def __init__( self, backbone: str = "sentence-transformers/all-MiniLM-L6-v2", max_length: int = 128, tokenizer_backbone: Optional[str] = None, tokenizer_kwargs: Optional[Dict[str, Any]] = None, enable_ort: bool = False, ): os.environ["TOKENIZERS_PARALLELISM"] = "TRUE" # disable HF thousand warnings warnings.simplefilter("ignore") # set os environ variable for multiprocesses os.environ["PYTHONWARNINGS"] = "ignore" super().__init__() if tokenizer_backbone is None: tokenizer_backbone = backbone self.max_length = max_length self.collate_fn = TextClassificationCollate( backbone=tokenizer_backbone, max_length=max_length, tokenizer_kwargs=tokenizer_kwargs) self.model = self.backbones.get(backbone)() self.pooling = Pooling(self.model.config.hidden_size) self.enable_ort = enable_ort
def getmodel(): word_embedding_model = Transformer( 'D:\\greedySchool\\myproject\\sentence-transformers\\sentence_transformers\\bert-base-uncased' ) pooling_model = Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device="cpu") return model
def load(self, path): modelhub = self.config.get("modelhub", True) # Download model from the model hub (default) if modelhub: model = Transformer(path) pooling = Pooling(model.get_word_embedding_dimension()) return SentenceTransformer(modules=[model, pooling]) # Download model directly from sentence transformers if model hub disabled return SentenceTransformer(path)
def load(self, path): modelhub = self.config.get("modelhub", True) # Download model from the model hub (default) if modelhub: model = Transformer(path) pooling = Pooling(model.get_word_embedding_dimension()) # Detect unbounded tokenizer typically found in older models Models.checklength(model.auto_model, model.tokenizer) return SentenceTransformer(modules=[model, pooling]) # Download model directly from sentence transformers if model hub disabled return SentenceTransformer(path)
def __init__( self, lemmatizer_label: str = "spacy-fr", embedding_model_label: str = "camembert-base", document_tokenizer: str = "only-words", mean_tokens: bool = True, cls_token: bool = True, max_tokens: bool = False, context_retrieval: bool = True, ): self.__lemmatizer_label = lemmatizer_label if context_retrieval: self.__embedding_model_label = embedding_model_label print(f"Loading {embedding_model_label} model...") self.__embedding_model = MODELS[ CAMEMBERT_LABEL_TRANSLATOR[embedding_model_label]]( CAMEMBERT_LABEL_TRANSLATOR[embedding_model_label]) print(f"Finished Loading {embedding_model_label} model !") print( "Creating Pooling Model..\nMean Tokens : {}\nCLS Token : {}\nMax Tokens : {}" .format(mean_tokens, cls_token, max_tokens)) self.__pooling_model = Pooling( self.__embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=mean_tokens, pooling_mode_cls_token=cls_token, pooling_mode_max_tokens=max_tokens, ) print("Pooling Model Created !") self.__sentence_transformer = SentenceTransformer( modules=[self.__embedding_model, self.__pooling_model]) self.__pooling_modes: str = "" modes = [] if mean_tokens: modes.append("mean") if cls_token: modes.append("cls") if max_tokens: modes.append("max") self.__pooling_modes = "_".join(modes) else: self.__embedding_model_label = "" self.__sentence_transformer = None self.__pooling_model = None self.__pooling_modes = "" self.__lemmatizer = LEMMATIZERS[lemmatizer_label]() self.__document_tokenizer = RegexpTokenizer( TOKENIZERS[document_tokenizer]) self.context_retrieval = context_retrieval
def load(self, path, blocking): model = Transformer(path) pooling = Pooling(model.get_word_embedding_dimension()) return SentenceTransformer(modules=[model, pooling])
def __init__(self, model_name_or_path: str = None, modules: Iterable[nn.Module] = None, device: str = None): if model_name_or_path is not None and model_name_or_path != "": logging.info("Load pretrained SentenceTransformer: {}".format( model_name_or_path)) model_path = model_name_or_path if not os.path.isdir(model_path) and not model_path.startswith( 'http://') and not model_path.startswith('https://'): logging.info( "Did not find folder {}. Assume to download model from server." .format(model_path)) model_path = __DOWNLOAD_SERVER__ + model_path + '.zip' if model_path.startswith('http://') or model_path.startswith( 'https://'): model_url = model_path folder_name = model_url.replace("https://", "").replace( "http://", "").replace("/", "_")[:250].rstrip('.zip') try: from torch.hub import _get_torch_home torch_cache_home = _get_torch_home() except ImportError: torch_cache_home = os.path.expanduser( os.getenv( 'TORCH_HOME', os.path.join( os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch'))) default_cache_path = os.path.join(torch_cache_home, 'sentence_transformers') model_path = os.path.join(default_cache_path, folder_name) os.makedirs(model_path, exist_ok=True) if not os.listdir(model_path): if model_url[-1] == "/": model_url = model_url[:-1] logging.info( "Downloading sentence transformer model from {} and saving it at {}" .format(model_url, model_path)) try: zip_save_path = os.path.join(model_path, 'model.zip') http_get(model_url, zip_save_path) with ZipFile(zip_save_path, 'r') as zip: zip.extractall(model_path) os.remove(zip_save_path) except requests.exceptions.HTTPError as e: shutil.rmtree(model_path) if e.response.status_code == 404: logging.warning( 'SentenceTransformer-Model {} not found. Try to create it from scratch' .format(model_url)) logging.warning( 'Try to create Transformer Model {} with mean pooling' .format(model_name_or_path)) model_path = None transformer_model = Transformer(model_name_or_path) pooling_model = Pooling( transformer_model.get_word_embedding_dimension( )) modules = [transformer_model, pooling_model] else: raise e except Exception as e: shutil.rmtree(model_path) raise e #### Load from disk if model_path is not None: logging.info("Load SentenceTransformer from folder: {}".format( model_path)) if os.path.exists(os.path.join(model_path, 'config.json')): with open(os.path.join(model_path, 'config.json')) as fIn: config = json.load(fIn) #if config['__version__'] > __version__: # logging.warning("You try to use a model that was created with version {}, however, your version is {}. This might cause unexpected behavior or errors. In that case, try to update to the latest version.\n\n\n".format(config['__version__'], __version__)) with open(os.path.join(model_path, 'modules.json')) as fIn: contained_modules = json.load(fIn) modules = OrderedDict() for module_config in contained_modules: module_class = import_from_string(module_config['type']) module = module_class.load( os.path.join(model_path, module_config['path'])) modules[module_config['name']] = module if modules is not None and not isinstance(modules, OrderedDict): modules = OrderedDict([(str(idx), module) for idx, module in enumerate(modules)]) super().__init__(modules) if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" logging.info("Use pytorch device: {}".format(device)) self._target_device = torch.device(device)
from sentence_transformers.models import CamemBERT, Pooling from sentence_transformers.readers import NLIDataReader from sentence_transformers.losses import SoftmaxLoss from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from torch.utils.data import DataLoader import math import logging from datetime import datetime # Use CamemBERT for mapping tokens to embeddings model_name = 'camembert-base' word_embedding_model = CamemBERT(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) model_save_path = 'output/training_fquad_' + model_name + '-' + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") fquad_reader = NLIDataReader('datasets/FQuad') batch_size = 4 train_num_labels = fquad_reader.get_num_labels() train_data = SentencesDataset(fquad_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = SoftmaxLoss(
def train(model_name_or_path: str, hf_dataset: str, aspect: str, fold: Union[int, str], output_path: str, train_epochs: int = 3, train_batch_size: int = 25, eval_batch_size: int = 32, evaluation_steps: int = 5000, train_on_test: bool = False, loss: str = 'multiple_negatives_ranking', override: bool = False): """ # $MODEL_NAME $HF_DATASET $ASPECT $FOLD $OUTPUT_DIR --train_epochs=3 --train_batch_size=$TRAIN_BATCH_SIZE --eval_batch_size=$EVAL_BATCH_SIZE Run with: $ export CUDA_VISIBLE_DEVICES=1 $ ./sentence_transformer_cli.py train scibert-scivocab-uncased paperswithcode_task_docs 1 ./output/st_scibert/1 --train_epochs=3 --train_batch_size=25 --eval_batch_size=32 :param loss: Training loss function (choices: multiple_negatives_ranking, cosine) :param train_on_test: If True, joint training on train and test set (validation disabled) :param aspect: :param evaluation_steps: :param train_epochs: :param model_name_or_path: :param hf_dataset: :param fold: :param output_path: :param train_batch_size: :param eval_batch_size: :param override: :return: """ top_ks = [5, 10, 25, 50] # cuda_device = -1 # hf_dataset = 'paperswithcode_task_docs' # model_name_or_path = 'scibert-scivocab-uncased' # fold = 1 max_token_length = 336 # ssee pwc_token_stats.ipynb nlp_cache_dir = './data/nlp_cache' # train_batch_size = 25 # eval_batch_size = 32 # override = False # output_path = './output/pwc_task_st/1/sci-bert' # output_path = os.path.join(output_path, str(fold), model_name_or_path) # output/1/sci-bert if os.path.exists(output_path) and not override: logger.error(f'Stop. Output path exists already: {output_path}') sys.exit(1) # if cuda_device >= 0: # os.environ["CUDA_VISIBLE_DEVICES"] = str(cuda_device) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Model path from env if not os.path.exists(model_name_or_path) and os.path.exists( os.path.join(env['bert_dir'], model_name_or_path)): model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path) word_embedding_model = Transformer(model_name_or_path, max_seq_length=max_token_length) pooling_model = Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # tokenizer = BertTokenizer.from_pretrained(model_name_or_path) # dataset docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='docs', cache_dir=nlp_cache_dir, split='docs') train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_train_split(aspect, fold)) test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset), name='relations', cache_dir=nlp_cache_dir, split=get_test_split(aspect, fold)) # filter for positive labels only train_ds = train_ds.filter(lambda row: row['label'] == 'y') logger.info(f'After filtering: {len(train_ds):,}') # joint training on train and test? if train_on_test: # # import pyarrow # from datasets.arrow_dataset import Dataset # # full_ds_table = pyarrow.concat_tables([train_ds.data, test_ds.data]) # full_ds = Dataset(arrow_table=full_ds_table) raise NotImplementedError('TODO Evaluator') else: # standard training on test only train_sds = DocumentPairSentencesDataset(docs_ds, train_ds, model, max_length=max_token_length, forced_length=0) train_sds.tokenize_all_docs() evaluator = NearestNeighborsEvaluator(model, docs_ds, test_ds, top_ks=top_ks, batch_size=eval_batch_size, show_progress_bar=True) if loss == 'cosine': train_loss = losses.CosineSimilarityLoss(model) elif loss == 'multiple_negatives_ranking': # A nice advantage of MultipleNegativesRankingLoss is that it only requires positive pairs # https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/quora_duplicate_questions train_loss = losses.MultipleNegativesRankingLoss(model) else: raise ValueError(f'Unsupported loss function: {loss}') train_dl = DataLoader(train_sds, shuffle=True, batch_size=train_batch_size) # Training model.fit( train_objectives=[(train_dl, train_loss)], epochs=train_epochs, # try 1-4 warmup_steps=100, evaluator=evaluator, evaluation_steps= evaluation_steps, # increase to 5000 (full dataset => 20k steps) output_path=output_path, output_path_ignore_not_empty=True) logger.info('Training done')