Example #1
0
    def __init__(
        self,
        backbone: str = "sentence-transformers/all-MiniLM-L6-v2",
        max_length: int = 128,
        tokenizer_backbone: Optional[str] = None,
        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
        enable_ort: bool = False,
    ):
        os.environ["TOKENIZERS_PARALLELISM"] = "TRUE"
        # disable HF thousand warnings
        warnings.simplefilter("ignore")
        # set os environ variable for multiprocesses
        os.environ["PYTHONWARNINGS"] = "ignore"
        super().__init__()

        if tokenizer_backbone is None:
            tokenizer_backbone = backbone
        self.max_length = max_length
        self.collate_fn = TextClassificationCollate(
            backbone=tokenizer_backbone,
            max_length=max_length,
            tokenizer_kwargs=tokenizer_kwargs)
        self.model = self.backbones.get(backbone)()
        self.pooling = Pooling(self.model.config.hidden_size)
        self.enable_ort = enable_ort
Example #2
0
def getmodel():
    word_embedding_model = Transformer(
        'D:\\greedySchool\\myproject\\sentence-transformers\\sentence_transformers\\bert-base-uncased'
    )
    pooling_model = Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model],
                                device="cpu")
    return model
Example #3
0
    def load(self, path):
        modelhub = self.config.get("modelhub", True)

        # Download model from the model hub (default)
        if modelhub:
            model = Transformer(path)
            pooling = Pooling(model.get_word_embedding_dimension())

            return SentenceTransformer(modules=[model, pooling])

        # Download model directly from sentence transformers if model hub disabled
        return SentenceTransformer(path)
Example #4
0
    def load(self, path):
        modelhub = self.config.get("modelhub", True)

        # Download model from the model hub (default)
        if modelhub:
            model = Transformer(path)
            pooling = Pooling(model.get_word_embedding_dimension())

            # Detect unbounded tokenizer typically found in older models
            Models.checklength(model.auto_model, model.tokenizer)

            return SentenceTransformer(modules=[model, pooling])

        # Download model directly from sentence transformers if model hub disabled
        return SentenceTransformer(path)
Example #5
0
 def __init__(
     self,
     lemmatizer_label: str = "spacy-fr",
     embedding_model_label: str = "camembert-base",
     document_tokenizer: str = "only-words",
     mean_tokens: bool = True,
     cls_token: bool = True,
     max_tokens: bool = False,
     context_retrieval: bool = True,
 ):
     self.__lemmatizer_label = lemmatizer_label
     if context_retrieval:
         self.__embedding_model_label = embedding_model_label
         print(f"Loading {embedding_model_label} model...")
         self.__embedding_model = MODELS[
             CAMEMBERT_LABEL_TRANSLATOR[embedding_model_label]](
                 CAMEMBERT_LABEL_TRANSLATOR[embedding_model_label])
         print(f"Finished Loading {embedding_model_label} model !")
         print(
             "Creating Pooling Model..\nMean Tokens : {}\nCLS Token : {}\nMax Tokens : {}"
             .format(mean_tokens, cls_token, max_tokens))
         self.__pooling_model = Pooling(
             self.__embedding_model.get_word_embedding_dimension(),
             pooling_mode_mean_tokens=mean_tokens,
             pooling_mode_cls_token=cls_token,
             pooling_mode_max_tokens=max_tokens,
         )
         print("Pooling Model Created !")
         self.__sentence_transformer = SentenceTransformer(
             modules=[self.__embedding_model, self.__pooling_model])
         self.__pooling_modes: str = ""
         modes = []
         if mean_tokens:
             modes.append("mean")
         if cls_token:
             modes.append("cls")
         if max_tokens:
             modes.append("max")
         self.__pooling_modes = "_".join(modes)
     else:
         self.__embedding_model_label = ""
         self.__sentence_transformer = None
         self.__pooling_model = None
         self.__pooling_modes = ""
     self.__lemmatizer = LEMMATIZERS[lemmatizer_label]()
     self.__document_tokenizer = RegexpTokenizer(
         TOKENIZERS[document_tokenizer])
     self.context_retrieval = context_retrieval
Example #6
0
    def load(self, path, blocking):
        model = Transformer(path)
        pooling = Pooling(model.get_word_embedding_dimension())

        return SentenceTransformer(modules=[model, pooling])
Example #7
0
    def __init__(self,
                 model_name_or_path: str = None,
                 modules: Iterable[nn.Module] = None,
                 device: str = None):
        if model_name_or_path is not None and model_name_or_path != "":
            logging.info("Load pretrained SentenceTransformer: {}".format(
                model_name_or_path))
            model_path = model_name_or_path

            if not os.path.isdir(model_path) and not model_path.startswith(
                    'http://') and not model_path.startswith('https://'):
                logging.info(
                    "Did not find folder {}. Assume to download model from server."
                    .format(model_path))
                model_path = __DOWNLOAD_SERVER__ + model_path + '.zip'

            if model_path.startswith('http://') or model_path.startswith(
                    'https://'):
                model_url = model_path
                folder_name = model_url.replace("https://", "").replace(
                    "http://", "").replace("/", "_")[:250].rstrip('.zip')

                try:
                    from torch.hub import _get_torch_home
                    torch_cache_home = _get_torch_home()
                except ImportError:
                    torch_cache_home = os.path.expanduser(
                        os.getenv(
                            'TORCH_HOME',
                            os.path.join(
                                os.getenv('XDG_CACHE_HOME', '~/.cache'),
                                'torch')))
                default_cache_path = os.path.join(torch_cache_home,
                                                  'sentence_transformers')
                model_path = os.path.join(default_cache_path, folder_name)
                os.makedirs(model_path, exist_ok=True)

                if not os.listdir(model_path):
                    if model_url[-1] == "/":
                        model_url = model_url[:-1]
                    logging.info(
                        "Downloading sentence transformer model from {} and saving it at {}"
                        .format(model_url, model_path))
                    try:
                        zip_save_path = os.path.join(model_path, 'model.zip')
                        http_get(model_url, zip_save_path)
                        with ZipFile(zip_save_path, 'r') as zip:
                            zip.extractall(model_path)
                        os.remove(zip_save_path)
                    except requests.exceptions.HTTPError as e:
                        shutil.rmtree(model_path)
                        if e.response.status_code == 404:
                            logging.warning(
                                'SentenceTransformer-Model {} not found. Try to create it from scratch'
                                .format(model_url))
                            logging.warning(
                                'Try to create Transformer Model {} with mean pooling'
                                .format(model_name_or_path))

                            model_path = None
                            transformer_model = Transformer(model_name_or_path)
                            pooling_model = Pooling(
                                transformer_model.get_word_embedding_dimension(
                                ))
                            modules = [transformer_model, pooling_model]

                        else:
                            raise e
                    except Exception as e:
                        shutil.rmtree(model_path)
                        raise e

            #### Load from disk
            if model_path is not None:
                logging.info("Load SentenceTransformer from folder: {}".format(
                    model_path))

                if os.path.exists(os.path.join(model_path, 'config.json')):
                    with open(os.path.join(model_path, 'config.json')) as fIn:
                        config = json.load(fIn)
                        #if config['__version__'] > __version__:
                        #    logging.warning("You try to use a model that was created with version {}, however, your version is {}. This might cause unexpected behavior or errors. In that case, try to update to the latest version.\n\n\n".format(config['__version__'], __version__))

                with open(os.path.join(model_path, 'modules.json')) as fIn:
                    contained_modules = json.load(fIn)

                modules = OrderedDict()
                for module_config in contained_modules:
                    module_class = import_from_string(module_config['type'])
                    module = module_class.load(
                        os.path.join(model_path, module_config['path']))
                    modules[module_config['name']] = module

        if modules is not None and not isinstance(modules, OrderedDict):
            modules = OrderedDict([(str(idx), module)
                                   for idx, module in enumerate(modules)])

        super().__init__(modules)
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
            logging.info("Use pytorch device: {}".format(device))

        self._target_device = torch.device(device)
from sentence_transformers.models import CamemBERT, Pooling
from sentence_transformers.readers import NLIDataReader
from sentence_transformers.losses import SoftmaxLoss
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import math
import logging
from datetime import datetime

# Use CamemBERT for mapping tokens to embeddings
model_name = 'camembert-base'
word_embedding_model = CamemBERT(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = Pooling(word_embedding_model.get_word_embedding_dimension(),
                        pooling_mode_mean_tokens=True,
                        pooling_mode_cls_token=False,
                        pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

model_save_path = 'output/training_fquad_' + model_name + '-' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S")

fquad_reader = NLIDataReader('datasets/FQuad')
batch_size = 4
train_num_labels = fquad_reader.get_num_labels()

train_data = SentencesDataset(fquad_reader.get_examples('train.gz'),
                              model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = SoftmaxLoss(
Example #9
0
def train(model_name_or_path: str,
          hf_dataset: str,
          aspect: str,
          fold: Union[int, str],
          output_path: str,
          train_epochs: int = 3,
          train_batch_size: int = 25,
          eval_batch_size: int = 32,
          evaluation_steps: int = 5000,
          train_on_test: bool = False,
          loss: str = 'multiple_negatives_ranking',
          override: bool = False):
    """

    # $MODEL_NAME $HF_DATASET $ASPECT $FOLD $OUTPUT_DIR --train_epochs=3 --train_batch_size=$TRAIN_BATCH_SIZE --eval_batch_size=$EVAL_BATCH_SIZE

    Run with:
    $ export CUDA_VISIBLE_DEVICES=1
    $ ./sentence_transformer_cli.py train scibert-scivocab-uncased paperswithcode_task_docs 1 ./output/st_scibert/1 --train_epochs=3 --train_batch_size=25 --eval_batch_size=32


    :param loss: Training loss function (choices: multiple_negatives_ranking, cosine)
    :param train_on_test: If True, joint training on train and test set (validation disabled)
    :param aspect:
    :param evaluation_steps:
    :param train_epochs:
    :param model_name_or_path:
    :param hf_dataset:
    :param fold:
    :param output_path:
    :param train_batch_size:
    :param eval_batch_size:
    :param override:
    :return:
    """

    top_ks = [5, 10, 25, 50]
    # cuda_device = -1

    # hf_dataset = 'paperswithcode_task_docs'
    # model_name_or_path = 'scibert-scivocab-uncased'
    # fold = 1
    max_token_length = 336  # ssee pwc_token_stats.ipynb
    nlp_cache_dir = './data/nlp_cache'

    # train_batch_size = 25
    # eval_batch_size = 32
    # override = False

    # output_path = './output/pwc_task_st/1/sci-bert'
    # output_path = os.path.join(output_path, str(fold), model_name_or_path)  # output/1/sci-bert

    if os.path.exists(output_path) and not override:
        logger.error(f'Stop. Output path exists already: {output_path}')
        sys.exit(1)

    # if cuda_device >= 0:
    #     os.environ["CUDA_VISIBLE_DEVICES"] = str(cuda_device)

    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Model path from env
    if not os.path.exists(model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_name_or_path)):
        model_name_or_path = os.path.join(env['bert_dir'], model_name_or_path)

    word_embedding_model = Transformer(model_name_or_path,
                                       max_seq_length=max_token_length)
    pooling_model = Pooling(
        word_embedding_model.get_word_embedding_dimension())

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    # tokenizer = BertTokenizer.from_pretrained(model_name_or_path)

    # dataset
    docs_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='docs',
                           cache_dir=nlp_cache_dir,
                           split='docs')
    train_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                            name='relations',
                            cache_dir=nlp_cache_dir,
                            split=get_train_split(aspect, fold))
    test_ds = load_dataset(get_local_hf_dataset_path(hf_dataset),
                           name='relations',
                           cache_dir=nlp_cache_dir,
                           split=get_test_split(aspect, fold))

    # filter for positive labels only
    train_ds = train_ds.filter(lambda row: row['label'] == 'y')

    logger.info(f'After filtering: {len(train_ds):,}')

    # joint training on train and test?
    if train_on_test:
        #
        # import pyarrow
        # from datasets.arrow_dataset import Dataset
        #
        # full_ds_table = pyarrow.concat_tables([train_ds.data, test_ds.data])
        # full_ds = Dataset(arrow_table=full_ds_table)
        raise NotImplementedError('TODO Evaluator')
    else:
        # standard training on test only
        train_sds = DocumentPairSentencesDataset(docs_ds,
                                                 train_ds,
                                                 model,
                                                 max_length=max_token_length,
                                                 forced_length=0)
        train_sds.tokenize_all_docs()

        evaluator = NearestNeighborsEvaluator(model,
                                              docs_ds,
                                              test_ds,
                                              top_ks=top_ks,
                                              batch_size=eval_batch_size,
                                              show_progress_bar=True)

    if loss == 'cosine':
        train_loss = losses.CosineSimilarityLoss(model)
    elif loss == 'multiple_negatives_ranking':
        # A nice advantage of MultipleNegativesRankingLoss is that it only requires positive pairs
        # https://github.com/UKPLab/sentence-transformers/tree/master/examples/training/quora_duplicate_questions
        train_loss = losses.MultipleNegativesRankingLoss(model)
    else:
        raise ValueError(f'Unsupported loss function: {loss}')

    train_dl = DataLoader(train_sds, shuffle=True, batch_size=train_batch_size)

    # Training
    model.fit(
        train_objectives=[(train_dl, train_loss)],
        epochs=train_epochs,  # try 1-4
        warmup_steps=100,
        evaluator=evaluator,
        evaluation_steps=
        evaluation_steps,  # increase to 5000 (full dataset => 20k steps)
        output_path=output_path,
        output_path_ignore_not_empty=True)

    logger.info('Training done')