level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset batch_size = 32 sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') model_save_path = 'output/training_stsbenchmark_bilstm-' + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") # Map tokens to traditional word embeddings like GloVe word_embedding_model = models.WordEmbeddings.from_text_file( 'glove.6B.300d.txt.gz') lstm = models.LSTM(word_embedding_dimension=word_embedding_model. get_word_embedding_dimension(), hidden_dim=1024) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(lstm.get_word_embedding_dimension(), pooling_mode_mean_tokens=False, pooling_mode_cls_token=False, pooling_mode_max_tokens=True) model = SentenceTransformer( modules=[word_embedding_model, lstm, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
os.mkdir(args.ckpt_path) # Read the dataset sts_reader = STSBenchmarkDataReader(args.data_path, normalize_scores=True) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.PhoBERT(args.pre_trained_path, tokenizer_args={ 'vncorenlp_path': args.vncorenlp_path, 'bpe_path': args.bpe_path }) lstm = models.LSTM( word_embedding_dimension=word_embedding_model.get_word_embedding_dimension( ), hidden_dim=384, num_layers=1) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer( modules=[word_embedding_model, lstm, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_data = SentencesDataset(sts_reader.get_examples('sts-train_vi.csv'), model)