#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark_roberta-' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S")
sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True)

# Use XLNet for mapping tokens to embeddings
word_embedding_model = models.RoBERTa('roberta-base')

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
batch_size = 16
nli_reader = NLIDataReader('datasets/AllNLI')
sts_reader = STSDataReader('datasets/stsbenchmark')
train_num_labels = nli_reader.get_num_labels()
model_save_path = 'output/training_nli_roberta-' + datetime.now().strftime(
    "%Y-%m-%d_%H-%M-%S")

# Use BERT for mapping tokens to embeddings
word_embedding_model = models.RoBERTa('roberta-large')

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(
Beispiel #3
0
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
model_name = 'roberta-base'
batch_size = 32
agb_reader = AGBDataReader('datasets/AGB_og')
train_num_labels = agb_reader.get_num_labels()
model_save_path = 'output/training_agb_og_' + model_name + '-' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S")

# Use RoBERTa for mapping tokens to embeddings
word_embedding_model = models.RoBERTa(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read AGB train dataset")
train_data = SentencesDataset(agb_reader.get_examples('train.tsv'),
                              model=model,
                              shorten=True)
import torch

torch.backends.cudnn.benchmark = True

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

output_path = "checkpoints/sentence_transformers/roberta-base_v7_triplet_full_epoch1"
num_epochs = 1
train_batch_size = 16
eval_batch_size = 256

# Apply mean pooling to get one fixed sized sentence vector
word_embedding_model = models.RoBERTa('roberta-base', do_lower_case=False)
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Load Old Model
# model = SentenceTransformer('checkpoints/sentence_transformers/roberta-base_v7_triplet_epoch1')

triplet_reader = TripletReader('data/v7')

logging.info("Read Train dataset")
train_data = SentencesDataset(
    examples=triplet_reader.get_examples('triplet_train_full.csv'),