def pad_batch_BERT(self, batch, bert_layers, bert_dims): num_sentences = [len(x) for x in batch] max_num_sentences = max(num_sentences) batch, no_padding_num_sentences = hF.pad_batch_with_sentences_BERT( batch, max_num_sentences, bert_layers, bert_dims) ########################################## return batch, max_num_sentences, None, no_padding_num_sentences, None
def encode_and_pad_BERT(self, data_batches, Bert_model_Path, device, bert_layers, bert_dims): from pytorch_pretrained_bert import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained( Bert_model_Path ) # '../../pytorch-pretrained-BERT/bert_models/uncased_L-12_H-768_A-12/') model = BertModel.from_pretrained( Bert_model_Path ) # '../../pytorch-pretrained-BERT/bert_models/uncased_L-12_H-768_A-12/') model.eval() model.to(device) #################### Prepare Training data################ print('Encoding Data using BERT...') max_sentences = [] no_padding_sentences = [] for index, batch in tqdm(enumerate(data_batches)): batch = hF.encode_batch_BERT(batch, model, tokenizer, device, bert_layers) # data_batches[index] = batch num_sentences = [len(x) for x in batch] max_num_sentences = max(num_sentences) batch, no_padding_num_sentences = hF.pad_batch_with_sentences_BERT( batch, max_num_sentences, bert_layers, bert_dims) max_sentences.append(max_num_sentences) no_padding_sentences.append(no_padding_num_sentences) data_batches[index] = batch ########################################## return data_batches, max_sentences, None, no_padding_sentences, None
def pad_BERT(self, data_batches, bert_layers, bert_dims): print('Padding Data using BERT...') max_sentences = [] no_padding_sentences = [] for index, batch in tqdm(enumerate(data_batches)): num_sentences = [len(x) for x in batch] max_num_sentences = max(num_sentences) batch, no_padding_num_sentences = hF.pad_batch_with_sentences_BERT( batch, max_num_sentences, bert_layers, bert_dims) max_sentences.append(max_num_sentences) no_padding_sentences.append(no_padding_num_sentences) data_batches[index] = batch ########################################## return data_batches, max_sentences, None, no_padding_sentences, None