Example #1
0
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})

# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)

# Load dataset via TensorFlow Datasets
data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True)
train_examples = info.splits['train'].num_examples

# MNLI expects either validation_matched or validation_mismatched
valid_examples = info.splits['validation'].num_examples

# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK)

# MNLI expects either validation_matched or validation_mismatched
valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK)
train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)

# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
if USE_AMP:
    # loss scaling is currently required when using mixed precision
    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')


if num_labels == 1:
    loss = tf.keras.losses.MeanSquaredError()
Example #2
0
EVAL_BATCH_SIZE = BATCH_SIZE * 2

# In[ ]:
with open('../data/lenghts/tweet_info.json') as json_file:
    data_info = json.load(json_file)

train_exs = data_info['train_length']
val_exs = data_info['val_length']
test_exs = data_info['test_length']

print((train_exs, val_exs, test_exs))

# In[ ]:
train_dataset = glue_convert_examples_to_features(examples=train_parsed,
                                                  tokenizer=tokenizer,
                                                  max_length=128,
                                                  task='sst-2',
                                                  label_list=['0', '-1', '1'])

val_dataset = glue_convert_examples_to_features(examples=val_parsed,
                                                tokenizer=tokenizer,
                                                max_length=128,
                                                task='sst-2',
                                                label_list=['0', '-1', '1'])

test_dataset = glue_convert_examples_to_features(examples=test_parsed,
                                                 tokenizer=tokenizer,
                                                 max_length=128,
                                                 task='sst-2',
                                                 label_list=['0', '-1', '1'])
import os
import tensorflow as tf
import tensorflow_datasets
from transformers import BertTokenizer, BertForSequenceClassification, TFBertForSequenceClassification, glue_convert_examples_to_features

BATCH_SIZE = 32
FINE_TUNED_MODEL_DIR = "./data/"

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased")

data, info = tensorflow_datasets.load("glue/mrpc", with_info=True)
num_train = info.splits["train"].num_examples
num_valid = info.splits["validation"].num_examples

Xtrain = glue_convert_examples_to_features(data["train"], tokenizer, 128,
                                           "mrpc")
Xtrain = Xtrain.shuffle(128).batch(32).repeat(-1)
Xvalid = glue_convert_examples_to_features(data["validation"], tokenizer, 128,
                                           "mrpc")
Xvalid = Xvalid.batch(32)

opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
model.compile(optimizer=opt, loss=loss, metrics=[metric])

train_steps = num_train // 32
valid_steps = num_valid // 32

history = model.fit(Xtrain,
                    epochs=2,
Example #4
0
        tokenizer = RobertaTokenizer.from_pretrained(
            '/home/rohola/codes/program-r/libs/pretrain_roberta_model')
        model = RobertaForSequenceClassification.from_pretrained(
            '/home/rohola/codes/program-r/libs/pretrain_roberta_model')

        sentence1 = "Dogs are cute."
        sentence2 = "I need an Macbook."
        sentence3 = "Computer technology is awesome."

        example = InputExample(guid=0,
                               text_a=sentence3,
                               text_b=sentence2,
                               label=0)
        feature = glue_convert_examples_to_features(examples=[example],
                                                    tokenizer=tokenizer,
                                                    max_length=128,
                                                    output_mode='regression',
                                                    label_list=[None])

        input_ids = torch.tensor(feature[0].input_ids).unsqueeze(0)
        attention_mask = torch.tensor(feature[0].attention_mask).unsqueeze(0)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            print("model outputs: {}".format(outputs[0].item()))

        semantic_similarity = SemanticClassifer(model, tokenizer)
        s1 = semantic_similarity.similarity_with_concept(
            "The computer technology is awesome", "Intel")
        s2 = semantic_similarity.similarity_with_concept(
            "The computer technology is awesome", "dog")
Example #5
0
test_examples = data_info['test_length']

USE_XLA = False
USE_AMP = False
tf.config.optimizer.set_jit(USE_XLA)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
config = BertConfig("bert_config.json")
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased',
                                                        config=config)

#Training Dataset
train_dataset = glue_convert_examples_to_features(examples=tr_clean_ds,
                                                  tokenizer=tokenizer,
                                                  max_length=128,
                                                  task='sst-2',
                                                  label_list=['1', '3'])
train_dataset = train_dataset.shuffle(train_examples).batch(BATCH_SIZE).repeat(
    -1)

#Validation Dataset
valid_dataset = glue_convert_examples_to_features(examples=val_clean_ds,
                                                  tokenizer=tokenizer,
                                                  max_length=128,
                                                  task='sst-2',
                                                  label_list=['1', '3'])
valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)

opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
Example #6
0
    def __init__(
        self,
        args: GlueMemDataTrainingArguments,
        tokenizer: PreTrainedTokenizer,
        mem_size=20,
        limit_length: Optional[int] = None,
        mode: Union[str, Split] = Split.train,
        cache_dir: Optional[str] = None,
    ):
        self.args = args
        self.processor = glue_processors[args.task_name]()
        self.output_mode = glue_output_modes[args.task_name]
        if isinstance(mode, str):
            try:
                mode = Split[mode]
            except KeyError:
                raise KeyError("mode is not a valid split name")
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            cache_dir if cache_dir is not None else args.data_dir,
            "cached_mem_{}_{}_{}_{}".format(
                mode.value,
                tokenizer.__class__.__name__,
                str(args.max_seq_length),
                args.task_name,
            ),
        )
        label_list = self.processor.get_labels()
        if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
            RobertaTokenizer,
            RobertaTokenizerFast,
            XLMRobertaTokenizer,
            BartTokenizer,
            BartTokenizerFast,
        ):
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        self.label_list = label_list

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
                self.features = torch.load(cached_features_file)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
                )
            else:
                logger.info(f"Creating features from dataset file at {args.data_dir}")

                if mode == Split.dev:
                    examples = self.processor.get_dev_examples(args.data_dir)
                elif mode == Split.test:
                    examples = self.processor.get_test_examples(args.data_dir)
                else:
                    examples = self.processor.get_train_examples(args.data_dir)
                if limit_length is not None:
                    examples = examples[:limit_length]
                self.features = glue_convert_examples_to_features(
                    examples,
                    tokenizer,
                    max_length=args.max_seq_length - args.mem_size,
                    label_list=label_list,
                    output_mode=self.output_mode,
                )
                mem_id = tokenizer.added_tokens_encoder['[mem]']
                input_ids = [el.input_ids for el in self.features]
                input_ids = [[el[0]] + args.mem_size*[mem_id] + el[1:] for el in input_ids]
                attention_mask = [el.attention_mask for el in self.features]
                attention_mask = [args.mem_size*[1] + el for el in attention_mask]
                token_type_ids = [el.token_type_ids for el in self.features]
                token_type_ids = [args.mem_size*[0] + el for el in token_type_ids]
                labels = [el.label for el in self.features]
                self.features = [InputFeatures(input_ids=el[0], attention_mask=el[1], token_type_ids=el[2], label=el[3])
                                 for el in zip(input_ids, attention_mask, token_type_ids, labels)]
                start = time.time()
                torch.save(self.features, cached_features_file)
                # ^ This seems to take a lot of time so I want to investigate why and how we can improve.
                logger.info(
                    "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start
                )
import time
from transformers import BertTokenizer, TFBertForSequenceClassification,DistilBertTokenizer, TFDistilBertForSequenceClassification, glue_convert_examples_to_features
import tensorflow_datasets

# Load MRPC data
data = tensorflow_datasets.load('glue/mrpc')

# Pick GPU device (only pick 1 GPU)
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

# Load tokenizer, model from pretrained model/vocabulary
bert_tokenizer = BertTokenizer.from_pretrained('mrpc/1')
bert_model = TFBertForSequenceClassification.from_pretrained('mrpc/1')

valid_dataset = glue_convert_examples_to_features(data['validation'], bert_tokenizer, max_length=128, task='mrpc')
valid_dataset = valid_dataset.batch(64)

# Evaluate time for bert_model (bigger model)
start_time = time.time()
results = bert_model.predict(valid_dataset)
execution_time = time.time() - start_time

# Load tokenizer, model from pretrained model/vocabulary
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('mrpc/2')
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained('mrpc/2')

valid_dataset = glue_convert_examples_to_features(data['validation'], distilbert_tokenizer, max_length=128, task='mrpc')
valid_dataset = valid_dataset.batch(64)

# Evaluate time for distilbert_model (bigger model)
Example #8
0
tf.config.optimizer.set_jit(USE_XLA)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})

# Load tokenizer and model from pretrained model/vocabulary
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')

# Load dataset via TensorFlow Datasets
data_folder = "/content/drive/My Drive/AI/data/glue_data"
(train_raw,
 train_examples), (dev_raw,
                   valid_examples) = load_glue_data(data_folder, "qnli")

# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(train_raw, tokenizer, 128,
                                                  'qnli')
valid_dataset = glue_convert_examples_to_features(dev_raw, tokenizer, 128,
                                                  'qnli')
train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)

# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
if USE_AMP:
    # loss scaling is currently required when using mixed precision
    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
        opt, 'dynamic')
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=opt, loss=loss, metrics=[metric])
Example #9
0
# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased",
                                                        config=config)

# Load dataset via TensorFlow Datasets
data, info = tensorflow_datasets.load(f"glue/{TFDS_TASK}", with_info=True)
train_examples = info.splits["train"].num_examples

# MNLI expects either validation_matched or validation_mismatched
valid_examples = info.splits["validation"].num_examples

# Prepare dataset for GLUE as a tf.data.Dataset instance
train_dataset = glue_convert_examples_to_features(data["train"],
                                                  tokenizer,
                                                  max_length=128,
                                                  task=TASK)

# MNLI expects either validation_matched or validation_mismatched
valid_dataset = glue_convert_examples_to_features(data["validation"],
                                                  tokenizer,
                                                  max_length=128,
                                                  task=TASK)
train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)

# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
if USE_AMP:
    # loss scaling is currently required when using mixed precision
    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
Example #10
0
def train(argv=None):
    """
    A function that re-trains BERT for sentiment analysis.
    """
    _set_config()

    num_labels = len(glue_processors[FLAGS.task]().get_labels())

    # Load tokenizer and model from pretrained model/vocabulary.
    # Specify the number of labels to classify (2+: classification, 1: regression)
    config = BertConfig.from_pretrained("bert-base-cased",
                                        num_labels=num_labels,
                                        hidden_size=FLAGS.hidden_size)
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)

    # Load dataset via TensorFlow Datasets
    data, info = tensorflow_datasets.load(f'glue/{_get_tfds_task(FLAGS.task)}', with_info=True)
    train_examples = info.splits['train'].num_examples

    # MNLI expects either validation_matched or validation_mismatched
    valid_examples = info.splits['validation'].num_examples

    # Prepare dataset for GLUE as a tf.data.Dataset instance
    train_dataset = glue_convert_examples_to_features(data['train'],
                                                      tokenizer,
                                                      FLAGS.max_length,
                                                      FLAGS.task)

    # MNLI expects either validation_matched or validation_mismatched
    valid_dataset = glue_convert_examples_to_features(data['validation'],
                                                      tokenizer,
                                                      FLAGS.max_length,
                                                      FLAGS.task)
    train_dataset = train_dataset.shuffle(FLAGS.buffer_size).batch(FLAGS.batch_size).repeat(-1)
    valid_dataset = valid_dataset.batch(FLAGS.batch_size * 2)

    # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
    opt = tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate, epsilon=FLAGS.epsilon)
    if FLAGS.use_amp:
        # loss scaling is currently required when using mixed precision
        opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')

    if num_labels == 1:
        loss = tf.keras.losses.MeanSquaredError()
    else:
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model_path = f'./{_get_tfds_task(FLAGS.task)}/'

    if os.path.exists(model_path + 'tf_model.h5') and not FLAGS.force_train:
        print(f'Model in {model_path} already exists. Skipping training. ' + \
              'If you would like to force a re-train, set the force_train flag.')
        local_vars = locals()
        for variable in local_vars:
            if not variable.startswith('-'):
                print(f'{variable}:\t{local_vars[variable]}')
        return

    model.compile(optimizer=opt, loss=loss, metrics=[metric])

    # Train and evaluate using tf.keras.Model.fit()
    train_steps = train_examples // FLAGS.batch_size
    valid_steps = valid_examples // (FLAGS.batch_size * 2)

    _ = model.fit(train_dataset, epochs=FLAGS.epochs, steps_per_epoch=train_steps,
                  validation_data=valid_dataset, validation_steps=valid_steps)

    # Save TF2 model

#     os.makedirs(model_path, exist_ok=True)
    model.save_pretrained(model_path)