Esempio n. 1
0
 def test_TFDistilBertModel(self):
     from transformers import DistilBertConfig, TFDistilBertModel
     keras.backend.clear_session()
     # pretrained_weights = 'distilbert-base-uncased'
     tokenizer_file = 'distilbert_distilbert-base-uncased.pickle'
     tokenizer = self._get_tokenzier(tokenizer_file)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     config = DistilBertConfig()
     model = TFDistilBertModel(config)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
Esempio n. 2
0
 def _create_sentence_transformer(self, input_shape):
   input_ids = tf.keras.Input(shape=input_shape, name='input_ids', dtype=tf.int32)
   attention_mask = tf.keras.Input(shape=input_shape, name='attention_mask', dtype=tf.int32)
   transformer_model = TFDistilBertModel.from_pretrained(self.model_name, config = self.model_config)
   word_embedding_layer = transformer_model([input_ids, attention_mask])[0]
   sentence_embedding_layer = PoolingLayer(pooling_type="mean")([word_embedding_layer, attention_mask])
   return tf.keras.Model([input_ids, attention_mask], sentence_embedding_layer)
Esempio n. 3
0
 def __init__(self, config={}):
     super(DistilBert, self).__init__()
     self.masking = tf.keras.layers.Masking()
     self.fc1 = Dense(config['base_config']['dim'], activation='relu')
     self.model_config = DistilBertConfig.from_dict(config['base_config'])
     self.base = TFDistilBertModel(self.model_config)
     self.head = HEADS[config['head']['name']](config['head'])
Esempio n. 4
0
def build_model():
    """
    This model is build based upon the DistilBert model taken from the Huggingface's Transformer library
    The model has to be compiled before weight loading.
    """
    pretrained_model = TFDistilBertModel.from_pretrained(
        'distilbert-base-uncased', output_attentions=False
    )
    input_ids = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='input_ids_pl'
    )
    attention_mask = tf.keras.layers.Input(
        shape=(None,), dtype=tf.int32, name='attention_mask_pl'
    )
    
    # get the output of the '[CLS'] token on the last layer
    bert_output = pretrained_model(
        {'input_ids': input_ids, 'attention_mask': attention_mask},
        return_dict=True
    )['last_hidden_state'][:, 0]

    pre_classification = tf.keras.layers.Dense(128, activation='tanh')(bert_output)
    dropout_1 = tf.keras.layers.Dropout(0.3)(pre_classification)

    classification_output = tf.keras.layers.Dense(2, activation='softmax')(dropout_1)

    model = tf.keras.models.Model(
        inputs=[input_ids, attention_mask], outputs=classification_output
    )
    return model
def tl_disbert_model(param={}):
  
  trainable = param['Trainable']
  max_seq_len = param['Max_length']
  inputs = Input(shape= (max_seq_len,), dtype ='int64', name='inputs')
  masks = Input(shape = (max_seq_len,), dtype='int64', name='masks')

  disBert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
  disBert_model.trainable = param['Trainable']

  disBert_output = disBert_model(inputs, attention_mask = masks)
  disBert_last_hidden = disBert_output.last_hidden_state
  disBert_CLS_output =  disBert_last_hidden [:,0,:]
  x = Flatten()(disBert_CLS_output)
  x = LayerNormalization()(x)
  x = Dense(param['first_layer'], activation='relu')(x)
  x = Dropout(param['dropout'])(x)
  x = LayerNormalization()(x)
  x = Dense(param['second_layer'], activation='relu')(x)
  x = Dropout(param['dropout'])(x)

  probs = Dense(3, activation='softmax')(x)

  model = keras.Model(inputs = [inputs, masks], outputs=probs)
  model.summary()

  return model
Esempio n. 6
0
def distilbert_model(input_shape,
                     transformer_model,
                     output_shape=96,
                     output_activation='softmax',
                     optimizer='Adam',
                     optimizer_params={'lr': 1e-5},
                     loss='categorical_crossentropy',
                     metrics=None):

    input_ids = Input((input_shape, ), dtype=tf.int32)
    input_mask = Input((input_shape, ), dtype=tf.int32)

    transformer_encoder = TFDistilBertModel.from_pretrained(
        transformer_model, from_pt=True, output_hidden_states=True)
    outputs = transformer_encoder.distilbert(input_ids,
                                             attention_mask=input_mask)

    x = outputs[0]
    x = GlobalAveragePooling1D()(x)
    output = Dense(output_shape, activation=output_activation)(x)

    model = Model(inputs=[input_ids, input_mask], outputs=output)
    model.compile(loss=loss,
                  metrics=metrics,
                  optimizer=getattr(optimizers, optimizer)(**optimizer_params))

    return model
Esempio n. 7
0
def extract_embeddings_for_other_clf():
    distil_bert = "distilbert-base-uncased"

    config = DistilBertConfig(dropout=0.2, attention_dropout=0.2)
    config.output_hidden_states = False
    transformer_model = TFDistilBertModel.from_pretrained(distil_bert,
                                                          config=config)

    input_ids_in = tf.keras.layers.Input(shape=(25, ),
                                         name="input_token",
                                         dtype="int32")
    input_masks_in = tf.keras.layers.Input(shape=(25, ),
                                           name="masked_token",
                                           dtype="int32")

    embedding_layer = transformer_model(input_ids_in,
                                        attention_mask=input_masks_in)[0]
    cls_token = embedding_layer[:, 0, :]
    X = tf.keras.layers.BatchNormalization()(cls_token)
    X = tf.keras.layers.Dense(192, activation="relu")(X)
    X = tf.keras.layers.Dropout(0.2)(X)
    X = tf.keras.layers.Dense(3, activation="softmax")(X)
    model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=X)

    for layer in model.layers[:3]:
        layer.trainable = False

    return model
Esempio n. 8
0
def run_distilibert(strategy: tf.distribute.TPUStrategy, x_train: np.array,
                    x_valid: np.array, _y_train: np.array, y_valid: np.array,
                    train_dataset: tf.data.Dataset,
                    valid_dataset: tf.data.Dataset,
                    test_dataset: tf.data.Dataset, max_len: int, epochs: int,
                    batch_size: int) -> tf.keras.models.Model:
    """
    create and run distilbert on training and testing data
    """
    logger.info('build distilbert')

    with strategy.scope():
        transformer_layer = TFDistilBertModel.from_pretrained(MODEL)
        model = build_model(transformer_layer, max_len=max_len)
    model.summary()

    # train given model
    n_steps = x_train.shape[0] // batch_size
    history = model.fit(train_dataset,
                        steps_per_epoch=n_steps,
                        validation_data=valid_dataset,
                        epochs=epochs)
    plot_train_val_loss(history, 'distilbert')

    n_steps = x_valid.shape[0] // batch_size
    _train_history_2 = model.fit(valid_dataset.repeat(),
                                 steps_per_epoch=n_steps,
                                 epochs=epochs * 2)

    scores = model.predict(test_dataset, verbose=1)
    logger.info(f"AUC: {roc_auc(scores, y_valid):.4f}")

    return model
Esempio n. 9
0
    def __init__(
            self,
            pretrained_model_name_or_path='distilbert-base-uncased',
            reduce_output='sum',
            trainable=True,
            num_tokens=None,
            **kwargs
    ):
        super(DistilBERTEncoder, self).__init__()
        try:
            from transformers import TFDistilBertModel
        except ModuleNotFoundError:
            logger.error(
                ' transformers is not installed. '
                'In order to install all text feature dependencies run '
                'pip install ludwig[text]'
            )
            sys.exit(-1)

        self.transformer = TFDistilBertModel.from_pretrained(
            pretrained_model_name_or_path
        )
        self.reduce_output = reduce_output
        self.reduce_sequence = SequenceReducer(reduce_mode=reduce_output)
        self.transformer.trainable = trainable
        self.transformer.resize_token_embeddings(num_tokens)
Esempio n. 10
0
def create_model(model_config: CommentClassifierConfig,
                 saved_weights_path: str = None,
                 max_seq_length: int = MAX_SEQ_LENGTH) -> tf.keras.Model:
    """
    :param model_config:       CommentClassifierConfig
    :param saved_weights_path: If defined, model weights will be loaded
                               from the provided checkpoint path
    :param max_seq_length:     Maximum length of the tokenized input to BERT
    :return:
        Model for text classification using DistilBert transformers
    """
    # Load pre-trained DistilBERT
    bert_config = DistilBertConfig(
        dropout=model_config.bert_dropout,
        attention_dropout=model_config.bert_attention_dropout,
        num_labels=NUM_CLASSES)
    bert_config.output_hidden_states = False
    transformer_model = TFDistilBertModel.from_pretrained(MODEL_NAME,
                                                          config=bert_config)

    input_ids_in = tf.keras.layers.Input(shape=(max_seq_length, ),
                                         name='input_token',
                                         dtype='int32')
    input_masks_in = tf.keras.layers.Input(shape=(max_seq_length, ),
                                           name='masked_token',
                                           dtype='int32')

    embedding_layer = transformer_model(input_ids_in,
                                        attention_mask=input_masks_in)[0]

    x = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(
            model_config.lstm_units,
            return_sequences=True,
            dropout=model_config.lstm_dropout,
            recurrent_dropout=model_config.lstm_recurrent_dropout))(
                embedding_layer)

    x = tf.keras.layers.GlobalMaxPool1D()(x)
    x = tf.keras.layers.Dense(
        model_config.hidden_layer_dim,
        activation=model_config.hidden_layer_activation)(x)

    x = tf.keras.layers.Dropout(model_config.final_layer_dropout)(x)
    x = tf.keras.layers.Dense(
        NUM_CLASSES, activation=model_config.final_layer_activation)(x)

    model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=x)

    # Use transfer learning only - do not train BERT again
    for layer in model.layers[:3]:
        layer.trainable = False

    # Load weights from a checkpoint, but allow partial matching
    # (e.g. due to a change in the optimizer)
    if saved_weights_path is not None:
        model.load_weights(saved_weights_path).expect_partial()

    return model
Esempio n. 11
0
def run_bert_meta_regression_tfmodel():
    """ Run self defined combined model."""
    timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
    log_dir = os.path.join(os.getenv('OUTPUT_DIR'), timestamp)
    model_plot = f'regression_model_{timestamp}.png'

    tokenizer = AutoTokenizer.from_pretrained(os.getenv('MODEL_NAME'))
    config = AutoConfig.from_pretrained(os.getenv('MODEL_NAME'), num_labels=1)
    distilebert_model = TFDistilBertModel.from_pretrained(
        os.getenv('MODEL_NAME'), config=config)

    print(config, tokenizer, sep='\n')
    # tf.keras.utils.plot_model(distilebert_model, to_file=model_plot, show_shapes=True)

    tc = TopCoder()
    encoded_text = tc.get_bert_encoded_txt_features(tokenizer)
    metadata = tc.get_meta_data_features(encoded_tech=True, softmax_tech=True)
    target = tc.get_target()

    split = int((4 / 5) * len(target))
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(**encoded_text, meta_input=metadata), target))
    dataset = dataset.shuffle(len(target))
    train_ds, test_ds = dataset.take(split).batch(16), dataset.skip(
        split).batch(8)

    print(train_ds, test_ds, sep='\n')
    # for i in train_ds.take(2):
    #     pprint(i)
    # print()
    # for i in test_ds.take(2):
    #     pprint(i)

    # model = TCPMDistilBertRegression.from_pretrained(os.getenv('MODEL_NAME'), config=config)
    tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir,
                                                    histogram_freq=1)
    model = build_tcpm_model_distilbert_regression(distilebert_model)
    model.summary()
    model.compile(optimizer=tf.keras.optimizers.Adam(2e-6),
                  loss='mse',
                  metrics=['mae', 'mse', mre])
    history = model.fit(
        train_ds,
        epochs=12,
    )
    result = model.evaluate(
        test_ds,
        return_dict=True,
    )

    pprint(result)

    history_df = pd.DataFrame(history.history)
    history_df.to_json(os.path.join(log_dir, 'train_history.json'),
                       orient='index',
                       indent=4)
    with open(os.path.join(log_dir, 'result.json'), 'w') as f:
        json.dump(result, f, indent=4)
Esempio n. 12
0
 def test_TFDistilBertModel(self):
     from transformers import DistilBertTokenizer, TFDistilBertModel
     pretrained_weights = 'distilbert-base-uncased'
     tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFDistilBertModel.from_pretrained(pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx,
                          predictions, self.model_files))
Esempio n. 13
0
    def __init__(self, config={}):
        super(DistilBertNorm, self).__init__()
        self.masking = tf.keras.layers.Masking()
        self.fc1 = Dense(config['base_config']['dim'])
        self.norm1 = tf.keras.layers.LayerNormalization()
        self.gelu1 = tf.keras.layers.ReLU()
        self.model_config = DistilBertConfig.from_dict(config['base_config'])
        self.base = TFDistilBertModel(self.model_config)

        self.fc2 = Dense(1024)
        self.norm2 = tf.keras.layers.LayerNormalization()
        self.gelu2 = tf.keras.layers.ReLU()

        self.head = HEADS[config['head']['name']](config['head'])
Esempio n. 14
0
def get_transformer(bert_model_type, output_hidden_states=False):
    config = get_bert_config(bert_model_type, output_hidden_states)
    if bert_model_type in [
            'bert-base-uncased', 'bert-base-cased', 'bert-large-uncased',
            'bert-large-uncased-whole-word-masking',
            'bert-large-uncased-whole-word-masking-finetuned-squad'
    ]:
        return TFBertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                           config=config)
    elif bert_model_type in [
            'prod-bert-base-uncased', 'tune_bert-base-uncased_nsp'
    ]:
        return TFBertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                           config=config,
                                           from_pt=True)
    elif bert_model_type in [
            'roberta-base', 'roberta-large', 'roberta-large-mnli',
            'distilroberta-base'
    ]:
        return TFRobertaModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                              config=config)
    elif bert_model_type in ['prod-roberta-base-cased']:
        return TFRobertaModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                              config=config,
                                              from_pt=True)
    elif bert_model_type in ['xlnet-base-cased']:
        return TFXLNetModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                            config=config)
    elif bert_model_type in [
            'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
            'albert-xxlarge-v1'
    ]:
        return TFAlbertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                             config=config)
    elif bert_model_type in ['gpt2', 'gpt2-medium']:
        return TFGPT2Model.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                           config=config)
    elif bert_model_type in ['transfo-xl']:
        return TFTransfoXLModel.from_pretrained(
            BERT_MODEL_FILE[bert_model_type], config=config)
    elif bert_model_type in [
            'distilbert-base-uncased',
            'distilbert-base-uncased-distilled-squad'
    ]:
        return TFDistilBertModel.from_pretrained(
            BERT_MODEL_FILE[bert_model_type], config=config)
    else:
        raise ValueError(
            f'`bert_model_type` not understood: {bert_model_type}')
Esempio n. 15
0
def model_arch_multitask():
    num_labels = 2
    bert = TFDistilBertModel.from_pretrained("distilbert-base-cased")
    dropout = tf.keras.layers.Dropout(0.4)
    answer_logits = tf.keras.layers.Dense(10, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), name="answer_logits", activation = "softmax")
    classifier = tf.keras.layers.Dense(2, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), name="seq_logits")
    input_ids = tf.keras.layers.Input(shape = (None,), dtype=tf.int32)
    attention_mask = tf.keras.layers.Input(shape = (None,), dtype=tf.int32)
    outputs = bert([input_ids, attention_mask])
    answer_output = answer_logits(outputs[0][:, 0, :])
    sequence_output = outputs[0]
    sequence_output = dropout(sequence_output)
    logits = classifier(sequence_output)
    model = tf.keras.models.Model(inputs = [input_ids, attention_mask], outputs = [logits, answer_output])
    model.compile(loss={'seq_logits': custom_loss_logits, 'answer_logits': "categorical_crossentropy"}, optimizer=optimizer, loss_weights = {"answer_logits": 1.0, "seq_logits": 1.0}, metrics=["accuracy"])
    return model
Esempio n. 16
0
def get_transformer(LM: bool):
    if LM:
        if model_name == 'distilbert-base-cased':
            model = TFDistilBertForMaskedLM.from_pretrained(
                'distilbert-base-cased')
        elif model_name == 'huggingface/CodeBERTa-small-v1':
            model = AutoModelWithLMHead.from_pretrained(
                'huggingface/CodeBERTa-small-v1')
            model = pt_to_tf(model, TFRobertaForMaskedLM)
    else:
        if model_name == 'distilbert-base-cased':
            model = TFDistilBertModel.from_pretrained(
                'distilbert-base-cased', )
        elif model_name == 'huggingface/CodeBERTa-small-v1':
            model = AutoModel.from_pretrained('huggingface/CodeBERTa-small-v1')
            model = pt_to_tf(model, TFRobertaModel)
    return model
Esempio n. 17
0
def create_model(max_seq_len, classes):
    config = DistilBertConfig(dropout=0.2, attention_dropout=0.2)
    config.output_hidden_states = False
    tfm = TFDistilBertModel.from_pretrained('./MODEL/uncased/', config=config)
    input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32', name="input_ids")
    bert_output = tfm(input_ids)[0]

    cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
    cls_out = keras.layers.Dropout(0.5)(cls_out)
    logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
    logits = keras.layers.Dropout(0.5)(logits)
    logits = keras.layers.Dense(units=512, activation="tanh")(cls_out)
    logits = keras.layers.Dropout(0.5)(logits)
    logits = keras.layers.Dense(units=256, activation="tanh")(logits)
    logits = keras.layers.Dropout(0.5)(logits)
    logits = keras.layers.Dense(units=len(classes), activation="softmax")(logits)

    model = keras.Model(inputs=input_ids, outputs=logits)
    model.build(input_shape=(None, max_seq_len))

    return model
Esempio n. 18
0
def model_arch_tok_classification():
    num_labels = 2
    max_len = 128
    bert = TFDistilBertModel.from_pretrained("distilbert-base-cased")
    dropout = tf.keras.layers.Dropout(0.4)
    classifier = tf.keras.layers.Dense(2, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02), name="seq_logits")
    question_input_ids = tf.keras.layers.Input(shape = (max_len,), dtype=tf.int32)
    question_attention_mask = tf.keras.layers.Input(shape = (max_len,), dtype=tf.int32)
    question_output = bert([question_input_ids, question_attention_mask])
    question_output = question_output[0][:, 0, :]
    question_output = tf.keras.layers.RepeatVector(max_len)(question_output)
    context_input_ids = tf.keras.layers.Input(shape = (max_len,), dtype=tf.int32)
    context_attention_mask = tf.keras.layers.Input(shape = (max_len,), dtype=tf.int32)
    outputs = bert([context_input_ids, context_attention_mask])
    sequence_output = outputs[0]
    sequence_output = tf.keras.layers.concatenate([sequence_output, question_output], axis = -1)
    sequence_output = dropout(sequence_output)
    logits = classifier(sequence_output)
    model = tf.keras.models.Model(inputs = [question_input_ids, question_attention_mask, context_input_ids, context_attention_mask], outputs = logits)
    model.compile(loss=custom_loss_logits, optimizer=optimizer)
    return model
Esempio n. 19
0
def initialize_hugface_model(hugging_face_model):
    # if hugging_face_model == "xlnet":
    #     tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
    #     model = TFXLNetModel.from_pretrained('xlnet-base-cased')
    # elif hugging_face_model == "roberta":
    #     tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    #     model = TFRobertaModel.from_pretrained('roberta-base')
    # elif hugging_face_model == "ernie":
    #     tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-en")
    #     model = TFAutoModel.from_pretrained("nghuyong/ernie-2.0-en")

    #FAST TOKENIZERS
    if hugging_face_model == "distilbert":
        tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
        model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
    elif hugging_face_model == "bert":
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
        model = TFBertModel.from_pretrained('bert-base-cased')


    else:
        raise ValueError('Invalid embedding type')
    return tokenizer, model
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import re
from transformers import TFDistilBertModel, DistilBertTokenizer

model_name = 'distilbert-base-uncased'
pretrained_model = TFDistilBertModel.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

train_path = 'data\\train.csv'
test_path = 'data\\test.csv'
input_column = 'tweet'
label_column = 'label'

# ============= load dataset ===============
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# --- get classes and convert to categorical ---
label_cols = train_df[label_column].unique()

if type(label_cols) == np.ndarray:
    train_df["target"] = train_df[label_column]
    # train_df.pop(label_column)
    num_classes = label_cols.shape[0]
else:
    num_classes = len(label_column)
Esempio n. 21
0
from train import train, trainLiar, trainPoliti
from tensorflow.keras import regularizers, initializers, optimizers, callbacks
from tensorflow.keras.layers import *
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
from transformers import TFDistilBertModel, DistilBertConfig

# DistilBERT
config = DistilBertConfig(dropout=0.2,
                          attention_dropout=0.2,
                          output_hidden_states=True)
dbert_model = TFDistilBertModel.from_pretrained(
    'distilbert-base-uncased', config=config)


# Define model using Keras functional API
def buildModel(seq_length, md_length, sco_length, his_length, n_output1, n_output2):
    input_ids1 = Input(shape=(seq_length, ),
                       dtype=tf.int32, name="input_ids1")
    attention_mask1 = Input(shape=(seq_length, ),
                            dtype=tf.int32, name="attention_mask1")

    input_ids2 = Input(shape=(md_length, ),
                       dtype=tf.int32, name="input_ids2")
    attention_mask2 = Input(shape=(md_length, ),
                            dtype=tf.int32, name="attention_mask2")

    score = Input(shape=(sco_length,), name="score")
    history = Input(shape=(his_length,), name="history")
Esempio n. 22
0
    def feature_extracter_from_texts(self, mashup_api=None):
        """
        对mashup,service的description均需要提取特征,右路的文本的整个特征提取过程
        公用的话应该封装成新的model!
        :param mashup_api: 默认是None,只有'HDP'/'Bert'时为非空
        :return: 输出的是一个封装好的model,所以可以被mashup和api公用
        """
        if self.args.text_extracter_mode in fixed_vector_modes and mashup_api is not None:

            if self.args.text_extracter_mode == 'Bert':
                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
                bertModel = BertModel.from_pretrained("bert-base-uncased")

                if mashup_api == 'mashup':
                    if self.mashup_text_feature_extracter is None:  # 没求过
                        mashup_texts = get_iterable_values(
                            data_repository.get_md().mashup_df,
                            'final_description',
                            return_ele_type='str')
                        dense_mashup_features = bertModel(
                            tokenizer(mashup_texts, return_tensors='tf'))
                        self.mashup_text_feature_extracter = vector_feature_extracter_from_texts(
                            'mashup', dense_mashup_features)
                    return self.mashup_text_feature_extracter
                elif mashup_api == 'api':
                    if self.api_text_feature_extracter is None:
                        api_texts = get_iterable_values(
                            data_repository.get_md().api_df,
                            'final_description',
                            return_ele_type='str')
                        dense_api_features = bertModel(
                            tokenizer(api_texts, return_tensors='tf'))
                        self.api_text_feature_extracter = vector_feature_extracter_from_texts(
                            'api', dense_api_features)
                    return self.api_text_feature_extracter
                else:
                    raise TypeError('wrong mashup_api mode!')

            else:
                if self.gd is None:
                    self.gd = get_default_gd(
                        tag_times=0, mashup_only=False,
                        strict_train=True)  # 用gensim处理文本,文本中不加tag
                    self.gd.model_pcs(self.args.text_extracter_mode)  #

                if mashup_api == 'mashup':
                    if self.mashup_text_feature_extracter is None:  # 没求过
                        self.mashup_text_feature_extracter = vector_feature_extracter_from_texts(
                            'mashup', self.gd.dense_mashup_features)
                    return self.mashup_text_feature_extracter
                elif mashup_api == 'api':
                    if self.api_text_feature_extracter is None:
                        self.api_text_feature_extracter = vector_feature_extracter_from_texts(
                            'api', self.gd.dense_api_features)
                    return self.api_text_feature_extracter
                else:
                    raise TypeError('wrong mashup_api mode!')

        elif self.text_feature_extracter is None:  # 没求过
            if 'trainable_bert' in self.args.text_extracter_mode.lower():
                self.text_feature_extracter = TFDistilBertModel.from_pretrained(
                    "distilbert-base-uncased")  # layer
                if self.args.frozen_bert:
                    self.text_feature_extracter.trainable = False
            else:
                text_input = Input(shape=(self.args.MAX_SEQUENCE_LENGTH, ),
                                   dtype='int32')
                text_embedding_layer = self.get_text_embedding_layer(
                )  # 参数还需设为外部输入!
                text_embedded_sequences = text_embedding_layer(
                    text_input)  # 转化为2D

                if self.args.text_extracter_mode in (
                        'inception', 'textCNN'):  # 2D转3D,第三维是channel
                    # print(text_embedded_sequences.shape)
                    text_embedded_sequences = Lambda(
                        lambda x: tf.expand_dims(x, axis=3))(
                            text_embedded_sequences)  # tf 和 keras的tensor 不同!!!
                    print(text_embedded_sequences.shape)

                if self.args.text_extracter_mode == 'inception':
                    x = inception_layer(
                        text_embedded_sequences, self.args.embedding_dim,
                        self.args.inception_channels,
                        self.args.inception_pooling)  # inception处理
                    print('built inception layer, done!')
                elif self.args.text_extracter_mode == 'textCNN':
                    x = textCNN_feature_extracter_from_texts(
                        text_embedded_sequences, self.args)
                elif self.args.text_extracter_mode == 'LSTM':
                    x = LSTM_feature_extracter_from_texts(
                        text_embedded_sequences, self.args)
                else:
                    raise TypeError('wrong extracter!')
                print('text feature after inception/textCNN/LSTM whole_model,',
                      x)  # 观察MLP转化前,模块输出的特征

                for FC_unit_num in self.args.inception_fc_unit_nums:
                    x = Dense(FC_unit_num,
                              kernel_regularizer=l2(self.args.l2_reg))(
                                  x)  # , activation='relu'
                    if self.args.inception_MLP_BN:
                        x = BatchNormalization(scale=False)(x)
                    x = PReLU()(x)  #
                    if self.args.inception_MLP_dropout:
                        x = tf.keras.layers.Dropout(0.5)(x)
                self.text_feature_extracter = Model(
                    text_input, x, name='text_feature_extracter')
        return self.text_feature_extracter
Esempio n. 23
0
                        0: 1,
                        1: 2,
                        2: 3,
                        3: 4,
                        4: 5
                    },
                    label2id={
                        1: 0,
                        2: 1,
                        3: 2,
                        4: 3,
                        5: 4
                    },
                )

                transformer_model = TFDistilBertModel.from_pretrained(
                    "distilbert-base-uncased", config=config)

                input_ids = tf.keras.layers.Input(shape=(max_seq_length, ),
                                                  name="input_ids",
                                                  dtype="int32")
                input_mask = tf.keras.layers.Input(shape=(max_seq_length, ),
                                                   name="input_mask",
                                                   dtype="int32")

                embedding_layer = transformer_model.distilbert(
                    input_ids, attention_mask=input_mask)[0]
                X = tf.keras.layers.Bidirectional(
                    tf.keras.layers.LSTM(
                        50,
                        return_sequences=True,
                        dropout=0.1,
Esempio n. 24
0
    def build_custom_model(self, validation_data, validation_label):
        # --------------------------------------------------------------------------------
        # Input layer (token indices and attention masks)
        # --------------------------------------------------------------------------------
        input_ids = tf.keras.layers.Input(shape=(self.max_sequence_length, ),
                                          dtype=tf.int32,
                                          name='input_ids')
        attention_mask = tf.keras.layers.Input((self.max_sequence_length, ),
                                               dtype=tf.int32,
                                               name='attention_mask')

        # --------------------------------------------------------------------------------
        # Base layer
        # --------------------------------------------------------------------------------
        # TFBaseModelOutput.last_hidden_state has shape (batch_size, max_sequence_length, 768)
        # Each sequence has [CLS]...[SEP] structure of shape (max_sequence_length, 768)
        # Extract [CLS] embeddings of shape (batch_size, 768) as last_hidden_state[:, 0, :]
        # --------------------------------------------------------------------------------
        base = TFDistilBertModel.from_pretrained(self.model_name, )
        # Freeze the base model weights.
        if self.freeze_pretrained_base_model:
            for layer in base.layers:
                layer.trainable = False

        base.summary()
        output = base([input_ids, attention_mask]).last_hidden_state[:, 0, :]

        # --------------------------------------------------------------------------------
        # TODO:
        #    Need to verify the effect of regularizers.
        #
        #    [bias regularizer]
        #    It looks bias_regularizer adjusts the ROC threshold towards 0.5.
        #    Without it, the threshold of the ROC with BinaryCrossEntropy loss was approx 0.02.
        #    With    it, the threshold of the ROC with BinaryCrossEntropy loss was approx 0.6.
        # --------------------------------------------------------------------------------
        activation = "sigmoid" if self.num_labels == 1 else "softmax"
        output = tf.keras.layers.Dense(
            units=self.num_labels,
            kernel_initializer='glorot_uniform',
            # https://huggingface.co/transformers/v4.3.3/main_classes/optimizer_schedules.html#adamweightdecay-tensorflow
            # kernel_regularizer=tf.keras.regularizers.l2(l2=self.l2),
            # bias_regularizer=tf.keras.regularizers.l2(l2=self.l2),
            # activity_regularizer=tf.keras.regularizers.l2(l2=self.l2/10.0),
            activation=activation,
            name=activation)(output)

        # --------------------------------------------------------------------------------
        # Loss layer
        # --------------------------------------------------------------------------------
        if self.num_labels == 1:  # Binary classification
            loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)
        else:  # Categorical classification
            loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=False)

        # --------------------------------------------------------------------------------
        # Model Metrics
        # --------------------------------------------------------------------------------
        if self.num_labels == 1:
            if self.USE_METRIC_AUC:  # ROC/AUC
                # AUC is for Binary Classification. Error if used for categorical"
                # "alueError: Shapes (None, <num_classes>) and (None, 1) are incompatible"
                # Because AUC is expecting shape(None, 1) as binary input into the loss fn.
                self._metric_name = "auc"
                self._monitor_metric = f"val_{self._metric_name}"
                self._monitor_mode = 'max'
                self._metrics = [
                    tf.keras.metrics.AUC(from_logits=False,
                                         name=self._metric_name),
                    tf.keras.metrics.Recall(name="recall"), "accuracy"
                ]
                self._callbacks = self.build_custom_model_auc_callbacks(
                    validation_data, validation_label)
            else:
                self._metric_name = "recall"  # Recall
                self._monitor_metric = f"val_{self._metric_name}"
                self._monitor_mode = 'max'
                self._metrics = [
                    tf.keras.metrics.Recall(name=self._metric_name), "accuracy"
                ]
                self._callbacks = self.build_custom_model_acc_callbacks()

        else:  # Validation loss
            self._metric_name = "accuracy"
            self._monitor_metric = "val_loss"
            self._monitor_mode = 'min'
            # metrics=[tf.keras.metrics.Accuracy(name=metric_name)]
            self._metrics = [self._metric_name]
            self._callbacks = self.build_custom_model_acc_callbacks()

        # --------------------------------------------------------------------------------
        # Build model
        # --------------------------------------------------------------------------------
        # TODO: Replace TIMESTAMP with instance variable
        name = f"{TIMESTAMP}_{self.model_name.upper()}"
        self._model = tf.keras.models.Model(inputs=[input_ids, attention_mask],
                                            outputs=output,
                                            name=name)
        self.model.compile(
            # https://huggingface.co/transformers/v4.3.3/main_classes/optimizer_schedules.html#adamweightdecay-tensorflow
            # optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),
            optimizer=transformers.AdamWeightDecay(
                learning_rate=self.learning_rate),
            loss=loss_fn,
            metrics=self._metrics)

        # --------------------------------------------------------------------------------
        # Load model parameters if the saved weight file exits
        # --------------------------------------------------------------------------------
        path_to_h5 = self.model_directory + os.path.sep + "model.h5"
        if os.path.isfile(path_to_h5) and os.access(path_to_h5, os.R_OK):
            print(
                f"\nloading the saved model parameters from {path_to_h5}...\n")
            self.model.load_weights(path_to_h5)
dataset_test=tf.data.Dataset.from_tensor_slices((Xids_test,Xmask_test))

def map_func(input_ids,mask):
    return {'input_ids':input_ids,'attention_mask':mask}

dataset_test=dataset_test.map(map_func)
dataset_test=dataset_test.batch(32).prefetch(1000)


#Build the model
from transformers import TFDistilBertModel, DistilBertConfig
distil_bert = 'distilbert-base-uncased'

config = DistilBertConfig(dropout=0.2, attention_dropout=0.2)
config.output_hidden_states = False
transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config = config)

input_ids_in = tf.keras.layers.Input(shape=(SEQ_length,), name='input_ids', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(SEQ_length,), name='attention_mask', dtype='int32') 

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(50, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(1, activation='sigmoid')(X)
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = X)

for layer in model.layers[:3]:
  layer.trainable = False
Esempio n. 26
0
data = pd.read_csv('/content/drive/My Drive/data/train_E6oV3lV.csv').sample(frac=0.3)
X_train, X_test, y_train, y_test = train_test_split(data.tweet, data.label)

model = Pipeline([
    ('vect', TfidfVectorizer(
        stop_words='english',
        # ngram_range=(1, 3)
        )),
    ('clf', SGDClassifier()),
])

model.fit(X_train, y_train)
print('tfidf f1:', f1_score(y_test, model.predict(X_test), average='binary'))


model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased',
                                          # pad_to_max_length=True,
                                          # max_length=100000
                                          )
pipe = pipeline('feature-extraction', model=model,
                tokenizer=tokenizer)
features = pipe(X_train.to_list(),
                # pad_to_max_length=True
                )
print()


# model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
# tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# pipe = pipeline('feature-extraction', model=model,
Esempio n. 27
0
        (x_valid, y_valid)).batch(BATCH_SIZE).cache().prefetch(AUTO))

    # test_dataset = (
    #     tf.data.Dataset
    #     .from_tensor_slices(x_test)
    #    .batch(BATCH_SIZE)
    # )

    ###################################################################
    # LOAD MODEL
    ###################################################################
    print("loading model ...")

    with strategy.scope():
        #transformer_layer = TFAutoModel.from_pretrained(MODEL)
        transformer_layer = TFDistilBertModel.from_pretrained(MODEL)
        model = HelperFns.build_model(transformer_layer, max_len=MAX_LEN)
    model.summary()

    ###################################################################
    # TRAINING
    ###################################################################
    print("run training ...")

    n_steps = x_train.shape[0] // BATCH_SIZE

    train_history = model.fit(train_dataset,
                              steps_per_epoch=n_steps,
                              validation_data=valid_dataset,
                              epochs=EPOCHS)