Beispiel #1
0
def load_transformer(model_type):
    if model_type == "distilbert":
        tokenizer = DistilBertTokenizer.from_pretrained(
            'distilbert-base-uncased')
        model = TFDistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=1)
    elif model_type == "bert_x12":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-base-uncased", num_labels=1)
    elif model_type == "bert_x24":
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-large-uncased", num_labels=1)
    elif model_type == "albert_v2_x12":
        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
        model = TFAlbertForSequenceClassification.from_pretrained(
            "albert-base-v2", num_labels=1)
    elif model_type == "longformer_x12":
        tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-base-4096')
        model = TFLongformerForSequenceClassification.from_pretrained(
            "allenai/longformer-base-4096", num_labels=1)
    elif model_type == "longformer_x24":
        tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-large-4096')
        model = TFLongformerForSequenceClassification.from_pretrained(
            "allenai/longformer-large-4096", num_labels=1)
    else:
        raise ValueError(model_type + " was invalid")

    return model, tokenizer
Beispiel #2
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.bert_layer = TFBertForSequenceClassification.from_pretrained(
         'bert-base-cased')
     self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
     self.cashed_train_dataset = None
     self.cashed_val_dataset = None
Beispiel #3
0
    def load_model_from_zip(self, zip_archive):
        """a function that loads components of a serialized model from a zip
        given zip file using the python ZipFile interface and returns an
        instance of the model

        Arguments:

        zip_archive: ZipFile
            an instance of the python ZipFile interface that has loaded
            the file path specified by self.resource.disk_target

        Returns:

        model: Any
            any format of of machine learning model that will be stored
            in the self.model attribute for later use

        """

        # read the h5 bytes from the zip file
        with zip_archive.open('transformer.zip', "r") as file:
            model_bytes = file.read()  # read model bytes in the h5 format

        # load the bytes of the hugging face save path from a zip
        with tempfile.TemporaryDirectory() as directory:
            with tempfile.NamedTemporaryFile(suffix=".zip") as archive:
                archive.write(model_bytes)
                shutil.unpack_archive(archive.name, directory)
                return TFBert.from_pretrained(directory)
def carga_modelo_BERT(model_path):
    """Carga el modelo BERT preentrenado y que se encuentra en la ruta `model_path`"""
    # Parámetros del script usado por HuggingFace para hacer análisis de sentimientos sobre otro conjunto de datos
    USE_XLA = False
    USE_AMP = False
    #TASK = "sst-2"
    #TFDS_TASK = "sst2"
    num_labels = 2
    tf.config.optimizer.set_jit(USE_XLA)
    tf.config.optimizer.set_experimental_options(
        {"auto_mixed_precision": USE_AMP})

    # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
    config = BertConfig.from_pretrained("bert-base-cased",
                                        num_labels=num_labels)
    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    model = TFBertForSequenceClassification.from_pretrained("bert-base-cased",
                                                            config=config)

    opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
    model.compile(optimizer=opt, loss=loss, metrics=[metric])
    model.load_weights(model_path)

    return model, tokenizer, config
Beispiel #5
0
    def _load_remote_model(self, model_name, tokenizer_kwargs, model_kwargs):
        if model_name not in ModelsByFamily.Supported:
            raise ValueError(f'Model {model_name} not supported.')

        do_lower_case = False
        if 'uncased' in model_name.lower():
            do_lower_case = True
        tokenizer_kwargs.update({'do_lower_case': do_lower_case})

        self._tokenizer = None
        self._model = None

        if model_name in ModelsByFamily.Bert:
            self._tokenizer = BertTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFBertForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)
        elif model_name in ModelsByFamily.Roberta:
            self._tokenizer = RobertaTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFRobertaForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)
        elif model_name in ModelsByFamily.XLNet:
            self._tokenizer = XLNetTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFXLNetForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)
        elif model_name in ModelsByFamily.DistilBert:
            self._tokenizer = DistilBertTokenizer.from_pretrained(
                model_name, **tokenizer_kwargs)
            self._model = TFDistilBertForSequenceClassification.from_pretrained(
                model_name, **model_kwargs)

        assert self._tokenizer and self._model
 def build_model(self,
                 max_length,
                 train_batch_size,
                 learning_rate,
                 epochs,
                 num_labels,
                 tagset=None,
                 gpu_growth=True,
                 eval_batch_size=32):
     #if gpu_growth:
     #    model_utils.set_tf_memory_growth()
     if self.task == "pos":
         self.model = TFBertForTokenClassification.from_pretrained(
             self.model_name, num_labels=num_labels, from_pt=True)
         self.tokenizer = MBERT_Tokenizer_pos.from_pretrained(
             self.model_name, do_lower_case=False)
     else:
         self.model = TFBertForSequenceClassification.from_pretrained(
             self.model_name, num_labels=num_labels, from_pt=True)
         self.tokenizer = BertTokenizer.from_pretrained(self.model_name,
                                                        do_lower_case=False)
     #self.model, self.tokenizer = model_utils.create_model(self.short_model_name, self.task, num_labels)
     self.model = model_utils.compile_model(self.model, self.task,
                                            learning_rate)
     print("Successfully built", self.model_name)
     self.max_length = max_length
     self.train_batch_size = train_batch_size
     self.learning_rate = learning_rate
     self.epochs = epochs
     self.num_labels = num_labels
     if tagset:
         self.tagset = tagset
         self.label_map = {label: i for i, label in enumerate(tagset)}
     self.eval_batch_size = eval_batch_size
    def build(self, **kwargs):
        optimizer = kwargs.get("optimizer", "adam")
        metrics = kwargs.get("metrics", ['accuracy'])
        dropout_rate = kwargs.get('dropout_rate', 0.5)

        ## BUILDING THE GRAPH
        input_ids = tf.keras.layers.Input(shape=(1,50), name='input_ids', dtype=tf.int32)
        input_mask = tf.keras.layers.Input(shape=(1,50), name='input_mask', dtype=tf.int32)
        #bert_layer = Lambda_Bert_Layer(trainable=False, dynamic=True)
        bert_layer = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
        bert_layer.bert.trainable=False
        bert_output = bert_layer(input_ids[:,0,:], attention_mask=input_mask[:,0,:])
        """
        bert_layer.trainable = False
        bert_output=bert_output[0][:,-1,:]
        last_state = tf.reshape(bert_output, shape=(-1,768))
        dense_out_1 = tf.keras.layers.Dense(units=768, activation="relu")(last_state)  # reshape_lambda_layer
        dense_out_1 = tf.keras.layers.Dropout(dropout_rate)(dense_out_1)
        dense_out_2 = tf.keras.layers.Dense(units=200, activation="relu")(dense_out_1)
        dense_out_2 = tf.keras.layers.Dropout(dropout_rate)(dense_out_2)
        logits = tf.keras.layers.Dense(units=2, activation='softmax')(dense_out_2)
        """
        logits = bert_output[0]

        self.model = tf.keras.Model(inputs=(input_ids,input_mask), outputs=logits)
        self.model.compile(optimizer=optimizer,
                           loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                           metrics=metrics,
                           run_eagerly=True)
        self.model.summary()
    def train(self,
              train_data,
              train_labels,
              dev_data,
              dev_labels,
              save_model_path=f"models/bert.pkl"):
        ds_train_encoded = self.encode_examples(
            train_data, train_labels).batch(self.batch_size)
        ds_dev_encoded = self.encode_examples(dev_data, dev_labels).batch(
            self.batch_size)

        self.model = TFBertForSequenceClassification.from_pretrained(
            'bert-base-uncased')
        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate,
                                             epsilon=1e-08)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
        self.model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

        self.model.fit(ds_train_encoded,
                       epochs=self.epochs,
                       validation_data=ds_dev_encoded)

        predictions = self.model.predict(ds_dev_encoded, verbose=1).logits
        self.model.save_weights(save_model_path)
        print('Validation Loss:', log_loss(dev_labels, predictions))
Beispiel #9
0
def train_model(request):
    if request.method == "POST":
        inputed_batch_size = int(request.POST['batch_size'])
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-base-uncased")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        #IMDB movie reviews
        dataset = get_dataset()
        #save dataset to file memory
        clean_dataset(dataset)

        #training and testing datasets here
        train = tf.keras.preprocessing.text_dataset_from_directory(
            'aclImdb/train',
            batch_size=inputed_batch_size,
            validation_split=0.2,
            subset='training',
            seed=123)

        test = tf.keras.preprocessing.text_dataset_from_directory(
            'aclImdb/train',
            batch_size=inputed_batch_size,
            validation_split=0.2,
            subset='validation',
            seed=123)

        #convert to pandas dataframes
        train = convert_dataset_to_dataframe(train)
        test = convert_dataset_to_dataframe(test)

        #convert data to tf datasets
        train_InputExamples, validation_InputExamples = convert_data_to_examples(
            train, test, DATA_COLUMN, LABEL_COLUMN)

        train_data = convert_examples_to_tf_dataset(list(train_InputExamples),
                                                    tokenizer)
        train_data = train_data.shuffle(100).batch(32).repeat(2)

        validation_data = convert_examples_to_tf_dataset(
            list(validation_InputExamples), tokenizer)
        validation_data = validation_data.batch(32)

        #fine tune it!
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5,
                                               epsilon=1e-08,
                                               clipnorm=1.0),
            loss=tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True),
            metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

        model.fit(train_data, epochs=2, validation_data=validation_data)

        #save the model weights to memory to be used later
        model.save_pretrained(MOOD_MODEL_DIR)

    context = {}
    return render(request, 'mood_classifier/train_model_button.html', context)
Beispiel #10
0
    def __init__(self):

        model_name = 'bert-base-cased'
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

        self.model = TFBertForSequenceClassification.from_pretrained(
            'C:/Users/lbbre/Documents/ECAM 5/PRD/content/assets')
        self.max_seq_len = 32
    def load(self, path):
        """Loads model from path"""
        self.strategy = self._get_distributed_strategy()

        with self.strategy.scope():
            self.initialise_models()
            self.model = TFBertForSequenceClassification.from_pretrained(path)
        self.trained_ = True
 def model_build(self):
     self.bertConfig = BertConfig.from_pretrained(
         os.path.join(self.pretrain_path, "config.json"),
         num_labels=self.num_classes)
     self.model = TFBertForSequenceClassification.from_pretrained(
         os.path.join(self.pretrain_path, "tf_model.h5"),
         config=self.bertConfig)
     self.model.summary()
Beispiel #13
0
    def build(self):

        # Handle the Meta Data
        ids = tf.keras.Input((self.config.max_length, ),
                             dtype=tf.int32,
                             name='input_ids')
        vn = tf.keras.Input((1, ), dtype=tf.float32, name='version_number')
        pl = tf.keras.Input((1, ), dtype=tf.float32, name='partisan_lean')
        cat = tf.keras.Input((self.config.n_sc_id_classes, ),
                             dtype=tf.float32,
                             name='sc_id')
        meta = tf.concat([vn, pl, cat], axis=-1)

        # Load the initial weights with the ones trained from the DL model without text
        if self.config.load_weights_from_no_text:
            #if 'no_text_dense_layer_initialization_path' in self.config:
            print(
                "Usinging pretrained weights from the no_text model! --------------------"
            )
            model_location = self.config.data_vol + "models/no_text/full_model.h5"
            ntdl = tf.keras.layers.Dense(
                self.config.n_dense_layers,
                activation='relu',
                name="no_text_dense_layer",
                kernel_initializer=noTextKernelInitializer(
                    model_location=model_location),
                bias_initializer=noTextBiasInitializer(
                    model_location=model_location))
        else:
            ntdl = tf.keras.layers.Dense(self.config.n_dense_layers,
                                         activation='relu',
                                         name="no_text_dense_layer")
        meta = ntdl(meta)

        # Handle the Transformer
        self.base_transformer_model = TFBertForSequenceClassification.from_pretrained(
            "bert-base-uncased")
        x = self.base_transformer_model.bert(ids)  # Get the main Layer
        x = x['last_hidden_state'][:, 0, :]
        x = tf.keras.layers.Dropout(0.2)(x)

        # Combine the two and run through another dense layer.
        x = tf.concat([x, meta], axis=-1)
        x = tf.keras.layers.Dense(self.config.n_dense_layers,
                                  activation='relu')(x)
        x = tf.keras.layers.Dropout(0.2)(x)
        x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
        dl_model = tf.keras.Model(inputs={
            "input_ids": ids,
            "version_number": vn,
            "partisan_lean": pl,
            "sc_id": cat
        },
                                  outputs=[x])

        self.deep_legis_model = dl_model
 def create_model_2(self):
     model = TFBertForSequenceClassification.from_pretrained(
         'bert-base-uncased')
     model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5,
                                                      epsilon=1e-08),
                   loss=tf.keras.losses.SparseCategoricalCrossentropy(
                       from_logits=True),
                   metrics=["acc"])
     model.summary()
     return model
Beispiel #15
0
    def __init__(self,
                 n_intents=None,
                 dropout=0.2,
                 model_name="bert-base-uncased"):
        super().__init__(name="intent_classifier")

        self.tokenizer = Tokenizer()
        self.bert = TFBertForSequenceClassification.from_pretrained(model_name)
        self.dropout = Dropout(dropout)
        self.intent_classifier = Dense(n_intents, activation='softmax')
Beispiel #16
0
def model_compile():
    model = TFBertForSequenceClassification.from_pretrained(
        'bert-base-uncased', num_labels=2)
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = keras.metrics.SparseCategoricalAccuracy('accuracy')
    optimizer = keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)

    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])

    return model
Beispiel #17
0
    def __init__(self, extractor, config, *args, **kwargs):
        super(TFVanillaBert_Class, self).__init__(*args, **kwargs)
        self.extractor = extractor

        # TFBertForSequenceClassification contains both the BERT and the linear classifier layers
        self.bert = TFBertForSequenceClassification.from_pretrained(
            config["pretrained"], hidden_dropout_prob=0.1)

        assert extractor.config[
            "numpassages"] == 1, "numpassages should be 1 for TFVanillaBERT"
        self.config = config
def build_model():
    import tensorflow as tf
    from transformers import TFBertForSequenceClassification
    model = TFBertForSequenceClassification.from_pretrained(
        "bert-base-uncased")
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        reduction=tf.keras.losses.Reduction.NONE, from_logits=True)
    opt = tf.keras.optimizers.Adam(learning_rate=3e-5)

    model.compile(optimizer=opt, loss=loss_fn, metrics=['accuracy'])
    return model
    def _init_model(self, num_labels=2):
        config = {"name": self.pretrained, "from_pt": self.from_pt}
        if self.pretrained in PRETRAINED_CONFIG:
            config = PRETRAINED_CONFIG[self.pretrained]

        pretrained = config["name"]
        from_pt = config["from_pt"]

        self.tokenizer = BertTokenizer.from_pretrained(pretrained)
        self.model = TFBertForSequenceClassification.from_pretrained(
            pretrained, from_pt=from_pt, num_labels=num_labels)
Beispiel #20
0
def fine_tune_model(ds, export_dir):
    (train_dataset, test_dataset, val_dataset) = get_test_train_val_datasets(ds)
    learning_rate = 2e-5
    number_of_epochs = 1
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
    bert_history = model.fit(train_dataset, epochs=number_of_epochs, validation_data=val_dataset)
    model.save_pretrained(export_dir)
    return model
 def test_TFBertForSequenceClassification(self):
     from transformers import BertTokenizer, TFBertForSequenceClassification
     pretrained_weights = 'bert-base-uncased'
     tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
     text, inputs, inputs_onnx = self._prepare_inputs(tokenizer)
     model = TFBertForSequenceClassification.from_pretrained(
         pretrained_weights)
     predictions = model.predict(inputs)
     onnx_model = keras2onnx.convert_keras(model, model.name)
     self.assertTrue(
         run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx,
                          predictions, self.model_files))
def run_first_test():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
    input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
    print(f"input_ids:{input_ids}")
    outputs = model(input_ids)
    logits = outputs[0]
    print(logits)
    print("-" * 30)
    print(f"outputs:{outputs}")
    print("-" * 30)
    model.summary()
Beispiel #23
0
    def get_model(self):
        """
        Build BERT model. Using origin way to build BERT model On the Transformers package.

        But this way don't custom input and output, so that can't adapt to deepnlp project worker.
        Need to change deepnlp project parameter.
        """
        model = TFBertForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path=self.model_name,
            from_pt=True if self.from_pt_word in self.model_name else False,
            num_labels=len(self.label_list))

        return model
    def initialise_models(self):
        if self.pretrained == "bert":
            model_name = "bert-base-cased"
            from_pt = False
        elif self.pretrained == "scibert":
            model_name = "allenai/scibert_scivocab_cased"
            from_pt = True

        self.config = BertConfig.from_pretrained(model_name, num_labels=2)
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = TFBertForSequenceClassification.from_pretrained(
            model_name, config=self.config, from_pt=from_pt)
        return self.model
def bert_result(sentence):
    model = TFBertForSequenceClassification.from_pretrained(
        "bert-base-uncased")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model.load_weights("senti_weights.ckpt")
    tf_batch = tokenizer(sentence,
                         max_length=128,
                         padding=True,
                         truncation=True,
                         return_tensors='tf')
    tf_outputs = model(tf_batch)
    tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
    label = tf_predictions.numpy()
    return int(label[0][1] * 100)
Beispiel #26
0
    def __init__(self, weights_path):
        """
    :param weights_path: specifies where load/store weights of the model
    :type weights_path: str
    """
        super().__init__(weights_path)

        # A tensorflow model of Bert base (uncased), pre-trained.
        # More on it on our report.
        self.__model = TFBertForSequenceClassification.from_pretrained(
            'bert-large-uncased')

        # Instanciating a proper tokenizer for Bert
        self.__tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
Beispiel #27
0
    def __init__(self,
                 model_path="./modelSentiment",
                 tokenizer="bert-base-uncased"):
        """
        Initialize model

        Downloads the model if not present
        """
        if os.path.exists(model_path):
            self.model = TFBertForSequenceClassification.from_pretrained(
                model_path)
        else:
            print("Downloading model...")
            Model.download_file_from_google_drive(
                "1uthnEb7WYnIR6y0VVX4gMPoqG-X0oKRK", "Modelfile.zip")

            with zipfile.ZipFile("Modelfile.zip", "r") as zip_ref:
                zip_ref.extractall("./")

            self.model = TFBertForSequenceClassification.from_pretrained(
                model_path)

        self.tokenizer = BertTokenizer.from_pretrained(tokenizer)
Beispiel #28
0
def evaluate(dataset, limit_num_sents: bool):
    # Split and tokenize dataset
    split = Split_BERT()
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    X_train, y_train = split.get_X_y(dataset['train'] + dataset['oos_train'], limit_num_sents=limit_num_sents,
                                     set_type='train')
    X_val, y_val = split.get_X_y(dataset['val'] + dataset['oos_val'], limit_num_sents=limit_num_sents, set_type='val')
    X_test, y_test = split.get_X_y(dataset['test'] + dataset['oos_test'], limit_num_sents=limit_num_sents,
                                   set_type='test')

    train_ids, train_attention_masks, train_labels = tokenize_BERT(X_train, y_train, tokenizer)
    val_ids, val_attention_masks, val_labels = tokenize_BERT(X_val, y_val, tokenizer)
    test_ids, test_attention_masks, test_labels = tokenize_BERT(X_test, y_test, tokenizer)

    num_labels = len(split.intents_dct.keys())

    # Train model
    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                            num_labels=num_labels)  # we have to adjust the number of labels
    print('\nBert Model', model.summary())

    log_dir = 'tensorboard_data/tb_bert'
    model_save_path = './models/bert_model.h5'

    callbacks = [
        tf.keras.callbacks.ModelCheckpoint(filepath=model_save_path, save_weights_only=True, monitor='val_loss',
                                           mode='min',
                                           save_best_only=True), tf.keras.callbacks.TensorBoard(log_dir=log_dir)]

    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    optimizer = tf.keras.optimizers.Adam(learning_rate=4e-5)

    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])

    history = model.fit([train_ids, train_attention_masks],
                        train_labels,
                        batch_size=32,
                        epochs=5,
                        validation_data=([val_ids, val_attention_masks], val_labels),
                        callbacks=callbacks)

    # Test
    testing = Testing(model, {'test_ids': test_ids, 'test_attention_masks': test_attention_masks}, test_labels,
                      'bert', split.intents_dct['oos'])
    results_dct = testing.test_train()

    return results_dct
Beispiel #29
0
    def __init__(self):
        self.modified = False
        # load models
        self.classifier_tokenizer = BertTokenizer.from_pretrained(
            CLASSIFIER_PATH)
        self.classifier_config = AutoConfig.from_pretrained(CLASSIFIER_PATH)
        self.classifier_model = TFBertForSequenceClassification.from_pretrained(
            CLASSIFIER_PATH)

        self.ner_tokenizer = AutoTokenizer.from_pretrained(PARSBERTNER_PATH)
        self.ner_config = AutoConfig.from_pretrained(PARSBERTNER_PATH)
        self.ner_model = TFAutoModelForTokenClassification.from_pretrained(
            PARSBERTNER_PATH)
        self.weather_api = Weather()
        self.adhan_api = Adhan()
        self.time_api = Time()
        self.calender_api = Calender()
    def fit(self,
            messages,
            y,
            epochs=2,
            validation_percent=0.15,
            allow_import=True):
        if self.english:
            messages = (messages['Translation'].values)
        else:
            messages = (messages['Message'].values)

        y = self.Lab_Encoder.fit_transform(
            y.to_numpy().astype(str)).astype(float)

        # preparing the data:
        # split in train and validation

        if os.path.exists(self.path) and allow_import:
            print("Loading Pretrained Model")
            self.model = bert_class.from_pretrained(self.path)
            return (self)
        else:

            X_train, X_val, y_train, y_val = train_test_split(
                messages, y, test_size=validation_percent, random_state=0)

            # tokenizing the data and making tf.dataset
            X_train_input = self.convert_to_input(X_train.astype(str))
            X_val_input = self.convert_to_input(X_val.astype(str))

            train_ds = tf.data.Dataset.from_tensor_slices(
                (X_train_input[0], X_train_input[1], X_train_input[2],
                 y_train)).map(
                     self.example_to_features).shuffle(100).batch(12).repeat(5)
            val_ds = tf.data.Dataset.from_tensor_slices(
                (X_val_input[0], X_val_input[1], X_val_input[2],
                 y_val)).map(self.example_to_features).batch(12)

            self.model.fit(train_ds,
                           epochs=epochs,
                           validation_data=val_ds,
                           verbose=1)
            os.mkdir(self.path)
            self.model.save_pretrained(self.path)
            return self