コード例 #1
0
    def test_get_max_vector_len(self):
        vectors = [[0, 0, 0], [0]]

        result = Helpers.get_max_vector_len(vectors)

        expected = 3

        self.assertEqual(result, expected)
コード例 #2
0
        vocab_file = None
        do_lower_case = None
        tokenizer = None

        gc.collect()

        bert_layer = hub.KerasLayer(
            f'https://tfhub.dev/tensorflow/{MODEL["BERT"]}/{MODEL["BERT_VERSION"]}',
            trainable=True)
        vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
        do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
        tokenizer = FullTokenizer(vocab_file, do_lower_case)

        gc.collect()

        x_train, INPUT_LENGTH = Helpers.get_bert_input(x_train, tokenizer)
        x_val = Helpers.get_bert_input(x_val,
                                       tokenizer,
                                       input_length=INPUT_LENGTH)
        x_test = Helpers.get_bert_input(x_test,
                                        tokenizer,
                                        input_length=INPUT_LENGTH)

        MODEL['INPUT_LENGTH'] = INPUT_LENGTH
        test_data_callback = None
        gc.collect()

        test_data_callback = TestDataCallback(x_test=x_test, y_test=y_test)

        model = None
        gc.collect()
コード例 #3
0
ファイル: app_keras.py プロジェクト: mplakhtiy/real-or-not
    test_data['preprocessed'] = tweets_preprocessor.preprocess(
        test_data.text,
        PREPROCESSING_ALGORITHM,
        keywords=test_data.keyword,
        locations=test_data.location)

    train_inputs, val_inputs, train_targets, val_targets = train_test_split(
        train_data['preprocessed'],
        train_data['target'],
        test_size=0.3,
        random_state=SEED)

    keras_tokenizer = Tokenizer()

    (x_train, x_val, x_test), input_dim, input_len = Helpers.get_model_inputs(
        (train_inputs, val_inputs, test_data.preprocessed), keras_tokenizer)
    y_train = train_targets
    y_val = val_targets
    y_test = test_data.target.values

    MODEL['EMBEDDING_OPTIONS']['input_dim'] = input_dim
    MODEL['EMBEDDING_OPTIONS']['input_length'] = input_len

    if USE_GLOVE:
        Helpers.with_glove_embedding_options(MODEL, keras_tokenizer,
                                             GLOVE_EMBEDDINGS)

    model = Keras.get_model(MODEL)

    history = Keras.fit(model,
                        (x_train, y_train, x_val, y_val, x_test, y_test),
コード例 #4
0
        test_data['preprocessed'] = tweets_preprocessor.preprocess(
            test_data.text,
            preprocessing_algorithm,
            keywords=test_data.keyword,
            locations=test_data.location)

        inputs = train_data['preprocessed']
        targets = train_data['target']

        for train, validation in kfold.split(inputs, targets):
            keras_tokenizer = Tokenizer()

            (x_train, x_val,
             x_test), input_dim, input_len = Helpers.get_model_inputs(
                 (inputs[train], inputs[validation], test_data.preprocessed),
                 keras_tokenizer)
            y_train = targets[train]
            y_val = targets[validation]
            y_test = test_data.target.values

            CONFIG['EMBEDDING_OPTIONS']['input_dim'] = input_dim
            CONFIG['EMBEDDING_OPTIONS']['input_length'] = input_len

            if USE_GLOVE:
                Helpers.with_glove_embedding_options(CONFIG, keras_tokenizer,
                                                     GLOVE_EMBEDDINGS)

            model = Keras.get_model(CONFIG)

            history = Keras.fit(
コード例 #5
0
        if is_bert:
            x_train = np.asarray(train_inputs)
            x_val = np.asarray(val_inputs)
            x_test = np.asarray(test_data_preprocessed)

            bert_layer = hub.KerasLayer(
                f'https://tfhub.dev/tensorflow/{BERT_MODEL["BERT"]}/{BERT_MODEL["BERT_VERSION"]}',
                trainable=True)

            vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy(
            )
            do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
            tokenizer = FullTokenizer(vocab_file, do_lower_case)

            x_train, INPUT_LENGTH = Helpers.get_bert_input(x_train, tokenizer)
            x_val = Helpers.get_bert_input(x_val,
                                           tokenizer,
                                           input_length=INPUT_LENGTH)
            x_test = Helpers.get_bert_input(x_test,
                                            tokenizer,
                                            input_length=INPUT_LENGTH)

            model = load_model(model_path,
                               custom_objects={'KerasLayer': hub.KerasLayer})

            print(key)
            # print(f'a - {model.evaluate(x_train, y_train, verbose=0)}')
            # print(f'va - {model.evaluate(x_val, y_val, verbose=0)}')
            # print(f'ta - {model.evaluate(x_test, y_test, verbose=1)}')
            # print('----------------------------')
コード例 #6
0
# -*- coding: utf-8 -*-
from tweets import Helpers
from sklearn.model_selection import train_test_split
from data import original_train_data as data
import pandas as pd

Helpers.correct_data(data)

# id_t, id_val, keyword, keyword_val, location, location_val, text, text_val, target, target_val = train_test_split(
#     data.id.values,
#     data.keyword.values,
#     data.location.values,
#     data.text.values,
#     data['target_relabeled'].values,
#     test_size=0.2
# )

train = pd.DataFrame({
    'id': data.id,
    'keyword': data.keyword,
    'location': data.location,
    'text': data.text,
    'target': data.target_relabeled
})

# validation = pd.DataFrame({
#     'id': pd.Series(id_val),
#     'keyword': pd.Series(keyword_val),
#     'location': pd.Series(location_val),
#     'text': pd.Series(text_val),
#     'target': pd.Series(target_val)