Exemple #1
0
# -*- coding: utf-8 -*-
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from bert_tokenization import FullTokenizer
from data import train_data, test_data
from models import Keras, TestDataCallback
from sklearn.model_selection import StratifiedKFold
from tweets import Helpers, tweets_preprocessor
from utils import log_model
from configs import get_preprocessing_algorithm
import uuid
import gc

PREPROCESSING_ALGORITHMS = get_preprocessing_algorithm(join=True)
PREPROCESSING_ALGORITHMS['None'] = {}

SEED = 7
KFOLD = 10

for algorithm_id, preprocessing_algorithm in PREPROCESSING_ALGORITHMS.items():
    MODEL = {
        'UUID': str(uuid.uuid4()),
        # 'BERT': 'bert_en_uncased_L-12_H-768_A-12',
        'BERT': 'bert_en_uncased_L-24_H-1024_A-16',
        'BERT_VERSION': 1,
        'BATCH_SIZE': 16,
        'EPOCHS': 3,
        'OPTIMIZER': 'adam',
        'LEARNING_RATE': 2e-6,
        'PREPROCESSING_ALGORITHM_UUID': algorithm_id,
        'PREPROCESSING_ALGORITHM': preprocessing_algorithm,
Exemple #2
0
    'b054e509',
    '2e359f0b',
    '71bd09db',
    'd3cc3c6e',
]

pairs = [[0, 8], [1, 9], [2, 3], [3, 3], [4, 6], [5, 4], [6, 6], [7, 0],
         [8, 6]]

for pair in pairs:

    NETWORK_KEY = NETWORKS_KEYS[pair[0]]
    PREPROCESSING_ALGORITHM_ID = PREPROCESSING_ALGORITHM_IDS[pair[1]]

    MODEL = get_model_config(NETWORK_KEY, glove=USE_GLOVE)
    PREPROCESSING_ALGORITHM = get_preprocessing_algorithm(
        PREPROCESSING_ALGORITHM_ID)

    if USE_GLOVE:
        MODEL['GLOVE'] = {'SIZE': 200}
        GLOVE = f'glove.twitter.27B.{MODEL["GLOVE"]["SIZE"]}d.txt'
        GLOVE_FILE_PATH = f'./data/glove/{GLOVE}'
        GLOVE_EMBEDDINGS = get_glove_embeddings(GLOVE_FILE_PATH)

    MODEL['UUID'] = str(uuid.uuid4())
    MODEL['PREPROCESSING_ALGORITHM'] = PREPROCESSING_ALGORITHM
    MODEL['PREPROCESSING_ALGORITHM_UUID'] = PREPROCESSING_ALGORITHM_ID
    MODEL['DIR'] = f'./data-saved-models/glove-false/{NETWORK_KEY}/'
    ensure_path_exists(MODEL['DIR'])
    MODEL['PREFIX'] = f'{NETWORK_KEY}-{PREPROCESSING_ALGORITHM_ID}-SEED-{SEED}'

    train_data['preprocessed'] = tweets_preprocessor.preprocess(
    l = [arr]
    print(l)
    return np.array(l)


for key in NETWORKS_KEYS:
    MODEL_CONFIG = HAN_CONFIG.copy()
    MODEL_CONFIG['TRAIN_UUID'] = TRAIN_UUID

    if USE_GLOVE:
        MODEL_CONFIG['GLOVE'] = {'SIZE': 200}
        GLOVE = f'glove.twitter.27B.{MODEL_CONFIG["GLOVE"]["SIZE"]}d.txt'
        GLOVE_FILE_PATH = f'./data/glove/{GLOVE}'
        GLOVE_EMBEDDINGS = get_glove_embeddings(GLOVE_FILE_PATH)

    for key, preprocessing_algorithm in get_preprocessing_algorithm(
            join=True).items():
        CONFIG = MODEL_CONFIG.copy()
        CONFIG['UUID'] = str(uuid.uuid4())
        CONFIG['PREPROCESSING_ALGORITHM'] = preprocessing_algorithm
        CONFIG['PREPROCESSING_ALGORITHM_UUID'] = key
        CONFIG['KFOLD_HISTORY'] = []

        kfold = StratifiedKFold(n_splits=KFOLD,
                                shuffle=True,
                                random_state=SEED)

        train_data['preprocessed'] = tweets_preprocessor.preprocess(
            train_data.text,
            preprocessing_algorithm,
            keywords=train_data.keyword,
            locations=train_data.location)
    # 'FASTTEXT', 'RCNN', 'CNN', 'RNN', 'GRU',
]

PREFIX = NETWORKS_KEYS[0]

for key in NETWORKS_KEYS:
    MODEL_CONFIG = get_model_config(key, USE_GLOVE)
    MODEL_CONFIG['TRAIN_UUID'] = TRAIN_UUID

    if USE_GLOVE:
        MODEL_CONFIG['GLOVE'] = {'SIZE': 200}
        GLOVE = f'glove.twitter.27B.{MODEL_CONFIG["GLOVE"]["SIZE"]}d.txt'
        GLOVE_FILE_PATH = f'./data/glove/{GLOVE}'
        GLOVE_EMBEDDINGS = get_glove_embeddings(GLOVE_FILE_PATH)

    for key, preprocessing_algorithm in get_preprocessing_algorithm().items():
        CONFIG = MODEL_CONFIG.copy()
        CONFIG['UUID'] = str(uuid.uuid4())
        CONFIG['PREPROCESSING_ALGORITHM'] = preprocessing_algorithm
        CONFIG['PREPROCESSING_ALGORITHM_UUID'] = key
        CONFIG['KFOLD_HISTORY'] = []

        kfold = StratifiedKFold(n_splits=KFOLD,
                                shuffle=True,
                                random_state=SEED)

        train_data['preprocessed'] = tweets_preprocessor.preprocess(
            train_data.text,
            preprocessing_algorithm,
            keywords=train_data.keyword,
            locations=train_data.location)
Exemple #5
0
        keys = BERT_KEY
    else:
        keys = NETWORKS_KEYS

    for key in keys:
        try:
            model_path = [
                x for x in saved_models_pathes
                if f'{folder}/{key}/' in x and f'SEED-{SEED}' in x
            ][0]
        except:
            continue

        data = model_path.split('/')[-1].split('-')
        preprocessing_algorithm_id = data[1]
        preprocessing_algorithm = get_preprocessing_algorithm(
            preprocessing_algorithm_id, join=(is_classifier or is_bert))

        train_data_preprocessed = tweets_preprocessor.preprocess(
            train_data.text,
            preprocessing_algorithm,
            keywords=train_data.keyword,
            locations=train_data.location)

        test_data_preprocessed = tweets_preprocessor.preprocess(
            test_data.text,
            preprocessing_algorithm,
            keywords=test_data.keyword,
            locations=test_data.location)

        train_inputs, val_inputs, train_targets, val_targets = train_test_split(
            train_data_preprocessed,