# -*- coding: utf-8 -*- from tensorflow.keras.callbacks import ModelCheckpoint import tensorflow_hub as hub from bert_tokenization import FullTokenizer from data import train_data, test_data from models import Keras, TestDataCallback from sklearn.model_selection import StratifiedKFold from tweets import Helpers, tweets_preprocessor from utils import log_model from configs import get_preprocessing_algorithm import uuid import gc PREPROCESSING_ALGORITHMS = get_preprocessing_algorithm(join=True) PREPROCESSING_ALGORITHMS['None'] = {} SEED = 7 KFOLD = 10 for algorithm_id, preprocessing_algorithm in PREPROCESSING_ALGORITHMS.items(): MODEL = { 'UUID': str(uuid.uuid4()), # 'BERT': 'bert_en_uncased_L-12_H-768_A-12', 'BERT': 'bert_en_uncased_L-24_H-1024_A-16', 'BERT_VERSION': 1, 'BATCH_SIZE': 16, 'EPOCHS': 3, 'OPTIMIZER': 'adam', 'LEARNING_RATE': 2e-6, 'PREPROCESSING_ALGORITHM_UUID': algorithm_id, 'PREPROCESSING_ALGORITHM': preprocessing_algorithm,
'b054e509', '2e359f0b', '71bd09db', 'd3cc3c6e', ] pairs = [[0, 8], [1, 9], [2, 3], [3, 3], [4, 6], [5, 4], [6, 6], [7, 0], [8, 6]] for pair in pairs: NETWORK_KEY = NETWORKS_KEYS[pair[0]] PREPROCESSING_ALGORITHM_ID = PREPROCESSING_ALGORITHM_IDS[pair[1]] MODEL = get_model_config(NETWORK_KEY, glove=USE_GLOVE) PREPROCESSING_ALGORITHM = get_preprocessing_algorithm( PREPROCESSING_ALGORITHM_ID) if USE_GLOVE: MODEL['GLOVE'] = {'SIZE': 200} GLOVE = f'glove.twitter.27B.{MODEL["GLOVE"]["SIZE"]}d.txt' GLOVE_FILE_PATH = f'./data/glove/{GLOVE}' GLOVE_EMBEDDINGS = get_glove_embeddings(GLOVE_FILE_PATH) MODEL['UUID'] = str(uuid.uuid4()) MODEL['PREPROCESSING_ALGORITHM'] = PREPROCESSING_ALGORITHM MODEL['PREPROCESSING_ALGORITHM_UUID'] = PREPROCESSING_ALGORITHM_ID MODEL['DIR'] = f'./data-saved-models/glove-false/{NETWORK_KEY}/' ensure_path_exists(MODEL['DIR']) MODEL['PREFIX'] = f'{NETWORK_KEY}-{PREPROCESSING_ALGORITHM_ID}-SEED-{SEED}' train_data['preprocessed'] = tweets_preprocessor.preprocess(
l = [arr] print(l) return np.array(l) for key in NETWORKS_KEYS: MODEL_CONFIG = HAN_CONFIG.copy() MODEL_CONFIG['TRAIN_UUID'] = TRAIN_UUID if USE_GLOVE: MODEL_CONFIG['GLOVE'] = {'SIZE': 200} GLOVE = f'glove.twitter.27B.{MODEL_CONFIG["GLOVE"]["SIZE"]}d.txt' GLOVE_FILE_PATH = f'./data/glove/{GLOVE}' GLOVE_EMBEDDINGS = get_glove_embeddings(GLOVE_FILE_PATH) for key, preprocessing_algorithm in get_preprocessing_algorithm( join=True).items(): CONFIG = MODEL_CONFIG.copy() CONFIG['UUID'] = str(uuid.uuid4()) CONFIG['PREPROCESSING_ALGORITHM'] = preprocessing_algorithm CONFIG['PREPROCESSING_ALGORITHM_UUID'] = key CONFIG['KFOLD_HISTORY'] = [] kfold = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED) train_data['preprocessed'] = tweets_preprocessor.preprocess( train_data.text, preprocessing_algorithm, keywords=train_data.keyword, locations=train_data.location)
# 'FASTTEXT', 'RCNN', 'CNN', 'RNN', 'GRU', ] PREFIX = NETWORKS_KEYS[0] for key in NETWORKS_KEYS: MODEL_CONFIG = get_model_config(key, USE_GLOVE) MODEL_CONFIG['TRAIN_UUID'] = TRAIN_UUID if USE_GLOVE: MODEL_CONFIG['GLOVE'] = {'SIZE': 200} GLOVE = f'glove.twitter.27B.{MODEL_CONFIG["GLOVE"]["SIZE"]}d.txt' GLOVE_FILE_PATH = f'./data/glove/{GLOVE}' GLOVE_EMBEDDINGS = get_glove_embeddings(GLOVE_FILE_PATH) for key, preprocessing_algorithm in get_preprocessing_algorithm().items(): CONFIG = MODEL_CONFIG.copy() CONFIG['UUID'] = str(uuid.uuid4()) CONFIG['PREPROCESSING_ALGORITHM'] = preprocessing_algorithm CONFIG['PREPROCESSING_ALGORITHM_UUID'] = key CONFIG['KFOLD_HISTORY'] = [] kfold = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED) train_data['preprocessed'] = tweets_preprocessor.preprocess( train_data.text, preprocessing_algorithm, keywords=train_data.keyword, locations=train_data.location)
keys = BERT_KEY else: keys = NETWORKS_KEYS for key in keys: try: model_path = [ x for x in saved_models_pathes if f'{folder}/{key}/' in x and f'SEED-{SEED}' in x ][0] except: continue data = model_path.split('/')[-1].split('-') preprocessing_algorithm_id = data[1] preprocessing_algorithm = get_preprocessing_algorithm( preprocessing_algorithm_id, join=(is_classifier or is_bert)) train_data_preprocessed = tweets_preprocessor.preprocess( train_data.text, preprocessing_algorithm, keywords=train_data.keyword, locations=train_data.location) test_data_preprocessed = tweets_preprocessor.preprocess( test_data.text, preprocessing_algorithm, keywords=test_data.keyword, locations=test_data.location) train_inputs, val_inputs, train_targets, val_targets = train_test_split( train_data_preprocessed,