def test_get_max_vector_len(self): vectors = [[0, 0, 0], [0]] result = Helpers.get_max_vector_len(vectors) expected = 3 self.assertEqual(result, expected)
vocab_file = None do_lower_case = None tokenizer = None gc.collect() bert_layer = hub.KerasLayer( f'https://tfhub.dev/tensorflow/{MODEL["BERT"]}/{MODEL["BERT_VERSION"]}', trainable=True) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = FullTokenizer(vocab_file, do_lower_case) gc.collect() x_train, INPUT_LENGTH = Helpers.get_bert_input(x_train, tokenizer) x_val = Helpers.get_bert_input(x_val, tokenizer, input_length=INPUT_LENGTH) x_test = Helpers.get_bert_input(x_test, tokenizer, input_length=INPUT_LENGTH) MODEL['INPUT_LENGTH'] = INPUT_LENGTH test_data_callback = None gc.collect() test_data_callback = TestDataCallback(x_test=x_test, y_test=y_test) model = None gc.collect()
test_data['preprocessed'] = tweets_preprocessor.preprocess( test_data.text, PREPROCESSING_ALGORITHM, keywords=test_data.keyword, locations=test_data.location) train_inputs, val_inputs, train_targets, val_targets = train_test_split( train_data['preprocessed'], train_data['target'], test_size=0.3, random_state=SEED) keras_tokenizer = Tokenizer() (x_train, x_val, x_test), input_dim, input_len = Helpers.get_model_inputs( (train_inputs, val_inputs, test_data.preprocessed), keras_tokenizer) y_train = train_targets y_val = val_targets y_test = test_data.target.values MODEL['EMBEDDING_OPTIONS']['input_dim'] = input_dim MODEL['EMBEDDING_OPTIONS']['input_length'] = input_len if USE_GLOVE: Helpers.with_glove_embedding_options(MODEL, keras_tokenizer, GLOVE_EMBEDDINGS) model = Keras.get_model(MODEL) history = Keras.fit(model, (x_train, y_train, x_val, y_val, x_test, y_test),
test_data['preprocessed'] = tweets_preprocessor.preprocess( test_data.text, preprocessing_algorithm, keywords=test_data.keyword, locations=test_data.location) inputs = train_data['preprocessed'] targets = train_data['target'] for train, validation in kfold.split(inputs, targets): keras_tokenizer = Tokenizer() (x_train, x_val, x_test), input_dim, input_len = Helpers.get_model_inputs( (inputs[train], inputs[validation], test_data.preprocessed), keras_tokenizer) y_train = targets[train] y_val = targets[validation] y_test = test_data.target.values CONFIG['EMBEDDING_OPTIONS']['input_dim'] = input_dim CONFIG['EMBEDDING_OPTIONS']['input_length'] = input_len if USE_GLOVE: Helpers.with_glove_embedding_options(CONFIG, keras_tokenizer, GLOVE_EMBEDDINGS) model = Keras.get_model(CONFIG) history = Keras.fit(
if is_bert: x_train = np.asarray(train_inputs) x_val = np.asarray(val_inputs) x_test = np.asarray(test_data_preprocessed) bert_layer = hub.KerasLayer( f'https://tfhub.dev/tensorflow/{BERT_MODEL["BERT"]}/{BERT_MODEL["BERT_VERSION"]}', trainable=True) vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy( ) do_lower_case = bert_layer.resolved_object.do_lower_case.numpy() tokenizer = FullTokenizer(vocab_file, do_lower_case) x_train, INPUT_LENGTH = Helpers.get_bert_input(x_train, tokenizer) x_val = Helpers.get_bert_input(x_val, tokenizer, input_length=INPUT_LENGTH) x_test = Helpers.get_bert_input(x_test, tokenizer, input_length=INPUT_LENGTH) model = load_model(model_path, custom_objects={'KerasLayer': hub.KerasLayer}) print(key) # print(f'a - {model.evaluate(x_train, y_train, verbose=0)}') # print(f'va - {model.evaluate(x_val, y_val, verbose=0)}') # print(f'ta - {model.evaluate(x_test, y_test, verbose=1)}') # print('----------------------------')
# -*- coding: utf-8 -*- from tweets import Helpers from sklearn.model_selection import train_test_split from data import original_train_data as data import pandas as pd Helpers.correct_data(data) # id_t, id_val, keyword, keyword_val, location, location_val, text, text_val, target, target_val = train_test_split( # data.id.values, # data.keyword.values, # data.location.values, # data.text.values, # data['target_relabeled'].values, # test_size=0.2 # ) train = pd.DataFrame({ 'id': data.id, 'keyword': data.keyword, 'location': data.location, 'text': data.text, 'target': data.target_relabeled }) # validation = pd.DataFrame({ # 'id': pd.Series(id_val), # 'keyword': pd.Series(keyword_val), # 'location': pd.Series(location_val), # 'text': pd.Series(text_val), # 'target': pd.Series(target_val)