def main_training(): lexicon_loader = LexiconLoader() scored_lexicon: dict = lexicon_loader.load_all_and_merge() tr_tweets_loader = LabeledTweetsLoader(TRAINING_INPUT_FILENAME) tr_labeled_tweets = tr_tweets_loader.parse_tokens_and_labels( tr_tweets_loader.load_lines()) token_summarizer = TokenSummarizer(scored_lexicon) feature_extractor = FeatureExtractor(scored_lexicon) vu = VocabUtil() nn_input_preparer = NNInputPreparer(vu) tr_feature_vectors = [] # 2D array of feature vectors for labeled_tweet in tr_labeled_tweets: known_token_sequence = token_summarizer.get_known_tokens( labeled_tweet[0]) feature_vector = feature_extractor.compute_feature_vector( known_token_sequence) tr_feature_vectors.append(feature_vector) tr_network_input = np.array(tr_feature_vectors) tr_targets = [labeled_tweet[1] for labeled_tweet in tr_labeled_tweets] tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( tr_targets) dev_tweets_loader = LabeledTweetsLoader(DEV_INPUT_FILENAME) dev_labeled_tweets = dev_tweets_loader.parse_tokens_and_labels( dev_tweets_loader.load_lines()) dev_feature_vectors = [] # 2D array of feature vectors for labeled_tweet in dev_labeled_tweets: known_token_sequence = token_summarizer.get_known_tokens( labeled_tweet[0]) feature_vector = feature_extractor.compute_feature_vector( known_token_sequence) dev_feature_vectors.append(feature_vector) dev_network_input = np.array(dev_feature_vectors) dev_targets = [labeled_tweet[1] for labeled_tweet in dev_labeled_tweets] dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( dev_targets) # Every epoch is cheap (< 1ms), so we don't need the ability to continue training from a previous model. print("Commencing new training run") model_creator = ModelCreator(vu) model = model_creator.create_two_dense_model(hidden_layer_size=HIDDEN_SIZE) cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5' checkpoint = ModelCheckpoint(cp_filepath, monitor='val_accuracy', verbose=1, save_best_only=False) model.fit(tr_network_input, tr_targets_one_hot_encoded, batch_size=32, epochs=MAX_EPOCHS, validation_data=(dev_network_input, dev_targets_one_hot_encoded), callbacks=[checkpoint])
def prep_validation_set(input_filename: str, nn_input_preparer: NNInputPreparer, vu: VocabUtil, upsample: bool): loader = LabeledDataLoader(input_filename) labeled_tweets = loader.parse_tokens_and_labels(loader.load_lines()) labeled_tweets = nn_input_preparer.filter_out_long_tweets(labeled_tweets) if upsample: labeled_tweets = nn_input_preparer.crude_upsample(labeled_tweets) irregular_inputs = [[ vu.nn_input_token_to_int[token] if token in vu.nn_input_token_to_int else vu.nn_input_token_to_int['<OOV>'] for token in labeled_tweet[0] ] for labeled_tweet in labeled_tweets] rectangular_inputs = nn_input_preparer.rectangularize_inputs( irregular_inputs) rectangular_targets = [tweet[1] for tweet in labeled_tweets] targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( rectangular_targets) return rectangular_inputs, rectangular_targets, targets_one_hot_encoded
def prep_validation_set(input_filename: str, nn_input_preparer: NNInputPreparer, vu: VocabUtil) \ -> Tuple[np.ndarray, np.ndarray, np.ndarray]: loader = LabeledDataLoader(input_filename) tweets = loader.parse_tokens_and_labels(loader.load_lines()) tweets = nn_input_preparer.filter_out_long_sequences(tweets) print( f'processing all not-too-long {len(tweets)} tweets from {input_filename}' ) irregular_inputs = [[ vu.nn_input_token_to_int[item[0]] if item[0] in vu.nn_input_token_to_int else vu.nn_input_token_to_int['<OOV>'] for item in tweet ] for tweet in tweets] irregular_targets = [[vu.nn_pos_to_int[item[1]] for item in tweet] for tweet in tweets] rectangular_inputs = nn_input_preparer.rectangularize_inputs( irregular_inputs) rectangular_targets = nn_input_preparer.rectangularize_targets( irregular_targets) targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( rectangular_targets) return rectangular_inputs, rectangular_targets, targets_one_hot_encoded
def main_training(): print(f'Using TensorFlow version {tf.__version__}') vu = create_vocab_util_from_training_set(TRAINING_INPUT_FILENAME) nn_input_preparer = NNInputPreparer(vu, max_seq_len=MAX_SEQ_LEN) tr_loader = LabeledDataLoader(TRAINING_INPUT_FILENAME) tr_tweets = tr_loader.parse_tokens_and_labels(tr_loader.load_lines()) tr_tweets = nn_input_preparer.filter_out_long_sequences(tr_tweets) print( f'Training on {len(tr_tweets)} tweets, each no longer than {MAX_SEQ_LEN} tokens' ) tr_irregular_inputs = [[ vu.nn_input_token_to_int[item[0]] for item in tweet ] for tweet in tr_tweets] tr_irregular_targets = [[vu.nn_pos_to_int[item[1]] for item in tweet] for tweet in tr_tweets] tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs( tr_irregular_inputs) tr_rectangular_targets = nn_input_preparer.rectangularize_targets( tr_irregular_targets) tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( tr_rectangular_targets) if CONTINUE_TRAINING: print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE) model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE) model.summary() else: print("Commencing new training run") model_creator = LstmModelCreator(vu, embedding_dim=EMBEDDING_DIM, lstm_dim=LSTM_DIM, mask_zero=MASK_ZERO) model = model_creator.create_bi_lstm_model() cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5' checkpoint = ModelCheckpoint(cp_filepath, monitor='val_accuracy', verbose=1, save_best_only=False) rectangular_inputs, _, targets_one_hot_encoded = \ prep_validation_set(DEV_INPUT_FILENAME, nn_input_preparer, vu) model.fit(x=tr_rectangular_inputs, y=tr_targets_one_hot_encoded, batch_size=32, initial_epoch=INITIAL_EPOCH, epochs=MAX_EPOCHS, validation_data=(rectangular_inputs, targets_one_hot_encoded), callbacks=[checkpoint])
def main(): print('tf:', tf.__version__) print('TRAINING_MODEL_FILENAME =', TRAINING_MODEL_FILENAME) nn_input_preparer = NNInputPreparer() model_creator = NNModelCreator(latent_dim=LATENT_DIM, dense_dim=DENSE_DIM) loaded_training_model = load_model(TRAINING_MODEL_FILENAME) encoder_model, decoder_model = model_creator.derive_inference_models( loaded_training_model) inference_runner = InferenceRunner(encoder_model=encoder_model, decoder_model=decoder_model) cleaner = TweetCleaner() selector = TweetSelector(min_length=MIN_TWEET_LENGTH, max_length=MAX_TWEET_LENGTH) noiser = DisjointNoiser() for input_filename in [TRAINING_INPUT_FILENAME, DEV_INPUT_FILENAME]: k = 10 print( f'processing the first {k} selected tweets from {input_filename}') raw_tweets = DataLoader(input_filename).load() clean_tweets = [cleaner.clean_tweet(t) for t in raw_tweets] clean_tweets_as_lists = [list(t) for t in clean_tweets] selected_tweets_as_lists = [ t for t in clean_tweets_as_lists if selector.select(t) ] gb_inference = nn_input_preparer.get_batches(selected_tweets_as_lists, noiser, batch_size=1) for i in range(k): noised_batch, originals_batch, original_delayed_batch = next( gb_inference) print('[noised ]', nn_input_preparer.decode_tweet(noised_batch[0])) print('[original ]', nn_input_preparer.decode_tweet(originals_batch[0])) print('[original 2]', ''.join(selected_tweets_as_lists[i])) print('[or-delayed]', nn_input_preparer.decode_tweet(original_delayed_batch[0])) decoded_tweet = inference_runner.decode_sequence(noised_batch) print('[decoded ]', decoded_tweet) print()
def main_inference(): print(f'Using TensorFlow version {tf.__version__}') print(f'Loading model {TRAINING_MODEL_FILENAME}') trained_model = load_model(TRAINING_MODEL_FILENAME) trained_model.summary() lexicon_loader = LexiconLoader() scored_lexicon: dict = lexicon_loader.load_all_and_merge() token_summarizer = TokenSummarizer(scored_lexicon) feature_extractor = FeatureExtractor(scored_lexicon) vu = VocabUtil() nn_input_preparer = NNInputPreparer(vu) for input_filename in [DEV_INPUT_FILENAME]: tweets_loader = LabeledTweetsLoader(DEV_INPUT_FILENAME) labeled_tweets = tweets_loader.parse_tokens_and_labels( tweets_loader.load_lines()) feature_vectors = [] # 2D array of feature vectors for labeled_tweet in labeled_tweets: known_token_sequence = token_summarizer.get_known_tokens( labeled_tweet[0]) feature_vector = feature_extractor.compute_feature_vector( known_token_sequence) feature_vectors.append(feature_vector) network_input = np.array(feature_vectors) print('network_input.shape:', network_input.shape) targets = [labeled_tweet[1] for labeled_tweet in labeled_tweets] targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( targets) trained_model.evaluate(network_input, targets_one_hot_encoded) argmax_confusion_matrix = np.zeros( (vu.get_output_vocab_size(), vu.get_output_vocab_size()), dtype=int) expected_sampling_confusion_matrix = np.zeros( (vu.get_output_vocab_size(), vu.get_output_vocab_size())) expected_sampling_accuracy_sum = 0.0 num_correct_argmax_predictions = 0 for rectangular_input, target_human in tqdm(zip( network_input, targets)): rectangular_input.shape = (1, 3) target_index = vu.nn_rsl_to_int[target_human] predicted_probabilities = trained_model(rectangular_input)[0] # the predicted index if we take the class with the largest probability argmax_index = np.argmax(predicted_probabilities) if argmax_index == target_index: num_correct_argmax_predictions += 1 argmax_confusion_matrix[target_index][argmax_index] += 1 # rhs is the probability of guessing target_index if we sample according to predicted probabilities expected_sampling_accuracy_sum += tf.keras.backend.get_value( predicted_probabilities[target_index]) for i in range(vu.get_output_vocab_size()): expected_sampling_confusion_matrix[target_index][ i] += predicted_probabilities[i] num_tweets_in_dataset = len(targets) print(f'Argmax accuracy for {input_filename}:', num_correct_argmax_predictions / num_tweets_in_dataset) print(f'Expected sampling accuracy for {input_filename}:', expected_sampling_accuracy_sum / num_tweets_in_dataset) print( f"Argmax confusion matrix of targets vs predicted for {input_filename}:\n" f"{vu.raw_sentiment_labels}\n", argmax_confusion_matrix) print( f"Expected sampling confusion matrix of targets vs predicted for {input_filename}:\n" f"{vu.raw_sentiment_labels}\n", expected_sampling_confusion_matrix)
def main_inference(): print(f'Using TensorFlow version {tf.__version__}') print(f'Loading model {TRAINING_MODEL_FILENAME}') trained_model = load_model(TRAINING_MODEL_FILENAME) trained_model.summary() vu = create_vocab_util_from_training_set(TRAINING_INPUT_FILENAME) nn_input_preparer = NNInputPreparer(vu, MAX_SEQ_LEN) for input_filename in [DEV_INPUT_FILENAME]: rectangular_inputs, rectangular_targets, targets_one_hot_encoded = \ prep_validation_set(input_filename, nn_input_preparer, vu, UPSAMPLE) trained_model.evaluate(rectangular_inputs, targets_one_hot_encoded, batch_size=32) argmax_confusion_matrix = np.zeros( (vu.get_output_vocab_size(), vu.get_output_vocab_size()), dtype=int) expected_sampling_confusion_matrix = np.zeros( (vu.get_output_vocab_size(), vu.get_output_vocab_size())) expected_sampling_accuracy_sum = 0.0 num_correct_argmax_predictions = 0 it = 0 for rectangular_input, target_human in tqdm( zip(rectangular_inputs, rectangular_targets)): it += 1 target_index = vu.nn_rsl_to_int[target_human] rectangular_input_2d = np.array(rectangular_input) rectangular_input_2d.shape = (1, MAX_SEQ_LEN) predicted_probabilities = trained_model(rectangular_input_2d, training=False)[0] if it < 10: print(rectangular_input) print('target_index:', target_index) print(predicted_probabilities) print() # the predicted index if we take the class with the largest probability argmax_index = np.argmax(predicted_probabilities) if argmax_index == target_index: num_correct_argmax_predictions += 1 argmax_confusion_matrix[target_index][argmax_index] += 1 # rhs is the probability of guessing target_index if we sample according to predicted probabilities expected_sampling_accuracy_sum += tf.keras.backend.get_value( predicted_probabilities[target_index]) for i in range(vu.get_output_vocab_size()): expected_sampling_confusion_matrix[target_index][ i] += predicted_probabilities[i] num_tweets_in_dataset = len(rectangular_targets) print(f'Argmax accuracy for {input_filename}:', num_correct_argmax_predictions / num_tweets_in_dataset) print(f'Expected sampling accuracy for {input_filename}:', expected_sampling_accuracy_sum / num_tweets_in_dataset) print( f"Argmax confusion matrix of targets vs predicted for {input_filename}:\n" f"{vu.raw_sentiment_labels}\n", argmax_confusion_matrix) print( f"Expected sampling confusion matrix of targets vs predicted for {input_filename}:\n" f"{vu.raw_sentiment_labels}\n", expected_sampling_confusion_matrix)
def main_inference(): print(f'Using TensorFlow version {tf.__version__}') print(f'Loading model {TRAINING_MODEL_FILENAME}') trained_model = load_model(TRAINING_MODEL_FILENAME) vu = create_vocab_util_from_training_set(TRAINING_INPUT_FILENAME) nn_input_preparer = NNInputPreparer(vu, max_seq_len=MAX_SEQ_LEN) for input_filename in [DEV_INPUT_FILENAME]: rectangular_inputs, rectangular_targets, targets_one_hot_encoded = \ prep_validation_set(input_filename, nn_input_preparer, vu) trained_model.evaluate(rectangular_inputs, targets_one_hot_encoded, batch_size=32) num_tokens_in_dataset = num_token_level_correct_argmax_predictions = \ num_token_level_correct_argmax_predictions_incl_pads = 0 tweet_level_argmax_accuracy_sum = tweet_level_expected_sampling_accuracy_sum = \ token_level_expected_sampling_accuracy_sum = token_level_expected_sampling_accuracy_sum_incl_pads = 0.0 for rectangular_input, rectangular_target_indices in tqdm(zip(rectangular_inputs, rectangular_targets)): num_tokens_in_current_tweet = num_current_tweet_correct_argmax_predictions = 0 current_tweet_expected_sampling_accuracy_sum = 0.0 rectangular_input_2d = np.array(rectangular_input) rectangular_input_2d.shape = (1, MAX_SEQ_LEN) predicted_probabilities_sequence = trained_model(rectangular_input_2d, training=False)[0] for predicted_probabilities, target_index in zip( predicted_probabilities_sequence, rectangular_target_indices): # the predicted index if we take the class with the largest probability argmax_index = np.argmax(predicted_probabilities) # probability of guessing target_index if we sample according to predicted probabilities prob_sampling_success_on_token = \ tf.keras.backend.get_value(predicted_probabilities[target_index]) if argmax_index == target_index: num_token_level_correct_argmax_predictions_incl_pads += 1 token_level_expected_sampling_accuracy_sum_incl_pads += prob_sampling_success_on_token if target_index != 0: if argmax_index == target_index: num_token_level_correct_argmax_predictions += 1 num_current_tweet_correct_argmax_predictions += 1 current_tweet_expected_sampling_accuracy_sum += prob_sampling_success_on_token token_level_expected_sampling_accuracy_sum += prob_sampling_success_on_token num_tokens_in_current_tweet += 1 num_tokens_in_dataset += 1 # every tweet has at least one non-padding token, so we don't worry about division by zero current_tweet_argmax_accuracy = num_current_tweet_correct_argmax_predictions / num_tokens_in_current_tweet current_tweet_expected_sampling_accuracy = \ current_tweet_expected_sampling_accuracy_sum / num_tokens_in_current_tweet tweet_level_argmax_accuracy_sum += current_tweet_argmax_accuracy tweet_level_expected_sampling_accuracy_sum += current_tweet_expected_sampling_accuracy num_tokens_in_dataset_incl_pads = MAX_SEQ_LEN * len(rectangular_inputs) print(f'Argmax accuracy for {input_filename} including padding:', num_token_level_correct_argmax_predictions_incl_pads / num_tokens_in_dataset_incl_pads) print(f'Expected sampling accuracy for {input_filename} including padding:', token_level_expected_sampling_accuracy_sum_incl_pads / num_tokens_in_dataset_incl_pads) print(f'Token-level argmax accuracy for {input_filename}:', num_token_level_correct_argmax_predictions / num_tokens_in_dataset) print(f'Token-level expected sampling accuracy for {input_filename}:', token_level_expected_sampling_accuracy_sum / num_tokens_in_dataset) num_tweets_in_dataset = len(rectangular_inputs) print(f'Tweet-level argmax accuracy for {input_filename}:', tweet_level_argmax_accuracy_sum / num_tweets_in_dataset) print(f'Tweet-level expected sampling accuracy for {input_filename}:', tweet_level_expected_sampling_accuracy_sum / num_tweets_in_dataset)
def main_training(): print(f'Using TensorFlow version {tf.__version__}') tvu = TargetVocabUtil() btc = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu) nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN) tr_loader = LabeledDataLoader(TR_INPUT_FILENAME) tr_labeled_tweets = tr_loader.parse_tokens_and_labels( tr_loader.load_lines()) tr_labeled_tweets = btc.convert(tr_labeled_tweets) tr_labeled_tweets = btc.convert_to_ids(tr_labeled_tweets) tr_labeled_tweets = btc.prepend_cls(tr_labeled_tweets) tr_labeled_tweets = nn_input_preparer.filter_out_long_sequences( tr_labeled_tweets) print( f'Processing all not-too-long {len(tr_labeled_tweets)} tweets from {TR_INPUT_FILENAME}' ) tr_irregular_inputs = [tweet[0] for tweet in tr_labeled_tweets] tr_rectangular_targets = [tweet[1] for tweet in tr_labeled_tweets] tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs( tr_irregular_inputs) tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( tr_rectangular_targets) if CONTINUE_TRAINING: print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE) model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE) model.summary() else: print("Commencing new training run") model_creator = BertModelCreator(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu, max_seq_len=MAX_SEQ_LEN) model = model_creator.create_model() cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5' checkpoint = ModelCheckpoint(cp_filepath, monitor='val_accuracy', verbose=1, save_best_only=False) dev_loader = LabeledDataLoader(DEV_INPUT_FILENAME) dev_labeled_tweets = tr_loader.parse_tokens_and_labels( dev_loader.load_lines()) dev_labeled_tweets = btc.convert(dev_labeled_tweets) dev_labeled_tweets = btc.convert_to_ids(dev_labeled_tweets) dev_labeled_tweets = btc.prepend_cls(dev_labeled_tweets) dev_labeled_tweets = nn_input_preparer.filter_out_long_sequences( dev_labeled_tweets) print( f'Processing all not-too-long {len(dev_labeled_tweets)} tweets from {DEV_INPUT_FILENAME}' ) dev_irregular_inputs = [tweet[0] for tweet in dev_labeled_tweets] dev_rectangular_targets = [tweet[1] for tweet in dev_labeled_tweets] dev_rectangular_inputs = nn_input_preparer.rectangularize_inputs( dev_irregular_inputs) dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( dev_rectangular_targets) model.fit(tr_rectangular_inputs, tr_targets_one_hot_encoded, batch_size=32, initial_epoch=INITIAL_EPOCH, epochs=MAX_EPOCHS, validation_data=(dev_rectangular_inputs, dev_targets_one_hot_encoded), callbacks=[checkpoint]) model.save(FINAL_TRAINED_MODEL_FILENAME)
def main_training(): print(f'Using TensorFlow version {tf.__version__}') tr_loader = LabeledDataLoader(TRAINING_INPUT_FILENAME) tr_tweets = tr_loader.parse_raw_tokens_and_labels(tr_loader.load_lines()) tvu = TargetVocabUtil() tokenizer = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu) tr_tweets = tokenizer.convert_to_tokens(tr_tweets) tr_tweets = tokenizer.convert_to_ids(tr_tweets) nn_input_preparer = NNInputPreparer(tvu, max_seq_len=MAX_SEQ_LEN) tr_tweets = nn_input_preparer.filter_out_long_sequences(tr_tweets) print( f'Training on {len(tr_tweets)} tweets, each no longer than {MAX_SEQ_LEN} tokens' ) tr_irregular_inputs = [[item[0] for item in tweet] for tweet in tr_tweets] tr_irregular_targets = [[item[1] for item in tweet] for tweet in tr_tweets] tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs( tr_irregular_inputs) tr_rectangular_targets = nn_input_preparer.rectangularize_targets( tr_irregular_targets) tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( tr_rectangular_targets) if CONTINUE_TRAINING: print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE) model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE, custom_objects={"BertModelLayer": BertModelLayer}) model.summary() else: print('Commencing new training run') model_creator = BertModelCreator(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu, max_seq_len=MAX_SEQ_LEN, freeze_bert_layer=True) model = model_creator.create_model() cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5' dev_loader = LabeledDataLoader(DEV_INPUT_FILENAME) dev_tweets = dev_loader.parse_raw_tokens_and_labels( dev_loader.load_lines()) dev_tweets = tokenizer.convert_to_tokens(dev_tweets) dev_tweets = tokenizer.convert_to_ids(dev_tweets) dev_tweets = nn_input_preparer.filter_out_long_sequences(dev_tweets) print( f'processing all {len(dev_tweets)} not-too-long tweets from {DEV_INPUT_FILENAME}' ) dev_irregular_inputs = [[item[0] for item in tweet] for tweet in dev_tweets] # print('proportion of non-pad in dev:', sum(len(tweet) for tweet in tweets) / (MAX_SEQ_LEN * len(tweets))) dev_irregular_targets = [[item[1] for item in tweet] for tweet in dev_tweets] dev_rectangular_inputs = nn_input_preparer.rectangularize_inputs( dev_irregular_inputs) dev_rectangular_targets = nn_input_preparer.rectangularize_targets( dev_irregular_targets) dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( dev_rectangular_targets) checkpoint = ModelCheckpoint(cp_filepath, monitor='val_accuracy', verbose=1, save_best_only=False) model.fit(tr_rectangular_inputs, tr_targets_one_hot_encoded, batch_size=32, initial_epoch=INITIAL_EPOCH, epochs=MAX_EPOCHS, validation_data=(dev_rectangular_inputs, dev_targets_one_hot_encoded), callbacks=[checkpoint]) model.save(FINAL_TRAINED_MODEL_FILENAME)
def main_inference(): print(f'Using TensorFlow version {tf.__version__}') print(f'Loading model {TRAINING_MODEL_FILENAME}') trained_model = load_model( TRAINING_MODEL_FILENAME, custom_objects={"BertModelLayer": BertModelLayer}) trained_model.summary() tvu = TargetVocabUtil() btc = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu) nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN) for input_filename in [DEV_INPUT_FILENAME]: loader = LabeledDataLoader(input_filename) tweets = loader.parse_tokens_and_labels(loader.load_lines()) tweets = btc.convert(tweets) tweets = btc.convert_to_ids(tweets) tweets = btc.prepend_cls(tweets) tweets = nn_input_preparer.filter_out_long_sequences(tweets) print( f'Processing all not-too-long {len(tweets)} tweets from {input_filename}' ) irregular_inputs = [tweet[0] for tweet in tweets] rectangular_targets = [tweet[1] for tweet in tweets] argmax_confusion_matrix = np.zeros( (tvu.get_output_vocab_size(), tvu.get_output_vocab_size()), dtype=int) expected_sampling_confusion_matrix = np.zeros( (tvu.get_output_vocab_size(), tvu.get_output_vocab_size())) num_correct_argmax_predictions = 0 expected_sampling_accuracy_sum = 0.0 for irregular_input, target_index in tqdm( zip(irregular_inputs, rectangular_targets)): rectangular_input_singleton = nn_input_preparer.rectangularize_inputs( [irregular_input]) predicted_probabilities = trained_model( rectangular_input_singleton)[0] # the predicted index if we take the class with the largest probability argmax_index = np.argmax(predicted_probabilities) if argmax_index == target_index: num_correct_argmax_predictions += 1 argmax_confusion_matrix[target_index][argmax_index] += 1 # rhs is the probability of guessing target if we sample according to predicted probabilities expected_sampling_accuracy_sum += tf.keras.backend.get_value( predicted_probabilities[target_index]) for i in range(tvu.get_output_vocab_size()): expected_sampling_confusion_matrix[target_index][ i] += predicted_probabilities[i] num_tweets_in_dataset = len(rectangular_targets) print(f'Argmax accuracy for {input_filename}:', num_correct_argmax_predictions / num_tweets_in_dataset) print(f'Expected sampling accuracy for {input_filename}:', expected_sampling_accuracy_sum / num_tweets_in_dataset) print( f"Argmax confusion matrix of targets vs predicted for {input_filename}:\n" f"{tvu.raw_sentiment_labels}\n", argmax_confusion_matrix) print( f"Expected sampling confusion matrix of targets vs predicted for {input_filename}:\n" f"{tvu.raw_sentiment_labels}\n", expected_sampling_confusion_matrix)
def main(): print('tf:', tf.__version__) random.seed(42) raw_tweets = DataLoader(TRAINING_INPUT_FILENAME).load() cleaner = TweetCleaner() clean_tweets = [cleaner.clean_tweet(t) for t in raw_tweets] clean_tweets_as_lists = [list(t) for t in clean_tweets] print('number of clean_tweets_as_lists:', len(clean_tweets_as_lists)) selector = TweetSelector(min_length=MIN_TWEET_LENGTH, max_length=MAX_TWEET_LENGTH) selected_tweets_as_lists = [ t for t in clean_tweets_as_lists if selector.select(t) ] print('number of selected_tweets_as_lists:', len(selected_tweets_as_lists)) if CONTINUE_TRAINING: training_model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE) else: model_creator = NNModelCreator(latent_dim=LATENT_DIM, dense_dim=DENSE_DIM) training_model = model_creator.create_training_model() nn_input_preparer = NNInputPreparer() num_generations_in_run = 0 print(time.ctime()) noiser = DisjointNoiser() for de_facto_epoch in range(INITIALLY_COMPLETED_DFEPOCH + 1, NUM_DE_FACTO_EPOCHS): gb_training = nn_input_preparer.get_batches(selected_tweets_as_lists, noiser, GENERATOR_BATCH_SIZE) cp_filepath = BASE_DIR + f'dfepoch_{de_facto_epoch}_' + "{val_accuracy:.5f}.h5" checkpoint = ModelCheckpoint(cp_filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max') while True: try: noised_batch, originals_batch, originals_delayed_batch = next( gb_training) assert (len(noised_batch) == GENERATOR_BATCH_SIZE) print(noised_batch.shape, originals_batch.shape, originals_delayed_batch.shape) validation_split = 0.125 fit_batch_size = 32 # We take care here so as not to manifest the "Your input ran out of data" warning validation_steps = int( GENERATOR_BATCH_SIZE * validation_split) // fit_batch_size training_steps = GENERATOR_BATCH_SIZE // fit_batch_size - validation_steps training_model.fit([noised_batch, originals_delayed_batch], originals_batch, batch_size=fit_batch_size, steps_per_epoch=training_steps, epochs=1, validation_split=validation_split, validation_steps=validation_steps, callbacks=[checkpoint]) # https://keras.io/api/models/model_training_apis/ says: # "The validation data is selected from the last samples in the ... data provided" # This means the model is never validated on tweets that we train it on. except StopIteration: break num_generations_in_run += 1 print(f'num_generations: {num_generations_in_run}') print(time.ctime()) print(f'End of de facto epoch {de_facto_epoch} - saving model') training_model.save(BASE_DIR + f'dfepoch_{de_facto_epoch}_end.h5') training_model.save(FINAL_TRAINED_MODEL_FILENAME)
def main_inference(): print(f'Using TensorFlow version {tf.__version__}') print(f'Loading model {TRAINING_MODEL_FILENAME}') trained_model = load_model( TRAINING_MODEL_FILENAME, custom_objects={"BertModelLayer": BertModelLayer}) print('Loaded fine-tuned model:') trained_model.summary() tvu = TargetVocabUtil() btc = BertTokenConverter(BERT_PRETRAINED_MODEL_DIR, tvu) nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN) for input_filename in [DEV_INPUT_FILENAME]: loader = LabeledDataLoader(input_filename) tweets = loader.parse_raw_tokens_and_labels(loader.load_lines()) tweets = btc.convert_to_tokens(tweets) tweets = btc.convert_to_ids(tweets) tweets = nn_input_preparer.filter_out_long_sequences(tweets) print( f'processing all {len(tweets)} not-too-long tweets from {input_filename}' ) irregular_inputs = [[item[0] for item in tweet] for tweet in tweets] irregular_targets = [[item[1] for item in tweet] for tweet in tweets] num_tokens_in_dataset = num_token_level_correct_argmax_predictions = \ num_token_level_correct_argmax_predictions_incl_pads = 0 tweet_level_argmax_accuracy_sum = tweet_level_expected_sampling_accuracy_sum = \ token_level_expected_sampling_accuracy_sum = token_level_expected_sampling_accuracy_sum_incl_pads = 0.0 for (irregular_input, irregular_target_indices) in tqdm( zip(irregular_inputs, irregular_targets)): rectangular_inputs = nn_input_preparer.rectangularize_inputs( [irregular_input]) rectangular_targets = nn_input_preparer.rectangularize_targets( [irregular_target_indices]) num_tokens_in_current_tweet = num_current_tweet_correct_argmax_predictions = 0 current_tweet_expected_sampling_accuracy_sum = 0.0 predicted_probabilities_sequence = trained_model( rectangular_inputs) for predicted_probabilities, target_index in zip( predicted_probabilities_sequence[0], rectangular_targets[0]): # the predicted index if we take the class with the largest probability argmax_index = np.argmax(predicted_probabilities) # probability of guessing target_index if we sample according to predicted probabilities prob_sampling_success_on_token = tf.keras.backend.get_value( predicted_probabilities[target_index]) if argmax_index == target_index: num_token_level_correct_argmax_predictions_incl_pads += 1 token_level_expected_sampling_accuracy_sum_incl_pads += prob_sampling_success_on_token if target_index != 0: if argmax_index == target_index: num_token_level_correct_argmax_predictions += 1 num_current_tweet_correct_argmax_predictions += 1 current_tweet_expected_sampling_accuracy_sum += prob_sampling_success_on_token token_level_expected_sampling_accuracy_sum += prob_sampling_success_on_token num_tokens_in_current_tweet += 1 num_tokens_in_dataset += 1 # every tweet has at least one non-padding token, so we don't worry about division by zero current_tweet_argmax_accuracy = num_current_tweet_correct_argmax_predictions / num_tokens_in_current_tweet current_tweet_expected_sampling_accuracy = \ current_tweet_expected_sampling_accuracy_sum / num_tokens_in_current_tweet tweet_level_argmax_accuracy_sum += current_tweet_argmax_accuracy tweet_level_expected_sampling_accuracy_sum += current_tweet_expected_sampling_accuracy num_tweets_in_dataset = len(tweets) num_tokens_in_dataset_incl_pads = MAX_SEQ_LEN * num_tweets_in_dataset print( f'Argmax accuracy for {input_filename} including padding:', num_token_level_correct_argmax_predictions_incl_pads / num_tokens_in_dataset_incl_pads) print( f'Expected sampling accuracy for {input_filename} including padding:', token_level_expected_sampling_accuracy_sum_incl_pads / num_tokens_in_dataset_incl_pads) print( f'Token-level argmax accuracy for {input_filename}:', num_token_level_correct_argmax_predictions / num_tokens_in_dataset) print( f'Token-level expected sampling accuracy for {input_filename}:', token_level_expected_sampling_accuracy_sum / num_tokens_in_dataset) print(f'Tweet-level argmax accuracy for {input_filename}:', tweet_level_argmax_accuracy_sum / num_tweets_in_dataset) print( f'Tweet-level expected sampling accuracy for {input_filename}:', tweet_level_expected_sampling_accuracy_sum / num_tweets_in_dataset)