def create_vocab_util_from_training_set(tr_input_filename: str) -> VocabUtil: """To keep things simple, we use the entire training set without filtering.""" tr_loader = LabeledDataLoader(tr_input_filename) tr_tweets = tr_loader.parse_tokens_and_labels(tr_loader.load_lines()) tr_unique_tokens = set([item[0] for tweet in tr_tweets for item in tweet]) sorted_tr_tokens = sorted(tr_unique_tokens) return VocabUtil(sorted_tr_tokens)
def create_vocab_util_from_training_set(tr_input_filename: str) -> VocabUtil: tr_loader = LabeledDataLoader(tr_input_filename) tr_labeled_tweets = tr_loader.parse_tokens_and_labels( tr_loader.load_lines()) tr_unique_tokens = set([ item for labeled_tweet in tr_labeled_tweets for item in labeled_tweet[0] ]) tr_sorted_tokens = sorted(tr_unique_tokens) print(f"Creating VocabUtil from {len(tr_sorted_tokens)} unique tokens.") return VocabUtil(tr_sorted_tokens)
def main_training(): print(f'Using TensorFlow version {tf.__version__}') vu = create_vocab_util_from_training_set(TRAINING_INPUT_FILENAME) nn_input_preparer = NNInputPreparer(vu, max_seq_len=MAX_SEQ_LEN) tr_loader = LabeledDataLoader(TRAINING_INPUT_FILENAME) tr_tweets = tr_loader.parse_tokens_and_labels(tr_loader.load_lines()) tr_tweets = nn_input_preparer.filter_out_long_sequences(tr_tweets) print( f'Training on {len(tr_tweets)} tweets, each no longer than {MAX_SEQ_LEN} tokens' ) tr_irregular_inputs = [[ vu.nn_input_token_to_int[item[0]] for item in tweet ] for tweet in tr_tweets] tr_irregular_targets = [[vu.nn_pos_to_int[item[1]] for item in tweet] for tweet in tr_tweets] tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs( tr_irregular_inputs) tr_rectangular_targets = nn_input_preparer.rectangularize_targets( tr_irregular_targets) tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( tr_rectangular_targets) if CONTINUE_TRAINING: print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE) model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE) model.summary() else: print("Commencing new training run") model_creator = LstmModelCreator(vu, embedding_dim=EMBEDDING_DIM, lstm_dim=LSTM_DIM, mask_zero=MASK_ZERO) model = model_creator.create_bi_lstm_model() cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5' checkpoint = ModelCheckpoint(cp_filepath, monitor='val_accuracy', verbose=1, save_best_only=False) rectangular_inputs, _, targets_one_hot_encoded = \ prep_validation_set(DEV_INPUT_FILENAME, nn_input_preparer, vu) model.fit(x=tr_rectangular_inputs, y=tr_targets_one_hot_encoded, batch_size=32, initial_epoch=INITIAL_EPOCH, epochs=MAX_EPOCHS, validation_data=(rectangular_inputs, targets_one_hot_encoded), callbacks=[checkpoint])
def test_loads_expected_number_of_tweets(self): # We need to use the dev set as the test set, so we will optimize based on this training set. loader = LabeledDataLoader('../data/pos/train.conll') train_tweets_with_labeled_tokens = loader.parse_raw_tokens_and_labels(loader.load_lines()) self.assertEqual(27893, len(train_tweets_with_labeled_tokens)) # The test data is NOT labeled, so we want to use this dev set as the test set. loader = LabeledDataLoader('../data/pos/dev.conll') dev_tweets_with_labeled_tokens = loader.parse_raw_tokens_and_labels(loader.load_lines()) self.assertEqual(4298, len(dev_tweets_with_labeled_tokens))
def prep_validation_set(input_filename: str, nn_input_preparer: NNInputPreparer, vu: VocabUtil, upsample: bool): loader = LabeledDataLoader(input_filename) labeled_tweets = loader.parse_tokens_and_labels(loader.load_lines()) labeled_tweets = nn_input_preparer.filter_out_long_tweets(labeled_tweets) if upsample: labeled_tweets = nn_input_preparer.crude_upsample(labeled_tweets) irregular_inputs = [[ vu.nn_input_token_to_int[token] if token in vu.nn_input_token_to_int else vu.nn_input_token_to_int['<OOV>'] for token in labeled_tweet[0] ] for labeled_tweet in labeled_tweets] rectangular_inputs = nn_input_preparer.rectangularize_inputs( irregular_inputs) rectangular_targets = [tweet[1] for tweet in labeled_tweets] targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( rectangular_targets) return rectangular_inputs, rectangular_targets, targets_one_hot_encoded
def prep_validation_set(input_filename: str, nn_input_preparer: NNInputPreparer, vu: VocabUtil) \ -> Tuple[np.ndarray, np.ndarray, np.ndarray]: loader = LabeledDataLoader(input_filename) tweets = loader.parse_tokens_and_labels(loader.load_lines()) tweets = nn_input_preparer.filter_out_long_sequences(tweets) print( f'processing all not-too-long {len(tweets)} tweets from {input_filename}' ) irregular_inputs = [[ vu.nn_input_token_to_int[item[0]] if item[0] in vu.nn_input_token_to_int else vu.nn_input_token_to_int['<OOV>'] for item in tweet ] for tweet in tweets] irregular_targets = [[vu.nn_pos_to_int[item[1]] for item in tweet] for tweet in tweets] rectangular_inputs = nn_input_preparer.rectangularize_inputs( irregular_inputs) rectangular_targets = nn_input_preparer.rectangularize_targets( irregular_targets) targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( rectangular_targets) return rectangular_inputs, rectangular_targets, targets_one_hot_encoded
def test_has_one_of_three_labels(self): # We need to use the dev set as the test set, so we will optimize based on this training set. loader = LabeledDataLoader('../data/sa/train.conll') train_tweets_with_labels = loader.parse_tokens_and_labels( loader.load_lines()) train_labels_set = set( [labeled_tweet[1] for labeled_tweet in train_tweets_with_labels]) self.assertEqual({'positive', 'negative', 'neutral'}, train_labels_set) loader = LabeledDataLoader('../data/sa/dev.conll') dev_tweets_with_labels = loader.parse_tokens_and_labels( loader.load_lines()) dev_labels_set = set( [labeled_tweet[1] for labeled_tweet in dev_tweets_with_labels]) self.assertEqual({'positive', 'negative', 'neutral'}, dev_labels_set)
def test_loads_expected_number_of_tokens(self): # We need to use the dev set as the test set, so we will optimize based on this training set. loader = LabeledDataLoader('../data/pos/train.conll') train_tweets_with_labeled_tokens = loader.parse_tokens_and_labels( loader.load_lines()) self.assertEqual( 217068, sum((len(tweet) for tweet in train_tweets_with_labeled_tokens))) # The test data is NOT labeled, so we need to use this dev set as the test set. loader = LabeledDataLoader('../data/pos/dev.conll') dev_tweets_with_labeled_tokens = loader.parse_tokens_and_labels( loader.load_lines()) self.assertEqual( 33345, sum( (len(tweet) for tweet in dev_tweets_with_labeled_tokens)))
def main_training(): print(f'Using TensorFlow version {tf.__version__}') tvu = TargetVocabUtil() btc = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu) nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN) tr_loader = LabeledDataLoader(TR_INPUT_FILENAME) tr_labeled_tweets = tr_loader.parse_tokens_and_labels( tr_loader.load_lines()) tr_labeled_tweets = btc.convert(tr_labeled_tweets) tr_labeled_tweets = btc.convert_to_ids(tr_labeled_tweets) tr_labeled_tweets = btc.prepend_cls(tr_labeled_tweets) tr_labeled_tweets = nn_input_preparer.filter_out_long_sequences( tr_labeled_tweets) print( f'Processing all not-too-long {len(tr_labeled_tweets)} tweets from {TR_INPUT_FILENAME}' ) tr_irregular_inputs = [tweet[0] for tweet in tr_labeled_tweets] tr_rectangular_targets = [tweet[1] for tweet in tr_labeled_tweets] tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs( tr_irregular_inputs) tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( tr_rectangular_targets) if CONTINUE_TRAINING: print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE) model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE) model.summary() else: print("Commencing new training run") model_creator = BertModelCreator(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu, max_seq_len=MAX_SEQ_LEN) model = model_creator.create_model() cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5' checkpoint = ModelCheckpoint(cp_filepath, monitor='val_accuracy', verbose=1, save_best_only=False) dev_loader = LabeledDataLoader(DEV_INPUT_FILENAME) dev_labeled_tweets = tr_loader.parse_tokens_and_labels( dev_loader.load_lines()) dev_labeled_tweets = btc.convert(dev_labeled_tweets) dev_labeled_tweets = btc.convert_to_ids(dev_labeled_tweets) dev_labeled_tweets = btc.prepend_cls(dev_labeled_tweets) dev_labeled_tweets = nn_input_preparer.filter_out_long_sequences( dev_labeled_tweets) print( f'Processing all not-too-long {len(dev_labeled_tweets)} tweets from {DEV_INPUT_FILENAME}' ) dev_irregular_inputs = [tweet[0] for tweet in dev_labeled_tweets] dev_rectangular_targets = [tweet[1] for tweet in dev_labeled_tweets] dev_rectangular_inputs = nn_input_preparer.rectangularize_inputs( dev_irregular_inputs) dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( dev_rectangular_targets) model.fit(tr_rectangular_inputs, tr_targets_one_hot_encoded, batch_size=32, initial_epoch=INITIAL_EPOCH, epochs=MAX_EPOCHS, validation_data=(dev_rectangular_inputs, dev_targets_one_hot_encoded), callbacks=[checkpoint]) model.save(FINAL_TRAINED_MODEL_FILENAME)
def main_training(): print(f'Using TensorFlow version {tf.__version__}') tr_loader = LabeledDataLoader(TRAINING_INPUT_FILENAME) tr_tweets = tr_loader.parse_raw_tokens_and_labels(tr_loader.load_lines()) tvu = TargetVocabUtil() tokenizer = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu) tr_tweets = tokenizer.convert_to_tokens(tr_tweets) tr_tweets = tokenizer.convert_to_ids(tr_tweets) nn_input_preparer = NNInputPreparer(tvu, max_seq_len=MAX_SEQ_LEN) tr_tweets = nn_input_preparer.filter_out_long_sequences(tr_tweets) print( f'Training on {len(tr_tweets)} tweets, each no longer than {MAX_SEQ_LEN} tokens' ) tr_irregular_inputs = [[item[0] for item in tweet] for tweet in tr_tweets] tr_irregular_targets = [[item[1] for item in tweet] for tweet in tr_tweets] tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs( tr_irregular_inputs) tr_rectangular_targets = nn_input_preparer.rectangularize_targets( tr_irregular_targets) tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( tr_rectangular_targets) if CONTINUE_TRAINING: print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE) model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE, custom_objects={"BertModelLayer": BertModelLayer}) model.summary() else: print('Commencing new training run') model_creator = BertModelCreator(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu, max_seq_len=MAX_SEQ_LEN, freeze_bert_layer=True) model = model_creator.create_model() cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5' dev_loader = LabeledDataLoader(DEV_INPUT_FILENAME) dev_tweets = dev_loader.parse_raw_tokens_and_labels( dev_loader.load_lines()) dev_tweets = tokenizer.convert_to_tokens(dev_tweets) dev_tweets = tokenizer.convert_to_ids(dev_tweets) dev_tweets = nn_input_preparer.filter_out_long_sequences(dev_tweets) print( f'processing all {len(dev_tweets)} not-too-long tweets from {DEV_INPUT_FILENAME}' ) dev_irregular_inputs = [[item[0] for item in tweet] for tweet in dev_tweets] # print('proportion of non-pad in dev:', sum(len(tweet) for tweet in tweets) / (MAX_SEQ_LEN * len(tweets))) dev_irregular_targets = [[item[1] for item in tweet] for tweet in dev_tweets] dev_rectangular_inputs = nn_input_preparer.rectangularize_inputs( dev_irregular_inputs) dev_rectangular_targets = nn_input_preparer.rectangularize_targets( dev_irregular_targets) dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot( dev_rectangular_targets) checkpoint = ModelCheckpoint(cp_filepath, monitor='val_accuracy', verbose=1, save_best_only=False) model.fit(tr_rectangular_inputs, tr_targets_one_hot_encoded, batch_size=32, initial_epoch=INITIAL_EPOCH, epochs=MAX_EPOCHS, validation_data=(dev_rectangular_inputs, dev_targets_one_hot_encoded), callbacks=[checkpoint]) model.save(FINAL_TRAINED_MODEL_FILENAME)
def main_inference(): print(f'Using TensorFlow version {tf.__version__}') print(f'Loading model {TRAINING_MODEL_FILENAME}') trained_model = load_model( TRAINING_MODEL_FILENAME, custom_objects={"BertModelLayer": BertModelLayer}) trained_model.summary() tvu = TargetVocabUtil() btc = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu) nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN) for input_filename in [DEV_INPUT_FILENAME]: loader = LabeledDataLoader(input_filename) tweets = loader.parse_tokens_and_labels(loader.load_lines()) tweets = btc.convert(tweets) tweets = btc.convert_to_ids(tweets) tweets = btc.prepend_cls(tweets) tweets = nn_input_preparer.filter_out_long_sequences(tweets) print( f'Processing all not-too-long {len(tweets)} tweets from {input_filename}' ) irregular_inputs = [tweet[0] for tweet in tweets] rectangular_targets = [tweet[1] for tweet in tweets] argmax_confusion_matrix = np.zeros( (tvu.get_output_vocab_size(), tvu.get_output_vocab_size()), dtype=int) expected_sampling_confusion_matrix = np.zeros( (tvu.get_output_vocab_size(), tvu.get_output_vocab_size())) num_correct_argmax_predictions = 0 expected_sampling_accuracy_sum = 0.0 for irregular_input, target_index in tqdm( zip(irregular_inputs, rectangular_targets)): rectangular_input_singleton = nn_input_preparer.rectangularize_inputs( [irregular_input]) predicted_probabilities = trained_model( rectangular_input_singleton)[0] # the predicted index if we take the class with the largest probability argmax_index = np.argmax(predicted_probabilities) if argmax_index == target_index: num_correct_argmax_predictions += 1 argmax_confusion_matrix[target_index][argmax_index] += 1 # rhs is the probability of guessing target if we sample according to predicted probabilities expected_sampling_accuracy_sum += tf.keras.backend.get_value( predicted_probabilities[target_index]) for i in range(tvu.get_output_vocab_size()): expected_sampling_confusion_matrix[target_index][ i] += predicted_probabilities[i] num_tweets_in_dataset = len(rectangular_targets) print(f'Argmax accuracy for {input_filename}:', num_correct_argmax_predictions / num_tweets_in_dataset) print(f'Expected sampling accuracy for {input_filename}:', expected_sampling_accuracy_sum / num_tweets_in_dataset) print( f"Argmax confusion matrix of targets vs predicted for {input_filename}:\n" f"{tvu.raw_sentiment_labels}\n", argmax_confusion_matrix) print( f"Expected sampling confusion matrix of targets vs predicted for {input_filename}:\n" f"{tvu.raw_sentiment_labels}\n", expected_sampling_confusion_matrix)
def main_inference(): print(f'Using TensorFlow version {tf.__version__}') print(f'Loading model {TRAINING_MODEL_FILENAME}') trained_model = load_model( TRAINING_MODEL_FILENAME, custom_objects={"BertModelLayer": BertModelLayer}) print('Loaded fine-tuned model:') trained_model.summary() tvu = TargetVocabUtil() btc = BertTokenConverter(BERT_PRETRAINED_MODEL_DIR, tvu) nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN) for input_filename in [DEV_INPUT_FILENAME]: loader = LabeledDataLoader(input_filename) tweets = loader.parse_raw_tokens_and_labels(loader.load_lines()) tweets = btc.convert_to_tokens(tweets) tweets = btc.convert_to_ids(tweets) tweets = nn_input_preparer.filter_out_long_sequences(tweets) print( f'processing all {len(tweets)} not-too-long tweets from {input_filename}' ) irregular_inputs = [[item[0] for item in tweet] for tweet in tweets] irregular_targets = [[item[1] for item in tweet] for tweet in tweets] num_tokens_in_dataset = num_token_level_correct_argmax_predictions = \ num_token_level_correct_argmax_predictions_incl_pads = 0 tweet_level_argmax_accuracy_sum = tweet_level_expected_sampling_accuracy_sum = \ token_level_expected_sampling_accuracy_sum = token_level_expected_sampling_accuracy_sum_incl_pads = 0.0 for (irregular_input, irregular_target_indices) in tqdm( zip(irregular_inputs, irregular_targets)): rectangular_inputs = nn_input_preparer.rectangularize_inputs( [irregular_input]) rectangular_targets = nn_input_preparer.rectangularize_targets( [irregular_target_indices]) num_tokens_in_current_tweet = num_current_tweet_correct_argmax_predictions = 0 current_tweet_expected_sampling_accuracy_sum = 0.0 predicted_probabilities_sequence = trained_model( rectangular_inputs) for predicted_probabilities, target_index in zip( predicted_probabilities_sequence[0], rectangular_targets[0]): # the predicted index if we take the class with the largest probability argmax_index = np.argmax(predicted_probabilities) # probability of guessing target_index if we sample according to predicted probabilities prob_sampling_success_on_token = tf.keras.backend.get_value( predicted_probabilities[target_index]) if argmax_index == target_index: num_token_level_correct_argmax_predictions_incl_pads += 1 token_level_expected_sampling_accuracy_sum_incl_pads += prob_sampling_success_on_token if target_index != 0: if argmax_index == target_index: num_token_level_correct_argmax_predictions += 1 num_current_tweet_correct_argmax_predictions += 1 current_tweet_expected_sampling_accuracy_sum += prob_sampling_success_on_token token_level_expected_sampling_accuracy_sum += prob_sampling_success_on_token num_tokens_in_current_tweet += 1 num_tokens_in_dataset += 1 # every tweet has at least one non-padding token, so we don't worry about division by zero current_tweet_argmax_accuracy = num_current_tweet_correct_argmax_predictions / num_tokens_in_current_tweet current_tweet_expected_sampling_accuracy = \ current_tweet_expected_sampling_accuracy_sum / num_tokens_in_current_tweet tweet_level_argmax_accuracy_sum += current_tweet_argmax_accuracy tweet_level_expected_sampling_accuracy_sum += current_tweet_expected_sampling_accuracy num_tweets_in_dataset = len(tweets) num_tokens_in_dataset_incl_pads = MAX_SEQ_LEN * num_tweets_in_dataset print( f'Argmax accuracy for {input_filename} including padding:', num_token_level_correct_argmax_predictions_incl_pads / num_tokens_in_dataset_incl_pads) print( f'Expected sampling accuracy for {input_filename} including padding:', token_level_expected_sampling_accuracy_sum_incl_pads / num_tokens_in_dataset_incl_pads) print( f'Token-level argmax accuracy for {input_filename}:', num_token_level_correct_argmax_predictions / num_tokens_in_dataset) print( f'Token-level expected sampling accuracy for {input_filename}:', token_level_expected_sampling_accuracy_sum / num_tokens_in_dataset) print(f'Tweet-level argmax accuracy for {input_filename}:', tweet_level_argmax_accuracy_sum / num_tweets_in_dataset) print( f'Tweet-level expected sampling accuracy for {input_filename}:', tweet_level_expected_sampling_accuracy_sum / num_tweets_in_dataset)