Exemple #1
0
def main_training():
    lexicon_loader = LexiconLoader()
    scored_lexicon: dict = lexicon_loader.load_all_and_merge()
    tr_tweets_loader = LabeledTweetsLoader(TRAINING_INPUT_FILENAME)
    tr_labeled_tweets = tr_tweets_loader.parse_tokens_and_labels(
        tr_tweets_loader.load_lines())

    token_summarizer = TokenSummarizer(scored_lexicon)
    feature_extractor = FeatureExtractor(scored_lexicon)

    vu = VocabUtil()
    nn_input_preparer = NNInputPreparer(vu)

    tr_feature_vectors = []  # 2D array of feature vectors
    for labeled_tweet in tr_labeled_tweets:
        known_token_sequence = token_summarizer.get_known_tokens(
            labeled_tweet[0])
        feature_vector = feature_extractor.compute_feature_vector(
            known_token_sequence)
        tr_feature_vectors.append(feature_vector)
    tr_network_input = np.array(tr_feature_vectors)
    tr_targets = [labeled_tweet[1] for labeled_tweet in tr_labeled_tweets]
    tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        tr_targets)

    dev_tweets_loader = LabeledTweetsLoader(DEV_INPUT_FILENAME)
    dev_labeled_tweets = dev_tweets_loader.parse_tokens_and_labels(
        dev_tweets_loader.load_lines())
    dev_feature_vectors = []  # 2D array of feature vectors
    for labeled_tweet in dev_labeled_tweets:
        known_token_sequence = token_summarizer.get_known_tokens(
            labeled_tweet[0])
        feature_vector = feature_extractor.compute_feature_vector(
            known_token_sequence)
        dev_feature_vectors.append(feature_vector)
    dev_network_input = np.array(dev_feature_vectors)
    dev_targets = [labeled_tweet[1] for labeled_tweet in dev_labeled_tweets]
    dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        dev_targets)

    # Every epoch is cheap (< 1ms), so we don't need the ability to continue training from a previous model.
    print("Commencing new training run")
    model_creator = ModelCreator(vu)
    model = model_creator.create_two_dense_model(hidden_layer_size=HIDDEN_SIZE)

    cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5'
    checkpoint = ModelCheckpoint(cp_filepath,
                                 monitor='val_accuracy',
                                 verbose=1,
                                 save_best_only=False)

    model.fit(tr_network_input,
              tr_targets_one_hot_encoded,
              batch_size=32,
              epochs=MAX_EPOCHS,
              validation_data=(dev_network_input, dev_targets_one_hot_encoded),
              callbacks=[checkpoint])
Exemple #2
0
def prep_validation_set(input_filename: str,
                        nn_input_preparer: NNInputPreparer, vu: VocabUtil,
                        upsample: bool):
    loader = LabeledDataLoader(input_filename)
    labeled_tweets = loader.parse_tokens_and_labels(loader.load_lines())
    labeled_tweets = nn_input_preparer.filter_out_long_tweets(labeled_tweets)
    if upsample:
        labeled_tweets = nn_input_preparer.crude_upsample(labeled_tweets)
    irregular_inputs = [[
        vu.nn_input_token_to_int[token] if token in vu.nn_input_token_to_int
        else vu.nn_input_token_to_int['<OOV>'] for token in labeled_tweet[0]
    ] for labeled_tweet in labeled_tweets]
    rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        irregular_inputs)
    rectangular_targets = [tweet[1] for tweet in labeled_tweets]
    targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        rectangular_targets)

    return rectangular_inputs, rectangular_targets, targets_one_hot_encoded
Exemple #3
0
def prep_validation_set(input_filename: str, nn_input_preparer: NNInputPreparer, vu: VocabUtil) \
        -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    loader = LabeledDataLoader(input_filename)
    tweets = loader.parse_tokens_and_labels(loader.load_lines())
    tweets = nn_input_preparer.filter_out_long_sequences(tweets)
    print(
        f'processing all not-too-long {len(tweets)} tweets from {input_filename}'
    )
    irregular_inputs = [[
        vu.nn_input_token_to_int[item[0]] if item[0]
        in vu.nn_input_token_to_int else vu.nn_input_token_to_int['<OOV>']
        for item in tweet
    ] for tweet in tweets]
    irregular_targets = [[vu.nn_pos_to_int[item[1]] for item in tweet]
                         for tweet in tweets]
    rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        irregular_inputs)
    rectangular_targets = nn_input_preparer.rectangularize_targets(
        irregular_targets)
    targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        rectangular_targets)
    return rectangular_inputs, rectangular_targets, targets_one_hot_encoded
Exemple #4
0
def main_training():
    print(f'Using TensorFlow version {tf.__version__}')

    vu = create_vocab_util_from_training_set(TRAINING_INPUT_FILENAME)
    nn_input_preparer = NNInputPreparer(vu, max_seq_len=MAX_SEQ_LEN)

    tr_loader = LabeledDataLoader(TRAINING_INPUT_FILENAME)
    tr_tweets = tr_loader.parse_tokens_and_labels(tr_loader.load_lines())
    tr_tweets = nn_input_preparer.filter_out_long_sequences(tr_tweets)
    print(
        f'Training on {len(tr_tweets)} tweets, each no longer than {MAX_SEQ_LEN} tokens'
    )
    tr_irregular_inputs = [[
        vu.nn_input_token_to_int[item[0]] for item in tweet
    ] for tweet in tr_tweets]
    tr_irregular_targets = [[vu.nn_pos_to_int[item[1]] for item in tweet]
                            for tweet in tr_tweets]
    tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        tr_irregular_inputs)
    tr_rectangular_targets = nn_input_preparer.rectangularize_targets(
        tr_irregular_targets)
    tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        tr_rectangular_targets)

    if CONTINUE_TRAINING:
        print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE)
        model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE)
        model.summary()
    else:
        print("Commencing new training run")
        model_creator = LstmModelCreator(vu,
                                         embedding_dim=EMBEDDING_DIM,
                                         lstm_dim=LSTM_DIM,
                                         mask_zero=MASK_ZERO)
        model = model_creator.create_bi_lstm_model()

    cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5'

    checkpoint = ModelCheckpoint(cp_filepath,
                                 monitor='val_accuracy',
                                 verbose=1,
                                 save_best_only=False)

    rectangular_inputs, _, targets_one_hot_encoded = \
        prep_validation_set(DEV_INPUT_FILENAME, nn_input_preparer, vu)

    model.fit(x=tr_rectangular_inputs,
              y=tr_targets_one_hot_encoded,
              batch_size=32,
              initial_epoch=INITIAL_EPOCH,
              epochs=MAX_EPOCHS,
              validation_data=(rectangular_inputs, targets_one_hot_encoded),
              callbacks=[checkpoint])
Exemple #5
0
def main():
    print('tf:', tf.__version__)
    print('TRAINING_MODEL_FILENAME =', TRAINING_MODEL_FILENAME)

    nn_input_preparer = NNInputPreparer()

    model_creator = NNModelCreator(latent_dim=LATENT_DIM, dense_dim=DENSE_DIM)
    loaded_training_model = load_model(TRAINING_MODEL_FILENAME)

    encoder_model, decoder_model = model_creator.derive_inference_models(
        loaded_training_model)
    inference_runner = InferenceRunner(encoder_model=encoder_model,
                                       decoder_model=decoder_model)

    cleaner = TweetCleaner()
    selector = TweetSelector(min_length=MIN_TWEET_LENGTH,
                             max_length=MAX_TWEET_LENGTH)
    noiser = DisjointNoiser()

    for input_filename in [TRAINING_INPUT_FILENAME, DEV_INPUT_FILENAME]:
        k = 10
        print(
            f'processing the first {k} selected tweets from {input_filename}')
        raw_tweets = DataLoader(input_filename).load()
        clean_tweets = [cleaner.clean_tweet(t) for t in raw_tweets]
        clean_tweets_as_lists = [list(t) for t in clean_tweets]
        selected_tweets_as_lists = [
            t for t in clean_tweets_as_lists if selector.select(t)
        ]
        gb_inference = nn_input_preparer.get_batches(selected_tweets_as_lists,
                                                     noiser,
                                                     batch_size=1)
        for i in range(k):
            noised_batch, originals_batch, original_delayed_batch = next(
                gb_inference)
            print('[noised    ]',
                  nn_input_preparer.decode_tweet(noised_batch[0]))
            print('[original  ]',
                  nn_input_preparer.decode_tweet(originals_batch[0]))
            print('[original 2]', ''.join(selected_tweets_as_lists[i]))
            print('[or-delayed]',
                  nn_input_preparer.decode_tweet(original_delayed_batch[0]))
            decoded_tweet = inference_runner.decode_sequence(noised_batch)
            print('[decoded   ]', decoded_tweet)
            print()
Exemple #6
0
def main_inference():
    print(f'Using TensorFlow version {tf.__version__}')

    print(f'Loading model {TRAINING_MODEL_FILENAME}')
    trained_model = load_model(TRAINING_MODEL_FILENAME)
    trained_model.summary()

    lexicon_loader = LexiconLoader()
    scored_lexicon: dict = lexicon_loader.load_all_and_merge()
    token_summarizer = TokenSummarizer(scored_lexicon)
    feature_extractor = FeatureExtractor(scored_lexicon)
    vu = VocabUtil()
    nn_input_preparer = NNInputPreparer(vu)

    for input_filename in [DEV_INPUT_FILENAME]:
        tweets_loader = LabeledTweetsLoader(DEV_INPUT_FILENAME)
        labeled_tweets = tweets_loader.parse_tokens_and_labels(
            tweets_loader.load_lines())
        feature_vectors = []  # 2D array of feature vectors
        for labeled_tweet in labeled_tweets:
            known_token_sequence = token_summarizer.get_known_tokens(
                labeled_tweet[0])
            feature_vector = feature_extractor.compute_feature_vector(
                known_token_sequence)
            feature_vectors.append(feature_vector)
        network_input = np.array(feature_vectors)
        print('network_input.shape:', network_input.shape)
        targets = [labeled_tweet[1] for labeled_tweet in labeled_tweets]
        targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
            targets)

        trained_model.evaluate(network_input, targets_one_hot_encoded)

        argmax_confusion_matrix = np.zeros(
            (vu.get_output_vocab_size(), vu.get_output_vocab_size()),
            dtype=int)
        expected_sampling_confusion_matrix = np.zeros(
            (vu.get_output_vocab_size(), vu.get_output_vocab_size()))

        expected_sampling_accuracy_sum = 0.0
        num_correct_argmax_predictions = 0
        for rectangular_input, target_human in tqdm(zip(
                network_input, targets)):
            rectangular_input.shape = (1, 3)
            target_index = vu.nn_rsl_to_int[target_human]
            predicted_probabilities = trained_model(rectangular_input)[0]
            # the predicted index if we take the class with the largest probability
            argmax_index = np.argmax(predicted_probabilities)
            if argmax_index == target_index:
                num_correct_argmax_predictions += 1
            argmax_confusion_matrix[target_index][argmax_index] += 1
            # rhs is the probability of guessing target_index if we sample according to predicted probabilities
            expected_sampling_accuracy_sum += tf.keras.backend.get_value(
                predicted_probabilities[target_index])
            for i in range(vu.get_output_vocab_size()):
                expected_sampling_confusion_matrix[target_index][
                    i] += predicted_probabilities[i]
        num_tweets_in_dataset = len(targets)

        print(f'Argmax accuracy for {input_filename}:',
              num_correct_argmax_predictions / num_tweets_in_dataset)
        print(f'Expected sampling accuracy for {input_filename}:',
              expected_sampling_accuracy_sum / num_tweets_in_dataset)

        print(
            f"Argmax confusion matrix of targets vs predicted for {input_filename}:\n"
            f"{vu.raw_sentiment_labels}\n", argmax_confusion_matrix)
        print(
            f"Expected sampling confusion matrix of targets vs predicted for {input_filename}:\n"
            f"{vu.raw_sentiment_labels}\n", expected_sampling_confusion_matrix)
Exemple #7
0
def main_inference():
    print(f'Using TensorFlow version {tf.__version__}')

    print(f'Loading model {TRAINING_MODEL_FILENAME}')
    trained_model = load_model(TRAINING_MODEL_FILENAME)
    trained_model.summary()

    vu = create_vocab_util_from_training_set(TRAINING_INPUT_FILENAME)
    nn_input_preparer = NNInputPreparer(vu, MAX_SEQ_LEN)

    for input_filename in [DEV_INPUT_FILENAME]:
        rectangular_inputs, rectangular_targets, targets_one_hot_encoded = \
            prep_validation_set(input_filename, nn_input_preparer, vu, UPSAMPLE)

        trained_model.evaluate(rectangular_inputs,
                               targets_one_hot_encoded,
                               batch_size=32)

        argmax_confusion_matrix = np.zeros(
            (vu.get_output_vocab_size(), vu.get_output_vocab_size()),
            dtype=int)
        expected_sampling_confusion_matrix = np.zeros(
            (vu.get_output_vocab_size(), vu.get_output_vocab_size()))

        expected_sampling_accuracy_sum = 0.0
        num_correct_argmax_predictions = 0
        it = 0
        for rectangular_input, target_human in tqdm(
                zip(rectangular_inputs, rectangular_targets)):
            it += 1
            target_index = vu.nn_rsl_to_int[target_human]
            rectangular_input_2d = np.array(rectangular_input)
            rectangular_input_2d.shape = (1, MAX_SEQ_LEN)
            predicted_probabilities = trained_model(rectangular_input_2d,
                                                    training=False)[0]
            if it < 10:
                print(rectangular_input)
                print('target_index:', target_index)
                print(predicted_probabilities)
                print()
            # the predicted index if we take the class with the largest probability
            argmax_index = np.argmax(predicted_probabilities)
            if argmax_index == target_index:
                num_correct_argmax_predictions += 1
            argmax_confusion_matrix[target_index][argmax_index] += 1
            # rhs is the probability of guessing target_index if we sample according to predicted probabilities
            expected_sampling_accuracy_sum += tf.keras.backend.get_value(
                predicted_probabilities[target_index])
            for i in range(vu.get_output_vocab_size()):
                expected_sampling_confusion_matrix[target_index][
                    i] += predicted_probabilities[i]
        num_tweets_in_dataset = len(rectangular_targets)

        print(f'Argmax accuracy for {input_filename}:',
              num_correct_argmax_predictions / num_tweets_in_dataset)
        print(f'Expected sampling accuracy for {input_filename}:',
              expected_sampling_accuracy_sum / num_tweets_in_dataset)

        print(
            f"Argmax confusion matrix of targets vs predicted for {input_filename}:\n"
            f"{vu.raw_sentiment_labels}\n", argmax_confusion_matrix)
        print(
            f"Expected sampling confusion matrix of targets vs predicted for {input_filename}:\n"
            f"{vu.raw_sentiment_labels}\n", expected_sampling_confusion_matrix)
Exemple #8
0
def main_inference():
    print(f'Using TensorFlow version {tf.__version__}')
    print(f'Loading model {TRAINING_MODEL_FILENAME}')
    trained_model = load_model(TRAINING_MODEL_FILENAME)
    vu = create_vocab_util_from_training_set(TRAINING_INPUT_FILENAME)
    nn_input_preparer = NNInputPreparer(vu, max_seq_len=MAX_SEQ_LEN)

    for input_filename in [DEV_INPUT_FILENAME]:
        rectangular_inputs, rectangular_targets, targets_one_hot_encoded = \
            prep_validation_set(input_filename, nn_input_preparer, vu)
        trained_model.evaluate(rectangular_inputs, targets_one_hot_encoded, batch_size=32)
        num_tokens_in_dataset = num_token_level_correct_argmax_predictions = \
            num_token_level_correct_argmax_predictions_incl_pads = 0
        tweet_level_argmax_accuracy_sum = tweet_level_expected_sampling_accuracy_sum = \
            token_level_expected_sampling_accuracy_sum = token_level_expected_sampling_accuracy_sum_incl_pads = 0.0
        for rectangular_input, rectangular_target_indices in tqdm(zip(rectangular_inputs, rectangular_targets)):
            num_tokens_in_current_tweet = num_current_tweet_correct_argmax_predictions = 0
            current_tweet_expected_sampling_accuracy_sum = 0.0
            rectangular_input_2d = np.array(rectangular_input)
            rectangular_input_2d.shape = (1, MAX_SEQ_LEN)
            predicted_probabilities_sequence = trained_model(rectangular_input_2d, training=False)[0]
            for predicted_probabilities, target_index in zip(
                    predicted_probabilities_sequence, rectangular_target_indices):
                # the predicted index if we take the class with the largest probability
                argmax_index = np.argmax(predicted_probabilities)
                # probability of guessing target_index if we sample according to predicted probabilities
                prob_sampling_success_on_token = \
                    tf.keras.backend.get_value(predicted_probabilities[target_index])
                if argmax_index == target_index:
                    num_token_level_correct_argmax_predictions_incl_pads += 1
                token_level_expected_sampling_accuracy_sum_incl_pads += prob_sampling_success_on_token

                if target_index != 0:
                    if argmax_index == target_index:
                        num_token_level_correct_argmax_predictions += 1
                        num_current_tweet_correct_argmax_predictions += 1
                    current_tweet_expected_sampling_accuracy_sum += prob_sampling_success_on_token
                    token_level_expected_sampling_accuracy_sum += prob_sampling_success_on_token
                    num_tokens_in_current_tweet += 1
                    num_tokens_in_dataset += 1

            # every tweet has at least one non-padding token, so we don't worry about division by zero
            current_tweet_argmax_accuracy = num_current_tweet_correct_argmax_predictions / num_tokens_in_current_tweet
            current_tweet_expected_sampling_accuracy = \
                current_tweet_expected_sampling_accuracy_sum / num_tokens_in_current_tweet

            tweet_level_argmax_accuracy_sum += current_tweet_argmax_accuracy
            tweet_level_expected_sampling_accuracy_sum += current_tweet_expected_sampling_accuracy

        num_tokens_in_dataset_incl_pads = MAX_SEQ_LEN * len(rectangular_inputs)
        print(f'Argmax accuracy for {input_filename} including padding:',
              num_token_level_correct_argmax_predictions_incl_pads / num_tokens_in_dataset_incl_pads)
        print(f'Expected sampling accuracy for {input_filename} including padding:',
              token_level_expected_sampling_accuracy_sum_incl_pads / num_tokens_in_dataset_incl_pads)

        print(f'Token-level argmax accuracy for {input_filename}:',
              num_token_level_correct_argmax_predictions / num_tokens_in_dataset)
        print(f'Token-level expected sampling accuracy for {input_filename}:',
              token_level_expected_sampling_accuracy_sum / num_tokens_in_dataset)

        num_tweets_in_dataset = len(rectangular_inputs)
        print(f'Tweet-level argmax accuracy for {input_filename}:',
              tweet_level_argmax_accuracy_sum / num_tweets_in_dataset)
        print(f'Tweet-level expected sampling accuracy for {input_filename}:',
              tweet_level_expected_sampling_accuracy_sum / num_tweets_in_dataset)
Exemple #9
0
def main_training():
    print(f'Using TensorFlow version {tf.__version__}')

    tvu = TargetVocabUtil()
    btc = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu)
    nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN)

    tr_loader = LabeledDataLoader(TR_INPUT_FILENAME)
    tr_labeled_tweets = tr_loader.parse_tokens_and_labels(
        tr_loader.load_lines())
    tr_labeled_tweets = btc.convert(tr_labeled_tweets)
    tr_labeled_tweets = btc.convert_to_ids(tr_labeled_tweets)
    tr_labeled_tweets = btc.prepend_cls(tr_labeled_tweets)
    tr_labeled_tweets = nn_input_preparer.filter_out_long_sequences(
        tr_labeled_tweets)
    print(
        f'Processing all not-too-long {len(tr_labeled_tweets)} tweets from {TR_INPUT_FILENAME}'
    )
    tr_irregular_inputs = [tweet[0] for tweet in tr_labeled_tweets]
    tr_rectangular_targets = [tweet[1] for tweet in tr_labeled_tweets]
    tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        tr_irregular_inputs)
    tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        tr_rectangular_targets)

    if CONTINUE_TRAINING:
        print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE)
        model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE)
        model.summary()
    else:
        print("Commencing new training run")
        model_creator = BertModelCreator(model_dir=BERT_PRETRAINED_MODEL_DIR,
                                         tvu=tvu,
                                         max_seq_len=MAX_SEQ_LEN)
        model = model_creator.create_model()

    cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5'

    checkpoint = ModelCheckpoint(cp_filepath,
                                 monitor='val_accuracy',
                                 verbose=1,
                                 save_best_only=False)

    dev_loader = LabeledDataLoader(DEV_INPUT_FILENAME)
    dev_labeled_tweets = tr_loader.parse_tokens_and_labels(
        dev_loader.load_lines())
    dev_labeled_tweets = btc.convert(dev_labeled_tweets)
    dev_labeled_tweets = btc.convert_to_ids(dev_labeled_tweets)
    dev_labeled_tweets = btc.prepend_cls(dev_labeled_tweets)
    dev_labeled_tweets = nn_input_preparer.filter_out_long_sequences(
        dev_labeled_tweets)
    print(
        f'Processing all not-too-long {len(dev_labeled_tweets)} tweets from {DEV_INPUT_FILENAME}'
    )
    dev_irregular_inputs = [tweet[0] for tweet in dev_labeled_tweets]
    dev_rectangular_targets = [tweet[1] for tweet in dev_labeled_tweets]
    dev_rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        dev_irregular_inputs)
    dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        dev_rectangular_targets)

    model.fit(tr_rectangular_inputs,
              tr_targets_one_hot_encoded,
              batch_size=32,
              initial_epoch=INITIAL_EPOCH,
              epochs=MAX_EPOCHS,
              validation_data=(dev_rectangular_inputs,
                               dev_targets_one_hot_encoded),
              callbacks=[checkpoint])

    model.save(FINAL_TRAINED_MODEL_FILENAME)
Exemple #10
0
def main_training():
    print(f'Using TensorFlow version {tf.__version__}')
    tr_loader = LabeledDataLoader(TRAINING_INPUT_FILENAME)
    tr_tweets = tr_loader.parse_raw_tokens_and_labels(tr_loader.load_lines())

    tvu = TargetVocabUtil()

    tokenizer = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR,
                                   tvu=tvu)
    tr_tweets = tokenizer.convert_to_tokens(tr_tweets)
    tr_tweets = tokenizer.convert_to_ids(tr_tweets)

    nn_input_preparer = NNInputPreparer(tvu, max_seq_len=MAX_SEQ_LEN)
    tr_tweets = nn_input_preparer.filter_out_long_sequences(tr_tweets)

    print(
        f'Training on {len(tr_tweets)} tweets, each no longer than {MAX_SEQ_LEN} tokens'
    )

    tr_irregular_inputs = [[item[0] for item in tweet] for tweet in tr_tweets]
    tr_irregular_targets = [[item[1] for item in tweet] for tweet in tr_tweets]

    tr_rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        tr_irregular_inputs)
    tr_rectangular_targets = nn_input_preparer.rectangularize_targets(
        tr_irregular_targets)

    tr_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        tr_rectangular_targets)

    if CONTINUE_TRAINING:
        print('Continuing training from', TRAINING_MODEL_FILENAME_TO_CONTINUE)
        model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE,
                           custom_objects={"BertModelLayer": BertModelLayer})
        model.summary()
    else:
        print('Commencing new training run')
        model_creator = BertModelCreator(model_dir=BERT_PRETRAINED_MODEL_DIR,
                                         tvu=tvu,
                                         max_seq_len=MAX_SEQ_LEN,
                                         freeze_bert_layer=True)
        model = model_creator.create_model()

    cp_filepath = BASE_DIR + 'ep_{epoch}_valacc_{val_accuracy:.5f}.h5'

    dev_loader = LabeledDataLoader(DEV_INPUT_FILENAME)
    dev_tweets = dev_loader.parse_raw_tokens_and_labels(
        dev_loader.load_lines())
    dev_tweets = tokenizer.convert_to_tokens(dev_tweets)
    dev_tweets = tokenizer.convert_to_ids(dev_tweets)
    dev_tweets = nn_input_preparer.filter_out_long_sequences(dev_tweets)
    print(
        f'processing all {len(dev_tweets)} not-too-long tweets from {DEV_INPUT_FILENAME}'
    )
    dev_irregular_inputs = [[item[0] for item in tweet]
                            for tweet in dev_tweets]
    # print('proportion of non-pad in dev:', sum(len(tweet) for tweet in tweets) / (MAX_SEQ_LEN * len(tweets)))
    dev_irregular_targets = [[item[1] for item in tweet]
                             for tweet in dev_tweets]
    dev_rectangular_inputs = nn_input_preparer.rectangularize_inputs(
        dev_irregular_inputs)
    dev_rectangular_targets = nn_input_preparer.rectangularize_targets(
        dev_irregular_targets)
    dev_targets_one_hot_encoded = nn_input_preparer.rectangular_targets_to_one_hot(
        dev_rectangular_targets)

    checkpoint = ModelCheckpoint(cp_filepath,
                                 monitor='val_accuracy',
                                 verbose=1,
                                 save_best_only=False)

    model.fit(tr_rectangular_inputs,
              tr_targets_one_hot_encoded,
              batch_size=32,
              initial_epoch=INITIAL_EPOCH,
              epochs=MAX_EPOCHS,
              validation_data=(dev_rectangular_inputs,
                               dev_targets_one_hot_encoded),
              callbacks=[checkpoint])

    model.save(FINAL_TRAINED_MODEL_FILENAME)
Exemple #11
0
def main_inference():
    print(f'Using TensorFlow version {tf.__version__}')
    print(f'Loading model {TRAINING_MODEL_FILENAME}')
    trained_model = load_model(
        TRAINING_MODEL_FILENAME,
        custom_objects={"BertModelLayer": BertModelLayer})
    trained_model.summary()

    tvu = TargetVocabUtil()
    btc = BertTokenConverter(model_dir=BERT_PRETRAINED_MODEL_DIR, tvu=tvu)
    nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN)

    for input_filename in [DEV_INPUT_FILENAME]:
        loader = LabeledDataLoader(input_filename)
        tweets = loader.parse_tokens_and_labels(loader.load_lines())
        tweets = btc.convert(tweets)
        tweets = btc.convert_to_ids(tweets)
        tweets = btc.prepend_cls(tweets)
        tweets = nn_input_preparer.filter_out_long_sequences(tweets)
        print(
            f'Processing all not-too-long {len(tweets)} tweets from {input_filename}'
        )

        irregular_inputs = [tweet[0] for tweet in tweets]
        rectangular_targets = [tweet[1] for tweet in tweets]

        argmax_confusion_matrix = np.zeros(
            (tvu.get_output_vocab_size(), tvu.get_output_vocab_size()),
            dtype=int)
        expected_sampling_confusion_matrix = np.zeros(
            (tvu.get_output_vocab_size(), tvu.get_output_vocab_size()))

        num_correct_argmax_predictions = 0
        expected_sampling_accuracy_sum = 0.0

        for irregular_input, target_index in tqdm(
                zip(irregular_inputs, rectangular_targets)):
            rectangular_input_singleton = nn_input_preparer.rectangularize_inputs(
                [irregular_input])
            predicted_probabilities = trained_model(
                rectangular_input_singleton)[0]
            # the predicted index if we take the class with the largest probability
            argmax_index = np.argmax(predicted_probabilities)
            if argmax_index == target_index:
                num_correct_argmax_predictions += 1
            argmax_confusion_matrix[target_index][argmax_index] += 1

            # rhs is the probability of guessing target if we sample according to predicted probabilities
            expected_sampling_accuracy_sum += tf.keras.backend.get_value(
                predicted_probabilities[target_index])
            for i in range(tvu.get_output_vocab_size()):
                expected_sampling_confusion_matrix[target_index][
                    i] += predicted_probabilities[i]

        num_tweets_in_dataset = len(rectangular_targets)

        print(f'Argmax accuracy for {input_filename}:',
              num_correct_argmax_predictions / num_tweets_in_dataset)
        print(f'Expected sampling accuracy for {input_filename}:',
              expected_sampling_accuracy_sum / num_tweets_in_dataset)

        print(
            f"Argmax confusion matrix of targets vs predicted for {input_filename}:\n"
            f"{tvu.raw_sentiment_labels}\n", argmax_confusion_matrix)
        print(
            f"Expected sampling confusion matrix of targets vs predicted for {input_filename}:\n"
            f"{tvu.raw_sentiment_labels}\n",
            expected_sampling_confusion_matrix)
Exemple #12
0
def main():
    print('tf:', tf.__version__)

    random.seed(42)

    raw_tweets = DataLoader(TRAINING_INPUT_FILENAME).load()
    cleaner = TweetCleaner()
    clean_tweets = [cleaner.clean_tweet(t) for t in raw_tweets]

    clean_tweets_as_lists = [list(t) for t in clean_tweets]
    print('number of clean_tweets_as_lists:', len(clean_tweets_as_lists))
    selector = TweetSelector(min_length=MIN_TWEET_LENGTH,
                             max_length=MAX_TWEET_LENGTH)
    selected_tweets_as_lists = [
        t for t in clean_tweets_as_lists if selector.select(t)
    ]
    print('number of selected_tweets_as_lists:', len(selected_tweets_as_lists))

    if CONTINUE_TRAINING:
        training_model = load_model(TRAINING_MODEL_FILENAME_TO_CONTINUE)
    else:
        model_creator = NNModelCreator(latent_dim=LATENT_DIM,
                                       dense_dim=DENSE_DIM)
        training_model = model_creator.create_training_model()

    nn_input_preparer = NNInputPreparer()

    num_generations_in_run = 0

    print(time.ctime())

    noiser = DisjointNoiser()
    for de_facto_epoch in range(INITIALLY_COMPLETED_DFEPOCH + 1,
                                NUM_DE_FACTO_EPOCHS):
        gb_training = nn_input_preparer.get_batches(selected_tweets_as_lists,
                                                    noiser,
                                                    GENERATOR_BATCH_SIZE)

        cp_filepath = BASE_DIR + f'dfepoch_{de_facto_epoch}_' + "{val_accuracy:.5f}.h5"

        checkpoint = ModelCheckpoint(cp_filepath,
                                     monitor='val_accuracy',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')

        while True:
            try:
                noised_batch, originals_batch, originals_delayed_batch = next(
                    gb_training)
                assert (len(noised_batch) == GENERATOR_BATCH_SIZE)
                print(noised_batch.shape, originals_batch.shape,
                      originals_delayed_batch.shape)
                validation_split = 0.125
                fit_batch_size = 32
                # We take care here so as not to manifest the "Your input ran out of data" warning
                validation_steps = int(
                    GENERATOR_BATCH_SIZE * validation_split) // fit_batch_size
                training_steps = GENERATOR_BATCH_SIZE // fit_batch_size - validation_steps
                training_model.fit([noised_batch, originals_delayed_batch],
                                   originals_batch,
                                   batch_size=fit_batch_size,
                                   steps_per_epoch=training_steps,
                                   epochs=1,
                                   validation_split=validation_split,
                                   validation_steps=validation_steps,
                                   callbacks=[checkpoint])
                # https://keras.io/api/models/model_training_apis/ says:
                # "The validation data is selected from the last samples in the ... data provided"
                # This means the model is never validated on tweets that we train it on.
            except StopIteration:
                break

            num_generations_in_run += 1
            print(f'num_generations: {num_generations_in_run}')

        print(time.ctime())
        print(f'End of de facto epoch {de_facto_epoch} - saving model')

        training_model.save(BASE_DIR + f'dfepoch_{de_facto_epoch}_end.h5')

    training_model.save(FINAL_TRAINED_MODEL_FILENAME)
Exemple #13
0
def main_inference():
    print(f'Using TensorFlow version {tf.__version__}')
    print(f'Loading model {TRAINING_MODEL_FILENAME}')
    trained_model = load_model(
        TRAINING_MODEL_FILENAME,
        custom_objects={"BertModelLayer": BertModelLayer})

    print('Loaded fine-tuned model:')
    trained_model.summary()

    tvu = TargetVocabUtil()
    btc = BertTokenConverter(BERT_PRETRAINED_MODEL_DIR, tvu)
    nn_input_preparer = NNInputPreparer(tvu=tvu, max_seq_len=MAX_SEQ_LEN)

    for input_filename in [DEV_INPUT_FILENAME]:
        loader = LabeledDataLoader(input_filename)
        tweets = loader.parse_raw_tokens_and_labels(loader.load_lines())
        tweets = btc.convert_to_tokens(tweets)
        tweets = btc.convert_to_ids(tweets)
        tweets = nn_input_preparer.filter_out_long_sequences(tweets)
        print(
            f'processing all {len(tweets)} not-too-long tweets from {input_filename}'
        )
        irregular_inputs = [[item[0] for item in tweet] for tweet in tweets]
        irregular_targets = [[item[1] for item in tweet] for tweet in tweets]

        num_tokens_in_dataset = num_token_level_correct_argmax_predictions = \
            num_token_level_correct_argmax_predictions_incl_pads = 0
        tweet_level_argmax_accuracy_sum = tweet_level_expected_sampling_accuracy_sum = \
            token_level_expected_sampling_accuracy_sum = token_level_expected_sampling_accuracy_sum_incl_pads = 0.0

        for (irregular_input, irregular_target_indices) in tqdm(
                zip(irregular_inputs, irregular_targets)):
            rectangular_inputs = nn_input_preparer.rectangularize_inputs(
                [irregular_input])
            rectangular_targets = nn_input_preparer.rectangularize_targets(
                [irregular_target_indices])
            num_tokens_in_current_tweet = num_current_tweet_correct_argmax_predictions = 0
            current_tweet_expected_sampling_accuracy_sum = 0.0
            predicted_probabilities_sequence = trained_model(
                rectangular_inputs)
            for predicted_probabilities, target_index in zip(
                    predicted_probabilities_sequence[0],
                    rectangular_targets[0]):
                # the predicted index if we take the class with the largest probability
                argmax_index = np.argmax(predicted_probabilities)
                # probability of guessing target_index if we sample according to predicted probabilities
                prob_sampling_success_on_token = tf.keras.backend.get_value(
                    predicted_probabilities[target_index])
                if argmax_index == target_index:
                    num_token_level_correct_argmax_predictions_incl_pads += 1
                token_level_expected_sampling_accuracy_sum_incl_pads += prob_sampling_success_on_token

                if target_index != 0:
                    if argmax_index == target_index:
                        num_token_level_correct_argmax_predictions += 1
                        num_current_tweet_correct_argmax_predictions += 1
                    current_tweet_expected_sampling_accuracy_sum += prob_sampling_success_on_token
                    token_level_expected_sampling_accuracy_sum += prob_sampling_success_on_token
                    num_tokens_in_current_tweet += 1
                    num_tokens_in_dataset += 1

            # every tweet has at least one non-padding token, so we don't worry about division by zero
            current_tweet_argmax_accuracy = num_current_tweet_correct_argmax_predictions / num_tokens_in_current_tweet
            current_tweet_expected_sampling_accuracy = \
                current_tweet_expected_sampling_accuracy_sum / num_tokens_in_current_tweet

            tweet_level_argmax_accuracy_sum += current_tweet_argmax_accuracy
            tweet_level_expected_sampling_accuracy_sum += current_tweet_expected_sampling_accuracy

        num_tweets_in_dataset = len(tweets)
        num_tokens_in_dataset_incl_pads = MAX_SEQ_LEN * num_tweets_in_dataset

        print(
            f'Argmax accuracy for {input_filename} including padding:',
            num_token_level_correct_argmax_predictions_incl_pads /
            num_tokens_in_dataset_incl_pads)
        print(
            f'Expected sampling accuracy for {input_filename} including padding:',
            token_level_expected_sampling_accuracy_sum_incl_pads /
            num_tokens_in_dataset_incl_pads)

        print(
            f'Token-level argmax accuracy for {input_filename}:',
            num_token_level_correct_argmax_predictions / num_tokens_in_dataset)
        print(
            f'Token-level expected sampling accuracy for {input_filename}:',
            token_level_expected_sampling_accuracy_sum / num_tokens_in_dataset)

        print(f'Tweet-level argmax accuracy for {input_filename}:',
              tweet_level_argmax_accuracy_sum / num_tweets_in_dataset)
        print(
            f'Tweet-level expected sampling accuracy for {input_filename}:',
            tweet_level_expected_sampling_accuracy_sum / num_tweets_in_dataset)