def test_train_and_test_dataset():
    with tempfile.TemporaryDirectory() as temp_dir:
        # Test both the normal cahce_dir and the given cache dir
        for data_dir in [None, Path(temp_dir, 'twitter data')]:
            train_data = wang_2017_election_twitter_train(data_dir)
            test_data = wang_2017_election_twitter_test(data_dir)
            
            assert len(train_data) > len(test_data)

            combined_data = TargetTextCollection.combine(train_data, test_data)
            assert 11899 == combined_data.number_targets()
            train_count += 1
    percentage_train_targets = (train_count / len(train_targets)) * 100
    print(
        f'Percentage of targets that have been predicted that are in train: {percentage_train_targets}'
    )

    test_count = 0
    for test_target in test_targets:
        if test_target in pred_targets:
            test_count += 1
    percentage_test_targets = (test_count / len(test_targets)) * 100
    print(
        f'Percentage of targets that have been predicted that are in test: {percentage_test_targets}'
    )

    train_test = TargetTextCollection.combine(train_data, test_data)
    train_test_in_count = 0
    train_test_out_count = 0
    train_test_targets = train_test.target_count(lower=True)
    for train_test_target in train_test_targets:
        if train_test_target in pred_targets:
            train_test_in_count += 1
        else:
            train_test_out_count += 1
    print(
        f'Number of new predicted targets that are in the whole gold datasets: '
        f'{train_test_in_count} compared to that are not: {train_test_out_count}'
    )

    train_and_pred = TargetTextCollection.combine(train_data, all_targets)
    train_and_pred_targets = set(