def test_train_and_test_dataset(): with tempfile.TemporaryDirectory() as temp_dir: # Test both the normal cahce_dir and the given cache dir for data_dir in [None, Path(temp_dir, 'twitter data')]: train_data = wang_2017_election_twitter_train(data_dir) test_data = wang_2017_election_twitter_test(data_dir) assert len(train_data) > len(test_data) combined_data = TargetTextCollection.combine(train_data, test_data) assert 11899 == combined_data.number_targets()
train_count += 1 percentage_train_targets = (train_count / len(train_targets)) * 100 print( f'Percentage of targets that have been predicted that are in train: {percentage_train_targets}' ) test_count = 0 for test_target in test_targets: if test_target in pred_targets: test_count += 1 percentage_test_targets = (test_count / len(test_targets)) * 100 print( f'Percentage of targets that have been predicted that are in test: {percentage_test_targets}' ) train_test = TargetTextCollection.combine(train_data, test_data) train_test_in_count = 0 train_test_out_count = 0 train_test_targets = train_test.target_count(lower=True) for train_test_target in train_test_targets: if train_test_target in pred_targets: train_test_in_count += 1 else: train_test_out_count += 1 print( f'Number of new predicted targets that are in the whole gold datasets: ' f'{train_test_in_count} compared to that are not: {train_test_out_count}' ) train_and_pred = TargetTextCollection.combine(train_data, all_targets) train_and_pred_targets = set(