def split_dataset_by_cameras(db, dataset, config, args): validation_percent = 0.3 train = Dataset(db, "train", config) train.enable_augmentation = True validation = Dataset(db, "validation", config) train_cameras = [] cameras = list(dataset.cameras_by_id.values()) camera_count = len(cameras) num_validate_cameras = max( MIN_VALIDATE_CAMERAS, round(camera_count * validation_percent) ) wallaby, wallaby_validate = split_wallaby_cameras(dataset, cameras) if wallaby: train_cameras.append(wallaby) # has all the rabbits so put in training rabbits = dataset.cameras_by_id.get("ruru19w44a-[-36.03915 174.51675]") if rabbits: cameras.remove(rabbits) train_cameras.append(rabbits) validate_cameras, cameras = diverse_validation( cameras, dataset.labels, num_validate_cameras ) if wallaby_validate: validate_cameras.append(wallaby_validate) train_cameras.extend(cameras) add_camera_tracks(dataset.labels, train, train_cameras) add_camera_tracks(dataset.labels, validation, validate_cameras) return train, validation
def split_randomly(db_file, dataset, config, args, test_clips=[], balance_bins=True): # split data randomly such that a clip is only in one dataset # have tried many ways to split i.e. location and cameras found this is simplest # and the results are the same train = Dataset(db_file, "train", config) train.enable_augmentation = True validation = Dataset(db_file, "validation", config) test = Dataset(db_file, "test", config) test_c = get_test_set_camera(dataset, test_clips, args.date) test_cameras = [test_c] validate_cameras = [] train_cameras = [] for label in dataset.labels: existing_test_count = len(test.tracks_by_label.get(label, [])) train_c, validate_c, test_c = split_label( dataset, label, existing_test_count=existing_test_count) if train_c is not None: train_cameras.append(train_c) if validate_c is not None: validate_cameras.append(validate_c) if test_c is not None: test_cameras.append(test_c) add_camera_tracks(dataset.labels, train, train_cameras, balance_bins) add_camera_tracks(dataset.labels, validation, validate_cameras, balance_bins) add_camera_tracks(dataset.labels, test, test_cameras, balance_bins) return train, validation, test
def main(): init_logging() config = load_config() build_config = config.build db = TrackDatabase(os.path.join(config.tracks_folder, "dataset.hdf5")) dataset = Dataset(db, "dataset", config) tracks_loaded, total_tracks = dataset.load_tracks() print( "Loaded {}/{} tracks, found {:.1f}k segments".format( tracks_loaded, total_tracks, len(dataset.segments) / 1000 ) ) for key, value in dataset.filtered_stats.items(): if value != 0: print(" {} filtered {}".format(key, value)) print() show_tracks_breakdown(dataset) print() show_segments_breakdown(dataset) print() show_cameras_breakdown(dataset) print() print("Splitting data set into train / validation") datasets = split_dataset_by_cameras(db, dataset, build_config) # if build_config.use_previous_split: # split = get_previous_validation_bins(build_config.previous_split) # datasets = split_dataset(db, dataset, build_config, split) # else: # datasets = split_dataset(db, dataset, build_config) pickle.dump(datasets, open(dataset_db_path(config), "wb"))
def test_dataset(db, config, date): test = Dataset(db, "test", config) tracks_loaded, total_tracks = test.load_tracks(shuffle=True, after_date=date) print("Test Loaded {}/{} tracks".format(tracks_loaded, total_tracks)) for key, value in test.filtered_stats.items(): if value != 0: print("Test {} filtered {}".format(key, value)) return test
def main(): init_logging() args = parse_args() config = load_config(args.config_file) # return # import yaml # # with open("defualtstest.yml", "w") as f: # yaml.dump(config.as_dict(), f) test_clips = config.build.test_clips() if test_clips is None: test_clips = [] logging.info("# of test clips are %s", len(test_clips)) db_file = os.path.join(config.tracks_folder, "dataset.hdf5") dataset = Dataset(db_file, "dataset", config, consecutive_segments=args.consecutive_segments) tracks_loaded, total_tracks = dataset.load_tracks() dataset.labels.sort() print("Loaded {}/{} tracks, found {:.1f}k segments".format( tracks_loaded, total_tracks, len(dataset.segments) / 1000)) for key, value in dataset.filtered_stats.items(): if value != 0: print(" {} filtered {}".format(key, value)) print() show_tracks_breakdown(dataset) print() show_segments_breakdown(dataset) print() show_sample_frames_breakdown(dataset) print() show_cameras_breakdown(dataset) print() print("Splitting data set into train / validation") datasets = split_randomly(db_file, dataset, config, args, test_clips) validate_datasets(datasets, test_clips, args.date) print_counts(dataset, *datasets) base_dir = config.tracks_folder for dataset in datasets: dataset.saveto_numpy(os.path.join(base_dir)) for dataset in datasets: dataset.clear_samples() dataset.db = None logging.info("saving to %s", f"{os.path.join(base_dir, dataset.name)}.dat") pickle.dump(dataset, open(f"{os.path.join(base_dir, dataset.name)}.dat", "wb"))
def main(): global dataset global db db = TrackDatabase(os.path.join(DATASET_FOLDER, 'dataset.hdf5')) dataset = Dataset(db, 'dataset') total_tracks = len(db.get_all_track_ids()) tracks_loaded = dataset.load_tracks(track_filter) print("Loaded {}/{} tracks, found {:.1f}k segments".format( tracks_loaded, total_tracks, len(dataset.segments) / 1000)) for key, value in filtered_stats.items(): if value != 0: print(" {} filtered {}".format(key, value)) print() labels = sorted(list(set(dataset.tracks_by_label.keys()))) dataset.labels = labels show_tracks_breakdown() print() show_segments_breakdown() print() show_cameras_breakdown() print() print("Splitting data set into train / validation") if USE_PREVIOUS_SPLIT: split = get_bin_split('template.dat') datasets = split_dataset_days(split) else: datasets = split_dataset_days() pickle.dump(datasets, open(os.path.join(DATASET_FOLDER, 'datasets.dat'), 'wb'))
def main(): init_logging() args = parse_args() config = load_config(args.config_file) db = TrackDatabase(os.path.join(config.tracks_folder, "dataset.hdf5")) dataset = Dataset( db, "dataset", config, consecutive_segments=args.consecutive_segments ) tracks_loaded, total_tracks = dataset.load_tracks(before_date=args.date) print( "Loaded {}/{} tracks, found {:.1f}k segments".format( tracks_loaded, total_tracks, len(dataset.segments) / 1000 ) ) for key, value in dataset.filtered_stats.items(): if value != 0: print(" {} filtered {}".format(key, value)) print() show_tracks_breakdown(dataset) print() show_segments_breakdown(dataset) print() show_important_frames_breakdown(dataset) print() show_cameras_breakdown(dataset) print() print("Splitting data set into train / validation") datasets = split_dataset_by_cameras(db, dataset, config, args) if args.date is None: args.date = datetime.datetime.now(pytz.utc) - datetime.timedelta(days=7) test = test_dataset(db, config, args.date) datasets = (*datasets, test) print_counts(dataset, *datasets) print_cameras(*datasets) pickle.dump(datasets, open(dataset_db_path(config), "wb"))
def split_dataset(db, dataset, build_config, prefill_dataset=None): """ Randomly selects tracks to be used as the train, validation, and test sets :param prefill_bins: if given will use these bins for the test set :return: tuple containing train, validation, and test datasets. This method assigns tracks into 'label-camera-day' bins and splits the bins across datasets. """ # pick out groups to use for the various sets bins_by_label = {} used_bins = {} for label in dataset.labels: bins_by_label[label] = [] used_bins[label] = [] counts = [] for bin_id, tracks in dataset.tracks_by_bin.items(): label = tracks[0].label bins_by_label[tracks[0].label].append(bin_id) counts.append(sum(len(track.segments) for track in tracks)) train = Dataset(db, "train") # 10 cameras # 5 tests # 5 train # then change the names validation = Dataset(db, "validation") test = Dataset(db, "test") bin_segment_mean = np.mean(counts) bin_segment_std = np.std(counts) max_bin_segments = bin_segment_mean + bin_segment_std * build_config.cap_bin_weight print_bin_segment_stats(bin_segment_mean, bin_segment_std, max_bin_segments) max_track_duration = build_config.max_validation_set_track_duration if prefill_dataset is not None: prefill_bins( dataset, [validation, test], prefill_dataset, used_bins, max_bin_segments, max_track_duration, ) required_samples = build_config.test_set_count required_bins = max(MIN_BINS, build_config.test_set_bins) # assign bins to test and validation sets # if we previously added bins from another dataset we are simply filling in the gaps here. for label in dataset.labels: available_bins = set(bins_by_label[label]) - set(used_bins[label]) normal_bins, heavy_bins = dataset.split_heavy_bins( available_bins, max_bin_segments, max_track_duration ) print_bin_stats(label, normal_bins, heavy_bins, used_bins) add_random_samples( dataset, [validation, test], normal_bins, used_bins[label], label, required_samples, required_bins, ) normal_bins.extend(heavy_bins) for bin_id in normal_bins: train.add_tracks(dataset.tracks_by_bin[bin_id]) # if we have lots of segments on a single day, reduce the weight # so we don't overtrain on this specific example. train.balance_bins(max_bin_segments) validation.balance_bins(max_bin_segments) # balance out the classes train.balance_weights() validation.balance_weights() test.balance_resample(required_samples=build_config.test_set_count) print_segments(dataset, train, validation, test) return train, validation, test
def split_dataset_by_cameras(db, dataset, build_config): train_percent = 0.7 validation_percent = 0.3 test_cameras = 1 train = Dataset(db, "train") validation = Dataset(db, "validation") test = Dataset(db, "test") camera_count = len(dataset.camera_bins) remaining_cameras = camera_count - test_cameras validation_cameras = max(1, round(remaining_cameras * validation_percent)) remaining_cameras -= validation_cameras train_cameras = remaining_cameras camera_data = dataset.camera_bins # want a test set that covers all labels cameras = list(camera_data.values()) # randomize order cameras.sort(key=lambda x: np.random.random_sample()) test_i = -1 test_data = [] for i, camera in enumerate(cameras): if len(camera.label_to_bins.keys()) == len(dataset.labels): test_data.append(camera) test_i = i break assert len(test_data) > 0, "No test camera found with all labels" del cameras[test_i] train_data = cameras[:train_cameras] required_samples = build_config.test_set_count required_bins = build_config.test_set_bins add_camera_data( dataset.labels, train, train_data, None, None, build_config.cap_bin_weight, build_config.max_segments_per_track, ) validate_data = cameras[train_cameras : train_cameras + validation_cameras] add_camera_data( dataset.labels, validation, validate_data, None, None, build_config.cap_bin_weight, build_config.max_segments_per_track, ) # validation.add_cameras(validate_data) add_camera_data( dataset.labels, test, test_data, required_samples, required_bins, build_config.max_segments_per_track, ) # balance out the classes train.balance_weights() validation.balance_weights() test.balance_resample(required_samples=build_config.test_set_count) print_segments(dataset, train, validation, test) print_cameras(train, validation, test) return train, validation, test
def split_dataset_days(prefill_bins=None): """ Randomly selects tracks to be used as the train, validation, and test sets :param prefill_bins: if given will use these bins for the test set :return: tuple containing train, validation, and test datasets. This method assigns tracks into 'label-camera-day' bins and splits the bins across datasets. """ # pick out groups to use for the various sets bins_by_label = {} for label in dataset.labels: bins_by_label[label] = [] for bin_id, tracks in dataset.tracks_by_bin.items(): label = dataset.track_by_id[tracks[0].track_id].label bins_by_label[label].append(bin_id) train = Dataset(db, 'train') validation = Dataset(db, 'validation') test = Dataset(db, 'test') train.labels = dataset.labels.copy() validation.labels = dataset.labels.copy() test.labels = dataset.labels.copy() # check bins distribution bin_segments = [] for bin, tracks in dataset.tracks_by_bin.items(): count = sum(len(track.segments) for track in tracks) bin_segments.append((count, bin)) bin_segments.sort() counts = [count for count, bin in bin_segments] bin_segment_mean = np.mean(counts) bin_segment_std = np.std(counts) max_bin_segments = bin_segment_mean + bin_segment_std * CAP_BIN_WEIGHT print() print("Bin segment mean:{:.1f} std:{:.1f} auto max segments:{:.1f}".format( bin_segment_mean, bin_segment_std, max_bin_segments)) print() used_bins = {} for label in dataset.labels: used_bins[label] = [] if prefill_bins is not None: print("Reusing bins from previous split:") for label in dataset.labels: available_bins = set(bins_by_label[label]) if label not in prefill_bins: continue for sample in prefill_bins[label]: # this happens if we have banned/deleted the clip, but it was previously used. if sample not in dataset.tracks_by_bin: continue # this happens if we changed what a 'heavy' bin is. if is_heavy_bin(sample, max_bin_segments): continue validation.add_tracks(dataset.tracks_by_bin[sample]) test.add_tracks(dataset.tracks_by_bin[sample]) validation.filter_segments(TEST_MIN_MASS, ignore_labels=['false-positive']) test.filter_segments(TEST_MIN_MASS, ignore_labels=['false-positive']) available_bins.remove(sample) used_bins[label].append(sample) for bin_id in available_bins: train.add_tracks(dataset.tracks_by_bin[bin_id]) train.filter_segments(TRAIN_MIN_MASS, ignore_labels=['false-positive']) # assign bins to test and validation sets # if we previously added bins from another dataset we are simply filling in the gaps here. for label in dataset.labels: available_bins = set(bins_by_label[label]) # heavy bins are bins with an unsually high number of examples on a day. We exclude these from the test/validation # set as they will be subfiltered down and there is no need to waste that much data. heavy_bins = set() for bin_id in available_bins: if is_heavy_bin(bin_id, max_bin_segments): heavy_bins.add(bin_id) available_bins -= heavy_bins available_bins -= set(used_bins[label]) # print bin statistics print("{}: normal {} heavy {} pre-filled {}".format( label, len(available_bins), len(heavy_bins), len(used_bins[label]))) required_samples = TEST_SET_COUNT * LABEL_WEIGHTS.get(label, 1.0) required_bins = TEST_SET_BINS * LABEL_WEIGHTS.get( label, 1.0) # make sure there is some diversity required_bins = max(4, required_bins) # we assign bins to the test and validation sets randomly until we have enough segments + bins # the remaining bins can be used for training while len(available_bins) > 0 and \ (validation.get_class_segments_count(label) < required_samples or len(used_bins[label]) < required_bins): sample = random.sample(available_bins, 1)[0] validation.add_tracks(dataset.tracks_by_bin[sample]) test.add_tracks(dataset.tracks_by_bin[sample]) validation.filter_segments(TEST_MIN_MASS, ignore_labels=['false-positive']) test.filter_segments(TEST_MIN_MASS, ignore_labels=['false-positive']) available_bins.remove(sample) used_bins[label].append(sample) if prefill_bins is not None: print(" - required added adddtional sample ", sample) available_bins.update(heavy_bins) for bin_id in available_bins: train.add_tracks(dataset.tracks_by_bin[bin_id]) train.filter_segments(TRAIN_MIN_MASS, ignore_labels=['false-positive']) print("Segments per class:") print("-" * 90) print("{:<20} {:<21} {:<21} {:<21}".format("Class", "Train", "Validation", "Test")) print("-" * 90) # if we have lots of segments on a single day, reduce the weight so we don't overtrain on this specific # example. train.balance_bins(max_bin_segments) validation.balance_bins(bin_segment_mean + bin_segment_std * CAP_BIN_WEIGHT) # balance out the classes train.balance_weights(weight_modifiers=LABEL_WEIGHTS) validation.balance_weights(weight_modifiers=LABEL_WEIGHTS) test.balance_resample(weight_modifiers=LABEL_WEIGHTS, required_samples=TEST_SET_COUNT) # display the dataset summary for label in dataset.labels: print("{:<20} {:<20} {:<20} {:<20}".format( label, "{}/{}/{}/{:.1f}".format(*train.get_counts(label)), "{}/{}/{}/{:.1f}".format(*validation.get_counts(label)), "{}/{}/{}/{:.1f}".format(*test.get_counts(label)), )) print() return train, validation, test