def computeSubSet(metadata, frame_per_seq, probTh=None, seed=42): """This function computes a subset of the dataset, it extracts frame_per_seq consecutive frames for each sequence. Parameters ---------- metadata: Pandas.DataFrame metadata of the WilFireDataset frame_per_seq: int frame per sequence to take probTh: float , optional The data set contains many more frames classified 'fire' than 'not fire', this parameter allows to equalize the dataset. For each 'not fire' sequence, we draw a random number if it is greater than probTh we double the number of frames used for this sequence seed : int you can setup a seed for repeatability Example ------- metadataSS = computeSubSet(metadata, 2) wildfireSS = WildFireDataset(metadata=metadataSS, path_to_frames=path_to_frames) """ if not isinstance(metadata, pd.DataFrame): try: metadata = pd.read_csv(metadata) except (ValueError, FileNotFoundError): raise ValueError(f"Invalid path to CSV containing metadata. Please provide one (path={metadata})") cryptogen = SystemRandom() cryptogen.seed(seed) random.seed(seed) metadata.index = np.arange(len(metadata)) imgs = metadata['imgFile'] # Define sequences numbers metadata.index = np.arange(len(metadata)) meta = metadata[['exploitable', 'fire', 'sequence', 'clf_confidence', 'loc_confidence', 'x', 'y', 't', 'stateStart', 'stateEnd', 'fire_id', 'fBase']] meta = meta.drop_duplicates() meta['seq'] = np.arange(len(meta)) metadata = pd.merge(metadata, meta, on=['exploitable', 'fire', 'sequence', 'clf_confidence', 'loc_confidence', 'x', 'y', 't', 'stateStart', 'stateEnd', 'fire_id', 'fBase'], how='inner') # Get unique list of sequences seq = metadata['seq'] my_set = set(seq) uniqueSEQ = list(my_set) random.shuffle(uniqueSEQ) subSetImgs = [] subSetImgsEq = [] for seU in uniqueSEQ: # For each sequence get a subSample of frame_per_seq frames nn = [imgs[i] for i, se in enumerate(seq) if se == seU] if(len(nn) > frame_per_seq): nn = random.sample(nn, frame_per_seq) nb = [float(frame.split("frame", 1)[1].split(".", 1)[0]) for frame in nn] nb, nn = (list(t) for t in zip(*sorted(zip(nb, nn)))) subSetImgs += nn # Equalize the dataset adding not_fire frames if probTh is not None: if(metadata[metadata['seq'] == seU]['fire'].values[0] == 0 and cryptogen.random() < probTh): nn = [imgs[i] for i, se in enumerate(seq) if se == seU] if(len(nn) > frame_per_seq): nn = random.sample(nn, frame_per_seq) nb = [float(frame.split("frame", 1)[1].split(".", 1)[0]) for frame in nn] nb, nn = (list(t) for t in zip(*sorted(zip(nb, nn)))) subSetImgsEq += nn # Insert randomly the extra frames in the dataset if probTh is not None: for i in range(0, len(subSetImgsEq), 2): idx = cryptogen.randint(0, len(subSetImgs) - 2) // 2 * 2 subSetImgs.insert(idx, subSetImgsEq[i + 1]) subSetImgs.insert(idx, subSetImgsEq[i]) # Create metadta Subset index = [i for i, im in enumerate(metadata['imgFile'].values) if im in subSetImgs] return metadata.iloc[index]