Example #1
0
def computeSubSet(metadata, frame_per_seq, probTh=None, seed=42):
    """This function computes a subset of the dataset, it extracts frame_per_seq consecutive frames for each sequence.

    Parameters
    ----------
    metadata: Pandas.DataFrame
        metadata of the WilFireDataset

    frame_per_seq: int
        frame per sequence to take

    probTh: float , optional
        The data set contains many more frames classified 'fire' than 'not fire', this parameter
        allows to equalize the dataset. For each 'not fire' sequence, we draw a random number if
        it is greater than probTh we double the number of frames used for this sequence

    seed : int
        you can setup a seed for repeatability

    Example
    -------
    metadataSS = computeSubSet(metadata, 2)
    wildfireSS = WildFireDataset(metadata=metadataSS, path_to_frames=path_to_frames)
    """
    if not isinstance(metadata, pd.DataFrame):
        try:
            metadata = pd.read_csv(metadata)
        except (ValueError, FileNotFoundError):
            raise ValueError(f"Invalid path to CSV containing metadata. Please provide one (path={metadata})")

    cryptogen = SystemRandom()
    cryptogen.seed(seed)
    random.seed(seed)
    metadata.index = np.arange(len(metadata))
    imgs = metadata['imgFile']
    # Define sequences numbers
    metadata.index = np.arange(len(metadata))
    meta = metadata[['exploitable', 'fire', 'sequence', 'clf_confidence', 'loc_confidence',
                                    'x', 'y', 't', 'stateStart', 'stateEnd', 'fire_id', 'fBase']]
    meta = meta.drop_duplicates()
    meta['seq'] = np.arange(len(meta))
    metadata = pd.merge(metadata, meta, on=['exploitable', 'fire', 'sequence', 'clf_confidence',
                                                           'loc_confidence', 'x', 'y', 't', 'stateStart', 'stateEnd',
                                                           'fire_id', 'fBase'], how='inner')
    # Get unique list of sequences
    seq = metadata['seq']
    my_set = set(seq)
    uniqueSEQ = list(my_set)
    random.shuffle(uniqueSEQ)

    subSetImgs = []
    subSetImgsEq = []
    for seU in uniqueSEQ:
        # For each sequence get a subSample of frame_per_seq frames
        nn = [imgs[i] for i, se in enumerate(seq) if se == seU]
        if(len(nn) > frame_per_seq):
            nn = random.sample(nn, frame_per_seq)
        nb = [float(frame.split("frame", 1)[1].split(".", 1)[0]) for frame in nn]
        nb, nn = (list(t) for t in zip(*sorted(zip(nb, nn))))
        subSetImgs += nn
        # Equalize the dataset adding not_fire frames
        if probTh is not None:
            if(metadata[metadata['seq'] == seU]['fire'].values[0] == 0 and
               cryptogen.random() < probTh):
                nn = [imgs[i] for i, se in enumerate(seq) if se == seU]
                if(len(nn) > frame_per_seq):
                    nn = random.sample(nn, frame_per_seq)
                nb = [float(frame.split("frame", 1)[1].split(".", 1)[0]) for frame in nn]
                nb, nn = (list(t) for t in zip(*sorted(zip(nb, nn))))
                subSetImgsEq += nn

    # Insert randomly the extra frames in the dataset
    if probTh is not None:
        for i in range(0, len(subSetImgsEq), 2):
            idx = cryptogen.randint(0, len(subSetImgs) - 2) // 2 * 2
            subSetImgs.insert(idx, subSetImgsEq[i + 1])
            subSetImgs.insert(idx, subSetImgsEq[i])

    # Create metadta Subset
    index = [i for i, im in enumerate(metadata['imgFile'].values) if im in subSetImgs]

    return metadata.iloc[index]