Example #1
0
def data_lists(mapdata, verbose = False):
    """convert stacked data into list of sequences

    """

    if mapdata.sequence_lengths is not None:
        if mapdata.sequence_lengths.sum() != mapdata.data.shape[0]:
            print('Data object does not appear to be in a valid stacked format!')
            return None
    else:
        print('Error! Sequences do not appear to be in a valid stacked format!')
        return None

    listmap = Map()
    listmap.bin_width = mapdata.bin_width
    listmap.boundaries = mapdata.boundaries
    listmap.boundaries_fs = mapdata.boundaries_fs
    listmap.sequence_lengths = mapdata.sequence_lengths
    listmap.data = []

    ii = 0
    for ss in mapdata.sequence_lengths:
        listmap.data.append(mapdata.data[ii:ii+ss])
        ii = ii+ss

    return listmap
Example #2
0
def data_split(mapdata, tr=0.3, vl=0.3, ts=0.4, randomseed = None, verbose = False):
    """Split mapdata into train, val, and test sets.

        mapdata is a Map() data object, either stacked or not.
    """

    if randomseed is not None:
        np.random.seed(randomseed)
    
    # normalize tr, vl, and ts proportions:
    tmpsum = tr + vl + ts
    tr = tr / tmpsum
    vl = vl / tmpsum
    ts = ts / tmpsum

    trmap = Map()
    vlmap = Map()
    tsmap = Map()

    trmap.bin_width = mapdata.bin_width
    vlmap.bin_width = mapdata.bin_width
    tsmap.bin_width = mapdata.bin_width
    trmap.boundaries_fs = mapdata.boundaries_fs
    vlmap.boundaries_fs = mapdata.boundaries_fs
    tsmap.boundaries_fs = mapdata.boundaries_fs

    num_sequences = np.array(mapdata.sequence_lengths).shape[0]
    if verbose:
        print('Splitting {} sequences into train, validation, and test sets...'.format(num_sequences))

    indices = np.random.permutation(num_sequences)
    
    tridx = indices[np.arange(0, np.floor(tr*num_sequences)).astype(int)]
    vlidx = indices[np.arange(np.floor(tr*num_sequences), np.floor(tr*num_sequences) + np.floor(vl*num_sequences)).astype(int)]
    tsidx = indices[np.arange(np.floor(tr*num_sequences) + np.floor(vl*num_sequences), num_sequences).astype(int)]
    
    if isinstance(mapdata.data,np.ndarray):
        lstdata = data_lists(mapdata) 
        tmp = np.array(lstdata.data)
    else:   
        tmp = np.array(mapdata.data)
    
    trmap.data = list(tmp[tridx])
    vlmap.data = list(tmp[vlidx])
    tsmap.data = list(tmp[tsidx])

    trmap.boundaries = mapdata.boundaries[tridx]
    vlmap.boundaries = mapdata.boundaries[vlidx]
    tsmap.boundaries = mapdata.boundaries[tsidx]

    trmap.tridx = tridx
    vlmap.vlidx = vlidx
    tsmap.tsidx = tsidx

    if isinstance(mapdata.data,np.ndarray):
        # stack data if the original data was stacked:
        trtmp = data_stack(trmap, verbose = verbose)
        trmap.data = trtmp.data
        trmap.sequence_lengths = trtmp.sequence_lengths
        vltmp = data_stack(vlmap, verbose = verbose)
        vlmap.data = vltmp.data
        vlmap.sequence_lengths = vltmp.sequence_lengths
        tstmp = data_stack(tsmap, verbose = verbose)
        tsmap.data = tstmp.data
        tsmap.sequence_lengths = tstmp.sequence_lengths
        if verbose:
            print('Stacked data split into train ({:.1f} %), validation ({:.1f} %) and test ({:.1f} %) sequences.'.format(tr*100,vl*100,ts*100))
    else:
        if verbose:
            print('List data split into train ({:.1f} %), validation ({:.1f} %) and test ({:.1f} %) sequences.'.format(tr*100,vl*100,ts*100))

    return trmap, vlmap, tsmap