def data_lists(mapdata, verbose = False): """convert stacked data into list of sequences """ if mapdata.sequence_lengths is not None: if mapdata.sequence_lengths.sum() != mapdata.data.shape[0]: print('Data object does not appear to be in a valid stacked format!') return None else: print('Error! Sequences do not appear to be in a valid stacked format!') return None listmap = Map() listmap.bin_width = mapdata.bin_width listmap.boundaries = mapdata.boundaries listmap.boundaries_fs = mapdata.boundaries_fs listmap.sequence_lengths = mapdata.sequence_lengths listmap.data = [] ii = 0 for ss in mapdata.sequence_lengths: listmap.data.append(mapdata.data[ii:ii+ss]) ii = ii+ss return listmap
def data_split(mapdata, tr=0.3, vl=0.3, ts=0.4, randomseed = None, verbose = False): """Split mapdata into train, val, and test sets. mapdata is a Map() data object, either stacked or not. """ if randomseed is not None: np.random.seed(randomseed) # normalize tr, vl, and ts proportions: tmpsum = tr + vl + ts tr = tr / tmpsum vl = vl / tmpsum ts = ts / tmpsum trmap = Map() vlmap = Map() tsmap = Map() trmap.bin_width = mapdata.bin_width vlmap.bin_width = mapdata.bin_width tsmap.bin_width = mapdata.bin_width trmap.boundaries_fs = mapdata.boundaries_fs vlmap.boundaries_fs = mapdata.boundaries_fs tsmap.boundaries_fs = mapdata.boundaries_fs num_sequences = np.array(mapdata.sequence_lengths).shape[0] if verbose: print('Splitting {} sequences into train, validation, and test sets...'.format(num_sequences)) indices = np.random.permutation(num_sequences) tridx = indices[np.arange(0, np.floor(tr*num_sequences)).astype(int)] vlidx = indices[np.arange(np.floor(tr*num_sequences), np.floor(tr*num_sequences) + np.floor(vl*num_sequences)).astype(int)] tsidx = indices[np.arange(np.floor(tr*num_sequences) + np.floor(vl*num_sequences), num_sequences).astype(int)] if isinstance(mapdata.data,np.ndarray): lstdata = data_lists(mapdata) tmp = np.array(lstdata.data) else: tmp = np.array(mapdata.data) trmap.data = list(tmp[tridx]) vlmap.data = list(tmp[vlidx]) tsmap.data = list(tmp[tsidx]) trmap.boundaries = mapdata.boundaries[tridx] vlmap.boundaries = mapdata.boundaries[vlidx] tsmap.boundaries = mapdata.boundaries[tsidx] trmap.tridx = tridx vlmap.vlidx = vlidx tsmap.tsidx = tsidx if isinstance(mapdata.data,np.ndarray): # stack data if the original data was stacked: trtmp = data_stack(trmap, verbose = verbose) trmap.data = trtmp.data trmap.sequence_lengths = trtmp.sequence_lengths vltmp = data_stack(vlmap, verbose = verbose) vlmap.data = vltmp.data vlmap.sequence_lengths = vltmp.sequence_lengths tstmp = data_stack(tsmap, verbose = verbose) tsmap.data = tstmp.data tsmap.sequence_lengths = tstmp.sequence_lengths if verbose: print('Stacked data split into train ({:.1f} %), validation ({:.1f} %) and test ({:.1f} %) sequences.'.format(tr*100,vl*100,ts*100)) else: if verbose: print('List data split into train ({:.1f} %), validation ({:.1f} %) and test ({:.1f} %) sequences.'.format(tr*100,vl*100,ts*100)) return trmap, vlmap, tsmap