def __init__(self, local, stop_ind, mode, bsize, lag=6, stops=5, local_split=0.8, # 0.2 recent will be used for testing meta_path=None, min_stride=1, smooth=False, norm=10, diff=None, device=None, verbose=True): super().__init__( local, mode, bsize, lag, stops, local_split, meta_path, min_stride, smooth, norm, diff, device, verbose) self.refs = [] split_ind = int(13248 * local_split) for route in self.meta: last_t = 0 for pair in route['trainable']: # TODO: split train/test here ti, si = pair if si != stop_ind: continue if (mode == 'train' and ti < split_ind) \ or (mode == 'test' and ti >= split_ind): if ti >= last_t + min_stride: self.refs.append([route['name']] + pair) last_t = ti # self.refs.append([route['name']] + pair) if verbose: print(' [*] Subset in Stop-%d: %d' % (stop_ind, len(self.refs))) if mode == 'train': npshuff(self.refs) self.ind = 0 with open('data/stopcodes_sequence/%s.txt' % local) as fl: stops = fl.read().split('\n') segid = '%s-%s' % (stops[stop_ind], stops[stop_ind+1]) with open('data/avgspeeds-full-ts-xclude/%s/%s.csv' % (segid[0], segid)) as fl: lines = fl.read().split('\n')[1:] lines = filter(lambda ent: ent, lines) lines = map(lambda ent: ent.split(',')[1], lines) avgspeeds = [float(ln) if ln != '' else np.nan for ln in lines] self.avgdata = np.array(avgspeeds)
def __init__(self, local, mode, bsize, lag=6, stops=5, local_split=0.8, # 0.2 recent will be used for testing meta_path=None, min_stride=1, # index_file='min-data.json', smooth=False, norm=10, diff=None, device=None, verbose=False): self.device = device self.bsize = bsize self.mode = mode self.smooth = smooth self.lag = lag self.stops = stops self.norm = norm self.diff = diff t0 = time() if meta_path is None: meta_path = 'metadata/%dh' % (int(lag/6)) fpath = '%s/%s.json' % (meta_path, local) with open(fpath) as fl: meta = [json.load(fl)] self.meta = meta if verbose: print('Locals dataset: %s (%s)' % (mode, fpath)) if verbose: print(' [*] Loaded routes:', len(meta), '(%.2fs)' % (time() - t0)) if verbose: print(' [*] Has trainable inds:', len(meta[0]['trainable'])) self.refs = [] split_ind = int(13248 * local_split) for route in meta: last_t = 0 for pair in route['trainable']: # TODO: split train/test here ti, si = pair if (mode == 'train' and ti < split_ind) \ or (mode == 'test' and ti >= split_ind): if ti >= last_t + min_stride: self.refs.append([route['name']] + pair) last_t = ti assert len(meta) if verbose: print(' [*] Subset %s: %d' % (mode, len(self.refs))) if mode == 'train': npshuff(self.refs) self.ind = 0 self.mat = np.load('data/history/%s.npy' % (local)) self.maxval=20
def __init__(self, segments, mode, bsize, lag=6, data_path=PARSED_PATH, norm=(9, 10), shuffle=True, verbose=True, ): super().__init__( segments, mode, bsize, lag=None, data_path=PARSED_PATH, ignore_missing=True, split=0.8, preproc='s', smooth=1.2, res=10, post=None, clip_hours=8, norm=(0, 1), shuffle=False, verbose=verbose) def congestion(vs, clip=10): vs = blur1d(vs, sigma=1.5) vs[vs > clip] = np.log(vs[vs > clip] - clip + 1) + clip return vs # FIXME: using constfill ls = [] for seg in self.data: mat = np.array([congestion(seg[:, si]) for si in range(seg.shape[1])]) ls.append(mat.T) self.post_data = ls norm_mean, norm_scale = norm for ii in range(len(self.data)): # seg, post in zip(self.data, self.post_data): self.data[ii] = (self.data[ii] - norm_mean) / norm_scale self.post_data[ii] = (self.post_data[ii] - norm_mean) / norm_scale self.lag = lag if lag is not None: self.data, nComplete, nTotal = self.chunks(self.data) self.post_data, _, _ = self.chunks(self.post_data) if verbose: print(' [*] No missing: %d/%d' % (nComplete, nTotal)) if shuffle: inds = list(range(len(self.data))) npshuff(inds) self.data = [self.data[ii] for ii in inds] self.post_data = [self.post_data[ii] for ii in inds]
def __init__(self, mode, bsize, minValid=0.7, lag=6, index_file='min-data_2h.json', reserved='reserved_routes.json', overlap=True, smooth=False, device=None): self.device = device self.bsize = bsize self.mode = mode self.smooth = smooth self.lag = lag t0 = time() with open(index_file) as fl: meta = json.load(fl) print('Routes dataset: %s' %mode) print(' [*] Loaded routes:', len(meta), '(%.2fs)' % (time() - t0)) # Filter reserved routes with open(reserved) as fl: res_metas = json.load(fl) res_names = [entry['name'] for entry in res_metas] if mode == 'train': meta = [entry for entry in meta if entry['name'] not in res_names] else: meta = [entry for entry in meta if entry['name'] in res_names] if not overlap: for route in meta: route['trainable'] = dedupe(route['trainable']) assert len(meta) print(' [*] Subset %s: %d (%s)' % (mode, len(meta), reserved)) self.meta = meta t0 = time() self.refs = [] for route in meta: for pair in route['trainable']: self.refs.append([route['name']] + pair) print(' [*] Loaded trainable inds:', len(self.refs), '(%.2fs)' % (time() - t0)) t0 = time() if mode == 'train': npshuff(self.refs) self.ind = 0
def __init__(self, segments, mode, bsize, # lag=6, res=10, data_path=PARSED_PATH, preproc='s', split=0.8, # smooth=1.2, ignore_missing=True, # post=None, clip_hours=8, norm=(12, 10), # raw mean, scale shuffle=False, verbose=True, ): self.segments = segments self.mode = mode self.bsize = bsize self.shuffle = shuffle # self.lag = lag self.norm = norm self.res = res self.clip_hours = clip_hours # self.post = post byday = {} byseg = [] for segname in segments: smatch = '%s/%s%02d_%s_*.json' % (data_path, preproc, res, segname) # print(smatch) dfiles = sorted(glob(smatch)) try: assert len(dfiles) except: raise Exception('Missing: %s' % smatch) for dname in dfiles: day = dname.split('_')[-1].replace('.json', '') if day not in byday: byday[day] = [] byday[day].append(dname) byseg.append(dfiles) all_avail = [] for day in sorted(list(byday.keys())): gathered = byday[day] if len(gathered) < len(segments): continue all_avail.append([day, gathered]) # gather the raw speeds per day for ii, (day, gathered) in enumerate(all_avail): vlists = [] for seg_name_day in gathered: with open(seg_name_day) as fl: ls = json.load(fl) vlists.append(ls) all_avail[ii].append(vlists) # align the speeds vfill = None self.rawdata = [] self.trange = [] self.calendar = [] # contains time encoding based on week cycle and day (24hr) cycle self.nancount = [[0,0] for _ in segments] for ii, (day, gathered, vlists) in enumerate(all_avail): t0 = s2d(vlists[0][0]['time']) tf = s2d(vlists[0][-1]['time']) for segvs in vlists: if s2d(segvs[0]['time']) > t0: t0 = s2d(segvs[0]['time']) if s2d(segvs[-1]['time']) < tf: tf = s2d(segvs[-1]['time']) if t0 > tf: # skip days with no parallel times continue dt = tf - t0 tsteps = dt.seconds // (60 * res) + 1 vmat = np.zeros((tsteps, len(vlists))) # print(t0, tf) for si, segvs in enumerate(vlists): # seek until t0 begins ind = 0 while s2d(segvs[ind]['time']) < t0: ind += 1 if vfill is not None: vs = np.array(vfill(segvs, res)) else: vs = np.array(nanfill(segvs, res)) # print(len(vs), ind, tsteps) # if smooth is not None: # vs = blur1d(vs, sigma=smooth) nmean, nscale = norm vs = (vs - nmean) / nscale vmat[:, si] = vs[ind:ind+tsteps] if self.clip_hours is not None: vmat = vmat[self.clip_hours*6:] t0 += timedelta(seconds=self.clip_hours*60*60) self.trange.append((t0, tf)) self.rawdata.append(vmat) for si in range(vmat.shape[1]): segvs = vmat[:, si] self.nancount[si][0] += np.isnan(segvs).sum() self.nancount[si][1] += len(segvs) midnight = t0.replace(hour=0, minute=0, second=0, microsecond=0) seconds_since = (t0 - midnight).total_seconds() times = [] for step_i in range(vmat.shape[0]): seconds_enc = (seconds_since + step_i * 60 * 10) / (24 * 60 * 60) time_encoding = t0.weekday() / 6 + seconds_enc * 0.1 times.append(time_encoding) self.calendar.append(np.array(times)) self.data = self.rawdata tsplit = int(len(self.data) * split) self.data = self.data[:tsplit] if mode == 'train' else self.data[tsplit:] self.trange = self.trange[:tsplit] if mode == 'train' else self.trange[tsplit:] self.calendar = self.calendar[:tsplit] if mode == 'train' else self.calendar[tsplit:] if shuffle: npshuff(self.data) if verbose: avglen = lambda series: np.mean([len(seq) for seq in series]) print('Full history') print(' [*] Segments: %d co-avail' % len(all_avail)) for si, (segname, ls) in enumerate(zip(segments, byseg)): nanperc = self.nancount[si][0]/self.nancount[si][1] * 100 print(' * [%s]: %d (%.1f%% nan)' % ( segname, len(ls), nanperc)) print(' [*] Examples (%s): %d' % (mode, len(self.data))) # if lag is not None: # print(' [*] No missing: %d/%d' % (nComplete,nTotal)) tsteps = sorted(list(byday.keys())) print(' [*] Time range: %s ~ %s' % (tsteps[0], tsteps[-1]))
def __init__(self, segments, mode, bsize, lag=6, res=10, data_path=PARSED_PATH, preproc='s', split=0.8, smooth=1.2, ignore_missing=True, nanok=None, post=None, clip_hours=8, norm=(12, 10), # raw mean, scale shuffle=True, verbose=True, ): self.segments = segments self.mode = mode self.bsize = bsize self.shuffle = shuffle self.lag = lag self.nanok = nanok self.norm = norm self.res = res self.clip_hours = clip_hours self.post = post byday = {} byseg = [] for segname in segments: smatch = '%s/%s%02d_%s_*.json' % (data_path, preproc, res, segname) # print(smatch) dfiles = sorted(glob(smatch)) try: assert len(dfiles) except: raise Exception('Missing: %s' % smatch) for dname in dfiles: day = dname.split('_')[-1].replace('.json', '') if day not in byday: byday[day] = [] byday[day].append(dname) byseg.append(dfiles) all_avail = [] for day in sorted(list(byday.keys())): gathered = byday[day] if len(gathered) < len(segments): continue all_avail.append([day, gathered]) # gather the raw speeds per day for ii, (day, gathered) in enumerate(all_avail): vlists = [] for seg_name_day in gathered: with open(seg_name_day) as fl: ls = json.load(fl) vlists.append(ls) all_avail[ii].append(vlists) # align the speeds tfill = nanfill if ignore_missing and lag is not None else constfill self.rawdata = [] self.trange = [] for ii, (day, gathered, vlists) in enumerate(all_avail): t0 = s2d(vlists[0][0]['time']) tf = s2d(vlists[0][-1]['time']) for segvs in vlists: if s2d(segvs[0]['time']) > t0: t0 = s2d(segvs[0]['time']) if s2d(segvs[-1]['time']) < tf: tf = s2d(segvs[-1]['time']) if t0 > tf: # skip days with no parallel times continue dt = tf - t0 tsteps = dt.seconds // (60 * res) + 1 vmat = np.zeros((tsteps, len(vlists))) # print(t0, tf) for si, segvs in enumerate(vlists): # seek until t0 begins ind = 0 # print(ii, si, t0) # print(s2d(segvs[0]['time'])) # print(s2d(segvs[-1]['time'])) # print(len(segvs), tsteps) while s2d(segvs[ind]['time']) < t0: ind += 1 vs = np.array(tfill(segvs, res)) if smooth is not None: vs = blur1d(vs, sigma=smooth) nmean, nscale = norm vs = (vs - nmean) / nscale vmat[:, si] = vs[ind:ind+tsteps] self.trange.append((t0, tf)) if self.clip_hours is not None: vmat = vmat[self.clip_hours*6:] self.rawdata.append(vmat) self.data = self.rawdata tsplit = int(len(self.data) * split) self.data = self.data[:tsplit] if mode == 'train' else self.data[tsplit:] self.trange = self.trange[:tsplit] if mode == 'train' else self.trange[tsplit:] if lag is not None: self.raw_data = self.data self.data, nComplete, nTotal = self.chunks(self.data) if shuffle: npshuff(self.data) if verbose: avglen = lambda series: np.mean([len(seq) for seq in series]) print('Full history' if lag is None else 'Chunks (lag %d)' % lag) # print(' [*] Files found:', len(dfiles)) print(' [*] Segments: %d co-avail' % len(all_avail)) for segname, ls in zip(segments, byseg): print(' * [%s]: %d' % (segname, len(ls))) print(' [*] Examples (%s): %d' % (mode, len(self.data))) if lag is not None: print(' [*] No missing: %d/%d' % (nComplete,nTotal)) tsteps = sorted(list(byday.keys())) print(' [*] Time range: %s ~ %s' % (tsteps[0], tsteps[-1]))