Beispiel #1
0
	def __init__(self,
		local, stop_ind,
		mode, bsize,
		lag=6,
		stops=5,
		local_split=0.8, # 0.2 recent will be used for testing
		meta_path=None,
		min_stride=1,
		smooth=False,
		norm=10,
		diff=None,
		device=None, verbose=True):

		super().__init__(
			local,
			mode, bsize,
			lag,
			stops,
			local_split,
			meta_path,
			min_stride,
			smooth,
			norm,
			diff,
			device, verbose)

		self.refs = []
		split_ind = int(13248 * local_split)
		for route in self.meta:
			last_t = 0
			for pair in route['trainable']:
				# TODO: split train/test here
				ti, si = pair
				if si != stop_ind:
					continue
				if (mode == 'train' and ti < split_ind) \
					or (mode == 'test' and ti >= split_ind):

					if ti >= last_t + min_stride:
						self.refs.append([route['name']] + pair)
						last_t = ti
					# self.refs.append([route['name']] + pair)

		if verbose: print(' [*] Subset in Stop-%d: %d' % (stop_ind, len(self.refs)))

		if mode == 'train':
			npshuff(self.refs)
		self.ind = 0

		with open('data/stopcodes_sequence/%s.txt' % local) as fl:
			stops = fl.read().split('\n')
		segid = '%s-%s' % (stops[stop_ind], stops[stop_ind+1])
		with open('data/avgspeeds-full-ts-xclude/%s/%s.csv' % (segid[0], segid)) as fl:
			lines = fl.read().split('\n')[1:]
		lines = filter(lambda ent: ent, lines)
		lines = map(lambda ent: ent.split(',')[1], lines)
		avgspeeds = [float(ln) if ln != '' else np.nan for ln in lines]
		self.avgdata = np.array(avgspeeds)
Beispiel #2
0
	def __init__(self,
		local,
		mode, bsize,
		lag=6,
		stops=5,
		local_split=0.8, # 0.2 recent will be used for testing
		meta_path=None,
		min_stride=1,
		# index_file='min-data.json',
		smooth=False,
		norm=10,
		diff=None,
		device=None, verbose=False):

		self.device = device
		self.bsize = bsize
		self.mode = mode
		self.smooth = smooth
		self.lag = lag
		self.stops = stops
		self.norm = norm
		self.diff = diff

		t0 = time()
		if meta_path is None:
			meta_path = 'metadata/%dh' % (int(lag/6))
		fpath = '%s/%s.json' % (meta_path, local)
		with open(fpath) as fl:
			meta = [json.load(fl)]
		self.meta = meta

		if verbose: print('Locals dataset: %s (%s)' % (mode, fpath))
		if verbose: print(' [*] Loaded routes:', len(meta), '(%.2fs)' % (time() - t0))
		if verbose: print(' [*] Has trainable inds:', len(meta[0]['trainable']))

		self.refs = []
		split_ind = int(13248 * local_split)
		for route in meta:
			last_t = 0
			for pair in route['trainable']:
				# TODO: split train/test here
				ti, si = pair
				if (mode == 'train' and ti < split_ind) \
					or (mode == 'test' and ti >= split_ind):

					if ti >= last_t + min_stride:
						self.refs.append([route['name']] + pair)
						last_t = ti

		assert len(meta)
		if verbose: print(' [*] Subset %s: %d' % (mode, len(self.refs)))

		if mode == 'train':
			npshuff(self.refs)
		self.ind = 0
		self.mat = np.load('data/history/%s.npy' % (local))
		self.maxval=20
Beispiel #3
0
	def __init__(self,
			segments,
			mode, bsize,
			lag=6,
			data_path=PARSED_PATH,
			norm=(9, 10),
			shuffle=True,
			verbose=True,
		):
		super().__init__(
			segments,
			mode, bsize, lag=None,
			data_path=PARSED_PATH,
			ignore_missing=True,
			split=0.8,
			preproc='s', smooth=1.2, res=10,
			post=None, clip_hours=8, norm=(0, 1),
			shuffle=False, verbose=verbose)

		def congestion(vs, clip=10):
			vs = blur1d(vs, sigma=1.5)
			vs[vs > clip] = np.log(vs[vs > clip] - clip + 1) + clip
			return vs
		# FIXME: using constfill
		ls = []
		for seg in self.data:
			mat = np.array([congestion(seg[:, si]) for si in range(seg.shape[1])])
			ls.append(mat.T)
		self.post_data = ls

		norm_mean, norm_scale = norm
		for ii in range(len(self.data)):
			# seg, post in zip(self.data, self.post_data):

			self.data[ii] = (self.data[ii] - norm_mean) / norm_scale
			self.post_data[ii] = (self.post_data[ii] - norm_mean) / norm_scale


		self.lag = lag
		if lag is not None:
			self.data, nComplete, nTotal = self.chunks(self.data)
			self.post_data, _, _ = self.chunks(self.post_data)
			if verbose:
				print(' [*] No missing: %d/%d' % (nComplete, nTotal))

		if shuffle:
			inds = list(range(len(self.data)))
			npshuff(inds)
			self.data = [self.data[ii] for ii in inds]
			self.post_data = [self.post_data[ii] for ii in inds]
Beispiel #4
0
	def __init__(self, mode, bsize, minValid=0.7,
		lag=6,
		index_file='min-data_2h.json',
		reserved='reserved_routes.json',
		overlap=True,
		smooth=False,
		device=None):

		self.device = device
		self.bsize = bsize
		self.mode = mode
		self.smooth = smooth
		self.lag = lag

		t0 = time()
		with open(index_file) as fl:
			meta = json.load(fl)

		print('Routes dataset: %s' %mode)
		print(' [*] Loaded routes:', len(meta), '(%.2fs)' % (time() - t0))

		# Filter reserved routes
		with open(reserved) as fl:
			res_metas = json.load(fl)
		res_names = [entry['name'] for entry in res_metas]
		if mode == 'train':
			meta = [entry for entry in meta if entry['name'] not in res_names]
		else:
			meta = [entry for entry in meta if entry['name'] in res_names]
			if not overlap:
				for route in meta:
					route['trainable'] = dedupe(route['trainable'])
		assert len(meta)
		print(' [*] Subset %s: %d (%s)' % (mode, len(meta), reserved))
		self.meta = meta

		t0 = time()
		self.refs = []
		for route in meta:
			for pair in route['trainable']:
				self.refs.append([route['name']] + pair)
		print(' [*] Loaded trainable inds:', len(self.refs), '(%.2fs)' % (time() - t0))
		t0 = time()

		if mode == 'train':
			npshuff(self.refs)
		self.ind = 0
Beispiel #5
0
	def __init__(self,
			segments,
			mode, bsize,
			# lag=6,
			res=10,
			data_path=PARSED_PATH,
			preproc='s',
			split=0.8,

			# smooth=1.2,
			ignore_missing=True,

			# post=None,
			clip_hours=8,
			norm=(12, 10), # raw mean, scale
			shuffle=False,
			verbose=True,
		):

		self.segments = segments
		self.mode = mode
		self.bsize = bsize
		self.shuffle = shuffle
		# self.lag = lag
		self.norm = norm
		self.res = res
		self.clip_hours = clip_hours
		# self.post = post

		byday = {}
		byseg = []
		for segname in segments:
			smatch = '%s/%s%02d_%s_*.json' % (data_path, preproc, res, segname)
			# print(smatch)
			dfiles = sorted(glob(smatch))
			try:
				assert len(dfiles)
			except:
				raise Exception('Missing: %s' % smatch)

			for dname in dfiles:
				day = dname.split('_')[-1].replace('.json', '')
				if day not in byday: byday[day] = []
				byday[day].append(dname)
			byseg.append(dfiles)

		all_avail = []
		for day in sorted(list(byday.keys())):
			gathered = byday[day]
			if len(gathered) < len(segments):
				continue
			all_avail.append([day, gathered])

		# gather the raw speeds per day
		for ii, (day, gathered) in enumerate(all_avail):
			vlists = []
			for seg_name_day in gathered:
				with open(seg_name_day) as fl:
					ls = json.load(fl)
				vlists.append(ls)
			all_avail[ii].append(vlists)

		# align the speeds
		vfill = None
		self.rawdata = []
		self.trange = []
		self.calendar = [] # contains time encoding based on week cycle and day (24hr) cycle
		self.nancount = [[0,0] for _ in segments]
		for ii, (day, gathered, vlists) in enumerate(all_avail):
			t0 = s2d(vlists[0][0]['time'])
			tf = s2d(vlists[0][-1]['time'])

			for segvs in vlists:
				if s2d(segvs[0]['time']) > t0:
					t0 = s2d(segvs[0]['time'])
				if s2d(segvs[-1]['time']) < tf:
					tf = s2d(segvs[-1]['time'])

			if t0 > tf:
				# skip days with no parallel times
				continue

			dt = tf - t0
			tsteps = dt.seconds // (60 * res) + 1
			vmat = np.zeros((tsteps, len(vlists)))
			# print(t0, tf)

			for si, segvs in enumerate(vlists):
				# seek until t0 begins
				ind = 0
				while s2d(segvs[ind]['time']) < t0:
					ind += 1

				if vfill is not None:
					vs = np.array(vfill(segvs, res))
				else:
					vs = np.array(nanfill(segvs, res))
				# print(len(vs), ind, tsteps)

				# if smooth is not None:
				# 	vs = blur1d(vs, sigma=smooth)
				nmean, nscale = norm
				vs = (vs - nmean) / nscale
				vmat[:, si] = vs[ind:ind+tsteps]

			if self.clip_hours is not None:
				vmat = vmat[self.clip_hours*6:]
				t0 += timedelta(seconds=self.clip_hours*60*60)
			self.trange.append((t0, tf))
			self.rawdata.append(vmat)
			for si in range(vmat.shape[1]):
				segvs = vmat[:, si]
				self.nancount[si][0] += np.isnan(segvs).sum()
				self.nancount[si][1] += len(segvs)

			midnight = t0.replace(hour=0, minute=0, second=0, microsecond=0)
			seconds_since = (t0 - midnight).total_seconds()
			times = []
			for step_i in range(vmat.shape[0]):
			    seconds_enc = (seconds_since + step_i * 60 * 10) / (24 * 60 * 60)
			    time_encoding = t0.weekday() / 6 + seconds_enc * 0.1
			    times.append(time_encoding)
			self.calendar.append(np.array(times))
		self.data = self.rawdata

		tsplit = int(len(self.data) * split)
		self.data = self.data[:tsplit] if mode == 'train' else self.data[tsplit:]
		self.trange = self.trange[:tsplit] if mode == 'train' else self.trange[tsplit:]
		self.calendar = self.calendar[:tsplit] if mode == 'train' else self.calendar[tsplit:]

		if shuffle:
			npshuff(self.data)

		if verbose:
			avglen = lambda series: np.mean([len(seq) for seq in series])
			print('Full history')
			print(' [*] Segments: %d co-avail' % len(all_avail))
			for si, (segname, ls) in enumerate(zip(segments, byseg)):
				nanperc = self.nancount[si][0]/self.nancount[si][1] * 100
				print('    * [%s]: %d (%.1f%% nan)' % (
					segname,
					len(ls),
					nanperc))
			print(' [*] Examples (%s): %d' % (mode, len(self.data)))
			# if lag is not None:
			# 	print(' [*] No missing: %d/%d' % (nComplete,nTotal))
			tsteps = sorted(list(byday.keys()))
			print(' [*] Time range: %s ~ %s' % (tsteps[0], tsteps[-1]))
Beispiel #6
0
	def __init__(self,
			segments,
			mode, bsize,
			lag=6,
			res=10,
			data_path=PARSED_PATH,
			preproc='s',
			split=0.8,

			smooth=1.2,
			ignore_missing=True,

			nanok=None,
			post=None,
			clip_hours=8,
			norm=(12, 10), # raw mean, scale
			shuffle=True,
			verbose=True,
		):

		self.segments = segments
		self.mode = mode
		self.bsize = bsize
		self.shuffle = shuffle
		self.lag = lag
		self.nanok = nanok
		self.norm = norm
		self.res = res
		self.clip_hours = clip_hours
		self.post = post

		byday = {}
		byseg = []
		for segname in segments:
			smatch = '%s/%s%02d_%s_*.json' % (data_path, preproc, res, segname)
			# print(smatch)
			dfiles = sorted(glob(smatch))
			try:
				assert len(dfiles)
			except:
				raise Exception('Missing: %s' % smatch)

			for dname in dfiles:
				day = dname.split('_')[-1].replace('.json', '')
				if day not in byday: byday[day] = []
				byday[day].append(dname)
			byseg.append(dfiles)

		all_avail = []
		for day in sorted(list(byday.keys())):
			gathered = byday[day]
			if len(gathered) < len(segments):
				continue
			all_avail.append([day, gathered])

		# gather the raw speeds per day
		for ii, (day, gathered) in enumerate(all_avail):
			vlists = []
			for seg_name_day in gathered:
				with open(seg_name_day) as fl:
					ls = json.load(fl)
				vlists.append(ls)
			all_avail[ii].append(vlists)

		# align the speeds
		tfill = nanfill if ignore_missing and lag is not None else constfill
		self.rawdata = []
		self.trange = []
		for ii, (day, gathered, vlists) in enumerate(all_avail):
			t0 = s2d(vlists[0][0]['time'])
			tf = s2d(vlists[0][-1]['time'])

			for segvs in vlists:
				if s2d(segvs[0]['time']) > t0:
					t0 = s2d(segvs[0]['time'])
				if s2d(segvs[-1]['time']) < tf:
					tf = s2d(segvs[-1]['time'])

			if t0 > tf:
				# skip days with no parallel times
				continue

			dt = tf - t0
			tsteps = dt.seconds // (60 * res) + 1
			vmat = np.zeros((tsteps, len(vlists)))
			# print(t0, tf)

			for si, segvs in enumerate(vlists):
				# seek until t0 begins
				ind = 0
				# print(ii, si, t0)
				# print(s2d(segvs[0]['time']))
				# print(s2d(segvs[-1]['time']))
				# print(len(segvs), tsteps)
				while s2d(segvs[ind]['time']) < t0:
					ind += 1

				vs = np.array(tfill(segvs, res))

				if smooth is not None:
					vs = blur1d(vs, sigma=smooth)
				nmean, nscale = norm
				vs = (vs - nmean) / nscale
				vmat[:, si] = vs[ind:ind+tsteps]
			self.trange.append((t0, tf))

			if self.clip_hours is not None:
				vmat = vmat[self.clip_hours*6:]
			self.rawdata.append(vmat)
		self.data = self.rawdata

		tsplit = int(len(self.data) * split)
		self.data = self.data[:tsplit] if mode == 'train' else self.data[tsplit:]
		self.trange = self.trange[:tsplit] if mode == 'train' else self.trange[tsplit:]

		if lag is not None:
			self.raw_data = self.data
			self.data, nComplete, nTotal = self.chunks(self.data)

		if shuffle:
			npshuff(self.data)

		if verbose:
			avglen = lambda series: np.mean([len(seq) for seq in series])
			print('Full history' if lag is None else 'Chunks (lag %d)' % lag)
			# print(' [*] Files found:', len(dfiles))
			print(' [*] Segments: %d co-avail' % len(all_avail))
			for segname, ls in zip(segments, byseg):
				print('    * [%s]: %d' % (segname, len(ls)))
			print(' [*] Examples (%s): %d' % (mode, len(self.data)))
			if lag is not None:
				print(' [*] No missing: %d/%d' % (nComplete,nTotal))
			tsteps = sorted(list(byday.keys()))
			print(' [*] Time range: %s ~ %s' % (tsteps[0], tsteps[-1]))