def extract_from_raw(yymmdd, hh=None, taxi_id=None): if type(taxi_id) != int: taxi_id = int(taxi_id) # logging_fpath = opath.join(test_dpath, '_test.txt') ofpath = opath.join(test_dpath, 'log-%s-%s-%s.csv' % (yymmdd, hh, taxi_id)) logging(logging_fpath, 'handle the file; %s' % ofpath) with open(ofpath, 'w') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_header = [ 'time', 'taxi_id', 'driver_id', 'state', 'lng', 'lat', 'apBasePos' ] writer.writerow(new_header) # target_dt = datetime.strptime(yymmdd + hh, '%y%m%d%H') next_dt = target_dt + timedelta(hours=1) yymm = yymmdd[:len('yymm')] yy, mm = yymm[:len('yy')], yymm[len('yy'):] yyyy = '20%s' % yy # log_fpath = reduce( opath.join, [TAXI_RAW_DATA_HOME, yyyy, mm, 'logs', 'logs-%s-normal.csv' % yymm]) ap_polygons = get_ap_polygons() with open(log_fpath) as r_csvfile: reader = csv.DictReader(r_csvfile) for row in reader: t, vid, did, state = map(eval, [ row[cn] for cn in ['time', 'vehicle-id', 'driver-id', 'state'] ]) if vid != taxi_id: continue cur_dt = datetime.fromtimestamp(t) if cur_dt.day != target_dt.day: continue if cur_dt.hour == next_dt.hour: logging(logging_fpath, 'next period; %s' % ofpath) break elif cur_dt.hour != target_dt.hour: continue # lng, lat = map(eval, [row[cn] for cn in ['longitude', 'latitude']]) new_row = [t, vid, did, state, lng, lat] apBasePos = 'X' for ap_polygon in ap_polygons: if ap_polygon.is_including((lng, lat)): apBasePos = ap_polygon.name break new_row.append(apBasePos) with open(ofpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(new_row)
def run(target_months): for yymm in target_months: logging_fpath = opath.join(lf_dpath, 'a1_%s.txt' % yymm) logging(logging_fpath, 'Start handling; %s' % yymm) # yymm_dt = datetime.strptime(yymm, '%y%m') _, numDays = monthrange(yymm_dt.year, yymm_dt.month) first_date = yymm_dt last_date = datetime(yymm_dt.year, yymm_dt.month, numDays) nm_first_day = last_date + timedelta(days=1) handling_date = first_date worker_dts = [[] for _ in range(NUM_WORKERS)] while handling_date < nm_first_day: worker_dts[int((handling_date.day - 1) / numDays * NUM_WORKERS)].append(handling_date) handling_date += timedelta(days=1) # ps = [] for wid, dts in enumerate(worker_dts): p = multiprocessing.Process(target=process_dates, args=(wid, dts, logging_fpath)) ps.append(p) p.start() for p in ps: p.join()
def process_dates(wid, dts, logging_fpath): logging(logging_fpath, 'Start worker %d' % wid) target_days = list(range(dts[0].day, dts[-1].day + 1)) ymd_dt = dts[0] logging(logging_fpath, 'Worker %d: handling %s' % (wid, str(ymd_dt))) yy, mm = ymd_dt.strftime('%y'), ymd_dt.strftime('%m') yyyy = ymd_dt.strftime('%Y') # log_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME, yyyy, mm, 'logs', 'logs-%s-normal.csv' % ymd_dt.strftime('%y%m')]) ap_polygons = get_ap_polygons() handling_day, handling_hour = -1, -1 with open(log_fpath) as r_csvfile: reader = csv.DictReader(r_csvfile) for row in reader: t, vid, did, state = map(eval, [row[cn] for cn in ['time', 'vehicle-id', 'driver-id', 'state']]) cur_dt = datetime.fromtimestamp(t) if cur_dt.day != handling_day: handling_day = cur_dt.day logging(logging_fpath, 'Worker %d: handling %dth day' % (wid, cur_dt.day)) handling_hour = -1 if cur_dt.day < target_days[0]: continue if cur_dt.day > target_days[-1]: logging(logging_fpath, 'Worker %d: end processing' % wid) break # if cur_dt.hour != handling_hour: handling_hour = cur_dt.hour ofpath = opath.join(log_dpath, 'log-%s.csv' % cur_dt.strftime('%Y%m%d%H')) with open(ofpath, 'w') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_header = ['time', 'taxi_id', 'driver_id', 'state', 'lng', 'lat', 'apBasePos'] writer.writerow(new_header) # lng, lat = map(eval, [row[cn] for cn in ['longitude', 'latitude']]) new_row = [t, vid, did, state, lng, lat] apBasePos = 'X' for ap_polygon in ap_polygons: if ap_polygon.is_including((lng, lat)): apBasePos = ap_polygon.name break new_row.append(apBasePos) with open(ofpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(new_row)
def run(yymm): logging_fpath = opath.join(lf_dpath, 'a2_%s.txt' % yymm) ofpath = opath.join(trip_dpath, 'trip-%s.csv' % yymm) # logging(logging_fpath, 'handle the file; %s' % yymm) if opath.exists(ofpath): logging(logging_fpath, 'The file had already been processed; %s' % yymm) return None yy, mm = yymm[:2], yymm[-2:] yyyy = '20%s' % yy with open(ofpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_header = [ 'year', 'month', 'day', 'dow', 'hour', 'taxi_id', 'driver_id', 'fare', 'start_latitude', 'start_longitude', 'start_time', 'end_latitude', 'end_longitude', 'end_time', 'start_loc', 'end_loc', 'time_first_free', ] writer.writerow(new_header) # normal_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME, yyyy, mm, 'trips', 'trips-%s-normal.csv' % yymm]) ext_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME, yyyy, mm, 'trips', 'trips-%s-normal-ext.csv' % yymm]) log_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME, yyyy, mm, 'logs', 'logs-%s-normal.csv' % yymm]) handling_day = -1 try: year, month = map(int, [yyyy, mm]) ap_polygons = get_ap_polygons() vehicles = {} with open(normal_fpath) as tripFileN: tripReaderN = csv.DictReader(tripFileN) with open(ext_fpath) as tripFileE: tripReaderE = csv.DictReader(tripFileE) with open(log_fpath) as logFile: logReader = csv.DictReader(logFile) for rowN in tripReaderN: rowE = next(tripReaderE) sDay, sDow, sHour = map(int, [rowN[cn] for cn in ['start-day', 'start-dow', 'start-hour']]) if handling_day != sDay: logging(logging_fpath, 'handle day %d' % sDay) handling_day = sDay # tripTime = eval(rowN['start-time']) while True: rowL = next(logReader) logTime = eval(rowL['time']) vidL, state = map(int, [rowL[cn] for cn in ['vehicle-id', 'state']]) if not vidL in vehicles: vehicles[vidL] = vehicle(vidL, logTime, state) else: vehicles[vidL].update(logTime, state) if tripTime <= logTime: break taxiID = int(rowN['vehicle-id']) if not taxiID in vehicles: continue driverID = int(rowE['driver-id']) startTime, endTime = map(eval, [rowN[cn] for cn in ['start-time', 'end-time']]) timeFirstFree = vehicles[taxiID].firstFreeStateTime vehicles[taxiID].reset() # startLng, startLat = map(eval, [rowN[cn] for cn in ['start-long', 'start-lat']]) endLng, endLat = map(eval, [rowN[cn] for cn in ['end-long', 'end-lat']]) startLoc, endLoc = 'X', 'X' for ap_polygon in ap_polygons: if startLoc == 'X': if ap_polygon.is_including((startLng, startLat)): startLoc = ap_polygon.name if endLoc == 'X': if ap_polygon.is_including((endLng, endLat)): endLoc = ap_polygon.name if startLoc != 'X' and endLoc != 'X': break # with open(ofpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [year, month, sDay, sDow, sHour, taxiID, driverID, rowN['fare'], startLng, startLat, startTime, endLng, endLat, endTime, startLoc, endLoc, timeFirstFree] writer.writerow(new_row) except Exception as _: logging(logging_fpath, format_exc()) raise
def run(yymm): logging_fpath = opath.join(lf_dpath, 'a2_%s.txt' % yymm) logging(logging_fpath, 'Start handling; %s' % yymm) # yymm_dt = datetime.strptime(yymm, '%y%m') yy, mm = yymm_dt.strftime('%y'), yymm_dt.strftime('%m') yyyy = yymm_dt.strftime('%Y') # normal_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME, yyyy, mm, 'trips', 'trips-%s-normal.csv' % yymm]) ext_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME, yyyy, mm, 'trips', 'trips-%s-normal-ext.csv' % yymm]) # year, month = yymm_dt.year, yymm_dt.month ap_polygons = get_ap_polygons() handling_day, handling_hour = -1, -1 with open(normal_fpath) as tripFileN: tripReaderN = csv.DictReader(tripFileN) with open(ext_fpath) as tripFileE: tripReaderE = csv.DictReader(tripFileE) for rowN in tripReaderN: rowE = next(tripReaderE) # startTime, endTime = map(eval, [rowN[cn] for cn in ['start-time', 'end-time']]) cur_dt = datetime.fromtimestamp(startTime) if cur_dt.day != handling_day: handling_day = cur_dt.day logging(logging_fpath, 'handle day %d' % handling_day) handling_hour = -1 if cur_dt.day == handling_day and cur_dt.hour != handling_hour: handling_hour = cur_dt.hour ofpath = opath.join(trip_dpath, 'trip-%s.csv' % cur_dt.strftime('%Y%m%d%H')) with open(ofpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_header = [ 'year', 'month', 'day', 'dow', 'hour', 'taxi_id', 'driver_id', 'fare', 'start_longitude', 'start_latitude', 'start_time', 'end_longitude', 'end_latitude', 'end_time', 'start_loc', 'end_loc', ] writer.writerow(new_header) sDay, sDow, sHour = map(int, [rowN[cn] for cn in ['start-day', 'start-dow', 'start-hour']]) taxiID = int(rowN['vehicle-id']) driverID = int(rowE['driver-id']) # startLng, startLat = map(eval, [rowN[cn] for cn in ['start-long', 'start-lat']]) endLng, endLat = map(eval, [rowN[cn] for cn in ['end-long', 'end-lat']]) startLoc, endLoc = 'X', 'X' for ap_polygon in ap_polygons: if startLoc == 'X': if ap_polygon.is_including((startLng, startLat)): startLoc = ap_polygon.name if endLoc == 'X': if ap_polygon.is_including((endLng, endLat)): endLoc = ap_polygon.name if startLoc != 'X' and endLoc != 'X': break # with open(ofpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [year, month, sDay, sDow, sHour, taxiID, driverID, rowN['fare'], startLng, startLat, startTime, endLng, endLat, endTime, startLoc, endLoc] writer.writerow(new_row)
def run(prefix=None): logging_fpath = opath.join(lf_dpath, 'a3.txt') logging(logging_fpath, 'Start handling') target_dates = get_target_dates(prefix) # for dt in target_dates: ofpath = opath.join(adt_dpath, 'apDayTrip-%s.csv' % dt.strftime('%Y%m%d')) with open(ofpath, 'w') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_header = [ 'year', 'month', 'day', 'dow', 'hour', 'taxi_id', 'driver_id', 'fare', 'previous_dropoff_longitude', 'previous_dropoff_latitude', 'start_longitude', 'start_latitude', 'end_longitude', 'end_latitude', 'previous_dropoff_loc', 'start_loc', 'end_loc', 'time_previous_dropoff', 'time_enter_airport', 'start_time', 'time_exit_airport', 'end_time', 'time_first_free'] writer.writerow(new_header) # vid_ts, vid_traj0 = get_trajectory(dt) ifpath = opath.join(dt_dpath, 'dayTrip-%s.csv' % dt.strftime('%Y%m%d')) with open(ifpath) as r_csvfile: reader = csv.DictReader(r_csvfile) for row in reader: vid = int(row['taxi_id']) if vid not in vid_ts: continue prevLoc, sLoc, eLoc = [row[cn] for cn in ['previous_dropoff_loc', 'start_loc', 'end_loc']] tPrev, tStart, tEnd = [eval(row[cn]) for cn in ['time_previous_dropoff', 'start_time', 'end_time']] i, j, k = bisect(vid_ts[vid], tPrev), bisect(vid_ts[vid], tStart), bisect(vid_ts[vid], tEnd) # tEnter, tExit = -1, -1 if prevLoc == 'X': if sLoc == 'X': for ts, state, loc in vid_traj0[vid][i:j]: if tEnter == -1 and loc != 'X': tEnter = ts if tEnter != -1 and loc == 'X': tExit = ts break else: continue else: for ts, state, loc in vid_traj0[vid][i:j]: if tEnter == -1 and loc != 'X': tEnter = ts break if eLoc == 'X': for ts, state, loc in vid_traj0[vid][j:k]: if tExit == -1 and loc == 'X': tExit = ts break else: tExit = tEnd else: tEnter = tPrev if sLoc == 'X': for ts, state, loc in vid_traj0[vid][i:j]: if tExit == -1 and loc == 'X': tExit = ts break else: if eLoc == 'X': for ts, state, loc in vid_traj0[vid][j:k]: if tExit == -1 and loc == 'X': tExit = ts break else: tExit = tEnd if tEnter == -1 or tExit == -1: logging(logging_fpath, 'Cannot find tEnter or tExit; %d, %d\n%s' % (tEnter, tExit, str(row))) continue # time_ff = -1 for ts, state, loc in vid_traj0[vid][i:j]: if state == FREE: time_ff = ts break if time_ff == -1: logging(logging_fpath, 'Cannot find time_ff; \n%s' % str(row)) continue # new_row = [row[cn] for cn in ['year', 'month', 'day', 'dow', 'hour', 'taxi_id', 'driver_id', 'fare', 'previous_dropoff_latitude', 'previous_dropoff_longitude', 'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude', 'previous_dropoff_loc', 'start_loc', 'end_loc']] new_row += [tPrev, tEnter, tStart, tExit, tEnd, time_ff] with open(ofpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(new_row)
def run(yymm): logging_fpath = opath.join(lf_dpath, 'a4_%s.txt' % yymm) # logging(logging_fpath, 'handle the file; %s' % yymm) trip_fpath = opath.join(trip_dpath, 'trip-%s.csv' % yymm) handling_day = 0 vid_lastLocTime, vehicles = {}, {} try: with open(trip_fpath) as tripFile: tripReader = csv.DictReader(tripFile) for row in tripReader: did = int(row['driver_id']) if did == -1: continue day, hour = map(int, [row[cn] for cn in ['day', 'hour']]) if day == 1 and hour <= AM5: continue if AM2 <= hour and hour <= AM5: continue if day != handling_day and hour == AM5 + 1: handling_day = day logging(logging_fpath, 'handling %dth day' % handling_day) vid_lastLocTime, vehicles = {}, {} log_fpath = opath.join( apDL_dpath, 'ap-dayLog-%s%02d.csv' % (yymm, handling_day)) with open(log_fpath) as logFile: logReader = csv.DictReader(logFile) for rowL in logReader: vid = int(rowL['driver_id']) if not vid in vehicles: vehicles[vid] = vehicle(vid) vehicles[vid].add_trajectory( eval(rowL['time']), rowL['apBasePos']) # ofpath = opath.join( apDT_dpath, 'ap-dayTrip-%s%02d.csv' % (yymm, handling_day)) with open(ofpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_header = [ 'year', 'month', 'day', 'dow', 'hour', 'taxi_id', 'driver_id', 'fare', 'start_latitude', 'start_longitude', 'start_time', 'end_latitude', 'end_longitude', 'end_time', 'start_loc', 'end_loc', 'time_first_free', 'time_previous_dropoff', 'time_enter_airport', 'time_exit_airport', 'previous_dropoff_latitude', 'previous_dropoff_longitude', 'previous_dropoff_loc', ] writer.writerow(new_header) vid = int(row['taxi_id']) if not vid in vehicles: continue sLoc, eLoc = [row[cn] for cn in ['start_loc', 'end_loc']] sTime, eTime = map( eval, [row[cn] for cn in ['start_time', 'end_time']]) eLat, eLng = map( eval, [row[cn] for cn in ['end_latitude', 'end_longitude']]) if not vid in vid_lastLocTime: vid_lastLocTime[vid] = (eLat, eLng, eLoc, eTime) continue latPrevDropoff, lngPrevDropoff, locPrevDropoff, tPrevDropoff = vid_lastLocTime[ vid] if not (locPrevDropoff == 'X' and sLoc == 'X'): tEnter, tExit = vehicles[vid].find_eeTime_AP(sTime, sLoc) newInfo = [ tPrevDropoff, tEnter, tExit, latPrevDropoff, lngPrevDropoff, locPrevDropoff ] add_row(ofpath, row, newInfo) else: visitAP, tEnter, tExit = vehicles[vid].find_eeTime_XAP( tPrevDropoff, sTime) if visitAP: newInfo = [ tPrevDropoff, tEnter, tExit, latPrevDropoff, lngPrevDropoff, locPrevDropoff ] add_row(ofpath, row, newInfo) vid_lastLocTime[vid] = (eLat, eLng, eLoc, eTime) except Exception as _: logging(logging_fpath, format_exc()) raise