def process_file(yymm): ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm) if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath): return None print 'handle the file; %s' % yymm veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {} veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {} if yymm not in ['0901', '1001', '1011']: path_to_last_day_csv_file = None temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix, '.csv') prev_fn = None y, m = int(yymm[:2]), int(yymm[2:]) prev_m = m - 1 prev_yymm = '%02d%02d' %(y, prev_m) for temp_fn in temp_csv_files: if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)): prev_fn = temp_fn break assert prev_fn, yymm path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn) # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1: # return None veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \ record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm) veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \ record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) # save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time) save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time) print 'end the file; %s' % yymm
def process_files(yymm): print 'handle the file; %s' % yymm # for dn, fn_prefix, Y09, Y10, both in _package: target_file = Y09 if yymm.startswith('09') else Y10 with open('%s/%s%s.csv' % (dn, fn_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() for row in reader: if not check_path_exist(both): with open(both, 'wt') as csvFile: writer = csv.writer(csvFile) writer.writerow(headers) with open(both, 'a') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) # if not check_path_exist(target_file): with open(target_file, 'wt') as csvFile: writer = csv.writer(csvFile) writer.writerow(headers) with open(target_file, 'a') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) print 'end the file; %s' % yymm
def process_file(yymm): ap_pkl_file_path = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm) ns_pkl_file_path = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm) if not (check_path_exist(ap_pkl_file_path) and check_path_exist(ns_pkl_file_path)): return None # # Load pickle files # ap_crossing_time, ns_crossing_time = load_pickle_file(ap_pkl_file_path), load_pickle_file(ns_pkl_file_path) # # Initiate csv files # ap_trip_fpath = '%s/%s%s.csv' % (ap_trips_dir, ap_trip_prefix, yymm) ns_trip_fpath = '%s/%s%s.csv' % (ns_trips_dir, ns_trip_prefix, yymm) if check_path_exist(ap_trip_fpath) and check_path_exist(ns_trip_fpath): return None print 'handle the file; %s' % yymm for fpath in [ap_trip_fpath, ns_trip_fpath]: with open(fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['tid', 'vid', 'did', 'start-time', 'end-time', 'duration', 'fare', 'prev-trip-end-time', 'trip-mode', 'queue—join-time', 'queueing-time'] writer.writerow(new_headers) # with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h : i for i, h in enumerate(headers)} for row in reader: tid, did = row[hid['tid']], row[hid['did']] et, duration = row[hid['end-time']], row[hid['duration']] fare = row[hid['fare']] # ap_tm, ns_tm = int(row[hid['ap-trip-mode']]), int(row[hid['ns-trip-mode']]) vid, st, prev_tet = row[hid['vid']], eval(row[hid['start-time']]), eval(row[hid['prev-trip-end-time']]) # for tm, crossing_time, fpath in [(ap_tm, ap_crossing_time, ap_trip_fpath), (ns_tm, ns_crossing_time, ns_trip_fpath)]: if tm == DIn_POut or tm == DOut_POut: continue if tm == DIn_PIn: queue_join_time = prev_tet elif tm == DOut_PIn: try: i = bisect(crossing_time[vid], st) except KeyError: print '%s-tid-%s' % (yymm, row[hid['tid']]) continue queue_join_time = crossing_time[vid][i - 1] if i != 0 else crossing_time[vid][0] with open(fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') queueing_time = st - queue_join_time if queueing_time < Q_LIMIT_MIN: queueing_time = Q_LIMIT_MIN new_row = [tid, vid, did, st, et, duration, fare, prev_tet, tm, queue_join_time, queueing_time] writer.writerow(new_row) print 'end the file; %s' % yymm
def process_files(yymm): print 'handle the file; %s' % yymm # for dn, fn_prefix, Y09, Y10, both in _package: target_file = Y09 if yymm.startswith('09') else Y10 with open('%s/%s%s.csv' % (dn, fn_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() for row in reader: if not check_path_exist(both): with open(both, 'wt') as csvFile: writer = csv.writer(csvFile) writer.writerow(headers) with open(both, 'a') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) # if not check_path_exist(target_file): with open(target_file, 'wt') as csvFile: writer = csv.writer(csvFile) writer.writerow(headers) with open(target_file, 'a') as csvFile: writer = csv.writer(csvFile) writer.writerow(row) print 'end the file; %s' % yymm
def process_files(yymm): print 'handle the file; %s' % yymm # ap_target_file = Y09_ap_trips if yymm.startswith('09') else Y10_ap_trips with open('%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() if not check_path_exist(ap_target_file): with open(ap_target_file, 'wt') as csvFile: writer = csv.writer(csvFile) writer.writerow(headers) with open(ap_target_file, 'a') as csvFile: writer = csv.writer(csvFile) for row in reader: writer.writerow(row) # ns_target_file = Y09_ns_trips if yymm.startswith('09') else Y10_ns_trips with open('%s/%s%s.csv' % (ns_ep_dir, ns_ep_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() if not check_path_exist(ns_target_file): with open(ns_target_file, 'wt') as csvFile: writer = csv.writer(csvFile) writer.writerow(headers) with open(ns_target_file, 'a') as csvFile: writer = csv.writer(csvFile) for row in reader: writer.writerow(row) # print 'end the file; %s' % yymm
def process_file(yymm): ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm) if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath): return None print 'handle the file; %s' % yymm veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {} veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {} if yymm not in ['0901', '1001', '1011']: path_to_last_day_csv_file = None temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix, '.csv') prev_fn = None y, m = int(yymm[:2]), int(yymm[2:]) prev_m = m - 1 prev_yymm = '%02d%02d' % (y, prev_m) for temp_fn in temp_csv_files: if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)): prev_fn = temp_fn break assert prev_fn, yymm path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn) # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1: # return None veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \ record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm) veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \ record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) # save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time) save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time) print 'end the file; %s' % yymm
def process_files(yymm): print 'handle the file; %s' % yymm # ap_target_file = Y09_ap_trips if yymm.startswith('09') else Y10_ap_trips with open('%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() if not check_path_exist(ap_target_file): with open(ap_target_file, 'wt') as csvFile: writer = csv.writer(csvFile) writer.writerow(headers) with open(ap_target_file, 'a') as csvFile: writer = csv.writer(csvFile) for row in reader: writer.writerow(row) # ns_target_file = Y09_ns_trips if yymm.startswith('09') else Y10_ns_trips with open('%s/%s%s.csv' % (ns_ep_dir, ns_ep_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() if not check_path_exist(ns_target_file): with open(ns_target_file, 'wt') as csvFile: writer = csv.writer(csvFile) writer.writerow(headers) with open(ns_target_file, 'a') as csvFile: writer = csv.writer(csvFile) for row in reader: writer.writerow(row) # print 'end the file; %s' % yymm
def process_month(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) ss_trips_fpath = '%s/%s%s.csv' % (ss_trips_dpath, ss_trips_prefix, yymm) if not check_path_exist(ss_trips_fpath): logger.info('The file X exists; %s' % yymm) return None prevDriversDefined_fpath = '%s/%s%s.csv' % ( prevDriversDefined_dpath, prevDriversDefined_prefix, yymm) if check_path_exist(prevDriversDefined_fpath): logger.info('The processed; %s' % yymm) return None drivers = {} zones = generate_zones() handling_day = 0 with open(prevDriversDefined_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ 'did', 'timeFrame', 'zi', 'zj', 'time', 'day', 'month', 'start-long', 'start-lat', 'distance', 'duration', 'fare', 'spendingTime', 'prevDrivers' ]) with open(ss_trips_fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: t = eval(row[hid['time']]) cur_dt = datetime.datetime.fromtimestamp(t) if handling_day != cur_dt.day: logger.info('Processing %s %dth day (month %d)' % (yymm, cur_dt.day, cur_dt.month)) handling_day = cur_dt.day did = int(row[hid['did']]) zi, zj = int(row[hid['zi']]), int(row[hid['zj']]) try: z = zones[(zi, zj)] except KeyError: continue if not drivers.has_key(did): drivers[did] = ca_driver_withPrevDrivers(did) prevDrivers = drivers[did].find_prevDriver(t, z) writer.writerow(row + ['&'.join(map(str, prevDrivers))]) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def process_file(yymm): y, m = int('20' + yymm[:2]), int(yymm[2:]) # find the next month's first day if m == 12: next_y, next_m = y + 1, 1 else: next_y, next_m = y, m + 1 next_m_first_day = datetime.datetime(next_y, next_m, 1, 0) cur_m_last_day = next_m_first_day - datetime.timedelta(days=1) dd = '%02d' % cur_m_last_day.day ll_fpath = '%s/%s%s%s.csv' % (logs_last_day_dir, log_last_day_prefix, yymm, dd) if check_path_exist(ll_fpath): return None print 'handle the file; %s' % yymm # last_day_timestamp = time.mktime(cur_m_last_day.timetuple()) log_fpath = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm) # if (time.time() - get_created_time(log_fpath)) < HOUR1: # return None with open(log_fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open(ll_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(headers) for row in reader: t = eval(row[hid['time']]) if t <= last_day_timestamp: continue writer.writerow(row) print 'end the file; %s' % yymm
def process_file(yymm): fpath = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm) if check_path_exist(fpath): return None print 'handle the file; %s' % yymm yy, mm = yymm[:2], yymm[-2:] # with open('%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open(fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['time', 'vid', 'did', 'ap-or-not', 'ns-or-not'] writer.writerow(new_headers) # for row in reader: ap_or_not = ap_poly.is_including( (eval(row[hid['longitude']]), eval(row[hid['latitude']]))) np_or_not = ns_poly.is_including( (eval(row[hid['longitude']]), eval(row[hid['latitude']]))) new_row = [ row[hid['time']], row[hid['vehicle-id']], row[hid['driver-id']], ap_or_not, np_or_not ] writer.writerow(new_row) print 'end the file; %s' % yymm
def process_file(yymm): y, m = int('20' + yymm[:2]), int(yymm[2:]) # find the next month's first day if m == 12: next_y, next_m = y + 1, 1 else: next_y, next_m = y, m + 1 next_m_first_day = datetime.datetime(next_y, next_m, 1, 0) cur_m_last_day = next_m_first_day - datetime.timedelta(days=1) dd = '%02d' % cur_m_last_day.day ll_fpath = '%s/%s%s%s.csv' % (logs_last_day_dir, log_last_day_prefix, yymm, dd) if check_path_exist(ll_fpath): return None print 'handle the file; %s' % yymm # last_day_timestamp = time.mktime(cur_m_last_day.timetuple()) log_fpath = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm) # if (time.time() - get_created_time(log_fpath)) < HOUR1: # return None with open(log_fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open(ll_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(headers) for row in reader: t = eval(row[hid['time']]) if t <= last_day_timestamp: continue writer.writerow(row) print 'end the file; %s' % yymm
def get_driver_trajectory(did): ofpath = '%s%d.pkl' % (if_prefix, did) if check_path_exist(ofpath): dt_xy_state = load_pickle_file(ofpath) else: dates = [] for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix): _, _date, _did = fn[:-len('.csv')].split('-') if int(_did) != did: continue year = 2000 + int(_date[:2]) month, day = map(int, [_date[2:4], _date[4:6]]) dt = datetime.datetime(year, month, day) dates += [dt] dates.sort() dt_xy_state = [] for dt in dates: yy = '%02d' % (dt.year - 2000) mm, dd = '%02d' % dt.month, '%02d' % dt.day yymmdd = yy + mm + dd ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did) with open(ifpath, 'rb') as logFile: reader = csv.reader(logFile) header = reader.next() # header: time,vehicle-id,driver-id,longitude,latitude,speed,state hid = {h: i for i, h in enumerate(header)} for row in reader: dt = datetime.datetime.fromtimestamp(eval(row[hid['time']])) lon, lat = map(eval, [row[hid[cn]] for cn in ['longitude', 'latitude']]) x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat) dt_xy_state += [(dt, x, y, int(row[hid['state']]))] save_pickle_file(ofpath, dt_xy_state) return dt_xy_state
def log_location_labeling(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) log_fpath = '%s/%s%s.csv' % (log_dpath, log_prefix, yymm) if check_path_exist(log_fpath): logger.info('The file had already been processed; %s' % log_fpath) return yy, mm = yymm[:2], yymm[-2:] # ap_poly, ns_poly = read_generate_polygon(ap_poly_fn), read_generate_polygon(ns_poly_fn) with open('%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open(log_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['time', 'vid', 'did', 'ap-or-not', 'ns-or-not'] writer.writerow(new_headers) # for row in reader: ap_or_not = ap_poly.is_including((eval(row[hid['longitude']]), eval(row[hid['latitude']]))) np_or_not = ns_poly.is_including((eval(row[hid['longitude']]), eval(row[hid['latitude']]))) new_row = [row[hid['time']], row[hid['vehicle-id']], row[hid['driver-id']], ap_or_not, np_or_not] writer.writerow(new_row) logger.info('end the file; %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def InitUI(self): self.SetDoubleBuffered(True) # self.sgBorder_xy = GPS_xyDrawing.get_sgBoarder_xy() min_x, min_y = 1e400, 1e400 for x, y in self.sgBorder_xy: if x < min_x: min_x = x if y < min_y: min_y = y self.translate_x, self.translate_y = -min_x + 10, -min_y + 10 # self.Bind(wx.EVT_PAINT, self.OnPaint) # prepare stock objects. self.default_pen = self.create_pen(wx.BLACK, 1) self.default_font = self.create_font(8, wx.SWISS, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL) if check_path_exist(bg_img_fpath): bmp = wx.BitmapFromImage(wx.Image(bg_img_fpath).AdjustChannels(1.0, 1.0, 1.0, 0.4)) self.bg_bmp = (bmp, bmp.GetWidth(), bmp.GetHeight()) else: self.bg_bmp = None self.sgGrid_xy = GPS_xyDrawing.get_sgGrid_xy() self.encountered_zones = set() self.marked_zone = None
def process_tripBased(): for y in range(9, 11): yyyy = '20%02d' % y logger.info('handle the file; %s' % yyyy) # statistics_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix, yyyy) if check_path_exist(statistics_fpath): logger.info('The file had already been processed; %s' % yyyy) return yy = yyyy[2:] holidays = HOLIDAYS2009 if yyyy == '2009' else HOLIDAYS2010 with open(statistics_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['year', 'month', 'day', 'hour', 'weekEnd', 'driverID', 'locQTime', 'locEP', 'locDuration', 'locFare', 'locProductivity', 'locIn'] drop_pick_cns = [] for l0 in locations: for l1 in locations: cn = 'D%s#P%s' % (l0, l1) drop_pick_cns.append(cn) header.append(cn) writer.writerow(header) for fn in get_all_files(economicProfit_ap_dpath, '%s%s*' % (economicProfit_ap_prefix, yy)): with open('%s/%s' % (economicProfit_ap_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month, day, hour = map(int, [row[hid[cn]] for cn in ['year', 'month', 'day', 'hour']]) did = int(row[hid['did']]) locQTime = float(row[hid['queueingTime']]) / SEC60 locEP = float(row[hid['economicProfit']]) / CENT locDuration = float(row[hid['duration']]) / SEC60 locFare = float(row[hid['fare']]) / CENT locProductivity = (locFare / (locQTime + locDuration)) * SEC60 locIn = 1 if int(row[hid['tripMode']]) == DIn_PIn else 0 weekEnd = 0 if (year, month, day) in holidays: weekEnd = 1 if datetime.datetime(year, month, day).weekday() in WEEKENDS: weekEnd = 1 l0, l1 = row[hid['prevEndTerminalAP']], row[hid['pickUpTerminalAP']] drop_pick = 'D%s#P%s' % (l0, l1) new_row = [ year, month, day, hour, weekEnd, did, locQTime, locEP, locDuration, locFare, locProductivity, locIn ] for dp_candidate in drop_pick_cns: if dp_candidate == drop_pick: new_row.append(1) else: new_row.append(0) writer.writerow(new_row)
def process_file(fpath): logger.info('Start handling; %s' % fpath) _, _, _, _did1 = get_fn_only(fpath)[:-len('.csv')].split('-') try: ofpath = '%s/%s%s-%s.csv' % (of_dpath, of_prefix, year, _did1) sig_fpath = '%s/%ssigRelation-%s-%s.pkl' % (of_dpath, of_prefix, year, _did1) if check_path_exist(ofpath): return None with open(ofpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['did', 'numObservations', 'numPrevDrivers', 'numSigRelationship', 'numPosCoef', 'numNegCoef', 'sigPosRelation', 'sigNegRelation'] writer.writerow(header) # logger.info('Start loading; %s-%s' % (year, _did1)) df = pd.read_csv(fpath) numObservations = len(df) did1_df = df.drop(['month', 'day', 'hour', 'zi', 'zj', 'did'], axis=1) if _did1 in did1_df.columns: did1_df = did1_df.drop([_did1], axis=1) prevDrivers = [cn for cn in did1_df.columns if cn != depVar] numPrevDrivers = len(prevDrivers) # sigRelatioin = {k: [] for k in ['pos', 'neg']} for _did0 in prevDrivers: num_encouters = sum(did1_df[_did0]) if num_encouters < numObservations * MIN_PICKUP_RATIO: continue # if len(did1_df) - 1 == sum(did1_df[_did0]) or sum(did1_df[_did0]) == 0: # continue y = did1_df[depVar] X = did1_df[[_did0]] X = sm.add_constant(X) res = sm.OLS(y, X, missing='drop').fit() pv = res.pvalues[_did0] coef = res.params[_did0] if pv < SIGINIFICANCE_LEVEL: if coef < 0: sigRelatioin['neg'] += [(_did0, coef)] elif coef > 0: sigRelatioin['pos'] += [(_did0, coef)] with open(ofpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [_did1, numObservations, numPrevDrivers, len(sigRelatioin['pos']) + len(sigRelatioin['neg']), len(sigRelatioin['pos']), len(sigRelatioin['neg']), '&'.join([_did0 for _did0, _ in sigRelatioin['pos']]), '&'.join([_did0 for _did0, _ in sigRelatioin['neg']])] writer.writerow(new_row) save_pickle_file(sig_fpath, sigRelatioin) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, _did1)), 'w') as f: f.write(format_exc()) raise logger.info('End handling; %s' % fpath)
def run(): if not check_path_exist(ssd_apIn_fpath): with open(ssd_apIn_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') headers = ['apQTime', 'apIn', 'did'] writer.writerow(headers) for m in xrange(1, 13): yymm = '10%02d' % m if yymm in ['1010']: continue logger.info('Start handling; %s' % yymm) ft_drivers = map(int, load_pickle_file('%s/%s%s.pkl' % (full_time_driver_dir, ft_drivers_prefix, yymm))) ap_ep_fpath = '%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm) with open(ap_ep_fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} handling_day = 0 for row in reader: did = int(row[hid['did']]) if did not in ft_drivers: continue t = eval(row[hid['start-time']]) cur_dt = datetime.datetime.fromtimestamp(t) if handling_day != cur_dt.day: logger.info('...ing; %s(%dth)' % (yymm, handling_day)) handling_day = cur_dt.day apIn = 1 if int(row[hid['trip-mode']]) == DIn_PIn else 0 apQTime = eval(row[hid['queueing-time']]) / float(SEC60) new_row = [apQTime, apIn, did] writer.writerow(new_row) # df = pd.read_csv(ssd_apIn_fpath) df = df[~(np.abs(df['apQTime'] - df['apQTime'].mean()) > (3 * df['apQTime'].std()))] minNumSample = 40 with open(ssd_sensitivity_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') headers = ['did', 'F_pValue', 'rSqure', 'rSqureAdj', 'coef_apIn', 'pValue_apIn', 'coef_const', 'pValue_const'] writer.writerow(headers) for did in set(df['did']) : did_df = df[(df['did'] == did)] if len(did_df) < minNumSample: continue if len(did_df[(did_df['apIn'] == 0)]) < 4: continue y = did_df['apQTime'] X = did_df['apIn'] X = sm.add_constant(X) res = sm.OLS(y, X).fit() if np.isnan(res.f_pvalue): continue try: writer.writerow([did, res.f_pvalue, res.rsquared, res.rsquared_adj, res.params['apIn'], res.pvalues['apIn'], res.params['const'], res.pvalues['const']]) except Exception as _: pass
def get_sgBoarder_xy(): fpath = 'sgBorder_xy.pkl' if not check_path_exist(fpath): sgBorder_xy = [] for lon, lat in sg_border: x, y = convert_GPS2xy(lon, lat) sgBorder_xy += [(x, y)] save_pickle_file(fpath, sgBorder_xy) else: sgBorder_xy = load_pickle_file(fpath) return sgBorder_xy
def get_sgZones(): ofpath = 'sgZone.pkl' if check_path_exist(ofpath): sgZones = load_pickle_file(ofpath) else: sgZones = get_sg_zones() for z in sgZones.values(): z.cCoor_xy = convert_GPS2xy(*z.cCoor_gps) z.polyPoints_xy = [convert_GPS2xy(*gps_coord) for gps_coord in z.polyPoints_gps] z.marked = False save_pickle_file(ofpath, sgZones) return sgZones
def get_sgRoards_xy(): ofpath = 'sgRoards_xy.pkl' if check_path_exist(ofpath): sgRoards_xy = load_pickle_file(ofpath) else: sgRoards_xy = [] for _, coords in get_SG_roads(): road_fd = [] for lon, lat in coords: road_fd += [convert_GPS2xy(lon, lat)] sgRoards_xy += [road_fd] save_pickle_file(ofpath, sgRoards_xy) return sgRoards_xy
def run(time_from, time_to): # # Step 1. Split Singapore into zones # if not check_path_exist(grid_info_fn): from taxi_common.sg_grid_zone import run as run_split_into_zones # @UnresolvedImport hl_points, vl_points, zones = run_split_into_zones(rp_zone) else: hl_points, vl_points, zones = load_pickle_file(grid_info_fn) # # Step 2. Preprocess logs # processed_log_fn = get_processed_log_fn(time_from, time_to) if not check_path_exist(processed_log_fn): from preprocess_logs import run as run_preprocess_logs run_preprocess_logs(hl_points, vl_points, time_from, time_to) # # Step 3. Preprocess trips # processed_trip_fn = get_processed_trip_fn(time_from, time_to) if not check_path_exist(processed_trip_fn): from preprocess_trips import run as run_preprocess_trips run_preprocess_trips(hl_points, vl_points, time_from, time_to)
def run(time_from, time_to): # # Step 1. Split Singapore into zones # if not check_path_exist(grid_info_fn): from taxi_common.sg_grid_zone import run as run_split_into_zones # @UnresolvedImport hl_points, vl_points, zones = run_split_into_zones(rp_zone) else: hl_points, vl_points, zones = load_pickle_file(grid_info_fn) # # Step 2. Preprocess logs # processed_log_fn = get_processed_log_fn(time_from, time_to) if not check_path_exist(processed_log_fn): from preprocess_logs import run as run_preprocess_logs run_preprocess_logs(hl_points, vl_points, time_from, time_to) # # Step 3. Preprocess trips # processed_trip_fn = get_processed_trip_fn(time_from, time_to) if not check_path_exist(processed_trip_fn): from preprocess_trips import run as run_preprocess_trips run_preprocess_trips(hl_points, vl_points, time_from, time_to)
def process_month(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) ifpath = '%s/%s%s.csv' % (if_dpath, if_prefix, yymm) if not check_path_exist(ifpath): logger.info('The file X exists; %s' % yymm) return None ofpath = '%s/prevDrivers-%s%s.csv' % (if_dpath, if_prefix, yymm) # if check_path_exist(ofpath): # logger.info('The processed; %s' % yymm) # return None drivers = {} zones = generate_zones() handling_day = 0 with open(ifpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) header = reader.next() hid = {h: i for i, h in enumerate(header)} with open(ofpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_header = header + ['prevDrivers'] writer.writerow(new_header) for row in reader: t = eval(row[hid['time']]) cur_dt = datetime.datetime.fromtimestamp(t) if handling_day != cur_dt.day: logger.info('Processing %s %dth day (month %d)' % (yymm, cur_dt.day, cur_dt.month)) handling_day = cur_dt.day did = int(row[hid['did']]) zi, zj = int(row[hid['zi']]), int(row[hid['zj']]) try: z = zones[(zi, zj)] except KeyError: continue if not drivers.has_key(did): drivers[did] = ca_driver_withPrevDrivers(did) if did == 1 and t == eval('1233723600'): print 'hi' prevDrivers = drivers[did].find_prevDriver(t, z) writer.writerow(row + ['&'.join(map(str, prevDrivers))]) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def process_month(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) ifpath = '%s/%s%s.csv' % (if_dpath, if_prefix, yymm) if not check_path_exist(ifpath): logger.info('The file X exists; %s' % yymm) return None ofpath = '%s/prevDrivers-%s%s.csv' % (if_dpath, if_prefix, yymm) # if check_path_exist(ofpath): # logger.info('The processed; %s' % yymm) # return None drivers = {} zones = generate_zones() handling_day = 0 with open(ifpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) header = reader.next() hid = {h: i for i, h in enumerate(header)} with open(ofpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_header = header + ['prevDrivers'] writer.writerow(new_header) for row in reader: t = eval(row[hid['time']]) cur_dt = datetime.datetime.fromtimestamp(t) if handling_day != cur_dt.day: logger.info('Processing %s %dth day (month %d)' % (yymm, cur_dt.day, cur_dt.month)) handling_day = cur_dt.day did = int(row[hid['did']]) zi, zj = int(row[hid['zi']]), int(row[hid['zj']]) try: z = zones[(zi, zj)] except KeyError: continue if not drivers.has_key(did): drivers[did] = ca_driver_withPrevDrivers(did) if did == 1 and t == eval('1233723600'): print 'hi' prevDrivers = drivers[did].find_prevDriver(t, z) writer.writerow(row + ['&'.join(map(str, prevDrivers))]) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def get_sgGrid_xy(): ofpath = 'sgGrid_xy.pkl' if check_path_exist(ofpath): sgGrid_xy = load_pickle_file(ofpath) else: sgGrid_xy = [] lons, lats = generate_sg_grid() for lon in lons: sx, sy = convert_GPS2xy(lon, lats[0]) ex, ey = convert_GPS2xy(lon, lats[-1]) sgGrid_xy += [[(sx, sy), (ex, ey)]] for lat in lats: sx, sy = convert_GPS2xy(lons[0], lat) ex, ey = convert_GPS2xy(lons[-1], lat) sgGrid_xy += [[(sx, sy), (ex, ey)]] save_pickle_file(ofpath, sgGrid_xy) return sgGrid_xy
def log_last_day(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) y, m = int('20' + yymm[:2]), int(yymm[2:]) # find the next month's first day if m == 12: next_y, next_m = y + 1, 1 else: next_y, next_m = y, m + 1 next_m_first_day = datetime.datetime(next_y, next_m, 1, 0) cur_m_last_day = next_m_first_day - datetime.timedelta(days=1) dd = '%02d' % cur_m_last_day.day ll_fpath = '%s/%s%s%s.csv' % (log_last_day_dpath, log_last_day_prefix, yymm, dd) if check_path_exist(ll_fpath): logger.info('The file had already been processed; %s' % ll_fpath) return # last_day_timestamp = time.mktime(cur_m_last_day.timetuple()) log_fpath = '%s/%s%s.csv' % (log_dpath, log_prefix, yymm) with open(log_fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open(ll_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(headers) for row in reader: t = eval(row[hid['time']]) if t <= last_day_timestamp: continue writer.writerow(row) print 'end the file; %s' % yymm except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def run(): drivers_dates = {} for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix): _, _date, _did = fn[:-len('.csv')].split('-') year = 2000 + int(_date[:2]) month, day = map(int, [_date[2:4], _date[4:6]]) dt = datetime.datetime(year, month, day) k = int(_did) if not drivers_dates.has_key(k): drivers_dates[k] = [] drivers_dates[k] += [dt] # for did, dates in drivers_dates.iteritems(): ofpath = '%s%d.pkl' % (if_prefix, did) if check_path_exist(ofpath): continue dates.sort() dt_xy_state = [] for dt in dates: yy = '%02d' % (dt.year - 2000) mm, dd = '%02d' % dt.month, '%02d' % dt.day yymmdd = yy + mm + dd ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did) with open(ifpath, 'rb') as logFile: reader = csv.reader(logFile) header = reader.next() # header: time,vehicle-id,driver-id,longitude,latitude,speed,state hid = {h: i for i, h in enumerate(header)} for row in reader: dt = datetime.datetime.fromtimestamp(eval( row[hid['time']])) lon, lat = map( eval, [row[hid[cn]] for cn in ['longitude', 'latitude']]) x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat) dt_xy_state += [dt, x, y, int(row[hid['state']])] save_pickle_file(ofpath, dt_xy_state)
def process_file(yymm): fpath = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm) if check_path_exist(fpath): return None print 'handle the file; %s' % yymm yy, mm = yymm[:2], yymm[-2:] # with open('%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open(fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['time', 'vid', 'did', 'ap-or-not', 'ns-or-not'] writer.writerow(new_headers) # for row in reader: ap_or_not = ap_poly.is_including((eval(row[hid['longitude']]), eval(row[hid['latitude']]))) np_or_not = ns_poly.is_including((eval(row[hid['longitude']]), eval(row[hid['latitude']]))) new_row = [row[hid['time']], row[hid['vehicle-id']], row[hid['driver-id']], ap_or_not, np_or_not] writer.writerow(new_row) print 'end the file; %s' % yymm
def process_file(fpath): logger.info('Start handling; %s' % fpath) _, _, _, _did1 = get_fn_only(fpath)[:-len('.csv')].split('-') try: ofpath = '%s/%s%s-%s.csv' % (of_dpath, of_prefix, year, _did1) sig_fpath = '%s/%ssigRelation-%s-%s.pkl' % (of_dpath, of_prefix, year, _did1) if check_path_exist(ofpath): return None with open(ofpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ 'did', 'numObservations', 'numPrevDrivers', 'numSigRelationship', 'numPosCoef', 'numNegCoef', 'sigPosRelation', 'sigNegRelation' ] writer.writerow(header) # logger.info('Start loading; %s-%s' % (year, _did1)) df = pd.read_csv(fpath) numObservations = len(df) did1_df = df.drop(['month', 'day', 'hour', 'zi', 'zj', 'did'], axis=1) if _did1 in did1_df.columns: did1_df = did1_df.drop([_did1], axis=1) prevDrivers = [cn for cn in did1_df.columns if cn != depVar] numPrevDrivers = len(prevDrivers) # sigRelatioin = {k: [] for k in ['pos', 'neg']} for _did0 in prevDrivers: num_encouters = sum(did1_df[_did0]) if num_encouters < numObservations * MIN_PICKUP_RATIO: continue # if len(did1_df) - 1 == sum(did1_df[_did0]) or sum(did1_df[_did0]) == 0: # continue y = did1_df[depVar] X = did1_df[[_did0]] X = sm.add_constant(X) res = sm.OLS(y, X, missing='drop').fit() pv = res.pvalues[_did0] coef = res.params[_did0] if pv < SIGINIFICANCE_LEVEL: if coef < 0: sigRelatioin['neg'] += [(_did0, coef)] elif coef > 0: sigRelatioin['pos'] += [(_did0, coef)] with open(ofpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [ _did1, numObservations, numPrevDrivers, len(sigRelatioin['pos']) + len(sigRelatioin['neg']), len(sigRelatioin['pos']), len(sigRelatioin['neg']), '&'.join([_did0 for _did0, _ in sigRelatioin['pos']]), '&'.join([_did0 for _did0, _ in sigRelatioin['neg']]) ] writer.writerow(new_row) save_pickle_file(sig_fpath, sigRelatioin) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, _did1)), 'w') as f: f.write(format_exc()) raise logger.info('End handling; %s' % fpath)
def process_month(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) yy, mm = yymm[:2], yymm[2:] trip_normal_fpath = '%s/20%s/%s/trips/trips-%s-normal.csv' % ( taxi_home, yy, mm, yymm) trip_ext_fpath = '%s/20%s/%s/trips/trips-%s-normal-ext.csv' % ( taxi_home, yy, mm, yymm) log_fpath = '%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm) if not check_path_exist(trip_normal_fpath): logger.info('The file X exists; %s' % yymm) return None ss_drivers_fpath = '%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm) if not check_path_exist(ss_drivers_fpath): logger.info('The file X exists; %s' % ss_drivers_fpath) return None ss_drivers = load_pickle_file(ss_drivers_fpath) x_points, y_points = get_sg_grid_xy_points() # ss_trips_fpath = '%s/%s%s.csv' % (ss_trips_dpath, ss_trips_prefix, yymm) if check_path_exist(ss_trips_fpath): logger.info('The file had already been processed; %s' % yymm) return None with open(ss_trips_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ 'did', 'hour', 'zi', 'zj', 'time', 'day', 'month', 'start-long', 'start-lat', 'distance', 'duration', 'fare', 'queueingTime' ]) with open(trip_normal_fpath, 'rb') as tripFileN: tripReaderN = csv.reader(tripFileN) tripHeaderN = tripReaderN.next() # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3, # 'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7, # 'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11, # 'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15, # 'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19} hidN = {h: i for i, h in enumerate(tripHeaderN)} with open(trip_ext_fpath, 'rb') as tripFileE: tripReaderE = csv.reader(tripFileE) tripHeaderE = tripReaderE.next() # # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3} # hidE = {h: i for i, h in enumerate(tripHeaderE)} with open(log_fpath, 'rb') as logFile: logReader = csv.reader(logFile) logHeader = logReader.next() hidL = {h: i for i, h in enumerate(logHeader)} handling_day = 0 drivers = {} for rowN in tripReaderN: rowE = tripReaderE.next() didT = int(rowE[hidE['driver-id']]) if didT not in ss_drivers: continue tripTime = eval(rowN[hidN['start-time']]) cur_dtT = datetime.datetime.fromtimestamp(tripTime) if handling_day != cur_dtT.day: handling_day = cur_dtT.day logger.info('Processing %s %dth day' % (yymm, cur_dtT.day)) if cur_dtT.weekday() in [FRI, SAT, SUN]: continue if cur_dtT.hour < AM10: continue if PM8 <= cur_dtT.hour: continue while True: rowL = logReader.next() logTime = eval(rowL[hidL['time']]) didL = int(rowL[hidL['driver-id']]) if didL not in ss_drivers: continue t = eval(rowL[hidL['time']]) cur_dtL = datetime.datetime.fromtimestamp(t) if cur_dtL.weekday() in [FRI, SAT, SUN]: continue if cur_dtL.hour < AM10: continue if PM8 <= cur_dtL.hour: continue longitude, latitude = eval( rowL[hidL['longitude']]), eval( rowL[hidL['latitude']]) zi, zj = bisect(x_points, longitude) - 1, bisect( y_points, latitude) - 1 if zi < 0 or zj < 0: continue t, s = eval(rowL[hidL['time']]), eval( rowL[hidL['state']]) z = (zi, zj) cur_dt = datetime.datetime.fromtimestamp(t) if handling_day != cur_dt.day: handling_day = cur_dt.day logger.info('Processing %s %dth day' % (yymm, cur_dt.day)) if not drivers.has_key(didL): drivers[didL] = driver(didL, t, z, s) else: drivers[didL].update(t, z, s) if tripTime <= logTime: break s_long, s_lat = eval(rowN[hidN['start-long']]), eval( rowN[hidN['start-lat']]) zi, zj = bisect(x_points, s_long) - 1, bisect( y_points, s_lat) - 1 if zi < 0 or zj < 0: continue if not drivers.has_key(didT): continue if drivers[didT].firstFreeStateTime == -1: continue queueingTime = tripTime - drivers[didT].zoneEnteredTime if queueingTime < 0: continue with open(ss_trips_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ didT, cur_dtT.hour, zi, zj, tripTime, cur_dtT.day, cur_dtT.month, s_long, s_lat, rowN[hidN['distance']], rowN[hidN['duration']], rowN[hidN['fare']], queueingTime ]) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def run(): cg_dpath = dpaths['baseline', '2009', 'countGraph'] cg_prefix = prefixs['baseline', '2009', 'countGraph'] gp_dpath = dpaths['baseline', '2009', 'groupPartition'] gp_prefix = prefixs['baseline', '2009', 'groupPartition'] # check_dir_create(gp_dpath) # gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix) gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix) gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) # with open(gp_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ 'groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon' ]) # logger.info('Start handling SP_group_dpath') if not check_path_exist(gp_original_fpath): original_graph = {} for fn in get_all_files(cg_dpath, '%s*' % cg_prefix): count_graph = load_pickle_file('%s/%s' % (cg_dpath, fn)) logger.info('Start handling; %s' % fn) numEdges = len(count_graph) moduloNumber = numEdges / 10 for i, ((did0, did1), w) in enumerate(count_graph.iteritems()): if i % moduloNumber == 0: logger.info('Handling; %.2f' % (i / float(numEdges))) original_graph[did0, did1] = w save_pickle_file(gp_original_fpath, original_graph) else: original_graph = load_pickle_file(gp_original_fpath) # logger.info('igraph converting') igid, did_igid = 0, {} igG = ig.Graph(directed=True) numEdges = len(original_graph) moduloNumber = numEdges / 10 for i, ((did0, did1), w) in enumerate(original_graph.iteritems()): if i % moduloNumber == 0: logger.info('Handling; %.2f' % i / float(numEdges)) if not did_igid.has_key(did0): igG.add_vertex(did0) did_igid[did0] = igid igid += 1 if not did_igid.has_key(did1): igG.add_vertex(did1) did_igid[did1] = igid igid += 1 igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w)) # logger.info('Partitioning') part = louvain.find_partition(igG, method='Modularity', weight='weight') logger.info('Each group pickling and summary') gn_drivers = {} for i, sg in enumerate(part.subgraphs()): gn = 'G(%d)' % i group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn) sg.write_pickle(group_fpath) # drivers = [v['name'] for v in sg.vs] weights = [e['weight'] for e in sg.es] graphComplexity = len(weights) / float(len(drivers)) tie_strength = sum(weights) / float(len(drivers)) contribution = sum(weights) / float(len(weights)) benCon = tie_strength / float(len(drivers)) with open(gp_summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon ]) gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn) layout = sg.layout("kk") if len(drivers) < 100: ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers) else: ig.plot(sg, gl_img_fpath, layout=layout) gn_drivers[gn] = drivers gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn) with open(gc_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'did0', 'did1', 'coef']) for e in sg.es: did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple] coef = e['weight'] writer.writerow([gn, did0, did1, coef]) save_pickle_file(gp_drivers_fpath, gn_drivers)
def process_file(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) trip_fpath = '%s/%s%s.csv' % (trip_dpath, trip_prefix, yymm) trip_filtered_fpath = '%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm) if check_path_exist(trip_fpath): logger.info('The file had already been processed; %s' % trip_fpath) return for fpath in [trip_fpath, trip_filtered_fpath]: with open(fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = [ 'vid', 'did', 'startTime', 'endTime', 'duration', 'fare', 'tripModeAP', 'tripModeNS', 'prevTripEndTime', 'year', 'month', 'day', 'hour', 'pickUpTerminalAP', 'prevEndTerminalAP' ] writer.writerow(new_headers) yy, mm = yymm[:2], yymm[-2:] yyyy = str(2000 + int(yy)) normal_file = taxi_home + '/%s/%s/trips/trips-%s-normal.csv' % ( yyyy, mm, yymm) ext_file = taxi_home + '/%s/%s/trips/trips-%s-normal-ext.csv' % ( yyyy, mm, yymm) # year, month = int(yyyy), int(mm) ap_polygons, ns_polygon = get_ap_polygons(), get_ns_polygon() vehicle_prev_trip_position_time = {} # with open(normal_file, 'rb') as r_csvfile1: reader1 = csv.reader(r_csvfile1) headers1 = reader1.next() # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3, # 'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7, # 'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11, # 'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15, # 'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19} hid1 = {h: i for i, h in enumerate(headers1)} with open(ext_file, 'rb') as r_csvfile2: reader2 = csv.reader(r_csvfile2) headers2 = reader2.next() # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3} hid2 = {h: i for i, h in enumerate(headers2)} for row1 in reader1: row2 = reader2.next() # vid = row1[hid1['vehicle-id']] st_ts, et_ts = row1[hid1['start-time']], row1[ hid1['end-time']] dur, fare = row1[hid1['duration']], row1[hid1['fare']] day, hour = int(row1[hid1['start-day']]), int( row1[hid1['start-hour']]) s_long, s_lat = eval(row1[hid1['start-long']]), eval( row1[hid1['start-lat']]) e_long, e_lat = eval(row1[hid1['end-long']]), eval( row1[hid1['end-lat']]) c_sl_ap, c_el_ap = False, False c_sl_ter, c_el_ter = 'X', 'X' for ap_polygon in ap_polygons: if not c_sl_ap: res = ap_polygon.is_including((s_long, s_lat)) if res: c_sl_ap = res c_sl_ter = ap_polygon.name if not c_el_ap: res = ap_polygon.is_including((e_long, e_lat)) if res: c_el_ap = res c_el_ter = ap_polygon.name c_sl_ns, c_el_ns = ns_polygon.is_including( (s_long, s_lat)), ns_polygon.is_including( (e_long, e_lat)) did = row2[hid2['driver-id']] # if not vehicle_prev_trip_position_time.has_key(vid): # ASSUMPTION # If this trip is the driver's first trip in a month, # let's assume that the previous trip occurred at outside of the airport and Night safari # and also assume that the previous trip's end time is the current trip's start time # False means the trip occur at outside of the airport or Night safari vehicle_prev_trip_position_time[vid] = ('X', OUT, OUT, st_ts) pt_el_ter, pt_el_ap, pt_el_ns, pt_time = vehicle_prev_trip_position_time[ vid] ap_trip_mode, ns_trip_mode = None, None # if pt_el_ap == IN and c_sl_ap == IN: ap_trip_mode = DIn_PIn elif pt_el_ap == IN and c_sl_ap == OUT: ap_trip_mode = DIn_POut elif pt_el_ap == OUT and c_sl_ap == IN: ap_trip_mode = DOut_PIn elif pt_el_ap == OUT and c_sl_ap == OUT: ap_trip_mode = DOut_POut else: assert False # if pt_el_ns == IN and c_sl_ns == IN: ns_trip_mode = DIn_PIn elif pt_el_ns == IN and c_sl_ns == OUT: ns_trip_mode = DIn_POut elif pt_el_ns == OUT and c_sl_ns == IN: ns_trip_mode = DOut_PIn elif pt_el_ns == OUT and c_sl_ns == OUT: ns_trip_mode = DOut_POut else: assert False # vehicle_prev_trip_position_time[vid] = (c_el_ter, c_el_ap, c_el_ns, et_ts) # with open(trip_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [ vid, did, st_ts, et_ts, dur, fare, ap_trip_mode, ns_trip_mode, pt_time, year, month, day, hour, c_sl_ter, pt_el_ter ] writer.writerow(new_row) # # For filtered version # Only consider trips whose start time is before 2 AM and after 6 AM # if AM2 <= hour and hour <= AM5: continue need2skip = False for ys, ms, ds, hs in error_hours: yyyy0 = 2000 + int(ys) mm0, dd0, hh0 = map(int, [ms, ds, hs]) if (year == yyyy0) and (month == mm0) and ( day == dd0) and (hour == hh0): need2skip = True if need2skip: continue # with open(trip_filtered_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [ vid, did, st_ts, et_ts, dur, fare, ap_trip_mode, ns_trip_mode, pt_time, year, month, day, hour, c_sl_ter, pt_el_ter ] writer.writerow(new_row) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def process_file(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) queueingTime_ap_fpath = '%s/%s%s.csv' % (queueingTime_ap_dpath, queueingTime_ap_prefix, yymm) queueingTime_ns_fpath = '%s/%s%s.csv' % (queueingTime_ns_dpath, queueingTime_ns_prefix, yymm) if check_path_exist(queueingTime_ap_fpath) and check_path_exist(queueingTime_ns_fpath): logger.info('The file had already been processed; %s' % yymm) return # logger.info('load pickle files; %s' % yymm) ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm) crossingTime_ap, crossingTime_ns = load_pickle_file(ap_pkl_fpath), load_pickle_file(ns_pkl_fpath) # logger.info('initiate csv files; %s' % yymm) with open(queueingTime_ap_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode', 'queueJoinTime', 'queueingTime', 'year', 'month', 'day', 'hour', 'pickUpTerminalAP', 'prevEndTerminalAP'] writer.writerow(new_headers) with open(queueingTime_ns_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode', 'queueJoinTime', 'queueingTime', 'year', 'month', 'day', 'hour'] writer.writerow(new_headers) # logger.info('start recording; %s' % yymm) with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h : i for i, h in enumerate(headers)} for row in reader: did = row[hid['did']] et, duration = row[hid['endTime']], row[hid['duration']] fare = row[hid['fare']] year, month = row[hid['year']], row[hid['month']] day, hour = row[hid['day']], row[hid['hour']] pickUpTerminalAP, prevEndTerminalAP = row[hid['pickUpTerminalAP']], row[hid['prevEndTerminalAP']] # ap_tm, ns_tm = int(row[hid['tripModeAP']]), int(row[hid['tripModeNS']]) vid, st, prev_tet = row[hid['vid']], eval(row[hid['startTime']]), eval(row[hid['prevTripEndTime']]) # # Airport trip # if ap_tm != DIn_POut or ap_tm != DOut_POut: queueing_time = None if ap_tm == DIn_PIn: queue_join_time = prev_tet queueing_time = st - queue_join_time elif ap_tm == DOut_PIn: try: i = bisect(crossingTime_ap[vid], st) queue_join_time = crossingTime_ap[vid][i - 1] if i != 0 else crossingTime_ap[vid][0] queueing_time = st - queue_join_time except KeyError: pass if queueing_time != None and Q_LIMIT_MIN <= queueing_time: new_row = [did, st, et, duration, fare, ap_tm, queue_join_time, queueing_time, year, month, day, hour, pickUpTerminalAP, prevEndTerminalAP] append_record(queueingTime_ap_fpath, new_row) # # Night Safari # if ns_tm != DIn_POut or ns_tm != DOut_POut: queueing_time = None if ns_tm == DIn_PIn: queue_join_time = prev_tet queueing_time = st - queue_join_time elif ns_tm == DOut_PIn: try: i = bisect(crossingTime_ns[vid], st) queue_join_time = crossingTime_ns[vid][i - 1] if i != 0 else crossingTime_ns[vid][0] queueing_time = st - queue_join_time except KeyError: pass if queueing_time != None and Q_LIMIT_MIN <= queueing_time: new_row = [did, st, et, duration, fare, ns_tm, queue_join_time, queueing_time, year, month, day, hour] append_record(queueingTime_ns_fpath, new_row) logger.info('end the file; %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def process_files(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) productivity_fpath = '%s/%s%s.csv' % (productivity_dpath, productivity_prefix, yymm) if check_path_exist(productivity_fpath): logger.info('Already handled; %s' % yymm) return begin_datetime = datetime.datetime(2009, 1, 1, 0) last_datetime = datetime.datetime(2011, 2, 1, 0) hourly_stats, time_period_order = {}, [] while begin_datetime < last_datetime: year, month, day, hour = begin_datetime.year, begin_datetime.month, begin_datetime.day, begin_datetime.hour k = (year, month, day, hour) hourly_stats[k] = [ 0 for _ in range( len([ ALL_DUR, ALL_FARE, ALL_NUM, AP_DUR, AP_FARE, AP_QUEUE, AP_NUM, NS_DUR, NS_FARE, NS_QUEUE, NS_NUM ])) ] time_period_order.append(k) begin_datetime += datetime.timedelta(hours=1) st_label, et_label, dur_label, fare_label = 'startTime', 'endTime', 'duration', 'fare' qt_label = 'queueingTime' # logger.info('Productive duration; %s' % yymm) with open( '%s/%s%s.csv' % (shiftProDur_dpath, shiftProDur_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month = int(row[hid['year']]), int(row[hid['month']]) day, hour = int(row[hid['day']]), int(row[hid['hour']]) hourly_stats[(year, month, day, hour)][ALL_DUR] += eval(row[ hid['pro-dur']]) * SEC60 # unit change; Minute -> Second # logger.info('Total fare; %s' % yymm) with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: st_ts, et_ts = eval(row[hid[st_label]]), eval( row[hid[et_label]]) dur, fare = eval(row[hid[dur_label]]), eval( row[hid[fare_label]]) sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare, ALL_FARE, ALL_NUM, None) # logger.info('Sum up fare, duration and queue time; %s' % yymm) for dir_path, file_prefix, id_DUR, id_FARE, id_QUEUE, id_NUM in [ (queueingTime_ap_dpath, queueingTime_ap_prefix, AP_DUR, AP_FARE, AP_QUEUE, AP_NUM), (queueingTime_ns_dpath, queueingTime_ns_prefix, NS_DUR, NS_FARE, NS_QUEUE, NS_NUM) ]: with open('%s/%s%s.csv' % (dir_path, file_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: st_ts, et_ts = eval(row[hid[st_label]]), eval( row[hid[et_label]]) dur, fare = eval(row[hid[dur_label]]), eval( row[hid[fare_label]]) qt = eval(row[hid[qt_label]]) # sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare, id_FARE, id_NUM, id_DUR) sum_queueing_time(hourly_stats, st_ts, qt, id_QUEUE) # logger.info('Generate .csv file; %s' % yymm) with open(productivity_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ 'year', 'month', 'day', 'hour', 'allDuration', 'allFare', 'allNum', 'apDuration', 'apFare', 'apQueueingTime', 'apNum', 'nsDuration', 'nsFare', 'nsQueueingTime', 'nsNum' ] writer.writerow(header) for year, month, day, hour in time_period_order: all_dur, all_fare, all_num, \ ap_dur, ap_fare, ap_qt, ap_num, \ ns_dur, ns_fare, ns_qt, ns_num = hourly_stats[(year, month, day, hour)] # writer.writerow([ year, month, day, hour, all_dur, all_fare, all_num, ap_dur, ap_fare, ap_qt, ap_num, ns_dur, ns_fare, ns_qt, ns_num ]) logger.info('end the file; %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def process_file(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) trip_fpath = '%s/%s%s.csv' % (trip_dpath, trip_prefix, yymm) trip_filtered_fpath = '%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm) if check_path_exist(trip_fpath): logger.info('The file had already been processed; %s' % trip_fpath) return for fpath in [trip_fpath, trip_filtered_fpath]: with open(fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['vid', 'did', 'startTime', 'endTime', 'duration', 'fare', 'tripModeAP', 'tripModeNS', 'prevTripEndTime', 'year', 'month', 'day', 'hour', 'pickUpTerminalAP', 'prevEndTerminalAP'] writer.writerow(new_headers) yy, mm = yymm[:2], yymm[-2:] yyyy = str(2000 + int(yy)) normal_file = taxi_home + '/%s/%s/trips/trips-%s-normal.csv' % (yyyy, mm, yymm) ext_file = taxi_home + '/%s/%s/trips/trips-%s-normal-ext.csv' % (yyyy, mm, yymm) # year, month = int(yyyy), int(mm) ap_polygons, ns_polygon = get_ap_polygons(), get_ns_polygon() vehicle_prev_trip_position_time = {} # with open(normal_file, 'rb') as r_csvfile1: reader1 = csv.reader(r_csvfile1) headers1 = reader1.next() # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3, # 'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7, # 'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11, # 'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15, # 'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19} hid1 = {h : i for i, h in enumerate(headers1)} with open(ext_file, 'rb') as r_csvfile2: reader2 = csv.reader(r_csvfile2) headers2 = reader2.next() # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3} hid2 = {h : i for i, h in enumerate(headers2)} for row1 in reader1: row2 = reader2.next() # vid = row1[hid1['vehicle-id']] st_ts, et_ts = row1[hid1['start-time']], row1[hid1['end-time']] dur, fare = row1[hid1['duration']], row1[hid1['fare']] day, hour = int(row1[hid1['start-day']]), int(row1[hid1['start-hour']]) s_long, s_lat = eval(row1[hid1['start-long']]), eval(row1[hid1['start-lat']]) e_long, e_lat = eval(row1[hid1['end-long']]), eval(row1[hid1['end-lat']]) c_sl_ap, c_el_ap = False, False c_sl_ter, c_el_ter = 'X', 'X' for ap_polygon in ap_polygons: if not c_sl_ap: res = ap_polygon.is_including((s_long, s_lat)) if res: c_sl_ap = res c_sl_ter = ap_polygon.name if not c_el_ap: res = ap_polygon.is_including((e_long, e_lat)) if res: c_el_ap = res c_el_ter = ap_polygon.name c_sl_ns, c_el_ns = ns_polygon.is_including((s_long, s_lat)), ns_polygon.is_including((e_long, e_lat)) did = row2[hid2['driver-id']] # if not vehicle_prev_trip_position_time.has_key(vid): # ASSUMPTION # If this trip is the driver's first trip in a month, # let's assume that the previous trip occurred at outside of the airport and Night safari # and also assume that the previous trip's end time is the current trip's start time # False means the trip occur at outside of the airport or Night safari vehicle_prev_trip_position_time[vid] = ('X', OUT, OUT, st_ts) pt_el_ter, pt_el_ap, pt_el_ns, pt_time = vehicle_prev_trip_position_time[vid] ap_trip_mode, ns_trip_mode = None, None # if pt_el_ap == IN and c_sl_ap == IN: ap_trip_mode = DIn_PIn elif pt_el_ap == IN and c_sl_ap == OUT: ap_trip_mode = DIn_POut elif pt_el_ap == OUT and c_sl_ap == IN: ap_trip_mode = DOut_PIn elif pt_el_ap == OUT and c_sl_ap == OUT: ap_trip_mode = DOut_POut else: assert False # if pt_el_ns == IN and c_sl_ns == IN: ns_trip_mode = DIn_PIn elif pt_el_ns == IN and c_sl_ns == OUT: ns_trip_mode = DIn_POut elif pt_el_ns == OUT and c_sl_ns == IN: ns_trip_mode = DOut_PIn elif pt_el_ns == OUT and c_sl_ns == OUT: ns_trip_mode = DOut_POut else: assert False # vehicle_prev_trip_position_time[vid] = (c_el_ter, c_el_ap, c_el_ns, et_ts) # with open(trip_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [vid, did, st_ts, et_ts, dur, fare, ap_trip_mode, ns_trip_mode, pt_time, year, month, day, hour, c_sl_ter, pt_el_ter] writer.writerow(new_row) # # For filtered version # Only consider trips whose start time is before 2 AM and after 6 AM # if AM2 <= hour and hour <= AM5: continue need2skip = False for ys, ms, ds, hs in error_hours: yyyy0 = 2000 + int(ys) mm0, dd0, hh0 = map(int, [ms, ds, hs]) if (year == yyyy0) and (month == mm0) and (day == dd0) and (hour == hh0): need2skip = True if need2skip: continue # with open(trip_filtered_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [vid, did, st_ts, et_ts, dur, fare, ap_trip_mode, ns_trip_mode, pt_time, year, month, day, hour, c_sl_ter, pt_el_ter] writer.writerow(new_row) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def process_file(fpath): logger.info('Start handling; %s' % fpath) _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-') try: tm = 'spendingTime' st_graph_dpath = dpaths[tm, year, 'influenceGraph'] st_graph_prefix = prefixs[tm, year, 'influenceGraph'] SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix, reducerID) if check_path_exist(SP_graph_fpath): return None # logger.info('Start loading; %s-%s' % (year, reducerID)) df = pd.read_csv(fpath) SP_graph, RP_graph = {}, {} num_drivers = len(set(df['did'])) for i, did1 in enumerate(set(df['did'])): if i % 10 == 0: logger.info('Doing regression %.2f; %s-%s' % (i / float(num_drivers), year, reducerID)) did1_df = df[(df['did'] == did1)].copy(deep=True) numObservations = len(did1_df) minDFResiduals = numObservations * MIN_RATIO_RESIDUAL did1_df = did1_df.drop([ 'month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did', 'roamingTime' ], axis=1) if '%d' % did1 in did1_df.columns: did1_df = did1_df.drop(['%d' % did1], axis=1) # candi_dummies = [] num_iter = 1 while True: for i, vs in enumerate(zip(*did1_df.values)): if did1_df.columns[i] == tm: continue if sum(vs) > numObservations * MIN_PICKUP_RATIO * num_iter: candi_dummies.append(did1_df.columns[i]) numIndepVariables = len(candi_dummies) if numIndepVariables == 0: break if numObservations < numIndepVariables + minDFResiduals: candi_dummies = [] num_iter += 1 else: break if not candi_dummies: continue y = did1_df[tm] X = did1_df[candi_dummies] X = sm.add_constant(X) SP_res = sm.OLS(y, X, missing='drop').fit() # if SP_res.f_pvalue < SIGINIFICANCE_LEVEL: significant_drivers = set() for _did0, pv in SP_res.pvalues.iteritems(): if _did0 == 'const': continue if pv < SIGINIFICANCE_LEVEL: significant_drivers.add(_did0) positive_ef_drivers = set() for _did0, cof in SP_res.params.iteritems(): if _did0 == 'const': continue if cof > 0: positive_ef_drivers.add(_did0) for _did0 in significant_drivers.difference(positive_ef_drivers): SP_graph[int(_did0), did1] = SP_res.params[_did0] # logger.info('Start pickling; %s-%s' % (year, reducerID)) save_pickle_file(SP_graph_fpath, SP_graph) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)), 'w') as f: f.write(format_exc()) raise
def process_file(fpath): def regression(dv, df): oc_dv = 'roamingTime' if dv == 'spendingTime' else 'spendingTime' rdf = df.copy(deep=True).drop([oc_dv], axis=1) candi_dummies = [] num_iter = 1 while True: for i, vs in enumerate(zip(*rdf.values)): if rdf.columns[i] == dv: continue if sum(vs) > len(rdf) * MIN_PICKUP_RATIO * num_iter: candi_dummies.append(rdf.columns[i]) if len(rdf) <= len(candi_dummies): candi_dummies = [] num_iter += 1 else: break y = rdf[dv] X = rdf[candi_dummies] X = sm.add_constant(X) return sm.OLS(y, X, missing='drop').fit() logger.info('Start handling; %s' % fpath) _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-') try: st_graph_dpath = dpaths['spendingTime', year, 'influenceGraph'] st_graph_prefix = prefixs['spendingTime', year, 'influenceGraph'] SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix, reducerID) rt_graph_dpath = dpaths['roamingTime', year, 'influenceGraph'] rt_graph_prefix = prefixs['roamingTime', year, 'influenceGraph'] RP_graph_fpath = '%s/%s%s.pkl' % (rt_graph_dpath, rt_graph_prefix, reducerID) if check_path_exist(SP_graph_fpath): return None # logger.info('Start loading; %s-%s' % (year, reducerID)) df = pd.read_csv(fpath) SP_graph, RP_graph = {}, {} num_drivers = len(set(df['did'])) for i, did1 in enumerate(set(df['did'])): if i % 10 == 0: logger.info('Doing regression %.2f; %s-%s' % (i / float(num_drivers), year, reducerID)) did1_df = df[(df['did'] == did1)].copy(deep=True) did1_df = did1_df.drop( ['month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did'], axis=1) if '%d' % did1 in did1_df.columns: did1_df = did1_df.drop(['%d' % did1], axis=1) # SP_res = regression('spendingTime', did1_df) if SP_res.f_pvalue < SIGINIFICANCE_LEVEL: significant_drivers = set() for _did0, pv in SP_res.pvalues.iteritems(): if _did0 == 'const': continue if pv < SIGINIFICANCE_LEVEL: significant_drivers.add(_did0) positive_ef_drivers = set() for _did0, cof in SP_res.params.iteritems(): if _did0 == 'const': continue if cof > 0: positive_ef_drivers.add(_did0) for _did0 in significant_drivers.difference( positive_ef_drivers): SP_graph[int(_did0), did1] = SP_res.params[_did0] # # RP_res = regression('roamingTime', did1_df) # if RP_res.f_pvalue < SIGINIFICANCE_LEVEL: # significant_drivers = set() # for _did0, pv in RP_res.pvalues.iteritems(): # if _did0 == 'const': # continue # if pv < SIGINIFICANCE_LEVEL: # significant_drivers.add(_did0) # positive_ef_drivers = set() # for _did0, cof in RP_res.params.iteritems(): # if _did0 == 'const': # continue # if cof > 0: # positive_ef_drivers.add(_did0) # for _did0 in significant_drivers.difference(positive_ef_drivers): # RP_graph[int(_did0), did1] = RP_res.params[_did0] logger.info('Start pickling; %s-%s' % (year, reducerID)) save_pickle_file(SP_graph_fpath, SP_graph) # save_pickle_file(RP_graph_fpath, RP_graph) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)), 'w') as f: f.write(format_exc()) raise
def process_file(yymm): def record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not): with open(path_to_csv_file, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: t, vid = eval(row[hid['time']]), row[hid['vid']] ap_or_not, ns_or_not = eval(row[hid['ap-or-not']]), eval(row[hid['ns-or-not']]) # if not veh_last_log_ap_or_not.has_key(vid): if ap_or_not == IN: # the first log's position was occurred in the AP zone assert not veh_ap_crossing_time.has_key(vid) veh_ap_crossing_time[vid] = [t] else: assert veh_last_log_ap_or_not.has_key(vid) if veh_last_log_ap_or_not[vid] == OUT and ap_or_not == IN: veh_ap_crossing_time.setdefault(vid, [t]).append(t) # if not veh_last_log_ns_or_not.has_key(vid): if ns_or_not == IN: # the first log's position was occurred in the NS zone assert not veh_ns_crossing_time.has_key(vid) veh_ns_crossing_time[vid] = [t] else: assert veh_last_log_ns_or_not.has_key(vid) if veh_last_log_ns_or_not[vid] == OUT and ns_or_not == IN: veh_ns_crossing_time.setdefault(vid, [t]).append(t) # veh_last_log_ap_or_not[vid] = ap_or_not veh_last_log_ns_or_not[vid] = ns_or_not return veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not # from traceback import format_exc try: logger.info('handle the file; %s' % yymm) ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm) if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath): return None print 'handle the file; %s' % yymm veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {} veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {} if yymm not in ['0901', '1001', '1011']: y, m = int(yymm[:2]), int(yymm[2:]) prev_m = m - 1 prev_yymm = '%02d%02d' %(y, prev_m) prev_fn = get_all_files(log_last_day_dpath, '%s%s*.csv' % (log_last_day_prefix, prev_yymm))[0] path_to_last_day_csv_file = '%s/%s' % (log_last_day_dpath, prev_fn) veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \ record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) path_to_csv_file = '%s/%s%s.csv' % (log_dpath, log_prefix, yymm) veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \ record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) # save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time) save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time) logger.info('end the file; %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def process_month(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) yy, mm = yymm[:2], yymm[2:] trip_normal_fpath = '%s/20%s/%s/trips/trips-%s-normal.csv' % (taxi_home, yy, mm, yymm) trip_ext_fpath = '%s/20%s/%s/trips/trips-%s-normal-ext.csv' % (taxi_home, yy, mm, yymm) log_fpath = '%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm) if not check_path_exist(trip_normal_fpath): logger.info('The file X exists; %s' % yymm) return None ss_drivers_fpath = '%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm) if not check_path_exist(ss_drivers_fpath): logger.info('The file X exists; %s' % ss_drivers_fpath) return None ss_drivers = load_pickle_file(ss_drivers_fpath) x_points, y_points = get_sg_grid_xy_points() # ss_trips_fpath = '%s/%s%s.csv' % (ss_trips_dpath, ss_trips_prefix, yymm) if check_path_exist(ss_trips_fpath): logger.info('The file had already been processed; %s' % yymm) return None with open(ss_trips_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['did', 'hour', 'zi', 'zj', 'time', 'day', 'month', 'start-long', 'start-lat', 'distance', 'duration', 'fare', 'queueingTime']) with open(trip_normal_fpath, 'rb') as tripFileN: tripReaderN = csv.reader(tripFileN) tripHeaderN = tripReaderN.next() # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3, # 'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7, # 'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11, # 'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15, # 'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19} hidN = {h: i for i, h in enumerate(tripHeaderN)} with open(trip_ext_fpath, 'rb') as tripFileE: tripReaderE = csv.reader(tripFileE) tripHeaderE = tripReaderE.next() # # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3} # hidE = {h: i for i, h in enumerate(tripHeaderE)} with open(log_fpath, 'rb') as logFile: logReader = csv.reader(logFile) logHeader = logReader.next() hidL = {h: i for i, h in enumerate(logHeader)} handling_day = 0 drivers = {} for rowN in tripReaderN: rowE = tripReaderE.next() didT = int(rowE[hidE['driver-id']]) if didT not in ss_drivers: continue tripTime = eval(rowN[hidN['start-time']]) cur_dtT = datetime.datetime.fromtimestamp(tripTime) if handling_day != cur_dtT.day: handling_day = cur_dtT.day logger.info('Processing %s %dth day' % (yymm, cur_dtT.day)) if cur_dtT.weekday() in [FRI, SAT, SUN]: continue if cur_dtT.hour < AM10: continue if PM8 <= cur_dtT.hour: continue while True: rowL = logReader.next() logTime = eval(rowL[hidL['time']]) didL = int(rowL[hidL['driver-id']]) if didL not in ss_drivers: continue t = eval(rowL[hidL['time']]) cur_dtL = datetime.datetime.fromtimestamp(t) if cur_dtL.weekday() in [FRI, SAT, SUN]: continue if cur_dtL.hour < AM10: continue if PM8 <= cur_dtL.hour: continue longitude, latitude = eval(rowL[hidL['longitude']]), eval(rowL[hidL['latitude']]) zi, zj = bisect(x_points, longitude) - 1, bisect(y_points, latitude) - 1 if zi < 0 or zj < 0: continue t, s = eval(rowL[hidL['time']]), eval(rowL[hidL['state']]) z = (zi, zj) cur_dt = datetime.datetime.fromtimestamp(t) if handling_day != cur_dt.day: handling_day = cur_dt.day logger.info('Processing %s %dth day' % (yymm, cur_dt.day)) if not drivers.has_key(didL): drivers[didL] = driver(didL, t, z, s) else: drivers[didL].update(t, z, s) if tripTime <= logTime: break s_long, s_lat = eval(rowN[hidN['start-long']]), eval(rowN[hidN['start-lat']]) zi, zj = bisect(x_points, s_long) - 1, bisect(y_points, s_lat) - 1 if zi < 0 or zj < 0: continue if not drivers.has_key(didT): continue if drivers[didT].firstFreeStateTime == -1: continue queueingTime = tripTime - drivers[didT].zoneEnteredTime if queueingTime < 0: continue with open(ss_trips_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([didT, cur_dtT.hour, zi, zj, tripTime, cur_dtT.day, cur_dtT.month, s_long, s_lat, rowN[hidN['distance']], rowN[hidN['duration']], rowN[hidN['fare']], queueingTime]) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def run(): cg_dpath = dpaths['baseline', '2009', 'countGraph'] cg_prefix = prefixs['baseline', '2009', 'countGraph'] gp_dpath = dpaths['baseline', '2009', 'groupPartition'] gp_prefix = prefixs['baseline', '2009', 'groupPartition'] # check_dir_create(gp_dpath) # gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix) gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix) gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) # with open(gp_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon']) # logger.info('Start handling SP_group_dpath') if not check_path_exist(gp_original_fpath): original_graph = {} for fn in get_all_files(cg_dpath, '%s*' % cg_prefix): count_graph = load_pickle_file('%s/%s' % (cg_dpath, fn)) logger.info('Start handling; %s' % fn) numEdges = len(count_graph) moduloNumber = numEdges / 10 for i, ((did0, did1), w) in enumerate(count_graph.iteritems()): if i % moduloNumber== 0: logger.info('Handling; %.2f' % (i / float(numEdges))) original_graph[did0, did1] = w save_pickle_file(gp_original_fpath, original_graph) else: original_graph = load_pickle_file(gp_original_fpath) # logger.info('igraph converting') igid, did_igid = 0, {} igG = ig.Graph(directed=True) numEdges = len(original_graph) moduloNumber = numEdges / 10 for i, ((did0, did1), w) in enumerate(original_graph.iteritems()): if i % moduloNumber == 0: logger.info('Handling; %.2f' % i / float(numEdges)) if not did_igid.has_key(did0): igG.add_vertex(did0) did_igid[did0] = igid igid += 1 if not did_igid.has_key(did1): igG.add_vertex(did1) did_igid[did1] = igid igid += 1 igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w)) # logger.info('Partitioning') part = louvain.find_partition(igG, method='Modularity', weight='weight') logger.info('Each group pickling and summary') gn_drivers = {} for i, sg in enumerate(part.subgraphs()): gn = 'G(%d)' % i group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn) sg.write_pickle(group_fpath) # drivers = [v['name'] for v in sg.vs] weights = [e['weight'] for e in sg.es] graphComplexity = len(weights) / float(len(drivers)) tie_strength = sum(weights) / float(len(drivers)) contribution = sum(weights) / float(len(weights)) benCon = tie_strength / float(len(drivers)) with open(gp_summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon]) gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn) layout = sg.layout("kk") if len(drivers) < 100: ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers) else: ig.plot(sg, gl_img_fpath, layout=layout) gn_drivers[gn] = drivers gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn) with open(gc_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'did0', 'did1', 'coef']) for e in sg.es: did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple] coef = e['weight'] writer.writerow([gn, did0, did1, coef]) save_pickle_file(gp_drivers_fpath, gn_drivers)
def process_file(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) queueingTime_ap_fpath = '%s/%s%s.csv' % (queueingTime_ap_dpath, queueingTime_ap_prefix, yymm) queueingTime_ns_fpath = '%s/%s%s.csv' % (queueingTime_ns_dpath, queueingTime_ns_prefix, yymm) if check_path_exist(queueingTime_ap_fpath) and check_path_exist( queueingTime_ns_fpath): logger.info('The file had already been processed; %s' % yymm) return # logger.info('load pickle files; %s' % yymm) ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm) crossingTime_ap, crossingTime_ns = load_pickle_file( ap_pkl_fpath), load_pickle_file(ns_pkl_fpath) # logger.info('initiate csv files; %s' % yymm) with open(queueingTime_ap_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = [ 'did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode', 'queueJoinTime', 'queueingTime', 'year', 'month', 'day', 'hour', 'pickUpTerminalAP', 'prevEndTerminalAP' ] writer.writerow(new_headers) with open(queueingTime_ns_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = [ 'did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode', 'queueJoinTime', 'queueingTime', 'year', 'month', 'day', 'hour' ] writer.writerow(new_headers) # logger.info('start recording; %s' % yymm) with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: did = row[hid['did']] et, duration = row[hid['endTime']], row[hid['duration']] fare = row[hid['fare']] year, month = row[hid['year']], row[hid['month']] day, hour = row[hid['day']], row[hid['hour']] pickUpTerminalAP, prevEndTerminalAP = row[ hid['pickUpTerminalAP']], row[hid['prevEndTerminalAP']] # ap_tm, ns_tm = int(row[hid['tripModeAP']]), int( row[hid['tripModeNS']]) vid, st, prev_tet = row[hid['vid']], eval( row[hid['startTime']]), eval(row[hid['prevTripEndTime']]) # # Airport trip # if ap_tm != DIn_POut or ap_tm != DOut_POut: queueing_time = None if ap_tm == DIn_PIn: queue_join_time = prev_tet queueing_time = st - queue_join_time elif ap_tm == DOut_PIn: try: i = bisect(crossingTime_ap[vid], st) queue_join_time = crossingTime_ap[vid][ i - 1] if i != 0 else crossingTime_ap[vid][0] queueing_time = st - queue_join_time except KeyError: pass if queueing_time != None and Q_LIMIT_MIN <= queueing_time: new_row = [ did, st, et, duration, fare, ap_tm, queue_join_time, queueing_time, year, month, day, hour, pickUpTerminalAP, prevEndTerminalAP ] append_record(queueingTime_ap_fpath, new_row) # # Night Safari # if ns_tm != DIn_POut or ns_tm != DOut_POut: queueing_time = None if ns_tm == DIn_PIn: queue_join_time = prev_tet queueing_time = st - queue_join_time elif ns_tm == DOut_PIn: try: i = bisect(crossingTime_ns[vid], st) queue_join_time = crossingTime_ns[vid][ i - 1] if i != 0 else crossingTime_ns[vid][0] queueing_time = st - queue_join_time except KeyError: pass if queueing_time != None and Q_LIMIT_MIN <= queueing_time: new_row = [ did, st, et, duration, fare, ns_tm, queue_join_time, queueing_time, year, month, day, hour ] append_record(queueingTime_ns_fpath, new_row) logger.info('end the file; %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def process_files(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) productivity_fpath = '%s/%s%s.csv' % (productivity_dpath, productivity_prefix, yymm) if check_path_exist(productivity_fpath): logger.info('Already handled; %s' % yymm) return begin_datetime = datetime.datetime(2009, 1, 1, 0) last_datetime = datetime.datetime(2011, 2, 1, 0) hourly_stats, time_period_order = {}, [] while begin_datetime < last_datetime: year, month, day, hour = begin_datetime.year, begin_datetime.month, begin_datetime.day, begin_datetime.hour k = (year, month, day, hour) hourly_stats[k] = [0 for _ in range(len([ALL_DUR, ALL_FARE, ALL_NUM, AP_DUR, AP_FARE, AP_QUEUE, AP_NUM, NS_DUR, NS_FARE, NS_QUEUE, NS_NUM]))] time_period_order.append(k) begin_datetime += datetime.timedelta(hours=1) st_label, et_label, dur_label, fare_label = 'startTime', 'endTime', 'duration', 'fare' qt_label = 'queueingTime' # logger.info('Productive duration; %s' % yymm) with open('%s/%s%s.csv' % (shiftProDur_dpath, shiftProDur_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: year, month = int(row[hid['year']]), int(row[hid['month']]) day, hour = int(row[hid['day']]), int(row[hid['hour']]) hourly_stats[(year, month, day, hour)][ALL_DUR] += eval(row[hid['pro-dur']]) * SEC60 # unit change; Minute -> Second # logger.info('Total fare; %s' % yymm) with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: st_ts, et_ts = eval(row[hid[st_label]]), eval(row[hid[et_label]]) dur, fare = eval(row[hid[dur_label]]), eval(row[hid[fare_label]]) sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare, ALL_FARE, ALL_NUM, None) # logger.info('Sum up fare, duration and queue time; %s' % yymm) for dir_path, file_prefix, id_DUR, id_FARE, id_QUEUE, id_NUM in [(queueingTime_ap_dpath, queueingTime_ap_prefix, AP_DUR, AP_FARE, AP_QUEUE, AP_NUM), (queueingTime_ns_dpath, queueingTime_ns_prefix, NS_DUR, NS_FARE, NS_QUEUE, NS_NUM)]: with open('%s/%s%s.csv' % (dir_path, file_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: st_ts, et_ts = eval(row[hid[st_label]]), eval(row[hid[et_label]]) dur, fare = eval(row[hid[dur_label]]), eval(row[hid[fare_label]]) qt = eval(row[hid[qt_label]]) # sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare, id_FARE, id_NUM, id_DUR) sum_queueing_time(hourly_stats, st_ts, qt, id_QUEUE) # logger.info('Generate .csv file; %s' % yymm) with open(productivity_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['year', 'month', 'day', 'hour', 'allDuration', 'allFare', 'allNum', 'apDuration', 'apFare', 'apQueueingTime', 'apNum', 'nsDuration', 'nsFare', 'nsQueueingTime', 'nsNum'] writer.writerow(header) for year, month, day, hour in time_period_order: all_dur, all_fare, all_num, \ ap_dur, ap_fare, ap_qt, ap_num, \ ns_dur, ns_fare, ns_qt, ns_num = hourly_stats[(year, month, day, hour)] # writer.writerow([year, month, day, hour, all_dur, all_fare, all_num, ap_dur, ap_fare, ap_qt, ap_num, ns_dur, ns_fare, ns_qt, ns_num ]) logger.info('end the file; %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def process_file(fpath): logger.info('Start handling; %s' % fpath) _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-') try: tm = 'spendingTime' st_graph_dpath = dpaths[tm, year, 'influenceGraph'] st_graph_prefix = prefixs[tm, year, 'influenceGraph'] SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix, reducerID) if check_path_exist(SP_graph_fpath): return None # logger.info('Start loading; %s-%s' % (year, reducerID)) df = pd.read_csv(fpath) SP_graph, RP_graph = {}, {} num_drivers = len(set(df['did'])) for i, did1 in enumerate(set(df['did'])): if i % 10 == 0: logger.info('Doing regression %.2f; %s-%s' % (i / float(num_drivers), year, reducerID)) did1_df = df[(df['did'] == did1)].copy(deep=True) numObservations = len(did1_df) minDFResiduals = numObservations * MIN_RATIO_RESIDUAL did1_df = did1_df.drop(['month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did', 'roamingTime'], axis=1) if '%d' % did1 in did1_df.columns: did1_df = did1_df.drop(['%d' % did1], axis=1) # candi_dummies = [] num_iter = 1 while True: for i, vs in enumerate(zip(*did1_df.values)): if did1_df.columns[i] == tm: continue if sum(vs) > numObservations * MIN_PICKUP_RATIO * num_iter: candi_dummies.append(did1_df.columns[i]) numIndepVariables = len(candi_dummies) if numIndepVariables == 0: break if numObservations < numIndepVariables + minDFResiduals: candi_dummies = [] num_iter += 1 else: break if not candi_dummies: continue y = did1_df[tm] X = did1_df[candi_dummies] X = sm.add_constant(X) SP_res = sm.OLS(y, X, missing='drop').fit() # if SP_res.f_pvalue < SIGINIFICANCE_LEVEL: significant_drivers = set() for _did0, pv in SP_res.pvalues.iteritems(): if _did0 == 'const': continue if pv < SIGINIFICANCE_LEVEL: significant_drivers.add(_did0) positive_ef_drivers = set() for _did0, cof in SP_res.params.iteritems(): if _did0 == 'const': continue if cof > 0: positive_ef_drivers.add(_did0) for _did0 in significant_drivers.difference(positive_ef_drivers): SP_graph[int(_did0), did1] = SP_res.params[_did0] # logger.info('Start pickling; %s-%s' % (year, reducerID)) save_pickle_file(SP_graph_fpath, SP_graph) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)), 'w') as f: f.write(format_exc()) raise
def process_file(yymm): ap_pkl_file_path = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm) ns_pkl_file_path = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm) if not (check_path_exist(ap_pkl_file_path) and check_path_exist(ns_pkl_file_path)): return None # # Load pickle files # ap_crossing_time, ns_crossing_time = load_pickle_file( ap_pkl_file_path), load_pickle_file(ns_pkl_file_path) # # Initiate csv files # ap_trip_fpath = '%s/%s%s.csv' % (ap_trips_dir, ap_trip_prefix, yymm) ns_trip_fpath = '%s/%s%s.csv' % (ns_trips_dir, ns_trip_prefix, yymm) if check_path_exist(ap_trip_fpath) and check_path_exist(ns_trip_fpath): return None print 'handle the file; %s' % yymm for fpath in [ap_trip_fpath, ns_trip_fpath]: with open(fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = [ 'tid', 'vid', 'did', 'start-time', 'end-time', 'duration', 'fare', 'prev-trip-end-time', 'trip-mode', 'queue—join-time', 'queueing-time' ] writer.writerow(new_headers) # with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: tid, did = row[hid['tid']], row[hid['did']] et, duration = row[hid['end-time']], row[hid['duration']] fare = row[hid['fare']] # ap_tm, ns_tm = int(row[hid['ap-trip-mode']]), int( row[hid['ns-trip-mode']]) vid, st, prev_tet = row[hid['vid']], eval( row[hid['start-time']]), eval(row[hid['prev-trip-end-time']]) # for tm, crossing_time, fpath in [ (ap_tm, ap_crossing_time, ap_trip_fpath), (ns_tm, ns_crossing_time, ns_trip_fpath) ]: if tm == DIn_POut or tm == DOut_POut: continue if tm == DIn_PIn: queue_join_time = prev_tet elif tm == DOut_PIn: try: i = bisect(crossing_time[vid], st) except KeyError: print '%s-tid-%s' % (yymm, row[hid['tid']]) continue queue_join_time = crossing_time[vid][ i - 1] if i != 0 else crossing_time[vid][0] with open(fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') queueing_time = st - queue_join_time if queueing_time < Q_LIMIT_MIN: queueing_time = Q_LIMIT_MIN new_row = [ tid, vid, did, st, et, duration, fare, prev_tet, tm, queue_join_time, queueing_time ] writer.writerow(new_row) print 'end the file; %s' % yymm
def process_files(yymm): productivity_fpath = '%s/%s%s.csv' % (productivity_dir, productivity_prefix, yymm) if check_path_exist(productivity_fpath): return None print 'handle the file; %s' % yymm begin_datetime = datetime.datetime(2009, 1, 1, 0) last_datetime = datetime.datetime(2011, 2, 1, 0) hourly_stats, time_period_order = {}, [] while begin_datetime < last_datetime: yyyy, mm, dd, hh = begin_datetime.year, begin_datetime.month, begin_datetime.day, begin_datetime.hour k = (yyyy, mm, dd, hh) hourly_stats[k] = [ 0 for _ in range( len([ ALL_DUR, ALL_FARE, ALL_NUM, AP_DUR, AP_FARE, AP_QUEUE, AP_NUM, NS_DUR, NS_FARE, NS_QUEUE, NS_NUM ])) ] time_period_order.append(k) begin_datetime += datetime.timedelta(hours=1) # st_label, et_label, dur_label, fare_label = 'start-time', 'end-time', 'duration', 'fare' qt_label = 'queueing-time' # Productive duration print yymm, 'Productive duration' yyyy, mm = 2000 + int(yymm[:2]), int(yymm[2:]) with open('%s/%s%s.csv' % (shift_pro_dur_dir, shift_pro_dur_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: dd, hh = eval(row[hid['dd']]), eval(row[hid['hh']]) hourly_stats[(yyyy, mm, dd, hh)][ALL_DUR] += eval( row[hid['pro-dur']]) * SEC60 # unit change; Minute -> Second # Total fare print yymm, 'Total fare' with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: st_ts, et_ts = eval(row[hid[st_label]]), eval(row[hid[et_label]]) dur, fare = eval(row[hid[dur_label]]), eval(row[hid[fare_label]]) sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare, ALL_FARE, ALL_NUM, None) # Sum up fare, duration and queue time print yymm, 'Sum up fare, duration and queue time' for dir_path, file_prefix, id_DUR, id_FARE, id_QUEUE, id_NUM in [ (ap_trips_dir, ap_trip_prefix, AP_DUR, AP_FARE, AP_QUEUE, AP_NUM), (ns_trips_dir, ns_trip_prefix, NS_DUR, NS_FARE, NS_QUEUE, NS_NUM) ]: with open('%s/%s%s.csv' % (dir_path, file_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: st_ts, et_ts = eval(row[hid[st_label]]), eval( row[hid[et_label]]) dur, fare = eval(row[hid[dur_label]]), eval( row[hid[fare_label]]) qt = eval(row[hid[qt_label]]) # sum_prop_fare_dur(hourly_stats, st_ts, et_ts, dur, fare, id_FARE, id_NUM, id_DUR) sum_queueing_time(hourly_stats, st_ts, qt, id_QUEUE) # Generate .csv file print yymm, 'Generate .csv file' with open(productivity_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ 'yy', 'mm', 'dd', 'hh', 'all-duration', 'all-fare', 'all-num', 'ap-duration', 'ap-fare', 'ap-queueing-time', 'ap-num', 'ns-duration', 'ns-fare', 'ns-queueing-time', 'ns-num' ] writer.writerow(header) for yyyy, mm, dd, hh in time_period_order: all_dur, all_fare, all_num, \ ap_dur, ap_fare, ap_qt, ap_num, \ ns_dur, ns_fare, ns_qt, ns_num = hourly_stats[(yyyy, mm, dd, hh)] # writer.writerow([ yyyy - 2000, mm, dd, hh, all_dur, all_fare, all_num, ap_dur, ap_fare, ap_qt, ap_num, ns_dur, ns_fare, ns_qt, ns_num ]) print 'end the file; %s' % yymm