def gen_summary(): driverIntellect2009 = load_pickle_file( '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2009')) driverIntellect2010 = load_pickle_file( '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2010')) driverSet2009, driverSet2010 = set(driverIntellect2009.keys()), set(driverIntellect2010.keys()) driverSetBoth = driverSet2009.intersection(driverSet2010) onlySet2009 = driverSet2009.difference(driverSetBoth) onlySet2010 = driverSet2010.difference(driverSetBoth) # fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, 'all') fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, 'all-negativeOnly') with open(fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['did', 'numY2009', 'numY2010', 'coefY2009', 'coefY2010', 'coefDiff'] writer.writerow(header) for did in driverSetBoth: num2009, coef2009 = driverIntellect2009[did] num2010, coef2010 = driverIntellect2010[did] # if coef2009 == 'X' or coef2010 == 'X': # writer.writerow([did, num2009, num2010, coef2009, coef2010, 'X']) # else: # writer.writerow([did, num2009, num2010, coef2009, coef2010, coef2009 - coef2010]) if coef2009 == 'X' or coef2010 == 'X': continue if coef2009 < 0 and coef2010 < 0: writer.writerow([did, num2009, num2010, coef2009, coef2010, coef2009 - coef2010])
def process_file(yymm): ap_pkl_file_path = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm) ns_pkl_file_path = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm) if not (check_path_exist(ap_pkl_file_path) and check_path_exist(ns_pkl_file_path)): return None # # Load pickle files # ap_crossing_time, ns_crossing_time = load_pickle_file(ap_pkl_file_path), load_pickle_file(ns_pkl_file_path) # # Initiate csv files # ap_trip_fpath = '%s/%s%s.csv' % (ap_trips_dir, ap_trip_prefix, yymm) ns_trip_fpath = '%s/%s%s.csv' % (ns_trips_dir, ns_trip_prefix, yymm) if check_path_exist(ap_trip_fpath) and check_path_exist(ns_trip_fpath): return None print 'handle the file; %s' % yymm for fpath in [ap_trip_fpath, ns_trip_fpath]: with open(fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['tid', 'vid', 'did', 'start-time', 'end-time', 'duration', 'fare', 'prev-trip-end-time', 'trip-mode', 'queue—join-time', 'queueing-time'] writer.writerow(new_headers) # with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h : i for i, h in enumerate(headers)} for row in reader: tid, did = row[hid['tid']], row[hid['did']] et, duration = row[hid['end-time']], row[hid['duration']] fare = row[hid['fare']] # ap_tm, ns_tm = int(row[hid['ap-trip-mode']]), int(row[hid['ns-trip-mode']]) vid, st, prev_tet = row[hid['vid']], eval(row[hid['start-time']]), eval(row[hid['prev-trip-end-time']]) # for tm, crossing_time, fpath in [(ap_tm, ap_crossing_time, ap_trip_fpath), (ns_tm, ns_crossing_time, ns_trip_fpath)]: if tm == DIn_POut or tm == DOut_POut: continue if tm == DIn_PIn: queue_join_time = prev_tet elif tm == DOut_PIn: try: i = bisect(crossing_time[vid], st) except KeyError: print '%s-tid-%s' % (yymm, row[hid['tid']]) continue queue_join_time = crossing_time[vid][i - 1] if i != 0 else crossing_time[vid][0] with open(fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') queueing_time = st - queue_join_time if queueing_time < Q_LIMIT_MIN: queueing_time = Q_LIMIT_MIN new_row = [tid, vid, did, st, et, duration, fare, prev_tet, tm, queue_join_time, queueing_time] writer.writerow(new_row) print 'end the file; %s' % yymm
def run(): yearDriver_gn = {} whole_ss_drivers = set() tm = 'spendingTime' for year in ['2009', '2010', '2011', '2012']: gp_dpath = dpaths[tm, year, 'groupPartition'] gp_prefix = prefixs[tm, year, 'groupPartition'] gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) gp_drivers = load_pickle_file(gp_drivers_fpath) for gn, drivers in gp_drivers.iteritems(): for did in drivers: yearDriver_gn[year, did] = gn yy = year[2:] for fn in get_all_files(ss_drivers_dpath, '%s%s*.pkl' % (ss_drivers_prefix, yy)): ss_drivers_fpath = '%s/%s' % (ss_drivers_dpath, fn) ss_drivers = load_pickle_file(ss_drivers_fpath) for did in ss_drivers: whole_ss_drivers.add(did) with open(groupEvolution_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['did', 'G2009', 'G2010', 'G2011', 'G2012'] writer.writerow(header) for did in whole_ss_drivers: new_row = [did] for year in ['2009', '2010', '2011', '2012']: k = (year, did) if yearDriver_gn.has_key(k): gn = yearDriver_gn[k] else: gn = 'X' new_row += [gn] writer.writerow(new_row)
def run(): yearDriver_gn = {} whole_ss_drivers = set() tm = 'spendingTime' for year in ['2009', '2010', '2011', '2012']: gp_dpath = dpaths[tm, year, 'groupPartition'] gp_prefix = prefixs[tm, year, 'groupPartition'] gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) gp_drivers = load_pickle_file(gp_drivers_fpath) for gn, drivers in gp_drivers.iteritems(): for did in drivers: yearDriver_gn[year, did] = gn yy = year[2:] for fn in get_all_files(ss_drivers_dpath, '%s%s*.pkl' % (ss_drivers_prefix, yy)): ss_drivers_fpath = '%s/%s' % (ss_drivers_dpath, fn) ss_drivers = load_pickle_file(ss_drivers_fpath) for did in ss_drivers: whole_ss_drivers.add(did) with open(groupEvolution_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['did', 'G2009', 'G2010', 'G2011', 'G2012'] writer.writerow(header) for did in whole_ss_drivers: new_row = [did] for year in ['2009', '2010', '2011', '2012']: k = (year, did) if yearDriver_gn.has_key(k): gn = yearDriver_gn[k] else: gn = 'X' new_row += [gn] writer.writerow(new_row)
def get_driver_trajectory(did): ofpath = '%s%d.pkl' % (if_prefix, did) if check_path_exist(ofpath): dt_xy_state = load_pickle_file(ofpath) else: dates = [] for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix): _, _date, _did = fn[:-len('.csv')].split('-') if int(_did) != did: continue year = 2000 + int(_date[:2]) month, day = map(int, [_date[2:4], _date[4:6]]) dt = datetime.datetime(year, month, day) dates += [dt] dates.sort() dt_xy_state = [] for dt in dates: yy = '%02d' % (dt.year - 2000) mm, dd = '%02d' % dt.month, '%02d' % dt.day yymmdd = yy + mm + dd ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did) with open(ifpath, 'rb') as logFile: reader = csv.reader(logFile) header = reader.next() # header: time,vehicle-id,driver-id,longitude,latitude,speed,state hid = {h: i for i, h in enumerate(header)} for row in reader: dt = datetime.datetime.fromtimestamp(eval(row[hid['time']])) lon, lat = map(eval, [row[hid[cn]] for cn in ['longitude', 'latitude']]) x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat) dt_xy_state += [(dt, x, y, int(row[hid['state']]))] save_pickle_file(ofpath, dt_xy_state) return dt_xy_state
def process_files(yymm): print 'handle the file; %s' % yymm # shift_df = pd.read_csv('%s/%s%s.csv' % (ftd_shift_dir, ftd_shift_prefix, yymm)) all_trip_df = pd.read_csv('%s/%s%s.csv' % (ftd_trips_dir, ftd_trips_prefix, yymm)) loc_trip_df = pd.read_csv('%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm)) ft_drivers = map( int, load_pickle_file('%s/%s%s.pkl' % (full_time_driver_dir, ft_drivers_prefix, yymm))) days = set(loc_trip_df['dd']) # yy, mm = int(yymm[:2]), int(yymm[2:]) for dd in days: day_all_trip_df = all_trip_df[(all_trip_df['dd'] == dd)] day_loc_trip_df = loc_trip_df[(loc_trip_df['dd'] == dd)] day_shift_df = shift_df[(shift_df['dd'] == dd)] for did in ft_drivers: # # All # d_all_trip = day_all_trip_df[(day_all_trip_df['did'] == did)] d_shift = day_shift_df[(day_shift_df['did'] == did)] all_num = len(d_all_trip['fare']) pro_dur = sum(d_shift['pro-dur']) * SEC60 all_fare = sum(d_all_trip['fare']) # # Specific location # d_loc_trip = day_loc_trip_df[(day_loc_trip_df['did'] == did)] loc_num = len(d_loc_trip['fare']) loc_dur = sum(d_loc_trip['duration']) loc_fare = sum(d_loc_trip['fare']) loc_ep = sum(d_loc_trip['economic-profit']) loc_qtime = sum(d_loc_trip['queueing-time']) # d_loc_trip_in = d_loc_trip[(d_loc_trip['trip-mode'] == DIn_PIn)] locIn_num = len(d_loc_trip_in['fare']) locIn_dur = sum(d_loc_trip_in['duration']) locIn_fare = sum(d_loc_trip_in['fare']) locIn_ep = sum(d_loc_trip_in['economic-profit']) locIn_qtime = sum(d_loc_trip_in['queueing-time']) # d_loc_trip_out = d_loc_trip[(d_loc_trip['trip-mode'] == DOut_PIn)] locOut_num = len(d_loc_trip_out['fare']) locOut_dur = sum(d_loc_trip_out['duration']) locOut_fare = sum(d_loc_trip_out['fare']) locOut_ep = sum(d_loc_trip_out['economic-profit']) locOut_qtime = sum(d_loc_trip_out['queueing-time']) # with open(ftd_ap_daily_stat_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ yy, mm, dd, did, all_num, pro_dur, all_fare, loc_num, loc_dur, loc_fare, loc_ep, loc_qtime, locIn_num, locIn_dur, locIn_fare, locIn_ep, locIn_qtime, locOut_num, locOut_dur, locOut_fare, locOut_ep, locOut_qtime ])
def run(): if not check_path_exist(ssd_apIn_fpath): with open(ssd_apIn_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') headers = ['apQTime', 'apIn', 'did'] writer.writerow(headers) for m in xrange(1, 13): yymm = '10%02d' % m if yymm in ['1010']: continue logger.info('Start handling; %s' % yymm) ft_drivers = map(int, load_pickle_file('%s/%s%s.pkl' % (full_time_driver_dir, ft_drivers_prefix, yymm))) ap_ep_fpath = '%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm) with open(ap_ep_fpath, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} handling_day = 0 for row in reader: did = int(row[hid['did']]) if did not in ft_drivers: continue t = eval(row[hid['start-time']]) cur_dt = datetime.datetime.fromtimestamp(t) if handling_day != cur_dt.day: logger.info('...ing; %s(%dth)' % (yymm, handling_day)) handling_day = cur_dt.day apIn = 1 if int(row[hid['trip-mode']]) == DIn_PIn else 0 apQTime = eval(row[hid['queueing-time']]) / float(SEC60) new_row = [apQTime, apIn, did] writer.writerow(new_row) # df = pd.read_csv(ssd_apIn_fpath) df = df[~(np.abs(df['apQTime'] - df['apQTime'].mean()) > (3 * df['apQTime'].std()))] minNumSample = 40 with open(ssd_sensitivity_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') headers = ['did', 'F_pValue', 'rSqure', 'rSqureAdj', 'coef_apIn', 'pValue_apIn', 'coef_const', 'pValue_const'] writer.writerow(headers) for did in set(df['did']) : did_df = df[(df['did'] == did)] if len(did_df) < minNumSample: continue if len(did_df[(did_df['apIn'] == 0)]) < 4: continue y = did_df['apQTime'] X = did_df['apIn'] X = sm.add_constant(X) res = sm.OLS(y, X).fit() if np.isnan(res.f_pvalue): continue try: writer.writerow([did, res.f_pvalue, res.rsquared, res.rsquared_adj, res.params['apIn'], res.pvalues['apIn'], res.params['const'], res.pvalues['const']]) except Exception as _: pass
def gen_summary2010(): intellect2010_fpath = '%s/%s%s.csv' % ( statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2010') with open(intellect2010_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ 'driverID', 'locInCoef', 'wleTripNumber', 'wleOperatingHour', 'wleFare', 'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare', 'wleProductivity', 'QTime/locTrip', 'EP/locTrip', 'locProductivity', 'locInRatio' ] writer.writerow(header) # driverIntellect2010 = load_pickle_file( '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2010')) df = pd.read_csv('%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversDay_ap_prefix, '2010')) agg_df = df.groupby(['driverID']).sum().reset_index() candi_drop_cn = [] for cn in agg_df.columns: if cn not in [ 'driverID', 'wleTripNumber', 'wleOperatingHour', 'wleFare', 'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare' ]: candi_drop_cn.append(cn) agg_df = agg_df.drop(candi_drop_cn, axis=1) # agg_df['wleProductivity'] = agg_df['wleFare'] / agg_df['wleOperatingHour'] agg_df['QTime/locTrip'] = agg_df['locQTime'] / agg_df['locTripNumber'] agg_df['EP/locTrip'] = agg_df['locEP'] / agg_df['locTripNumber'] agg_df['locProductivity'] = agg_df['locFare'] / ( agg_df['locQTime'] + agg_df['locDuration']) * SEC60 agg_df['locInRatio'] = agg_df['locInNumber'] / agg_df['locTripNumber'] allDrivers = set(agg_df['driverID']) for did, (_, coef) in driverIntellect2010.iteritems(): if coef == 'X': continue if did not in allDrivers: continue with open(intellect2010_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [did, coef] for cn in [ 'wleTripNumber', 'wleOperatingHour', 'wleFare', 'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare', 'wleProductivity', 'QTime/locTrip', 'EP/locTrip', 'locProductivity', 'locInRatio' ]: new_row += agg_df.loc[agg_df['driverID'] == did][cn].tolist() writer.writerow(new_row)
def process_files(yymm): print 'handle the file; %s' % yymm # shift_df = pd.read_csv('%s/%s%s.csv' % (ftd_shift_dir, ftd_shift_prefix, yymm)) all_trip_df = pd.read_csv('%s/%s%s.csv' % (ftd_trips_dir, ftd_trips_prefix, yymm)) loc_trip_df = pd.read_csv('%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm)) ft_drivers = map(int, load_pickle_file('%s/%s%s.pkl' % (full_time_driver_dir, ft_drivers_prefix, yymm))) days = set(loc_trip_df['dd']) # yy, mm = int(yymm[:2]), int(yymm[2:]) for dd in days: day_all_trip_df = all_trip_df[(all_trip_df['dd'] == dd)] day_loc_trip_df = loc_trip_df[(loc_trip_df['dd'] == dd)] day_shift_df = shift_df[(shift_df['dd'] == dd)] for did in ft_drivers: # # All # d_all_trip = day_all_trip_df[(day_all_trip_df['did'] == did)] d_shift = day_shift_df[(day_shift_df['did'] == did)] all_num = len(d_all_trip['fare']) pro_dur = sum(d_shift['pro-dur']) * SEC60 all_fare = sum(d_all_trip['fare']) # # Specific location # d_loc_trip = day_loc_trip_df[(day_loc_trip_df['did'] == did)] loc_num = len(d_loc_trip['fare']) loc_dur = sum(d_loc_trip['duration']) loc_fare = sum(d_loc_trip['fare']) loc_ep = sum(d_loc_trip['economic-profit']) loc_qtime = sum(d_loc_trip['queueing-time']) # d_loc_trip_in = d_loc_trip[(d_loc_trip['trip-mode'] == DIn_PIn)] locIn_num = len(d_loc_trip_in['fare']) locIn_dur = sum(d_loc_trip_in['duration']) locIn_fare = sum(d_loc_trip_in['fare']) locIn_ep = sum(d_loc_trip_in['economic-profit']) locIn_qtime = sum(d_loc_trip_in['queueing-time']) # d_loc_trip_out = d_loc_trip[(d_loc_trip['trip-mode'] == DOut_PIn)] locOut_num = len(d_loc_trip_out['fare']) locOut_dur = sum(d_loc_trip_out['duration']) locOut_fare = sum(d_loc_trip_out['fare']) locOut_ep = sum(d_loc_trip_out['economic-profit']) locOut_qtime = sum(d_loc_trip_out['queueing-time']) # with open(ftd_ap_daily_stat_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([yy, mm, dd, did, all_num, pro_dur, all_fare, loc_num, loc_dur, loc_fare, loc_ep, loc_qtime, locIn_num, locIn_dur, locIn_fare, locIn_ep, locIn_qtime, locOut_num, locOut_dur, locOut_fare, locOut_ep, locOut_qtime])
def get_sgBoarder_xy(): fpath = 'sgBorder_xy.pkl' if not check_path_exist(fpath): sgBorder_xy = [] for lon, lat in sg_border: x, y = convert_GPS2xy(lon, lat) sgBorder_xy += [(x, y)] save_pickle_file(fpath, sgBorder_xy) else: sgBorder_xy = load_pickle_file(fpath) return sgBorder_xy
def run(): a1_dir = charts_dir + '/b_aggregated_a1 monthly fare' check_dir_create(a1_dir) # Y09, Y10 = load_pickle_file(driver_monthly_fare_fn) num_bin = 50 # print 't statistics %.3f, p-value %.3f' % (stats.ttest_ind(Y09, Y10, equal_var=False)) # one_histogram((8,6), '', 'Fare (S$)', 'Probability', num_bin, Y09, a1_dir + '/Y2009_monthly_fares') one_histogram((8,6), '', 'Fare (S$)', 'Probability', num_bin, Y10, a1_dir + '/Y2010_monthly_fares')
def get_sgZones(): ofpath = 'sgZone.pkl' if check_path_exist(ofpath): sgZones = load_pickle_file(ofpath) else: sgZones = get_sg_zones() for z in sgZones.values(): z.cCoor_xy = convert_GPS2xy(*z.cCoor_gps) z.polyPoints_xy = [convert_GPS2xy(*gps_coord) for gps_coord in z.polyPoints_gps] z.marked = False save_pickle_file(ofpath, sgZones) return sgZones
def get_sgRoards_xy(): ofpath = 'sgRoards_xy.pkl' if check_path_exist(ofpath): sgRoards_xy = load_pickle_file(ofpath) else: sgRoards_xy = [] for _, coords in get_SG_roads(): road_fd = [] for lon, lat in coords: road_fd += [convert_GPS2xy(lon, lat)] sgRoards_xy += [road_fd] save_pickle_file(ofpath, sgRoards_xy) return sgRoards_xy
def run(): a1_dir = charts_dir + '/b_aggregated_a1 monthly fare' check_dir_create(a1_dir) # Y09, Y10 = load_pickle_file(driver_monthly_fare_fn) num_bin = 50 # print 't statistics %.3f, p-value %.3f' % (stats.ttest_ind( Y09, Y10, equal_var=False)) # one_histogram((8, 6), '', 'Fare (S$)', 'Probability', num_bin, Y09, a1_dir + '/Y2009_monthly_fares') one_histogram((8, 6), '', 'Fare (S$)', 'Probability', num_bin, Y10, a1_dir + '/Y2010_monthly_fares')
def gen_summary(): driverIntellect2009 = load_pickle_file( '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2009')) driverIntellect2010 = load_pickle_file( '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2010')) driverSet2009, driverSet2010 = set(driverIntellect2009.keys()), set( driverIntellect2010.keys()) driverSetBoth = driverSet2009.intersection(driverSet2010) onlySet2009 = driverSet2009.difference(driverSetBoth) onlySet2010 = driverSet2010.difference(driverSetBoth) # fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, 'all') fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, 'all-negativeOnly') with open(fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ 'did', 'numY2009', 'numY2010', 'coefY2009', 'coefY2010', 'coefDiff' ] writer.writerow(header) for did in driverSetBoth: num2009, coef2009 = driverIntellect2009[did] num2010, coef2010 = driverIntellect2010[did] # if coef2009 == 'X' or coef2010 == 'X': # writer.writerow([did, num2009, num2010, coef2009, coef2010, 'X']) # else: # writer.writerow([did, num2009, num2010, coef2009, coef2010, coef2009 - coef2010]) if coef2009 == 'X' or coef2010 == 'X': continue if coef2009 < 0 and coef2010 < 0: writer.writerow([ did, num2009, num2010, coef2009, coef2010, coef2009 - coef2010 ])
def run(moduloIndex): logger.info('loading driversRelations %s; %s' % (year, depVar)) superSet_fpath = '%s/%sFiltered-superSet-%s%s.pkl' % (if_dpath, depVar, if_prefixs, year) driversRelations = load_pickle_file(superSet_fpath) whole_drivers = driversRelations.keys() driver_subsets = [[] for _ in range(numReducers)] for i, did in enumerate(whole_drivers): driver_subsets[i % numReducers].append(did) for i, driver_subset in enumerate(driver_subsets): if i % numWorker != moduloIndex: continue pickUp_drivers = set() for did1 in driver_subset: pickUp_drivers = pickUp_drivers.union(driversRelations[did1]) process_files(i, driver_subset, pickUp_drivers)
def process_files(yymm): print 'handle the file; %s' % yymm ft_drivers = load_pickle_file( '%s/%s%s.pkl' % (full_time_driver_dir, ft_drivers_prefix, yymm)) with open('%s/%s%s.csv' % (shift_pro_dur_dir, shift_pro_dur_prefix, yymm), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open('%s/%s%s.csv' % (ftd_shift_dir, ftd_shift_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile) writer.writerow(headers) for row in reader: did = row[hid['did']] if did not in ft_drivers: continue writer.writerow(row) # with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open('%s/%s%s.csv' % (ftd_trips_dir, ftd_trips_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = [ 'did', 'start-time', 'duration', 'fare', 'yy', 'mm', 'dd', 'hh' ] writer.writerow(new_headers) # # filter out trips data based on two factors; # 1. full time driver # for row in reader: st_ts = eval(row[hid['start-time']]) st_dt = datetime.datetime.fromtimestamp(st_ts) did = row[hid['did']] if did not in ft_drivers: continue writer.writerow([ row[hid['did']], row[hid['start-time']], row[hid['duration']], row[hid['fare']], st_dt.year - 2000, st_dt.month, st_dt.day, st_dt.hour ]) # print 'end the file; %s' % yymm
def run(moduloIndex): logger.info('loading driversRelations %s; %s' % (year, depVar)) superSet_fpath = '%s/%sFiltered-superSet-%s%s.pkl' % (if_dpath, depVar, if_prefixs, year) driversRelations = load_pickle_file(superSet_fpath) whole_drivers = driversRelations.keys() driver_subsets = [[] for _ in range(numReducers)] for i, did in enumerate(whole_drivers): driver_subsets[i % numReducers].append(did) for i, driver_subset in enumerate(driver_subsets): if i % numWorker != moduloIndex: continue pickUp_drivers = set() for did1 in driver_subset: pickUp_drivers = pickUp_drivers.union(driversRelations[did1]) process_files(i, driver_subset, pickUp_drivers)
def run(): for dpath in [ # statisticsSsDrivers_ap_dpath, statisticsSsDrivers_ns_dpath ]: check_dir_create(dpath) # ssDrivers = set() for y in xrange(9, 11): for m in xrange(1, 13): yymm = '%02d%02d' % (y, m) if yymm in ['0912', '1010']: # both years data are corrupted continue ssDrivers = ssDrivers.union(load_pickle_file('%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm))) # for all_dpath, ss_dpath in [ # (statisticsAllDrivers_ap_dpath, statisticsSsDrivers_ap_dpath), (statisticsAllDrivers_ns_dpath, statisticsSsDrivers_ns_dpath) ]: for all_prefix, ss_prefix in [ # (statisticsAllDriversDay_ap_prefix, statisticsSsDriversDay_ap_prefix), (statisticsAllDriversDay_ns1517_prefix, statisticsSsDriversDay_ns1517_prefix), (statisticsAllDriversDay_ns2023_prefix, statisticsSsDriversDay_ns2023_prefix), # (statisticsAllDriversMonth_ap_prefix, statisticsSsDriversMonth_ap_prefix), (statisticsAllDriversMonth_ns1517_prefix, statisticsSsDriversMonth_ns1517_prefix), (statisticsAllDriversMonth_ns2023_prefix, statisticsSsDriversMonth_ns2023_prefix), # (statisticsAllDriversTrip_ap_prefix, statisticsSsDriversTrip_ap_prefix), (statisticsAllDriversTrip_ns1517_prefix, statisticsSsDriversTrip_ns1517_prefix), (statisticsAllDriversTrip_ns2023_prefix, statisticsSsDriversTrip_ns2023_prefix), ]: for fn in get_all_files(all_dpath, '%s*' % all_prefix): period = fn[:-len('.csv')].split('-')[2] with open('%s/%s' % (all_dpath, fn), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) header = reader.next() hid = {h: i for i, h in enumerate(header)} with open('%s/%s%s.csv' % (ss_dpath, ss_prefix, period), 'wt') as w_csvfile: writer = csv.writer(w_csvfile) writer.writerow(header) for row in reader: did = int(row[hid['driverID']]) if did not in ssDrivers: continue writer.writerow(row)
def get_sgGrid_xy(): ofpath = 'sgGrid_xy.pkl' if check_path_exist(ofpath): sgGrid_xy = load_pickle_file(ofpath) else: sgGrid_xy = [] lons, lats = generate_sg_grid() for lon in lons: sx, sy = convert_GPS2xy(lon, lats[0]) ex, ey = convert_GPS2xy(lon, lats[-1]) sgGrid_xy += [[(sx, sy), (ex, ey)]] for lat in lats: sx, sy = convert_GPS2xy(lons[0], lat) ex, ey = convert_GPS2xy(lons[-1], lat) sgGrid_xy += [[(sx, sy), (ex, ey)]] save_pickle_file(ofpath, sgGrid_xy) return sgGrid_xy
def gen_summary2010(): intellect2010_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2010') with open(intellect2010_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['driverID', 'locInCoef', 'wleTripNumber', 'wleOperatingHour', 'wleFare', 'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare', 'wleProductivity', 'QTime/locTrip', 'EP/locTrip', 'locProductivity', 'locInRatio'] writer.writerow(header) # driverIntellect2010 = load_pickle_file('%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2010')) df = pd.read_csv('%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversDay_ap_prefix, '2010')) agg_df = df.groupby(['driverID']).sum().reset_index() candi_drop_cn = [] for cn in agg_df.columns: if cn not in ['driverID', 'wleTripNumber', 'wleOperatingHour', 'wleFare', 'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare']: candi_drop_cn.append(cn) agg_df = agg_df.drop(candi_drop_cn, axis=1) # agg_df['wleProductivity'] = agg_df['wleFare'] / agg_df['wleOperatingHour'] agg_df['QTime/locTrip'] = agg_df['locQTime'] / agg_df['locTripNumber'] agg_df['EP/locTrip'] = agg_df['locEP'] / agg_df['locTripNumber'] agg_df['locProductivity'] = agg_df['locFare'] / (agg_df['locQTime'] + agg_df['locDuration']) * SEC60 agg_df['locInRatio'] = agg_df['locInNumber'] / agg_df['locTripNumber'] allDrivers = set(agg_df['driverID']) for did, (_, coef) in driverIntellect2010.iteritems(): if coef == 'X': continue if did not in allDrivers: continue with open(intellect2010_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [did, coef] for cn in ['wleTripNumber', 'wleOperatingHour', 'wleFare', 'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare', 'wleProductivity', 'QTime/locTrip', 'EP/locTrip', 'locProductivity', 'locInRatio']: new_row += agg_df.loc[agg_df['driverID'] == did][cn].tolist() writer.writerow(new_row)
def run(): check_dir_create(tfZ_TP_dpath) numWorker = 6 init_multiprocessor(numWorker) count_num_jobs = 0 numReducers = numWorker * 10 # yyyy = '20%02d' % 12 logger.info('loading driversRelations %s' % yyyy) driversRelations = load_pickle_file(driversRelations_fpaths[yyyy]) whole_drivers = driversRelations.keys() driver_subsets = [[] for _ in range(numReducers)] for i, did in enumerate(whole_drivers): driver_subsets[i % numReducers].append(did) for i, driver_subset in enumerate(driver_subsets): # process_files(yyyy, i, driver_subset, driversRelations) pickUp_drivers = set() for did1 in driver_subset: pickUp_drivers = pickUp_drivers.union(driversRelations[did1]) put_task(process_files, [yyyy, i, driver_subset, pickUp_drivers]) count_num_jobs += 1 end_multiprocessor(count_num_jobs)
def run(): check_dir_create(tfZ_TP_dpath) numWorker = 6 init_multiprocessor(numWorker) count_num_jobs = 0 numReducers = numWorker * 10 # yyyy = '20%02d' % 12 logger.info('loading driversRelations %s' % yyyy) driversRelations = load_pickle_file(driversRelations_fpaths[yyyy]) whole_drivers = driversRelations.keys() driver_subsets = [[] for _ in range(numReducers)] for i, did in enumerate(whole_drivers): driver_subsets[i % numReducers].append(did) for i, driver_subset in enumerate(driver_subsets): # process_files(yyyy, i, driver_subset, driversRelations) pickUp_drivers = set() for did1 in driver_subset: pickUp_drivers = pickUp_drivers.union(driversRelations[did1]) put_task(process_files, [yyyy, i, driver_subset, pickUp_drivers]) count_num_jobs += 1 end_multiprocessor(count_num_jobs)
def run(time_from, time_to): # # Step 1. Split Singapore into zones # if not check_path_exist(grid_info_fn): from taxi_common.sg_grid_zone import run as run_split_into_zones # @UnresolvedImport hl_points, vl_points, zones = run_split_into_zones(rp_zone) else: hl_points, vl_points, zones = load_pickle_file(grid_info_fn) # # Step 2. Preprocess logs # processed_log_fn = get_processed_log_fn(time_from, time_to) if not check_path_exist(processed_log_fn): from preprocess_logs import run as run_preprocess_logs run_preprocess_logs(hl_points, vl_points, time_from, time_to) # # Step 3. Preprocess trips # processed_trip_fn = get_processed_trip_fn(time_from, time_to) if not check_path_exist(processed_trip_fn): from preprocess_trips import run as run_preprocess_trips run_preprocess_trips(hl_points, vl_points, time_from, time_to)
def run(time_from, time_to): # # Step 1. Split Singapore into zones # if not check_path_exist(grid_info_fn): from taxi_common.sg_grid_zone import run as run_split_into_zones # @UnresolvedImport hl_points, vl_points, zones = run_split_into_zones(rp_zone) else: hl_points, vl_points, zones = load_pickle_file(grid_info_fn) # # Step 2. Preprocess logs # processed_log_fn = get_processed_log_fn(time_from, time_to) if not check_path_exist(processed_log_fn): from preprocess_logs import run as run_preprocess_logs run_preprocess_logs(hl_points, vl_points, time_from, time_to) # # Step 3. Preprocess trips # processed_trip_fn = get_processed_trip_fn(time_from, time_to) if not check_path_exist(processed_trip_fn): from preprocess_trips import run as run_preprocess_trips run_preprocess_trips(hl_points, vl_points, time_from, time_to)
def get_driver_trajectory(did): ofpath = '%s%d.pkl' % (if_prefix, did) if check_path_exist(ofpath): dt_xy_state = load_pickle_file(ofpath) else: dates = [] for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix): _, _date, _did = fn[:-len('.csv')].split('-') if int(_did) != did: continue year = 2000 + int(_date[:2]) month, day = map(int, [_date[2:4], _date[4:6]]) dt = datetime.datetime(year, month, day) dates += [dt] dates.sort() dt_xy_state = [] for dt in dates: yy = '%02d' % (dt.year - 2000) mm, dd = '%02d' % dt.month, '%02d' % dt.day yymmdd = yy + mm + dd ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did) with open(ifpath, 'rb') as logFile: reader = csv.reader(logFile) header = reader.next() # header: time,vehicle-id,driver-id,longitude,latitude,speed,state hid = {h: i for i, h in enumerate(header)} for row in reader: dt = datetime.datetime.fromtimestamp(eval( row[hid['time']])) lon, lat = map( eval, [row[hid[cn]] for cn in ['longitude', 'latitude']]) x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat) dt_xy_state += [(dt, x, y, int(row[hid['state']]))] save_pickle_file(ofpath, dt_xy_state) return dt_xy_state
def process_month(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) yy, mm = yymm[:2], yymm[2:] trip_normal_fpath = '%s/20%s/%s/trips/trips-%s-normal.csv' % ( taxi_home, yy, mm, yymm) trip_ext_fpath = '%s/20%s/%s/trips/trips-%s-normal-ext.csv' % ( taxi_home, yy, mm, yymm) log_fpath = '%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm) if not check_path_exist(trip_normal_fpath): logger.info('The file X exists; %s' % yymm) return None ss_drivers_fpath = '%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm) if not check_path_exist(ss_drivers_fpath): logger.info('The file X exists; %s' % ss_drivers_fpath) return None ss_drivers = load_pickle_file(ss_drivers_fpath) x_points, y_points = get_sg_grid_xy_points() # ss_trips_fpath = '%s/%s%s.csv' % (ss_trips_dpath, ss_trips_prefix, yymm) if check_path_exist(ss_trips_fpath): logger.info('The file had already been processed; %s' % yymm) return None with open(ss_trips_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ 'did', 'hour', 'zi', 'zj', 'time', 'day', 'month', 'start-long', 'start-lat', 'distance', 'duration', 'fare', 'queueingTime' ]) with open(trip_normal_fpath, 'rb') as tripFileN: tripReaderN = csv.reader(tripFileN) tripHeaderN = tripReaderN.next() # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3, # 'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7, # 'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11, # 'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15, # 'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19} hidN = {h: i for i, h in enumerate(tripHeaderN)} with open(trip_ext_fpath, 'rb') as tripFileE: tripReaderE = csv.reader(tripFileE) tripHeaderE = tripReaderE.next() # # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3} # hidE = {h: i for i, h in enumerate(tripHeaderE)} with open(log_fpath, 'rb') as logFile: logReader = csv.reader(logFile) logHeader = logReader.next() hidL = {h: i for i, h in enumerate(logHeader)} handling_day = 0 drivers = {} for rowN in tripReaderN: rowE = tripReaderE.next() didT = int(rowE[hidE['driver-id']]) if didT not in ss_drivers: continue tripTime = eval(rowN[hidN['start-time']]) cur_dtT = datetime.datetime.fromtimestamp(tripTime) if handling_day != cur_dtT.day: handling_day = cur_dtT.day logger.info('Processing %s %dth day' % (yymm, cur_dtT.day)) if cur_dtT.weekday() in [FRI, SAT, SUN]: continue if cur_dtT.hour < AM10: continue if PM8 <= cur_dtT.hour: continue while True: rowL = logReader.next() logTime = eval(rowL[hidL['time']]) didL = int(rowL[hidL['driver-id']]) if didL not in ss_drivers: continue t = eval(rowL[hidL['time']]) cur_dtL = datetime.datetime.fromtimestamp(t) if cur_dtL.weekday() in [FRI, SAT, SUN]: continue if cur_dtL.hour < AM10: continue if PM8 <= cur_dtL.hour: continue longitude, latitude = eval( rowL[hidL['longitude']]), eval( rowL[hidL['latitude']]) zi, zj = bisect(x_points, longitude) - 1, bisect( y_points, latitude) - 1 if zi < 0 or zj < 0: continue t, s = eval(rowL[hidL['time']]), eval( rowL[hidL['state']]) z = (zi, zj) cur_dt = datetime.datetime.fromtimestamp(t) if handling_day != cur_dt.day: handling_day = cur_dt.day logger.info('Processing %s %dth day' % (yymm, cur_dt.day)) if not drivers.has_key(didL): drivers[didL] = driver(didL, t, z, s) else: drivers[didL].update(t, z, s) if tripTime <= logTime: break s_long, s_lat = eval(rowN[hidN['start-long']]), eval( rowN[hidN['start-lat']]) zi, zj = bisect(x_points, s_long) - 1, bisect( y_points, s_lat) - 1 if zi < 0 or zj < 0: continue if not drivers.has_key(didT): continue if drivers[didT].firstFreeStateTime == -1: continue queueingTime = tripTime - drivers[didT].zoneEnteredTime if queueingTime < 0: continue with open(ss_trips_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ didT, cur_dtT.hour, zi, zj, tripTime, cur_dtT.day, cur_dtT.month, s_long, s_lat, rowN[hidN['distance']], rowN[hidN['duration']], rowN[hidN['fare']], queueingTime ]) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def run(): cg_dpath = dpaths['baseline', '2009', 'countGraph'] cg_prefix = prefixs['baseline', '2009', 'countGraph'] gp_dpath = dpaths['baseline', '2009', 'groupPartition'] gp_prefix = prefixs['baseline', '2009', 'groupPartition'] # check_dir_create(gp_dpath) # gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix) gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix) gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) # with open(gp_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ 'groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon' ]) # logger.info('Start handling SP_group_dpath') if not check_path_exist(gp_original_fpath): original_graph = {} for fn in get_all_files(cg_dpath, '%s*' % cg_prefix): count_graph = load_pickle_file('%s/%s' % (cg_dpath, fn)) logger.info('Start handling; %s' % fn) numEdges = len(count_graph) moduloNumber = numEdges / 10 for i, ((did0, did1), w) in enumerate(count_graph.iteritems()): if i % moduloNumber == 0: logger.info('Handling; %.2f' % (i / float(numEdges))) original_graph[did0, did1] = w save_pickle_file(gp_original_fpath, original_graph) else: original_graph = load_pickle_file(gp_original_fpath) # logger.info('igraph converting') igid, did_igid = 0, {} igG = ig.Graph(directed=True) numEdges = len(original_graph) moduloNumber = numEdges / 10 for i, ((did0, did1), w) in enumerate(original_graph.iteritems()): if i % moduloNumber == 0: logger.info('Handling; %.2f' % i / float(numEdges)) if not did_igid.has_key(did0): igG.add_vertex(did0) did_igid[did0] = igid igid += 1 if not did_igid.has_key(did1): igG.add_vertex(did1) did_igid[did1] = igid igid += 1 igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w)) # logger.info('Partitioning') part = louvain.find_partition(igG, method='Modularity', weight='weight') logger.info('Each group pickling and summary') gn_drivers = {} for i, sg in enumerate(part.subgraphs()): gn = 'G(%d)' % i group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn) sg.write_pickle(group_fpath) # drivers = [v['name'] for v in sg.vs] weights = [e['weight'] for e in sg.es] graphComplexity = len(weights) / float(len(drivers)) tie_strength = sum(weights) / float(len(drivers)) contribution = sum(weights) / float(len(weights)) benCon = tie_strength / float(len(drivers)) with open(gp_summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon ]) gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn) layout = sg.layout("kk") if len(drivers) < 100: ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers) else: ig.plot(sg, gl_img_fpath, layout=layout) gn_drivers[gn] = drivers gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn) with open(gc_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'did0', 'did1', 'coef']) for e in sg.es: did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple] coef = e['weight'] writer.writerow([gn, did0, did1, coef]) save_pickle_file(gp_drivers_fpath, gn_drivers)
def run(): gp_summary_fpath = '%s/%ssummary.csv' % (of_dpath, of_prefix) gp_original_fpath = '%s/%soriginal.pkl' % (of_dpath, of_prefix) gp_drivers_fpath = '%s/%sdrivers.pkl' % (of_dpath, of_prefix) # with open(gp_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ 'groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon' ]) logger.info('Start handling SP_group_dpath') orignal_graph = {} for fn in get_all_files(if_dpath, '%ssigRelation-%s-*.pkl' % (if_prefix, year)): _, _, _, _, _did1 = fn[:-len('.csv')].split('-') sigRelatioin = load_pickle_file('%s/%s' % (if_dpath, fn)) for _did0, coef in sigRelatioin['pos']: did0, did1 = map(int, [_did0, _did1]) orignal_graph[did0, did1] = coef save_pickle_file(gp_original_fpath, orignal_graph) # igid, did_igid = 0, {} igG = ig.Graph(directed=True) for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()): if not did_igid.has_key(did0): igG.add_vertex(did0) did_igid[did0] = igid igid += 1 if not did_igid.has_key(did1): igG.add_vertex(did1) did_igid[did1] = igid igid += 1 igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w)) logger.info('Partitioning') part = louvain.find_partition(igG, method='Modularity', weight='weight') logger.info('Each group pickling and summary') gn_drivers = {} for i, sg in enumerate(part.subgraphs()): gn = 'G(%d)' % i group_fpath = '%s/%s%s.pkl' % (of_dpath, of_prefix, gn) sg.write_pickle(group_fpath) # drivers = [v['name'] for v in sg.vs] weights = [e['weight'] for e in sg.es] graphComplexity = len(weights) / float(len(drivers)) tie_strength = sum(weights) / float(len(drivers)) contribution = sum(weights) / float(len(weights)) benCon = tie_strength / float(len(drivers)) with open(gp_summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon ]) gl_img_fpath = '%s/%simg-%s.pdf' % (of_dpath, of_prefix, gn) # layout = sg.layout("kk") # if len(drivers) < 100: # ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers) # else: # ig.plot(sg, gl_img_fpath, layout=layout) gn_drivers[gn] = drivers gc_fpath = '%s/%scoef-%s.csv' % (of_dpath, of_prefix, gn) with open(gc_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'did0', 'did1', 'coef']) for e in sg.es: did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple] coef = e['weight'] writer.writerow([gn, did0, did1, coef]) save_pickle_file(gp_drivers_fpath, gn_drivers)
def run(): print 'start' check_dir_create(com_dir) # yyyy = '2009' la_fn = '2009-CD(184)-N(7003)-E(5717371).pkl' la_fpath = '%s/%s' % (la_dir, la_fn) _, str_CD, _, _ = la_fn[:-len('.pkl')].split('-') CD = int(str_CD[len('CD('):-len(')')]) print 'pick file loading...' pairs_day_counting = load_pickle_file(la_fpath) print 'finished' for thD in [18, 36, 55, 73, 82, 92]: thD_dpath = '%s/%s' % (com_dir, '2009-CD(%d)-thD(%d)' % (CD, thD)) check_dir_create(thD_dpath) summary_fpath = '%s/%s-CD(%d)-thD(%d)-community-summary.csv' % ( thD_dpath, yyyy, CD, thD) glayout_fpath = '%s/%s-CD(%d)-thD(%d)-glayout.pkl' % (thD_dpath, yyyy, CD, thD) with open(summary_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = [ 'com-name', 'num-nodes', 'num-edges', 'tie-strength(# of days encounter / # of drivers)' ] writer.writerow(new_headers) # nxG = nx.Graph() for (k0, k1), num_days in pairs_day_counting.iteritems(): if num_days < thD: continue nxG.add_edge(k0, k1, weight=num_days) print 'Whole graph pickling ...', yyyy, CD, thD nx.write_gpickle( nxG, '%s/%s-CD(%d)-thD(%d)-whole-N(%d)-E(%d).pkl' % (thD_dpath, yyyy, CD, thD, len(nxG.nodes()), len(nxG.edges()))) n_label, n_comId = [], [] nxId_igId = {} ig_nid = 0 print 'Partitioning ...' partition = community.best_partition(nxG) for i, com in enumerate(set(partition.values())): list_nodes = [ nodes for nodes in partition.keys() if partition[nodes] == com ] print i, 'Saving sub-graph ...' sub_nxG = nxG.subgraph(list_nodes) com_name = 'COM(%d)' % i com_fpath = '%s/%s-CD(%d)-thD(%d)-%s-N(%d)-E(%d).pkl' % ( thD_dpath, yyyy, CD, thD, com_name, len( sub_nxG.nodes()), len(sub_nxG.edges())) nx.write_gpickle(sub_nxG, com_fpath) _, _, weight = zip( *list(sub_nxG.edges_iter(data='weight', default=1))) num_nodes, num_edges = len(sub_nxG), len(weight) with open(summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ com_name, num_nodes, num_edges, sum(weight) / float(num_nodes) ]) # print i, 'labeling...' for n in sub_nxG.nodes(): n_label.append(n) n_comId.append(i) nxId_igId[n] = ig_nid ig_nid += 1 # if len(nxG.nodes()) < 1000: print 'Layout calculating...' print datetime.datetime.now() Edges = [(nxId_igId[n0], nxId_igId[n1]) for (n0, n1) in nxG.edges()] print 'finish edge converting', len(Edges) print datetime.datetime.now() igG = ig.Graph(Edges, directed=False) layt = igG.layout('kk', dim=3) print 'finish layout calculation' print datetime.datetime.now() # save_pickle_file(glayout_fpath, [n_label, n_comId, layt, Edges]) else: save_pickle_file(glayout_fpath, [])
def process_files(yymm): from traceback import format_exc try: logger.info('handle %s' % yymm) ssDrivers = load_pickle_file( '%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm)) # with open( '%s/%s%s.csv' % (shiftProDur_dpath, shiftProDur_prefix, yymm), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open( '%s/%s%s.csv' % (ssDriverShiftProDur_dpath, ssDriverShiftProDur_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile) writer.writerow(headers) for row in reader: did = int(row[hid['did']]) if did not in ssDrivers: continue writer.writerow(row) # yy, mm = yymm[:2], yymm[-2:] year, month = 2000 + int(yy), int(mm) with open('%s/%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open( '%s/%s%s.csv' % (ssDriverTrip_dpath, ssDriverTrip_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = [ 'did', 'startTime', 'duration', 'fare', 'year', 'month', 'day', 'hour' ] writer.writerow(new_headers) for row in reader: did = int(row[hid['did']]) if did not in ssDrivers: continue writer.writerow([ row[hid['did']], row[hid['startTime']], row[hid['duration']], row[hid['fare']], year, month, row[hid['day']], row[hid['hour']] ]) # with open( '%s/%s%s.csv' % (economicProfit_ap_dpath, economicProfit_ap_prefix, yymm), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open( '%s/%s%s.csv' % (ssDriverEP_ap_dpath, ssDriverEP_ap_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile) writer.writerow(headers) for row in reader: did = int(row[hid['did']]) if did not in ssDrivers: continue writer.writerow(row) # with open( '%s/%s%s.csv' % (economicProfit_ns_dpath, economicProfit_ns_prefix, yymm), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open( '%s/%s%s.csv' % (ssDriverEP_ns_dpath, ssDriverEP_ns_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile) writer.writerow(headers) for row in reader: did = int(row[hid['did']]) if did not in ssDrivers: continue writer.writerow(row) logger.info('end %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
'H', # hexagon2 'd', # thin_diamond '|', # vline '_', # hline '.', # point ',', # pixel 'D', # diamond '8', # octagon ) # # The number of trips depending on hour # from taxi_common.file_handling_functions import load_pickle_file hour_tripNum = load_pickle_file('_hour_tripNum.pkl') # _figsize = (8, 6) _fontsize = 14 _data = hour_tripNum.values() xTickMarks = hour_tripNum.keys() _xlabel = 'Hour' _ylabel = 'The number of trips' # fig = plt.figure(figsize=_figsize) ax = fig.add_subplot(111) ind = np.arange(len(_data)) width = 0.5 # the width of the bars # ax.bar(ind, _data, color='blue') # axes and labels
def process_month(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) yy, mm = yymm[:2], yymm[2:] trip_normal_fpath = '%s/20%s/%s/trips/trips-%s-normal.csv' % (taxi_home, yy, mm, yymm) trip_ext_fpath = '%s/20%s/%s/trips/trips-%s-normal-ext.csv' % (taxi_home, yy, mm, yymm) log_fpath = '%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm) if not check_path_exist(trip_normal_fpath): logger.info('The file X exists; %s' % yymm) return None ss_drivers_fpath = '%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm) if not check_path_exist(ss_drivers_fpath): logger.info('The file X exists; %s' % ss_drivers_fpath) return None ss_drivers = load_pickle_file(ss_drivers_fpath) x_points, y_points = get_sg_grid_xy_points() # ss_trips_fpath = '%s/%s%s.csv' % (ss_trips_dpath, ss_trips_prefix, yymm) if check_path_exist(ss_trips_fpath): logger.info('The file had already been processed; %s' % yymm) return None with open(ss_trips_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['did', 'hour', 'zi', 'zj', 'time', 'day', 'month', 'start-long', 'start-lat', 'distance', 'duration', 'fare', 'queueingTime']) with open(trip_normal_fpath, 'rb') as tripFileN: tripReaderN = csv.reader(tripFileN) tripHeaderN = tripReaderN.next() # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3, # 'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7, # 'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11, # 'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15, # 'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19} hidN = {h: i for i, h in enumerate(tripHeaderN)} with open(trip_ext_fpath, 'rb') as tripFileE: tripReaderE = csv.reader(tripFileE) tripHeaderE = tripReaderE.next() # # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3} # hidE = {h: i for i, h in enumerate(tripHeaderE)} with open(log_fpath, 'rb') as logFile: logReader = csv.reader(logFile) logHeader = logReader.next() hidL = {h: i for i, h in enumerate(logHeader)} handling_day = 0 drivers = {} for rowN in tripReaderN: rowE = tripReaderE.next() didT = int(rowE[hidE['driver-id']]) if didT not in ss_drivers: continue tripTime = eval(rowN[hidN['start-time']]) cur_dtT = datetime.datetime.fromtimestamp(tripTime) if handling_day != cur_dtT.day: handling_day = cur_dtT.day logger.info('Processing %s %dth day' % (yymm, cur_dtT.day)) if cur_dtT.weekday() in [FRI, SAT, SUN]: continue if cur_dtT.hour < AM10: continue if PM8 <= cur_dtT.hour: continue while True: rowL = logReader.next() logTime = eval(rowL[hidL['time']]) didL = int(rowL[hidL['driver-id']]) if didL not in ss_drivers: continue t = eval(rowL[hidL['time']]) cur_dtL = datetime.datetime.fromtimestamp(t) if cur_dtL.weekday() in [FRI, SAT, SUN]: continue if cur_dtL.hour < AM10: continue if PM8 <= cur_dtL.hour: continue longitude, latitude = eval(rowL[hidL['longitude']]), eval(rowL[hidL['latitude']]) zi, zj = bisect(x_points, longitude) - 1, bisect(y_points, latitude) - 1 if zi < 0 or zj < 0: continue t, s = eval(rowL[hidL['time']]), eval(rowL[hidL['state']]) z = (zi, zj) cur_dt = datetime.datetime.fromtimestamp(t) if handling_day != cur_dt.day: handling_day = cur_dt.day logger.info('Processing %s %dth day' % (yymm, cur_dt.day)) if not drivers.has_key(didL): drivers[didL] = driver(didL, t, z, s) else: drivers[didL].update(t, z, s) if tripTime <= logTime: break s_long, s_lat = eval(rowN[hidN['start-long']]), eval(rowN[hidN['start-lat']]) zi, zj = bisect(x_points, s_long) - 1, bisect(y_points, s_lat) - 1 if zi < 0 or zj < 0: continue if not drivers.has_key(didT): continue if drivers[didT].firstFreeStateTime == -1: continue queueingTime = tripTime - drivers[didT].zoneEnteredTime if queueingTime < 0: continue with open(ss_trips_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([didT, cur_dtT.hour, zi, zj, tripTime, cur_dtT.day, cur_dtT.month, s_long, s_lat, rowN[hidN['distance']], rowN[hidN['duration']], rowN[hidN['fare']], queueingTime]) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def run(): print 'start' check_dir_create(com_dir) # yyyy = '2009' la_fn = '2009-CD(184)-N(7003)-E(5717371).pkl' la_fpath = '%s/%s' % (la_dir, la_fn) _, str_CD, _, _ = la_fn[:-len('.pkl')].split('-') CD = int(str_CD[len('CD('):-len(')')]) print 'pick file loading...' pairs_day_counting = load_pickle_file(la_fpath) print 'finished' for thD in [18, 36, 55, 73, 82, 92]: thD_dpath = '%s/%s' % (com_dir, '2009-CD(%d)-thD(%d)' % (CD, thD)) check_dir_create(thD_dpath) summary_fpath = '%s/%s-CD(%d)-thD(%d)-community-summary.csv' % (thD_dpath, yyyy, CD, thD) glayout_fpath = '%s/%s-CD(%d)-thD(%d)-glayout.pkl' % (thD_dpath, yyyy, CD, thD) with open(summary_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['com-name', 'num-nodes', 'num-edges', 'tie-strength(# of days encounter / # of drivers)'] writer.writerow(new_headers) # nxG = nx.Graph() for (k0, k1), num_days in pairs_day_counting.iteritems(): if num_days < thD: continue nxG.add_edge(k0, k1, weight=num_days) print 'Whole graph pickling ...', yyyy, CD, thD nx.write_gpickle(nxG, '%s/%s-CD(%d)-thD(%d)-whole-N(%d)-E(%d).pkl' % (thD_dpath, yyyy, CD, thD, len(nxG.nodes()), len(nxG.edges()))) n_label, n_comId = [], [] nxId_igId = {} ig_nid = 0 print 'Partitioning ...' partition = community.best_partition(nxG) for i, com in enumerate(set(partition.values())): list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com] print i, 'Saving sub-graph ...' sub_nxG = nxG.subgraph(list_nodes) com_name = 'COM(%d)' % i com_fpath = '%s/%s-CD(%d)-thD(%d)-%s-N(%d)-E(%d).pkl' % (thD_dpath, yyyy, CD, thD, com_name, len(sub_nxG.nodes()), len(sub_nxG.edges())) nx.write_gpickle(sub_nxG, com_fpath) _, _, weight = zip(*list(sub_nxG.edges_iter(data='weight', default=1))) num_nodes, num_edges = len(sub_nxG), len(weight) with open(summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([com_name, num_nodes, num_edges, sum(weight) / float(num_nodes)]) # print i, 'labeling...' for n in sub_nxG.nodes(): n_label.append(n) n_comId.append(i) nxId_igId[n] = ig_nid ig_nid += 1 # if len(nxG.nodes()) < 1000: print 'Layout calculating...' print datetime.datetime.now() Edges = [(nxId_igId[n0], nxId_igId[n1]) for (n0, n1) in nxG.edges()] print 'finish edge converting', len(Edges) print datetime.datetime.now() igG = ig.Graph(Edges, directed=False) layt = igG.layout('kk', dim=3) print 'finish layout calculation' print datetime.datetime.now() # save_pickle_file(glayout_fpath, [n_label, n_comId, layt, Edges]) else: save_pickle_file(glayout_fpath, [])
def process_file(tm, year): ig_dpath = dpaths[tm, year, 'influenceGraph'] ig_prefix = prefixs[tm, year, 'influenceGraph'] gp_dpath = dpaths[tm, year, 'groupPartition'] gp_prefix = prefixs[tm, year, 'groupPartition'] # check_dir_create(gp_dpath) # gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix) gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix) gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) # with open(gp_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon']) # logger.info('Start handling SP_group_dpath') orignal_graph = {} for fn in get_all_files(ig_dpath, '%s*' % ig_prefix): regression_graph = load_pickle_file('%s/%s' % (ig_dpath, fn)) for i, ((did0, did1), w) in enumerate(regression_graph.iteritems()): orignal_graph[did0, did1] = w save_pickle_file(gp_original_fpath, orignal_graph) # igid, did_igid = 0, {} igG = ig.Graph(directed=True) for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()): if not did_igid.has_key(did0): igG.add_vertex(did0) did_igid[did0] = igid igid += 1 if not did_igid.has_key(did1): igG.add_vertex(did1) did_igid[did1] = igid igid += 1 igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w)) # logger.info('Partitioning') part = louvain.find_partition(igG, method='Modularity', weight='weight') logger.info('Each group pickling and summary') gn_drivers = {} for i, sg in enumerate(part.subgraphs()): gn = 'G(%d)' % i group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn) sg.write_pickle(group_fpath) # drivers = [v['name'] for v in sg.vs] weights = [e['weight'] for e in sg.es] graphComplexity = len(weights) / float(len(drivers)) tie_strength = sum(weights) / float(len(drivers)) contribution = sum(weights) / float(len(weights)) benCon = tie_strength / float(len(drivers)) with open(gp_summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon]) gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn) layout = sg.layout("kk") if len(drivers) < 100: ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers) else: ig.plot(sg, gl_img_fpath, layout=layout) gn_drivers[gn] = drivers gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn) with open(gc_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'did0', 'did1', 'coef']) for e in sg.es: did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple] coef = e['weight'] writer.writerow([gn, did0, did1, coef]) save_pickle_file(gp_drivers_fpath, gn_drivers)
def process_files(yymm): print 'handle the file; %s' % yymm # # initialize csv_files # # general productivities with open('%s/%s%s.csv' % (ftd_gen_stat_dir, ftd_gen_stat_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile) headers = ['yy', 'mm', 'did', 'prod'] writer.writerow(headers) # airport and night safari productivities for dn, fn_prefix in [(ftd_prev_in_ap_stat_dir, ftd_prev_in_ap_stat_prefix), (ftd_prev_out_ap_stat_dir, ftd_prev_out_ap_stat_prefix), (ftd_prev_in_ns_stat_dir, ftd_prev_in_ns_stat_prefix), (ftd_prev_out_ns_stat_dir, ftd_prev_out_ns_stat_prefix)]: with open('%s/%s%s.csv' % (dn, fn_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile) headers = ['yy', 'mm', 'did', 'prod', 'eco-profit'] writer.writerow(headers) # full_dids = sorted(load_pickle_file('%s/%s%s.pkl' % (ftd_list_dir, ftd_list_prefix, yymm))) s_df = pd.read_csv('%s/%s%s.csv' % (ftd_shift_dir, ftd_shift_prefix, yymm)) trip_df = pd.read_csv('%s/%s%s.csv' % (ftd_trips_dir, ftd_trips_prefix, yymm)) ap_trip_df = pd.read_csv('%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm)) ns_trip_df = pd.read_csv('%s/%s%s.csv' % (ns_ep_dir, ns_ep_prefix, yymm)) # yy, mm = int(yymm[:2]), int(yymm[2:]) for did in full_dids: # # General # did_sh = s_df[(s_df['did'] == did)] pro_dur = sum(did_sh['pro-dur']) * SEC60 did_wt = trip_df[(trip_df['did'] == did)] total_fare = sum(did_wt['fare']) if pro_dur > 0 and total_fare != 0: total_prod = total_fare / float(pro_dur) with open('%s/%s%s.csv' % (ftd_gen_stat_dir, ftd_gen_stat_prefix, yymm), 'a') as w_csvfile: writer = csv.writer(w_csvfile) writer.writerow([yy, mm, did, total_prod]) # # airport trips # did_ap = ap_trip_df[(ap_trip_df['did'] == did)] prev_in_ap_trip = did_ap[(did_ap['trip-mode'] == DIn_PIn)] prev_out_ap_trip = did_ap[(did_ap['trip-mode'] == DOut_PIn)] # if len(did_ap) != 0: df_dir_path_prefix = [(prev_in_ap_trip, ftd_prev_in_ap_stat_dir, ftd_prev_in_ap_stat_prefix), (prev_out_ap_trip, ftd_prev_out_ap_stat_dir, ftd_prev_out_ap_stat_prefix)] calc_drivers_monthly_eco_profit(yymm, yy, mm, did, df_dir_path_prefix) # # night safari trips # did_ns = ns_trip_df[(ns_trip_df['did'] == did)] prev_in_ns_trip = did_ns[(did_ns['trip-mode'] == DIn_PIn)] prev_out_ns_trip = did_ns[(did_ns['trip-mode'] == DOut_PIn)] # if len(did_ns) != 0: df_dir_path_prefix = [(prev_in_ns_trip, ftd_prev_in_ns_stat_dir, ftd_prev_in_ns_stat_prefix), (prev_out_ns_trip, ftd_prev_out_ns_stat_dir, ftd_prev_out_ns_stat_prefix)] calc_drivers_monthly_eco_profit(yymm, yy, mm, did, df_dir_path_prefix) print 'End the file; %s' % yymm
def process_file(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) queueingTime_ap_fpath = '%s/%s%s.csv' % (queueingTime_ap_dpath, queueingTime_ap_prefix, yymm) queueingTime_ns_fpath = '%s/%s%s.csv' % (queueingTime_ns_dpath, queueingTime_ns_prefix, yymm) if check_path_exist(queueingTime_ap_fpath) and check_path_exist( queueingTime_ns_fpath): logger.info('The file had already been processed; %s' % yymm) return # logger.info('load pickle files; %s' % yymm) ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm) crossingTime_ap, crossingTime_ns = load_pickle_file( ap_pkl_fpath), load_pickle_file(ns_pkl_fpath) # logger.info('initiate csv files; %s' % yymm) with open(queueingTime_ap_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = [ 'did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode', 'queueJoinTime', 'queueingTime', 'year', 'month', 'day', 'hour', 'pickUpTerminalAP', 'prevEndTerminalAP' ] writer.writerow(new_headers) with open(queueingTime_ns_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = [ 'did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode', 'queueJoinTime', 'queueingTime', 'year', 'month', 'day', 'hour' ] writer.writerow(new_headers) # logger.info('start recording; %s' % yymm) with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: did = row[hid['did']] et, duration = row[hid['endTime']], row[hid['duration']] fare = row[hid['fare']] year, month = row[hid['year']], row[hid['month']] day, hour = row[hid['day']], row[hid['hour']] pickUpTerminalAP, prevEndTerminalAP = row[ hid['pickUpTerminalAP']], row[hid['prevEndTerminalAP']] # ap_tm, ns_tm = int(row[hid['tripModeAP']]), int( row[hid['tripModeNS']]) vid, st, prev_tet = row[hid['vid']], eval( row[hid['startTime']]), eval(row[hid['prevTripEndTime']]) # # Airport trip # if ap_tm != DIn_POut or ap_tm != DOut_POut: queueing_time = None if ap_tm == DIn_PIn: queue_join_time = prev_tet queueing_time = st - queue_join_time elif ap_tm == DOut_PIn: try: i = bisect(crossingTime_ap[vid], st) queue_join_time = crossingTime_ap[vid][ i - 1] if i != 0 else crossingTime_ap[vid][0] queueing_time = st - queue_join_time except KeyError: pass if queueing_time != None and Q_LIMIT_MIN <= queueing_time: new_row = [ did, st, et, duration, fare, ap_tm, queue_join_time, queueing_time, year, month, day, hour, pickUpTerminalAP, prevEndTerminalAP ] append_record(queueingTime_ap_fpath, new_row) # # Night Safari # if ns_tm != DIn_POut or ns_tm != DOut_POut: queueing_time = None if ns_tm == DIn_PIn: queue_join_time = prev_tet queueing_time = st - queue_join_time elif ns_tm == DOut_PIn: try: i = bisect(crossingTime_ns[vid], st) queue_join_time = crossingTime_ns[vid][ i - 1] if i != 0 else crossingTime_ns[vid][0] queueing_time = st - queue_join_time except KeyError: pass if queueing_time != None and Q_LIMIT_MIN <= queueing_time: new_row = [ did, st, et, duration, fare, ns_tm, queue_join_time, queueing_time, year, month, day, hour ] append_record(queueingTime_ns_fpath, new_row) logger.info('end the file; %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
import __init__ ''' ''' from community_analysis import dpaths, prefixs # from taxi_common.file_handling_functions import get_all_files, load_pickle_file year = '20%02d' % 9 # depVar = 'roamingTime' depVar = 'interTravelTime' # # of_dpath = dpaths[depVar, 'influenceGraph'] of_prefixs = prefixs[depVar, 'influenceGraph'] countRelationWhole = {k: 0 for k in ['sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']} for fn in get_all_files(of_dpath, '%scount-*' % of_prefixs): print fn fpath = '%s/%s' % (of_dpath, fn) countRelation = load_pickle_file(fpath) for n in ['sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']: countRelationWhole[n] += countRelation[n] print countRelationWhole
def aggregate_dayBased(yymm): print 'handle the file; %s' % yymm # for hours, fpath in [([15, 16, 17, 18, 19], ssDriversStatistics_ns1519_fpath), ([20, 21, 22, 23, 0], ssDriversStatistics_ns2000_fpath)]: shift_df = pd.read_csv('%s/%s%s.csv' % (ssDriverShiftProDur_dpath, ssDriverShiftProDur_prefix, yymm)) shift_df = shift_df[shift_df['hh'].isin(hours)] all_trip_df = pd.read_csv('%s/%s%s.csv' % (ssDriverTrip_dpath, ssDriverTrip_prefix, yymm)) all_trip_df = all_trip_df[all_trip_df['hour'].isin(hours)] EP_df = pd.read_csv('%s/%s%s.csv' % (ssDriverEP_ns_dpath, ssDriverEP_ns_prefix, yymm)) EP_df = EP_df[EP_df['hour'].isin(hours)] ssDrivers = map(int, load_pickle_file('%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm))) days = set(EP_df['day']) # yy, mm = int(yymm[:2]), int(yymm[2:]) for dd in days: day_all_trip_df = all_trip_df[(all_trip_df['day'] == dd)] day_loc_trip_df = EP_df[(EP_df['day'] == dd)] day_shift_df = shift_df[(shift_df['dd'] == dd)] for did in ssDrivers: # # Specific location # d_loc_trip = day_loc_trip_df[(day_loc_trip_df['did'] == did)] if len(d_loc_trip) == 0: continue loc_num = len(d_loc_trip['fare']) loc_dur = sum(d_loc_trip['duration']) loc_fare = sum(d_loc_trip['fare']) loc_ep = sum(d_loc_trip['economicProfit']) loc_qtime = sum(d_loc_trip['queueingTime']) # # All # d_all_trip = day_all_trip_df[(day_all_trip_df['did'] == did)] d_shift = day_shift_df[(day_shift_df['did'] == did)] all_num = len(d_all_trip['fare']) pro_dur = sum(d_shift['pro-dur']) * SEC60 all_fare = sum(d_all_trip['fare']) # d_loc_trip_in = d_loc_trip[(d_loc_trip['tripMode'] == DIn_PIn)] locIn_num = len(d_loc_trip_in['fare']) locIn_dur = sum(d_loc_trip_in['duration']) locIn_fare = sum(d_loc_trip_in['fare']) locIn_ep = sum(d_loc_trip_in['economicProfit']) locIn_qtime = sum(d_loc_trip_in['queueingTime']) # d_loc_trip_out = d_loc_trip[(d_loc_trip['tripMode'] == DOut_PIn)] locOut_num = len(d_loc_trip_out['fare']) locOut_dur = sum(d_loc_trip_out['duration']) locOut_fare = sum(d_loc_trip_out['fare']) locOut_ep = sum(d_loc_trip_out['economicProfit']) locOut_qtime = sum(d_loc_trip_out['queueingTime']) # with open(fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([yy, mm, dd, did, all_num, pro_dur, all_fare, loc_num, loc_dur, loc_fare, loc_ep, loc_qtime, locIn_num, locIn_dur, locIn_fare, locIn_ep, locIn_qtime, locOut_num, locOut_dur, locOut_fare, locOut_ep, locOut_qtime])
def process_file(yymm): from traceback import format_exc try: logger.info('handle the file; %s' % yymm) queueingTime_ap_fpath = '%s/%s%s.csv' % (queueingTime_ap_dpath, queueingTime_ap_prefix, yymm) queueingTime_ns_fpath = '%s/%s%s.csv' % (queueingTime_ns_dpath, queueingTime_ns_prefix, yymm) if check_path_exist(queueingTime_ap_fpath) and check_path_exist(queueingTime_ns_fpath): logger.info('The file had already been processed; %s' % yymm) return # logger.info('load pickle files; %s' % yymm) ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm) crossingTime_ap, crossingTime_ns = load_pickle_file(ap_pkl_fpath), load_pickle_file(ns_pkl_fpath) # logger.info('initiate csv files; %s' % yymm) with open(queueingTime_ap_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode', 'queueJoinTime', 'queueingTime', 'year', 'month', 'day', 'hour', 'pickUpTerminalAP', 'prevEndTerminalAP'] writer.writerow(new_headers) with open(queueingTime_ns_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode', 'queueJoinTime', 'queueingTime', 'year', 'month', 'day', 'hour'] writer.writerow(new_headers) # logger.info('start recording; %s' % yymm) with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h : i for i, h in enumerate(headers)} for row in reader: did = row[hid['did']] et, duration = row[hid['endTime']], row[hid['duration']] fare = row[hid['fare']] year, month = row[hid['year']], row[hid['month']] day, hour = row[hid['day']], row[hid['hour']] pickUpTerminalAP, prevEndTerminalAP = row[hid['pickUpTerminalAP']], row[hid['prevEndTerminalAP']] # ap_tm, ns_tm = int(row[hid['tripModeAP']]), int(row[hid['tripModeNS']]) vid, st, prev_tet = row[hid['vid']], eval(row[hid['startTime']]), eval(row[hid['prevTripEndTime']]) # # Airport trip # if ap_tm != DIn_POut or ap_tm != DOut_POut: queueing_time = None if ap_tm == DIn_PIn: queue_join_time = prev_tet queueing_time = st - queue_join_time elif ap_tm == DOut_PIn: try: i = bisect(crossingTime_ap[vid], st) queue_join_time = crossingTime_ap[vid][i - 1] if i != 0 else crossingTime_ap[vid][0] queueing_time = st - queue_join_time except KeyError: pass if queueing_time != None and Q_LIMIT_MIN <= queueing_time: new_row = [did, st, et, duration, fare, ap_tm, queue_join_time, queueing_time, year, month, day, hour, pickUpTerminalAP, prevEndTerminalAP] append_record(queueingTime_ap_fpath, new_row) # # Night Safari # if ns_tm != DIn_POut or ns_tm != DOut_POut: queueing_time = None if ns_tm == DIn_PIn: queue_join_time = prev_tet queueing_time = st - queue_join_time elif ns_tm == DOut_PIn: try: i = bisect(crossingTime_ns[vid], st) queue_join_time = crossingTime_ns[vid][i - 1] if i != 0 else crossingTime_ns[vid][0] queueing_time = st - queue_join_time except KeyError: pass if queueing_time != None and Q_LIMIT_MIN <= queueing_time: new_row = [did, st, et, duration, fare, ns_tm, queue_join_time, queueing_time, year, month, day, hour] append_record(queueingTime_ns_fpath, new_row) logger.info('end the file; %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def process_file(yymm): ap_pkl_file_path = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm) ns_pkl_file_path = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm) if not (check_path_exist(ap_pkl_file_path) and check_path_exist(ns_pkl_file_path)): return None # # Load pickle files # ap_crossing_time, ns_crossing_time = load_pickle_file( ap_pkl_file_path), load_pickle_file(ns_pkl_file_path) # # Initiate csv files # ap_trip_fpath = '%s/%s%s.csv' % (ap_trips_dir, ap_trip_prefix, yymm) ns_trip_fpath = '%s/%s%s.csv' % (ns_trips_dir, ns_trip_prefix, yymm) if check_path_exist(ap_trip_fpath) and check_path_exist(ns_trip_fpath): return None print 'handle the file; %s' % yymm for fpath in [ap_trip_fpath, ns_trip_fpath]: with open(fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = [ 'tid', 'vid', 'did', 'start-time', 'end-time', 'duration', 'fare', 'prev-trip-end-time', 'trip-mode', 'queue—join-time', 'queueing-time' ] writer.writerow(new_headers) # with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: tid, did = row[hid['tid']], row[hid['did']] et, duration = row[hid['end-time']], row[hid['duration']] fare = row[hid['fare']] # ap_tm, ns_tm = int(row[hid['ap-trip-mode']]), int( row[hid['ns-trip-mode']]) vid, st, prev_tet = row[hid['vid']], eval( row[hid['start-time']]), eval(row[hid['prev-trip-end-time']]) # for tm, crossing_time, fpath in [ (ap_tm, ap_crossing_time, ap_trip_fpath), (ns_tm, ns_crossing_time, ns_trip_fpath) ]: if tm == DIn_POut or tm == DOut_POut: continue if tm == DIn_PIn: queue_join_time = prev_tet elif tm == DOut_PIn: try: i = bisect(crossing_time[vid], st) except KeyError: print '%s-tid-%s' % (yymm, row[hid['tid']]) continue queue_join_time = crossing_time[vid][ i - 1] if i != 0 else crossing_time[vid][0] with open(fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') queueing_time = st - queue_join_time if queueing_time < Q_LIMIT_MIN: queueing_time = Q_LIMIT_MIN new_row = [ tid, vid, did, st, et, duration, fare, prev_tet, tm, queue_join_time, queueing_time ] writer.writerow(new_row) print 'end the file; %s' % yymm
def process_files(yymm): from traceback import format_exc try: logger.info('handle %s' % yymm) ssDrivers = load_pickle_file('%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm)) # with open('%s/%s%s.csv' % (shiftProDur_dpath, shiftProDur_prefix, yymm), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h : i for i, h in enumerate(headers)} with open('%s/%s%s.csv' % (ssDriverShiftProDur_dpath, ssDriverShiftProDur_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile) writer.writerow(headers) for row in reader: did = int(row[hid['did']]) if did not in ssDrivers: continue writer.writerow(row) # yy, mm = yymm[:2], yymm[-2:] year, month = 2000 + int(yy), int(mm) with open('%s/%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h : i for i, h in enumerate(headers)} with open('%s/%s%s.csv' % (ssDriverTrip_dpath, ssDriverTrip_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['did', 'startTime', 'duration', 'fare', 'year', 'month', 'day', 'hour'] writer.writerow(new_headers) for row in reader: did = int(row[hid['did']]) if did not in ssDrivers: continue writer.writerow([row[hid['did']], row[hid['startTime']], row[hid['duration']], row[hid['fare']], year, month, row[hid['day']], row[hid['hour']]]) # with open('%s/%s%s.csv' % (economicProfit_ap_dpath, economicProfit_ap_prefix, yymm), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open('%s/%s%s.csv' % (ssDriverEP_ap_dpath, ssDriverEP_ap_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile) writer.writerow(headers) for row in reader: did = int(row[hid['did']]) if did not in ssDrivers: continue writer.writerow(row) # with open('%s/%s%s.csv' % (economicProfit_ns_dpath, economicProfit_ns_prefix, yymm), 'rt') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} with open('%s/%s%s.csv' % (ssDriverEP_ns_dpath, ssDriverEP_ns_prefix, yymm), 'wt') as w_csvfile: writer = csv.writer(w_csvfile) writer.writerow(headers) for row in reader: did = int(row[hid['did']]) if did not in ssDrivers: continue writer.writerow(row) logger.info('end %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise