def process_file(fpath): logger.info('Start handling; %s' % fpath) _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-') try: count_graph_dpath = dpaths['baseline', '2009', 'countGraph'] count_graph_prefix = prefixs['baseline', '2009', 'countGraph'] count_graph_fpath = '%s/%s%s.pkl' % (count_graph_dpath, count_graph_prefix, reducerID) # logger.info('Start loading; %s-%s' % (year, reducerID)) df = pd.read_csv(fpath) count_graph = {} num_drivers = len(set(df['did'])) for i, did1 in enumerate(set(df['did'])): if i % 10 == 0: logger.info('Doing regression %.2f; %s-%s' % (i / float(num_drivers), year, reducerID)) did1_df = df[(df['did'] == did1)].copy(deep=True) did1_df = did1_df.drop(['month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did', 'spendingTime'], axis=1) if '%d' % did1 in did1_df.columns: did1_df = did1_df.drop(['%d' % did1], axis=1) # for _did0, numPriorPresence in did1_df.sum().iteritems(): if numPriorPresence == 0: continue count_graph[int(_did0), did1] = numPriorPresence # logger.info('Start pickling; %s-%s' % (year, reducerID)) save_pickle_file(count_graph_fpath, count_graph) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)), 'w') as f: f.write(format_exc()) raise
def get_driver_trajectory(did): ofpath = '%s%d.pkl' % (if_prefix, did) if check_path_exist(ofpath): dt_xy_state = load_pickle_file(ofpath) else: dates = [] for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix): _, _date, _did = fn[:-len('.csv')].split('-') if int(_did) != did: continue year = 2000 + int(_date[:2]) month, day = map(int, [_date[2:4], _date[4:6]]) dt = datetime.datetime(year, month, day) dates += [dt] dates.sort() dt_xy_state = [] for dt in dates: yy = '%02d' % (dt.year - 2000) mm, dd = '%02d' % dt.month, '%02d' % dt.day yymmdd = yy + mm + dd ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did) with open(ifpath, 'rb') as logFile: reader = csv.reader(logFile) header = reader.next() # header: time,vehicle-id,driver-id,longitude,latitude,speed,state hid = {h: i for i, h in enumerate(header)} for row in reader: dt = datetime.datetime.fromtimestamp(eval(row[hid['time']])) lon, lat = map(eval, [row[hid[cn]] for cn in ['longitude', 'latitude']]) x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat) dt_xy_state += [(dt, x, y, int(row[hid['state']]))] save_pickle_file(ofpath, dt_xy_state) return dt_xy_state
def process_file(yymm): ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm) if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath): return None print 'handle the file; %s' % yymm veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {} veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {} if yymm not in ['0901', '1001', '1011']: path_to_last_day_csv_file = None temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix, '.csv') prev_fn = None y, m = int(yymm[:2]), int(yymm[2:]) prev_m = m - 1 prev_yymm = '%02d%02d' % (y, prev_m) for temp_fn in temp_csv_files: if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)): prev_fn = temp_fn break assert prev_fn, yymm path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn) # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1: # return None veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \ record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm) veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \ record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) # save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time) save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time) print 'end the file; %s' % yymm
def only_1001(): yymm = '1001' id_fpath = '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, yymm) trip_fpath = '%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix, '2010') df = pd.read_csv(trip_fpath) drivers = set(df['driverID']) intelDrivers = {} for did in drivers: didm_df = df[(df['driverID'] == did) & (df['month'] == 1)].copy(deep=True) hours = set(didm_df['hour']) dummiesH = [] for h in hours: hour_str = 'H%02d' % h didm_df[hour_str] = np.where(didm_df['hour'] == h, 1, 0) dummiesH.append(hour_str) df_residual = len(didm_df) - (len(dummiesH) + 1) if df_residual / float(len(didm_df)) < min_df_residual_ratio: intelDrivers[did] = (len(didm_df), 'X') continue y = didm_df['locQTime'] X = didm_df[dummiesH[:-1] + ['locIn']] X = sm.add_constant(X) res = sm.OLS(y, X, missing='drop').fit() if res.pvalues['locIn'] < sig_level: intelDrivers[did] = (len(didm_df), res.params['locIn']) else: intelDrivers[did] = (len(didm_df), 'X') save_pickle_file(id_fpath, intelDrivers)
def process_file(yymm): ap_pkl_fpath = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm) if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath): return None print 'handle the file; %s' % yymm veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {} veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {} if yymm not in ['0901', '1001', '1011']: path_to_last_day_csv_file = None temp_csv_files = get_all_files(logs_last_day_dir, log_last_day_prefix, '.csv') prev_fn = None y, m = int(yymm[:2]), int(yymm[2:]) prev_m = m - 1 prev_yymm = '%02d%02d' %(y, prev_m) for temp_fn in temp_csv_files: if temp_fn.startswith('%s%s' % (log_last_day_prefix, prev_yymm)): prev_fn = temp_fn break assert prev_fn, yymm path_to_last_day_csv_file = '%s/%s' % (logs_last_day_dir, prev_fn) # if (time.time() - get_created_time(path_to_last_day_csv_file)) < HOUR1: # return None veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \ record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) path_to_csv_file = '%s/%s%s.csv' % (logs_dir, log_prefix, yymm) veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \ record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) # save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time) save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time) print 'end the file; %s' % yymm
def find_intelligentDrivers(): idb_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, 'both') with open(idb_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['', 'Y2009', 'Y2010', 'significance level %.2f' % sig_level, 'minDfResidualRatio %.2f' % min_df_residual_ratio] writer.writerow(header) regressionClassification = {} for y in range(9, 11): year = '20%02d' % y id_fpath = '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, year) trip_fpath = '%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix, year) df = pd.read_csv(trip_fpath) drivers = set(df['driverID']) intelDrivers = {} for mes in ['smallObs', 'sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']: regressionClassification[year, mes] = 0 for did in drivers: did_df = df[(df['driverID'] == did)].copy(deep=True) months = set(did_df['month']) hours = set(did_df['hour']) dummiesM = [] for m in months: month_str = 'M%02d' % m did_df[month_str] = np.where(did_df['month'] == m, 1, 0) dummiesM.append(month_str) dummiesH = [] for h in hours: hour_str = 'H%02d' % h did_df[hour_str] = np.where(did_df['hour'] == h, 1, 0) dummiesH.append(hour_str) df_residual = len(did_df) - (len(dummiesM) + len(dummiesH) + 1) if df_residual / float(len(did_df)) < min_df_residual_ratio: intelDrivers[did] = (len(did_df), 'X') regressionClassification[year, 'smallObs'] += 1 continue y = did_df['locQTime'] X = did_df[dummiesM[:-1] + dummiesH[:-1] + ['locIn']] X = sm.add_constant(X) res = sm.OLS(y, X, missing='drop').fit() if res.pvalues['locIn'] < sig_level: intelDrivers[did] = (len(did_df), res.params['locIn']) if res.params['locIn'] > 0: regressionClassification[year, 'sigPos'] += 1 else: regressionClassification[year, 'sigNeg'] += 1 else: intelDrivers[did] = (len(did_df), 'X') if res.params['locIn'] > 0: regressionClassification[year, 'XsigPos'] += 1 else: regressionClassification[year, 'XsigNeg'] += 1 save_pickle_file(id_fpath, intelDrivers) # with open(idb_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') for mes in ['smallObs', 'sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']: new_row = [mes, regressionClassification['2009', mes], regressionClassification['2010', mes]] writer.writerow(new_row)
def process_file(fpath): logger.info('Start handling; %s' % fpath) _, _, _, _did1 = get_fn_only(fpath)[:-len('.csv')].split('-') try: ofpath = '%s/%s%s-%s.csv' % (of_dpath, of_prefix, year, _did1) sig_fpath = '%s/%ssigRelation-%s-%s.pkl' % (of_dpath, of_prefix, year, _did1) if check_path_exist(ofpath): return None with open(ofpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = ['did', 'numObservations', 'numPrevDrivers', 'numSigRelationship', 'numPosCoef', 'numNegCoef', 'sigPosRelation', 'sigNegRelation'] writer.writerow(header) # logger.info('Start loading; %s-%s' % (year, _did1)) df = pd.read_csv(fpath) numObservations = len(df) did1_df = df.drop(['month', 'day', 'hour', 'zi', 'zj', 'did'], axis=1) if _did1 in did1_df.columns: did1_df = did1_df.drop([_did1], axis=1) prevDrivers = [cn for cn in did1_df.columns if cn != depVar] numPrevDrivers = len(prevDrivers) # sigRelatioin = {k: [] for k in ['pos', 'neg']} for _did0 in prevDrivers: num_encouters = sum(did1_df[_did0]) if num_encouters < numObservations * MIN_PICKUP_RATIO: continue # if len(did1_df) - 1 == sum(did1_df[_did0]) or sum(did1_df[_did0]) == 0: # continue y = did1_df[depVar] X = did1_df[[_did0]] X = sm.add_constant(X) res = sm.OLS(y, X, missing='drop').fit() pv = res.pvalues[_did0] coef = res.params[_did0] if pv < SIGINIFICANCE_LEVEL: if coef < 0: sigRelatioin['neg'] += [(_did0, coef)] elif coef > 0: sigRelatioin['pos'] += [(_did0, coef)] with open(ofpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [_did1, numObservations, numPrevDrivers, len(sigRelatioin['pos']) + len(sigRelatioin['neg']), len(sigRelatioin['pos']), len(sigRelatioin['neg']), '&'.join([_did0 for _did0, _ in sigRelatioin['pos']]), '&'.join([_did0 for _did0, _ in sigRelatioin['neg']])] writer.writerow(new_row) save_pickle_file(sig_fpath, sigRelatioin) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, _did1)), 'w') as f: f.write(format_exc()) raise logger.info('End handling; %s' % fpath)
def get_sgBoarder_xy(): fpath = 'sgBorder_xy.pkl' if not check_path_exist(fpath): sgBorder_xy = [] for lon, lat in sg_border: x, y = convert_GPS2xy(lon, lat) sgBorder_xy += [(x, y)] save_pickle_file(fpath, sgBorder_xy) else: sgBorder_xy = load_pickle_file(fpath) return sgBorder_xy
def get_sgZones(): ofpath = 'sgZone.pkl' if check_path_exist(ofpath): sgZones = load_pickle_file(ofpath) else: sgZones = get_sg_zones() for z in sgZones.values(): z.cCoor_xy = convert_GPS2xy(*z.cCoor_gps) z.polyPoints_xy = [convert_GPS2xy(*gps_coord) for gps_coord in z.polyPoints_gps] z.marked = False save_pickle_file(ofpath, sgZones) return sgZones
def get_sgRoards_xy(): ofpath = 'sgRoards_xy.pkl' if check_path_exist(ofpath): sgRoards_xy = load_pickle_file(ofpath) else: sgRoards_xy = [] for _, coords in get_SG_roads(): road_fd = [] for lon, lat in coords: road_fd += [convert_GPS2xy(lon, lat)] sgRoards_xy += [road_fd] save_pickle_file(ofpath, sgRoards_xy) return sgRoards_xy
def ns_productivity_economical_profit(): # # drivers who operate taxi in both years # df = dfs[Y09_PINS] df = df[((df['prod'] - df['prod'].mean()) / df['prod'].std()).abs() < 3] df = df[((df['eco-profit'] - df['eco-profit'].mean()) / df['eco-profit'].std()).abs() < 3] ns_full_drivers = set(df['did']) for i in [Y10_PINS, Y09_PONS, Y10_PONS]: df = dfs[i] df = df[((df['prod'] - df['prod'].mean()) / df['prod'].std()).abs() < 3] df = df[((df['eco-profit'] - df['eco-profit'].mean()) / df['eco-profit'].std()).abs() < 3] ns_full_drivers = ns_full_drivers.intersection(set(df['did'])) # save_pickle_file(ftd_gen_prod_db_for_ns, general_productivities(ns_full_drivers)) save_pickle_file(ftd_ns_prod_eco_prof_db, get_driver_average(ns_full_drivers, [Y09_PINS, Y10_PINS, Y09_PONS, Y10_PONS]))
def run(): Y09_monthly_fare, Y10_monthly_fare = [], [] for y in xrange(9, 11): for m in xrange(1, 13): yymm = '%02d%02d' % (y, m) if yymm in ['0912', '1010']: continue trip_df = pd.read_csv('%s/%s%s.csv' % (trips_dir, trip_prefix, yymm)) trip_df = trip_df[(trip_df['did'] != -1)] # fares = [x / float(CENT) for x in list(trip_df.groupby(['did']).sum()['fare'])] if y == 9: Y09_monthly_fare += fares else: Y10_monthly_fare += fares save_pickle_file(driver_monthly_fare_fn, [Y09_monthly_fare, Y10_monthly_fare])
def get_sgGrid_xy(): ofpath = 'sgGrid_xy.pkl' if check_path_exist(ofpath): sgGrid_xy = load_pickle_file(ofpath) else: sgGrid_xy = [] lons, lats = generate_sg_grid() for lon in lons: sx, sy = convert_GPS2xy(lon, lats[0]) ex, ey = convert_GPS2xy(lon, lats[-1]) sgGrid_xy += [[(sx, sy), (ex, ey)]] for lat in lats: sx, sy = convert_GPS2xy(lons[0], lat) ex, ey = convert_GPS2xy(lons[-1], lat) sgGrid_xy += [[(sx, sy), (ex, ey)]] save_pickle_file(ofpath, sgGrid_xy) return sgGrid_xy
def find_driversRelations(year): yy = year[2:] driversRelations = {} for fn in get_all_files(prevDriversDefined_dpath, 'Filtered-%s%s*' % (prevDriversDefined_prefix, yy)): logger.info('handle the file; %s' % fn) with open('%s/%s' % (prevDriversDefined_dpath, fn), 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: did1 = int(row[hid['did']]) prevDrivers = row[hid['prevDrivers']].split('&') if len(prevDrivers) == 1 and prevDrivers[0] == '': continue if not driversRelations.has_key(did1): driversRelations[did1] = set() for did0 in map(int, prevDrivers): driversRelations[did1].add(did0) save_pickle_file(driversRelations_fpaths[year], driversRelations)
def run(): Y09_monthly_fare, Y10_monthly_fare = [], [] for y in xrange(9, 11): for m in xrange(1, 13): yymm = '%02d%02d' % (y, m) if yymm in ['0912', '1010']: continue trip_df = pd.read_csv('%s/%s%s.csv' % (trips_dir, trip_prefix, yymm)) trip_df = trip_df[(trip_df['did'] != -1)] # fares = [ x / float(CENT) for x in list(trip_df.groupby(['did']).sum()['fare']) ] if y == 9: Y09_monthly_fare += fares else: Y10_monthly_fare += fares save_pickle_file(driver_monthly_fare_fn, [Y09_monthly_fare, Y10_monthly_fare])
def process_file(tm, year, gt_fpath): gz_dpath = dpaths[tm, year, 'groupZones'] gz_prefix = prefixs[tm, year, 'groupZones'] df = pd.read_csv(gt_fpath) assert len(set(df['groupName'])) == 1 gn = df['groupName'][0] gz_fpath = '%s/%s%s.pkl' % (gz_dpath, gz_prefix, gn) # df = df[~(np.abs(df[tm] - df[tm].mean()) > (3 * df[tm].std()))] groupZones = {} for zizj, pp_num in df.groupby(['zizj']).sum()['priorPresence'].iteritems(): if pp_num < 2: continue zizj_df = df[(df['zizj'] == zizj)] y = zizj_df[tm] X = zizj_df['priorPresence'] X = sm.add_constant(X) res = sm.OLS(y, X, missing='drop').fit() if res.params['priorPresence'] < 0 and res.pvalues['priorPresence'] < sig_level: groupZones[zizj] = res.params['priorPresence'] save_pickle_file(gz_fpath, groupZones)
def process_file(fpath): logger.info('Start handling; %s' % fpath) _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-') try: count_graph_dpath = dpaths['baseline', '2009', 'countGraph'] count_graph_prefix = prefixs['baseline', '2009', 'countGraph'] count_graph_fpath = '%s/%s%s.pkl' % (count_graph_dpath, count_graph_prefix, reducerID) # logger.info('Start loading; %s-%s' % (year, reducerID)) df = pd.read_csv(fpath) count_graph = {} num_drivers = len(set(df['did'])) for i, did1 in enumerate(set(df['did'])): if i % 10 == 0: logger.info('Doing regression %.2f; %s-%s' % (i / float(num_drivers), year, reducerID)) did1_df = df[(df['did'] == did1)].copy(deep=True) did1_df = did1_df.drop([ 'month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did', 'spendingTime' ], axis=1) if '%d' % did1 in did1_df.columns: did1_df = did1_df.drop(['%d' % did1], axis=1) # for _did0, numPriorPresence in did1_df.sum().iteritems(): if numPriorPresence == 0: continue count_graph[int(_did0), did1] = numPriorPresence # logger.info('Start pickling; %s-%s' % (year, reducerID)) save_pickle_file(count_graph_fpath, count_graph) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)), 'w') as f: f.write(format_exc()) raise
def process_file(tm, year, gt_fpath): gz_dpath = dpaths[tm, year, 'groupZones'] gz_prefix = prefixs[tm, year, 'groupZones'] df = pd.read_csv(gt_fpath) assert len(set(df['groupName'])) == 1 gn = df['groupName'][0] gz_fpath = '%s/%s%s.pkl' % (gz_dpath, gz_prefix, gn) # df = df[~(np.abs(df[tm] - df[tm].mean()) > (3 * df[tm].std()))] groupZones = {} for zizj, pp_num in df.groupby(['zizj' ]).sum()['priorPresence'].iteritems(): if pp_num < 2: continue zizj_df = df[(df['zizj'] == zizj)] y = zizj_df[tm] X = zizj_df['priorPresence'] X = sm.add_constant(X) res = sm.OLS(y, X, missing='drop').fit() if res.params['priorPresence'] < 0 and res.pvalues[ 'priorPresence'] < sig_level: groupZones[zizj] = res.params['priorPresence'] save_pickle_file(gz_fpath, groupZones)
def run(): drivers_dates = {} for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix): _, _date, _did = fn[:-len('.csv')].split('-') year = 2000 + int(_date[:2]) month, day = map(int, [_date[2:4], _date[4:6]]) dt = datetime.datetime(year, month, day) k = int(_did) if not drivers_dates.has_key(k): drivers_dates[k] = [] drivers_dates[k] += [dt] # for did, dates in drivers_dates.iteritems(): ofpath = '%s%d.pkl' % (if_prefix, did) if check_path_exist(ofpath): continue dates.sort() dt_xy_state = [] for dt in dates: yy = '%02d' % (dt.year - 2000) mm, dd = '%02d' % dt.month, '%02d' % dt.day yymmdd = yy + mm + dd ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did) with open(ifpath, 'rb') as logFile: reader = csv.reader(logFile) header = reader.next() # header: time,vehicle-id,driver-id,longitude,latitude,speed,state hid = {h: i for i, h in enumerate(header)} for row in reader: dt = datetime.datetime.fromtimestamp(eval( row[hid['time']])) lon, lat = map( eval, [row[hid[cn]] for cn in ['longitude', 'latitude']]) x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat) dt_xy_state += [dt, x, y, int(row[hid['state']])] save_pickle_file(ofpath, dt_xy_state)
def run(): for path in [ ftd_general_prod_mb, ftd_ap_prod_eco_prof_mb, ftd_ns_prod_eco_prof_mb ]: remove_file(path) # save_pickle_file(ftd_general_prod_mb, general_productivity()) save_pickle_file(ftd_ap_prod_eco_prof_mb, ap_productivity_economical_profit()) save_pickle_file(ftd_ns_prod_eco_prof_mb, ns_productivity_economical_profit())
def process_file(fpath): def regression(dv, df): oc_dv = 'roamingTime' if dv == 'spendingTime' else 'spendingTime' rdf = df.copy(deep=True).drop([oc_dv], axis=1) candi_dummies = [] num_iter = 1 while True: for i, vs in enumerate(zip(*rdf.values)): if rdf.columns[i] == dv: continue if sum(vs) > len(rdf) * MIN_PICKUP_RATIO * num_iter: candi_dummies.append(rdf.columns[i]) if len(rdf) <= len(candi_dummies): candi_dummies = [] num_iter += 1 else: break y = rdf[dv] X = rdf[candi_dummies] X = sm.add_constant(X) return sm.OLS(y, X, missing='drop').fit() logger.info('Start handling; %s' % fpath) _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-') try: st_graph_dpath = dpaths['spendingTime', year, 'influenceGraph'] st_graph_prefix = prefixs['spendingTime', year, 'influenceGraph'] SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix, reducerID) rt_graph_dpath = dpaths['roamingTime', year, 'influenceGraph'] rt_graph_prefix = prefixs['roamingTime', year, 'influenceGraph'] RP_graph_fpath = '%s/%s%s.pkl' % (rt_graph_dpath, rt_graph_prefix, reducerID) if check_path_exist(SP_graph_fpath): return None # logger.info('Start loading; %s-%s' % (year, reducerID)) df = pd.read_csv(fpath) SP_graph, RP_graph = {}, {} num_drivers = len(set(df['did'])) for i, did1 in enumerate(set(df['did'])): if i % 10 == 0: logger.info('Doing regression %.2f; %s-%s' % (i / float(num_drivers), year, reducerID)) did1_df = df[(df['did'] == did1)].copy(deep=True) did1_df = did1_df.drop( ['month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did'], axis=1) if '%d' % did1 in did1_df.columns: did1_df = did1_df.drop(['%d' % did1], axis=1) # SP_res = regression('spendingTime', did1_df) if SP_res.f_pvalue < SIGINIFICANCE_LEVEL: significant_drivers = set() for _did0, pv in SP_res.pvalues.iteritems(): if _did0 == 'const': continue if pv < SIGINIFICANCE_LEVEL: significant_drivers.add(_did0) positive_ef_drivers = set() for _did0, cof in SP_res.params.iteritems(): if _did0 == 'const': continue if cof > 0: positive_ef_drivers.add(_did0) for _did0 in significant_drivers.difference( positive_ef_drivers): SP_graph[int(_did0), did1] = SP_res.params[_did0] # # RP_res = regression('roamingTime', did1_df) # if RP_res.f_pvalue < SIGINIFICANCE_LEVEL: # significant_drivers = set() # for _did0, pv in RP_res.pvalues.iteritems(): # if _did0 == 'const': # continue # if pv < SIGINIFICANCE_LEVEL: # significant_drivers.add(_did0) # positive_ef_drivers = set() # for _did0, cof in RP_res.params.iteritems(): # if _did0 == 'const': # continue # if cof > 0: # positive_ef_drivers.add(_did0) # for _did0 in significant_drivers.difference(positive_ef_drivers): # RP_graph[int(_did0), did1] = RP_res.params[_did0] logger.info('Start pickling; %s-%s' % (year, reducerID)) save_pickle_file(SP_graph_fpath, SP_graph) # save_pickle_file(RP_graph_fpath, RP_graph) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)), 'w') as f: f.write(format_exc()) raise
def process_file(fpath): logger.info('Start handling; %s' % fpath) _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-') try: tm = 'spendingTime' st_graph_dpath = dpaths[tm, year, 'influenceGraph'] st_graph_prefix = prefixs[tm, year, 'influenceGraph'] SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix, reducerID) if check_path_exist(SP_graph_fpath): return None # logger.info('Start loading; %s-%s' % (year, reducerID)) df = pd.read_csv(fpath) SP_graph, RP_graph = {}, {} num_drivers = len(set(df['did'])) for i, did1 in enumerate(set(df['did'])): if i % 10 == 0: logger.info('Doing regression %.2f; %s-%s' % (i / float(num_drivers), year, reducerID)) did1_df = df[(df['did'] == did1)].copy(deep=True) numObservations = len(did1_df) minDFResiduals = numObservations * MIN_RATIO_RESIDUAL did1_df = did1_df.drop([ 'month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did', 'roamingTime' ], axis=1) if '%d' % did1 in did1_df.columns: did1_df = did1_df.drop(['%d' % did1], axis=1) # candi_dummies = [] num_iter = 1 while True: for i, vs in enumerate(zip(*did1_df.values)): if did1_df.columns[i] == tm: continue if sum(vs) > numObservations * MIN_PICKUP_RATIO * num_iter: candi_dummies.append(did1_df.columns[i]) numIndepVariables = len(candi_dummies) if numIndepVariables == 0: break if numObservations < numIndepVariables + minDFResiduals: candi_dummies = [] num_iter += 1 else: break if not candi_dummies: continue y = did1_df[tm] X = did1_df[candi_dummies] X = sm.add_constant(X) SP_res = sm.OLS(y, X, missing='drop').fit() # if SP_res.f_pvalue < SIGINIFICANCE_LEVEL: significant_drivers = set() for _did0, pv in SP_res.pvalues.iteritems(): if _did0 == 'const': continue if pv < SIGINIFICANCE_LEVEL: significant_drivers.add(_did0) positive_ef_drivers = set() for _did0, cof in SP_res.params.iteritems(): if _did0 == 'const': continue if cof > 0: positive_ef_drivers.add(_did0) for _did0 in significant_drivers.difference(positive_ef_drivers): SP_graph[int(_did0), did1] = SP_res.params[_did0] # logger.info('Start pickling; %s-%s' % (year, reducerID)) save_pickle_file(SP_graph_fpath, SP_graph) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)), 'w') as f: f.write(format_exc()) raise
def process_file(fpath): logger.info('Start handling; %s' % fpath) _, _, _, _did1 = get_fn_only(fpath)[:-len('.csv')].split('-') try: ofpath = '%s/%s%s-%s.csv' % (of_dpath, of_prefix, year, _did1) sig_fpath = '%s/%ssigRelation-%s-%s.pkl' % (of_dpath, of_prefix, year, _did1) if check_path_exist(ofpath): return None with open(ofpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ 'did', 'numObservations', 'numPrevDrivers', 'numSigRelationship', 'numPosCoef', 'numNegCoef', 'sigPosRelation', 'sigNegRelation' ] writer.writerow(header) # logger.info('Start loading; %s-%s' % (year, _did1)) df = pd.read_csv(fpath) numObservations = len(df) did1_df = df.drop(['month', 'day', 'hour', 'zi', 'zj', 'did'], axis=1) if _did1 in did1_df.columns: did1_df = did1_df.drop([_did1], axis=1) prevDrivers = [cn for cn in did1_df.columns if cn != depVar] numPrevDrivers = len(prevDrivers) # sigRelatioin = {k: [] for k in ['pos', 'neg']} for _did0 in prevDrivers: num_encouters = sum(did1_df[_did0]) if num_encouters < numObservations * MIN_PICKUP_RATIO: continue # if len(did1_df) - 1 == sum(did1_df[_did0]) or sum(did1_df[_did0]) == 0: # continue y = did1_df[depVar] X = did1_df[[_did0]] X = sm.add_constant(X) res = sm.OLS(y, X, missing='drop').fit() pv = res.pvalues[_did0] coef = res.params[_did0] if pv < SIGINIFICANCE_LEVEL: if coef < 0: sigRelatioin['neg'] += [(_did0, coef)] elif coef > 0: sigRelatioin['pos'] += [(_did0, coef)] with open(ofpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_row = [ _did1, numObservations, numPrevDrivers, len(sigRelatioin['pos']) + len(sigRelatioin['neg']), len(sigRelatioin['pos']), len(sigRelatioin['neg']), '&'.join([_did0 for _did0, _ in sigRelatioin['pos']]), '&'.join([_did0 for _did0, _ in sigRelatioin['neg']]) ] writer.writerow(new_row) save_pickle_file(sig_fpath, sigRelatioin) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, _did1)), 'w') as f: f.write(format_exc()) raise logger.info('End handling; %s' % fpath)
def process_file(yymm): def record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not): with open(path_to_csv_file, 'rb') as r_csvfile: reader = csv.reader(r_csvfile) headers = reader.next() hid = {h: i for i, h in enumerate(headers)} for row in reader: t, vid = eval(row[hid['time']]), row[hid['vid']] ap_or_not, ns_or_not = eval(row[hid['ap-or-not']]), eval(row[hid['ns-or-not']]) # if not veh_last_log_ap_or_not.has_key(vid): if ap_or_not == IN: # the first log's position was occurred in the AP zone assert not veh_ap_crossing_time.has_key(vid) veh_ap_crossing_time[vid] = [t] else: assert veh_last_log_ap_or_not.has_key(vid) if veh_last_log_ap_or_not[vid] == OUT and ap_or_not == IN: veh_ap_crossing_time.setdefault(vid, [t]).append(t) # if not veh_last_log_ns_or_not.has_key(vid): if ns_or_not == IN: # the first log's position was occurred in the NS zone assert not veh_ns_crossing_time.has_key(vid) veh_ns_crossing_time[vid] = [t] else: assert veh_last_log_ns_or_not.has_key(vid) if veh_last_log_ns_or_not[vid] == OUT and ns_or_not == IN: veh_ns_crossing_time.setdefault(vid, [t]).append(t) # veh_last_log_ap_or_not[vid] = ap_or_not veh_last_log_ns_or_not[vid] = ns_or_not return veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not # from traceback import format_exc try: logger.info('handle the file; %s' % yymm) ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm) ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm) if check_path_exist(ap_pkl_fpath) and check_path_exist(ns_pkl_fpath): return None print 'handle the file; %s' % yymm veh_ap_crossing_time, veh_last_log_ap_or_not = {}, {} veh_ns_crossing_time, veh_last_log_ns_or_not = {}, {} if yymm not in ['0901', '1001', '1011']: y, m = int(yymm[:2]), int(yymm[2:]) prev_m = m - 1 prev_yymm = '%02d%02d' %(y, prev_m) prev_fn = get_all_files(log_last_day_dpath, '%s%s*.csv' % (log_last_day_prefix, prev_yymm))[0] path_to_last_day_csv_file = '%s/%s' % (log_last_day_dpath, prev_fn) veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not = \ record_crossing_time(path_to_last_day_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) path_to_csv_file = '%s/%s%s.csv' % (log_dpath, log_prefix, yymm) veh_ap_crossing_time, _, veh_ns_crossing_time, _ = \ record_crossing_time(path_to_csv_file, veh_ap_crossing_time, veh_last_log_ap_or_not, veh_ns_crossing_time, veh_last_log_ns_or_not) # save_pickle_file(ap_pkl_fpath, veh_ap_crossing_time) save_pickle_file(ns_pkl_fpath, veh_ns_crossing_time) logger.info('end the file; %s' % yymm) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f: f.write(format_exc()) raise
def run(): gp_summary_fpath = '%s/%ssummary.csv' % (of_dpath, of_prefix) gp_original_fpath = '%s/%soriginal.pkl' % (of_dpath, of_prefix) gp_drivers_fpath = '%s/%sdrivers.pkl' % (of_dpath, of_prefix) # with open(gp_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ 'groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon' ]) logger.info('Start handling SP_group_dpath') orignal_graph = {} for fn in get_all_files(if_dpath, '%ssigRelation-%s-*.pkl' % (if_prefix, year)): _, _, _, _, _did1 = fn[:-len('.csv')].split('-') sigRelatioin = load_pickle_file('%s/%s' % (if_dpath, fn)) for _did0, coef in sigRelatioin['pos']: did0, did1 = map(int, [_did0, _did1]) orignal_graph[did0, did1] = coef save_pickle_file(gp_original_fpath, orignal_graph) # igid, did_igid = 0, {} igG = ig.Graph(directed=True) for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()): if not did_igid.has_key(did0): igG.add_vertex(did0) did_igid[did0] = igid igid += 1 if not did_igid.has_key(did1): igG.add_vertex(did1) did_igid[did1] = igid igid += 1 igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w)) logger.info('Partitioning') part = louvain.find_partition(igG, method='Modularity', weight='weight') logger.info('Each group pickling and summary') gn_drivers = {} for i, sg in enumerate(part.subgraphs()): gn = 'G(%d)' % i group_fpath = '%s/%s%s.pkl' % (of_dpath, of_prefix, gn) sg.write_pickle(group_fpath) # drivers = [v['name'] for v in sg.vs] weights = [e['weight'] for e in sg.es] graphComplexity = len(weights) / float(len(drivers)) tie_strength = sum(weights) / float(len(drivers)) contribution = sum(weights) / float(len(weights)) benCon = tie_strength / float(len(drivers)) with open(gp_summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon ]) gl_img_fpath = '%s/%simg-%s.pdf' % (of_dpath, of_prefix, gn) # layout = sg.layout("kk") # if len(drivers) < 100: # ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers) # else: # ig.plot(sg, gl_img_fpath, layout=layout) gn_drivers[gn] = drivers gc_fpath = '%s/%scoef-%s.csv' % (of_dpath, of_prefix, gn) with open(gc_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'did0', 'did1', 'coef']) for e in sg.es: did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple] coef = e['weight'] writer.writerow([gn, did0, did1, coef]) save_pickle_file(gp_drivers_fpath, gn_drivers)
def find_intelligentDrivers(): idb_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, 'both') with open(idb_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') header = [ '', 'Y2009', 'Y2010', 'significance level %.2f' % sig_level, 'minDfResidualRatio %.2f' % min_df_residual_ratio ] writer.writerow(header) regressionClassification = {} for y in range(9, 11): year = '20%02d' % y id_fpath = '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, year) trip_fpath = '%s/Filtered-%s%s.csv' % ( statisticsAllDrivers_ap_dpath, statisticsAllDriversTrip_ap_prefix, year) df = pd.read_csv(trip_fpath) drivers = set(df['driverID']) intelDrivers = {} for mes in ['smallObs', 'sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']: regressionClassification[year, mes] = 0 for did in drivers: did_df = df[(df['driverID'] == did)].copy(deep=True) months = set(did_df['month']) hours = set(did_df['hour']) dummiesM = [] for m in months: month_str = 'M%02d' % m did_df[month_str] = np.where(did_df['month'] == m, 1, 0) dummiesM.append(month_str) dummiesH = [] for h in hours: hour_str = 'H%02d' % h did_df[hour_str] = np.where(did_df['hour'] == h, 1, 0) dummiesH.append(hour_str) df_residual = len(did_df) - (len(dummiesM) + len(dummiesH) + 1) if df_residual / float(len(did_df)) < min_df_residual_ratio: intelDrivers[did] = (len(did_df), 'X') regressionClassification[year, 'smallObs'] += 1 continue y = did_df['locQTime'] X = did_df[dummiesM[:-1] + dummiesH[:-1] + ['locIn']] X = sm.add_constant(X) res = sm.OLS(y, X, missing='drop').fit() if res.pvalues['locIn'] < sig_level: intelDrivers[did] = (len(did_df), res.params['locIn']) if res.params['locIn'] > 0: regressionClassification[year, 'sigPos'] += 1 else: regressionClassification[year, 'sigNeg'] += 1 else: intelDrivers[did] = (len(did_df), 'X') if res.params['locIn'] > 0: regressionClassification[year, 'XsigPos'] += 1 else: regressionClassification[year, 'XsigNeg'] += 1 save_pickle_file(id_fpath, intelDrivers) # with open(idb_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') for mes in ['smallObs', 'sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']: new_row = [ mes, regressionClassification['2009', mes], regressionClassification['2010', mes] ] writer.writerow(new_row)
def run(): cg_dpath = dpaths['baseline', '2009', 'countGraph'] cg_prefix = prefixs['baseline', '2009', 'countGraph'] gp_dpath = dpaths['baseline', '2009', 'groupPartition'] gp_prefix = prefixs['baseline', '2009', 'groupPartition'] # check_dir_create(gp_dpath) # gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix) gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix) gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) # with open(gp_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ 'groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon' ]) # logger.info('Start handling SP_group_dpath') if not check_path_exist(gp_original_fpath): original_graph = {} for fn in get_all_files(cg_dpath, '%s*' % cg_prefix): count_graph = load_pickle_file('%s/%s' % (cg_dpath, fn)) logger.info('Start handling; %s' % fn) numEdges = len(count_graph) moduloNumber = numEdges / 10 for i, ((did0, did1), w) in enumerate(count_graph.iteritems()): if i % moduloNumber == 0: logger.info('Handling; %.2f' % (i / float(numEdges))) original_graph[did0, did1] = w save_pickle_file(gp_original_fpath, original_graph) else: original_graph = load_pickle_file(gp_original_fpath) # logger.info('igraph converting') igid, did_igid = 0, {} igG = ig.Graph(directed=True) numEdges = len(original_graph) moduloNumber = numEdges / 10 for i, ((did0, did1), w) in enumerate(original_graph.iteritems()): if i % moduloNumber == 0: logger.info('Handling; %.2f' % i / float(numEdges)) if not did_igid.has_key(did0): igG.add_vertex(did0) did_igid[did0] = igid igid += 1 if not did_igid.has_key(did1): igG.add_vertex(did1) did_igid[did1] = igid igid += 1 igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w)) # logger.info('Partitioning') part = louvain.find_partition(igG, method='Modularity', weight='weight') logger.info('Each group pickling and summary') gn_drivers = {} for i, sg in enumerate(part.subgraphs()): gn = 'G(%d)' % i group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn) sg.write_pickle(group_fpath) # drivers = [v['name'] for v in sg.vs] weights = [e['weight'] for e in sg.es] graphComplexity = len(weights) / float(len(drivers)) tie_strength = sum(weights) / float(len(drivers)) contribution = sum(weights) / float(len(weights)) benCon = tie_strength / float(len(drivers)) with open(gp_summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon ]) gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn) layout = sg.layout("kk") if len(drivers) < 100: ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers) else: ig.plot(sg, gl_img_fpath, layout=layout) gn_drivers[gn] = drivers gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn) with open(gc_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'did0', 'did1', 'coef']) for e in sg.es: did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple] coef = e['weight'] writer.writerow([gn, did0, did1, coef]) save_pickle_file(gp_drivers_fpath, gn_drivers)
def run(): print 'start' check_dir_create(com_dir) # yyyy = '2009' la_fn = '2009-CD(184)-N(7003)-E(5717371).pkl' la_fpath = '%s/%s' % (la_dir, la_fn) _, str_CD, _, _ = la_fn[:-len('.pkl')].split('-') CD = int(str_CD[len('CD('):-len(')')]) print 'pick file loading...' pairs_day_counting = load_pickle_file(la_fpath) print 'finished' for thD in [18, 36, 55, 73, 82, 92]: thD_dpath = '%s/%s' % (com_dir, '2009-CD(%d)-thD(%d)' % (CD, thD)) check_dir_create(thD_dpath) summary_fpath = '%s/%s-CD(%d)-thD(%d)-community-summary.csv' % ( thD_dpath, yyyy, CD, thD) glayout_fpath = '%s/%s-CD(%d)-thD(%d)-glayout.pkl' % (thD_dpath, yyyy, CD, thD) with open(summary_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = [ 'com-name', 'num-nodes', 'num-edges', 'tie-strength(# of days encounter / # of drivers)' ] writer.writerow(new_headers) # nxG = nx.Graph() for (k0, k1), num_days in pairs_day_counting.iteritems(): if num_days < thD: continue nxG.add_edge(k0, k1, weight=num_days) print 'Whole graph pickling ...', yyyy, CD, thD nx.write_gpickle( nxG, '%s/%s-CD(%d)-thD(%d)-whole-N(%d)-E(%d).pkl' % (thD_dpath, yyyy, CD, thD, len(nxG.nodes()), len(nxG.edges()))) n_label, n_comId = [], [] nxId_igId = {} ig_nid = 0 print 'Partitioning ...' partition = community.best_partition(nxG) for i, com in enumerate(set(partition.values())): list_nodes = [ nodes for nodes in partition.keys() if partition[nodes] == com ] print i, 'Saving sub-graph ...' sub_nxG = nxG.subgraph(list_nodes) com_name = 'COM(%d)' % i com_fpath = '%s/%s-CD(%d)-thD(%d)-%s-N(%d)-E(%d).pkl' % ( thD_dpath, yyyy, CD, thD, com_name, len( sub_nxG.nodes()), len(sub_nxG.edges())) nx.write_gpickle(sub_nxG, com_fpath) _, _, weight = zip( *list(sub_nxG.edges_iter(data='weight', default=1))) num_nodes, num_edges = len(sub_nxG), len(weight) with open(summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([ com_name, num_nodes, num_edges, sum(weight) / float(num_nodes) ]) # print i, 'labeling...' for n in sub_nxG.nodes(): n_label.append(n) n_comId.append(i) nxId_igId[n] = ig_nid ig_nid += 1 # if len(nxG.nodes()) < 1000: print 'Layout calculating...' print datetime.datetime.now() Edges = [(nxId_igId[n0], nxId_igId[n1]) for (n0, n1) in nxG.edges()] print 'finish edge converting', len(Edges) print datetime.datetime.now() igG = ig.Graph(Edges, directed=False) layt = igG.layout('kk', dim=3) print 'finish layout calculation' print datetime.datetime.now() # save_pickle_file(glayout_fpath, [n_label, n_comId, layt, Edges]) else: save_pickle_file(glayout_fpath, [])
def process_file(fpath): logger.info('Start handling; %s' % fpath) _, year, reducerID = get_fn_only(fpath)[:-len('.csv')].split('-') try: tm = 'spendingTime' st_graph_dpath = dpaths[tm, year, 'influenceGraph'] st_graph_prefix = prefixs[tm, year, 'influenceGraph'] SP_graph_fpath = '%s/%s%s.pkl' % (st_graph_dpath, st_graph_prefix, reducerID) if check_path_exist(SP_graph_fpath): return None # logger.info('Start loading; %s-%s' % (year, reducerID)) df = pd.read_csv(fpath) SP_graph, RP_graph = {}, {} num_drivers = len(set(df['did'])) for i, did1 in enumerate(set(df['did'])): if i % 10 == 0: logger.info('Doing regression %.2f; %s-%s' % (i / float(num_drivers), year, reducerID)) did1_df = df[(df['did'] == did1)].copy(deep=True) numObservations = len(did1_df) minDFResiduals = numObservations * MIN_RATIO_RESIDUAL did1_df = did1_df.drop(['month', 'day', 'timeFrame', 'zi', 'zj', 'tfZ', 'did', 'roamingTime'], axis=1) if '%d' % did1 in did1_df.columns: did1_df = did1_df.drop(['%d' % did1], axis=1) # candi_dummies = [] num_iter = 1 while True: for i, vs in enumerate(zip(*did1_df.values)): if did1_df.columns[i] == tm: continue if sum(vs) > numObservations * MIN_PICKUP_RATIO * num_iter: candi_dummies.append(did1_df.columns[i]) numIndepVariables = len(candi_dummies) if numIndepVariables == 0: break if numObservations < numIndepVariables + minDFResiduals: candi_dummies = [] num_iter += 1 else: break if not candi_dummies: continue y = did1_df[tm] X = did1_df[candi_dummies] X = sm.add_constant(X) SP_res = sm.OLS(y, X, missing='drop').fit() # if SP_res.f_pvalue < SIGINIFICANCE_LEVEL: significant_drivers = set() for _did0, pv in SP_res.pvalues.iteritems(): if _did0 == 'const': continue if pv < SIGINIFICANCE_LEVEL: significant_drivers.add(_did0) positive_ef_drivers = set() for _did0, cof in SP_res.params.iteritems(): if _did0 == 'const': continue if cof > 0: positive_ef_drivers.add(_did0) for _did0 in significant_drivers.difference(positive_ef_drivers): SP_graph[int(_did0), did1] = SP_res.params[_did0] # logger.info('Start pickling; %s-%s' % (year, reducerID)) save_pickle_file(SP_graph_fpath, SP_graph) except Exception as _: import sys with open('%s_%s.txt' % (sys.argv[0], '%s-%s' % (year, reducerID)), 'w') as f: f.write(format_exc()) raise
def process_file(tm, year): ig_dpath = dpaths[tm, year, 'influenceGraph'] ig_prefix = prefixs[tm, year, 'influenceGraph'] gp_dpath = dpaths[tm, year, 'groupPartition'] gp_prefix = prefixs[tm, year, 'groupPartition'] # check_dir_create(gp_dpath) # gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix) gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix) gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix) # with open(gp_summary_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon']) # logger.info('Start handling SP_group_dpath') orignal_graph = {} for fn in get_all_files(ig_dpath, '%s*' % ig_prefix): regression_graph = load_pickle_file('%s/%s' % (ig_dpath, fn)) for i, ((did0, did1), w) in enumerate(regression_graph.iteritems()): orignal_graph[did0, did1] = w save_pickle_file(gp_original_fpath, orignal_graph) # igid, did_igid = 0, {} igG = ig.Graph(directed=True) for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()): if not did_igid.has_key(did0): igG.add_vertex(did0) did_igid[did0] = igid igid += 1 if not did_igid.has_key(did1): igG.add_vertex(did1) did_igid[did1] = igid igid += 1 igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w)) # logger.info('Partitioning') part = louvain.find_partition(igG, method='Modularity', weight='weight') logger.info('Each group pickling and summary') gn_drivers = {} for i, sg in enumerate(part.subgraphs()): gn = 'G(%d)' % i group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn) sg.write_pickle(group_fpath) # drivers = [v['name'] for v in sg.vs] weights = [e['weight'] for e in sg.es] graphComplexity = len(weights) / float(len(drivers)) tie_strength = sum(weights) / float(len(drivers)) contribution = sum(weights) / float(len(weights)) benCon = tie_strength / float(len(drivers)) with open(gp_summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon]) gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn) layout = sg.layout("kk") if len(drivers) < 100: ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers) else: ig.plot(sg, gl_img_fpath, layout=layout) gn_drivers[gn] = drivers gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn) with open(gc_fpath, 'wt') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow(['groupName', 'did0', 'did1', 'coef']) for e in sg.es: did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple] coef = e['weight'] writer.writerow([gn, did0, did1, coef]) save_pickle_file(gp_drivers_fpath, gn_drivers)
def run(): print 'start' check_dir_create(com_dir) # yyyy = '2009' la_fn = '2009-CD(184)-N(7003)-E(5717371).pkl' la_fpath = '%s/%s' % (la_dir, la_fn) _, str_CD, _, _ = la_fn[:-len('.pkl')].split('-') CD = int(str_CD[len('CD('):-len(')')]) print 'pick file loading...' pairs_day_counting = load_pickle_file(la_fpath) print 'finished' for thD in [18, 36, 55, 73, 82, 92]: thD_dpath = '%s/%s' % (com_dir, '2009-CD(%d)-thD(%d)' % (CD, thD)) check_dir_create(thD_dpath) summary_fpath = '%s/%s-CD(%d)-thD(%d)-community-summary.csv' % (thD_dpath, yyyy, CD, thD) glayout_fpath = '%s/%s-CD(%d)-thD(%d)-glayout.pkl' % (thD_dpath, yyyy, CD, thD) with open(summary_fpath, 'wb') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') new_headers = ['com-name', 'num-nodes', 'num-edges', 'tie-strength(# of days encounter / # of drivers)'] writer.writerow(new_headers) # nxG = nx.Graph() for (k0, k1), num_days in pairs_day_counting.iteritems(): if num_days < thD: continue nxG.add_edge(k0, k1, weight=num_days) print 'Whole graph pickling ...', yyyy, CD, thD nx.write_gpickle(nxG, '%s/%s-CD(%d)-thD(%d)-whole-N(%d)-E(%d).pkl' % (thD_dpath, yyyy, CD, thD, len(nxG.nodes()), len(nxG.edges()))) n_label, n_comId = [], [] nxId_igId = {} ig_nid = 0 print 'Partitioning ...' partition = community.best_partition(nxG) for i, com in enumerate(set(partition.values())): list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com] print i, 'Saving sub-graph ...' sub_nxG = nxG.subgraph(list_nodes) com_name = 'COM(%d)' % i com_fpath = '%s/%s-CD(%d)-thD(%d)-%s-N(%d)-E(%d).pkl' % (thD_dpath, yyyy, CD, thD, com_name, len(sub_nxG.nodes()), len(sub_nxG.edges())) nx.write_gpickle(sub_nxG, com_fpath) _, _, weight = zip(*list(sub_nxG.edges_iter(data='weight', default=1))) num_nodes, num_edges = len(sub_nxG), len(weight) with open(summary_fpath, 'a') as w_csvfile: writer = csv.writer(w_csvfile, lineterminator='\n') writer.writerow([com_name, num_nodes, num_edges, sum(weight) / float(num_nodes)]) # print i, 'labeling...' for n in sub_nxG.nodes(): n_label.append(n) n_comId.append(i) nxId_igId[n] = ig_nid ig_nid += 1 # if len(nxG.nodes()) < 1000: print 'Layout calculating...' print datetime.datetime.now() Edges = [(nxId_igId[n0], nxId_igId[n1]) for (n0, n1) in nxG.edges()] print 'finish edge converting', len(Edges) print datetime.datetime.now() igG = ig.Graph(Edges, directed=False) layt = igG.layout('kk', dim=3) print 'finish layout calculation' print datetime.datetime.now() # save_pickle_file(glayout_fpath, [n_label, n_comId, layt, Edges]) else: save_pickle_file(glayout_fpath, [])