def gen_summary():
    driverIntellect2009 = load_pickle_file(
        '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2009'))
    driverIntellect2010 = load_pickle_file(
        '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2010'))
    driverSet2009, driverSet2010 = set(driverIntellect2009.keys()), set(driverIntellect2010.keys())
    driverSetBoth = driverSet2009.intersection(driverSet2010)
    onlySet2009 = driverSet2009.difference(driverSetBoth)
    onlySet2010 = driverSet2010.difference(driverSetBoth)
    # fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, 'all')
    fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, 'all-negativeOnly')
    with open(fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['did', 'numY2009', 'numY2010', 'coefY2009', 'coefY2010', 'coefDiff']
        writer.writerow(header)
        for did in driverSetBoth:
            num2009, coef2009 = driverIntellect2009[did]
            num2010, coef2010 = driverIntellect2010[did]
            # if coef2009 == 'X' or coef2010 == 'X':
            #     writer.writerow([did, num2009, num2010, coef2009, coef2010, 'X'])
            # else:
            #     writer.writerow([did, num2009, num2010, coef2009, coef2010, coef2009 - coef2010])
            if coef2009 == 'X' or coef2010 == 'X':
                continue
            if coef2009 < 0 and coef2010 < 0:
                writer.writerow([did, num2009, num2010, coef2009, coef2010, coef2009 - coef2010])
def process_file(yymm):
    ap_pkl_file_path = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix, yymm)
    ns_pkl_file_path = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix, yymm)
    if not (check_path_exist(ap_pkl_file_path) and check_path_exist(ns_pkl_file_path)):
        return None
    #
    # Load pickle files
    #
    ap_crossing_time, ns_crossing_time = load_pickle_file(ap_pkl_file_path), load_pickle_file(ns_pkl_file_path)
    #
    # Initiate csv files
    #
    ap_trip_fpath = '%s/%s%s.csv' % (ap_trips_dir, ap_trip_prefix, yymm)
    ns_trip_fpath = '%s/%s%s.csv' % (ns_trips_dir, ns_trip_prefix, yymm)
    if check_path_exist(ap_trip_fpath) and check_path_exist(ns_trip_fpath):
        return None
    print 'handle the file; %s' % yymm
    for fpath in [ap_trip_fpath, ns_trip_fpath]:
        with open(fpath, 'wt') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                new_headers = ['tid', 'vid', 'did',
                               'start-time', 'end-time', 'duration',
                               'fare', 'prev-trip-end-time',
                               'trip-mode', 'queue—join-time', 'queueing-time']
                writer.writerow(new_headers)
    #
    with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm), 'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h : i for i, h in enumerate(headers)}
        for row in reader:
            tid, did = row[hid['tid']], row[hid['did']]
            et, duration = row[hid['end-time']], row[hid['duration']]
            fare = row[hid['fare']]
            #
            ap_tm, ns_tm = int(row[hid['ap-trip-mode']]), int(row[hid['ns-trip-mode']]) 
            vid, st, prev_tet = row[hid['vid']], eval(row[hid['start-time']]), eval(row[hid['prev-trip-end-time']])
            #
            for tm, crossing_time, fpath in [(ap_tm, ap_crossing_time, ap_trip_fpath),
                                                             (ns_tm, ns_crossing_time, ns_trip_fpath)]:
                if tm == DIn_POut or tm == DOut_POut:
                    continue
                if tm == DIn_PIn:
                    queue_join_time = prev_tet
                elif tm == DOut_PIn:
                    try:
                        i = bisect(crossing_time[vid], st)
                    except KeyError:
                        print '%s-tid-%s' % (yymm, row[hid['tid']])
                        continue
                    queue_join_time = crossing_time[vid][i - 1] if i != 0 else crossing_time[vid][0]
                with open(fpath, 'a') as w_csvfile:
                    writer = csv.writer(w_csvfile, lineterminator='\n')
                    queueing_time = st - queue_join_time
                    if queueing_time < Q_LIMIT_MIN:
                        queueing_time = Q_LIMIT_MIN
                    new_row = [tid, vid, did, st, et, duration, fare, prev_tet,
                                tm, queue_join_time, queueing_time]
                    writer.writerow(new_row)
    print 'end the file; %s' % yymm 
def run():
    yearDriver_gn = {}
    whole_ss_drivers = set()
    tm = 'spendingTime'
    for year in ['2009', '2010', '2011', '2012']:
        gp_dpath = dpaths[tm, year, 'groupPartition']
        gp_prefix = prefixs[tm, year, 'groupPartition']
        gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
        gp_drivers = load_pickle_file(gp_drivers_fpath)
        for gn, drivers in gp_drivers.iteritems():
            for did in drivers:
                yearDriver_gn[year, did] = gn
        yy = year[2:]
        for fn in get_all_files(ss_drivers_dpath, '%s%s*.pkl' % (ss_drivers_prefix, yy)):
            ss_drivers_fpath = '%s/%s' % (ss_drivers_dpath, fn)
            ss_drivers = load_pickle_file(ss_drivers_fpath)
            for did in ss_drivers:
                whole_ss_drivers.add(did)
    with open(groupEvolution_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['did', 'G2009', 'G2010', 'G2011', 'G2012']
        writer.writerow(header)
        for did in whole_ss_drivers:
            new_row = [did]
            for year in ['2009', '2010', '2011', '2012']:
                k = (year, did)
                if yearDriver_gn.has_key(k):
                    gn = yearDriver_gn[k]
                else:
                    gn = 'X'
                new_row += [gn]
            writer.writerow(new_row)
def run():
    yearDriver_gn = {}
    whole_ss_drivers = set()
    tm = 'spendingTime'
    for year in ['2009', '2010', '2011', '2012']:
        gp_dpath = dpaths[tm, year, 'groupPartition']
        gp_prefix = prefixs[tm, year, 'groupPartition']
        gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
        gp_drivers = load_pickle_file(gp_drivers_fpath)
        for gn, drivers in gp_drivers.iteritems():
            for did in drivers:
                yearDriver_gn[year, did] = gn
        yy = year[2:]
        for fn in get_all_files(ss_drivers_dpath,
                                '%s%s*.pkl' % (ss_drivers_prefix, yy)):
            ss_drivers_fpath = '%s/%s' % (ss_drivers_dpath, fn)
            ss_drivers = load_pickle_file(ss_drivers_fpath)
            for did in ss_drivers:
                whole_ss_drivers.add(did)
    with open(groupEvolution_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['did', 'G2009', 'G2010', 'G2011', 'G2012']
        writer.writerow(header)
        for did in whole_ss_drivers:
            new_row = [did]
            for year in ['2009', '2010', '2011', '2012']:
                k = (year, did)
                if yearDriver_gn.has_key(k):
                    gn = yearDriver_gn[k]
                else:
                    gn = 'X'
                new_row += [gn]
            writer.writerow(new_row)
Example #5
0
def get_driver_trajectory(did):
    ofpath = '%s%d.pkl' % (if_prefix, did)
    if check_path_exist(ofpath):
        dt_xy_state = load_pickle_file(ofpath)
    else:
        dates = []
        for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix):
            _, _date, _did = fn[:-len('.csv')].split('-')
            if int(_did) != did:
                continue
            year = 2000 + int(_date[:2])
            month, day = map(int, [_date[2:4], _date[4:6]])
            dt = datetime.datetime(year, month, day)
            dates += [dt]
        dates.sort()
        dt_xy_state = []
        for dt in dates:
            yy = '%02d' % (dt.year - 2000)
            mm, dd = '%02d' % dt.month, '%02d' % dt.day
            yymmdd = yy + mm + dd
            ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did)
            with open(ifpath, 'rb') as logFile:
                reader = csv.reader(logFile)
                header = reader.next()
                # header: time,vehicle-id,driver-id,longitude,latitude,speed,state
                hid = {h: i for i, h in enumerate(header)}
                for row in reader:
                    dt = datetime.datetime.fromtimestamp(eval(row[hid['time']]))
                    lon, lat = map(eval, [row[hid[cn]] for cn in ['longitude', 'latitude']])
                    x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat)
                    dt_xy_state += [(dt, x, y, int(row[hid['state']]))]
        save_pickle_file(ofpath, dt_xy_state)
    return dt_xy_state
def process_files(yymm):
    print 'handle the file; %s' % yymm
    #
    shift_df = pd.read_csv('%s/%s%s.csv' %
                           (ftd_shift_dir, ftd_shift_prefix, yymm))
    all_trip_df = pd.read_csv('%s/%s%s.csv' %
                              (ftd_trips_dir, ftd_trips_prefix, yymm))
    loc_trip_df = pd.read_csv('%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm))
    ft_drivers = map(
        int,
        load_pickle_file('%s/%s%s.pkl' %
                         (full_time_driver_dir, ft_drivers_prefix, yymm)))
    days = set(loc_trip_df['dd'])
    #
    yy, mm = int(yymm[:2]), int(yymm[2:])
    for dd in days:
        day_all_trip_df = all_trip_df[(all_trip_df['dd'] == dd)]
        day_loc_trip_df = loc_trip_df[(loc_trip_df['dd'] == dd)]
        day_shift_df = shift_df[(shift_df['dd'] == dd)]
        for did in ft_drivers:
            #
            # All
            #
            d_all_trip = day_all_trip_df[(day_all_trip_df['did'] == did)]
            d_shift = day_shift_df[(day_shift_df['did'] == did)]
            all_num = len(d_all_trip['fare'])
            pro_dur = sum(d_shift['pro-dur']) * SEC60
            all_fare = sum(d_all_trip['fare'])
            #
            # Specific location
            #
            d_loc_trip = day_loc_trip_df[(day_loc_trip_df['did'] == did)]
            loc_num = len(d_loc_trip['fare'])
            loc_dur = sum(d_loc_trip['duration'])
            loc_fare = sum(d_loc_trip['fare'])
            loc_ep = sum(d_loc_trip['economic-profit'])
            loc_qtime = sum(d_loc_trip['queueing-time'])
            #
            d_loc_trip_in = d_loc_trip[(d_loc_trip['trip-mode'] == DIn_PIn)]
            locIn_num = len(d_loc_trip_in['fare'])
            locIn_dur = sum(d_loc_trip_in['duration'])
            locIn_fare = sum(d_loc_trip_in['fare'])
            locIn_ep = sum(d_loc_trip_in['economic-profit'])
            locIn_qtime = sum(d_loc_trip_in['queueing-time'])
            #
            d_loc_trip_out = d_loc_trip[(d_loc_trip['trip-mode'] == DOut_PIn)]
            locOut_num = len(d_loc_trip_out['fare'])
            locOut_dur = sum(d_loc_trip_out['duration'])
            locOut_fare = sum(d_loc_trip_out['fare'])
            locOut_ep = sum(d_loc_trip_out['economic-profit'])
            locOut_qtime = sum(d_loc_trip_out['queueing-time'])
            #
            with open(ftd_ap_daily_stat_fpath, 'a') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                writer.writerow([
                    yy, mm, dd, did, all_num, pro_dur, all_fare, loc_num,
                    loc_dur, loc_fare, loc_ep, loc_qtime, locIn_num, locIn_dur,
                    locIn_fare, locIn_ep, locIn_qtime, locOut_num, locOut_dur,
                    locOut_fare, locOut_ep, locOut_qtime
                ])
Example #7
0
def run():
    if not check_path_exist(ssd_apIn_fpath):
        with open(ssd_apIn_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            headers = ['apQTime', 'apIn', 'did']
            writer.writerow(headers)
            for m in xrange(1, 13):
                yymm = '10%02d' % m
                if yymm in ['1010']:
                    continue
                logger.info('Start handling; %s' % yymm)
                ft_drivers = map(int, load_pickle_file('%s/%s%s.pkl' % (full_time_driver_dir, ft_drivers_prefix, yymm)))
                ap_ep_fpath = '%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm)
                with open(ap_ep_fpath, 'rb') as r_csvfile:
                    reader = csv.reader(r_csvfile)
                    headers = reader.next()
                    hid = {h: i for i, h in enumerate(headers)}
                    handling_day = 0
                    for row in reader:
                        did = int(row[hid['did']])
                        if did not in ft_drivers:
                            continue
                        t = eval(row[hid['start-time']])
                        cur_dt = datetime.datetime.fromtimestamp(t)
                        if handling_day != cur_dt.day:
                            logger.info('...ing; %s(%dth)' % (yymm, handling_day))
                            handling_day = cur_dt.day
                        apIn = 1 if int(row[hid['trip-mode']]) == DIn_PIn else 0
                        apQTime = eval(row[hid['queueing-time']]) / float(SEC60)
                        new_row = [apQTime, apIn, did]
                        writer.writerow(new_row)
    #
    df = pd.read_csv(ssd_apIn_fpath)
    df = df[~(np.abs(df['apQTime'] - df['apQTime'].mean()) > (3 * df['apQTime'].std()))]
    minNumSample = 40
    with open(ssd_sensitivity_fpath, 'wb') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        headers = ['did', 'F_pValue', 'rSqure', 'rSqureAdj', 'coef_apIn', 'pValue_apIn', 'coef_const', 'pValue_const']
        writer.writerow(headers)
        for did in set(df['did']) :
            did_df = df[(df['did'] == did)]
            if len(did_df) < minNumSample:
                continue

            if len(did_df[(did_df['apIn'] == 0)]) < 4:
                continue
            y = did_df['apQTime']
            X = did_df['apIn']
            X = sm.add_constant(X)
            res = sm.OLS(y, X).fit()
            if np.isnan(res.f_pvalue):
                continue
            try:
                writer.writerow([did, res.f_pvalue, res.rsquared, res.rsquared_adj,
                                 res.params['apIn'], res.pvalues['apIn'], res.params['const'], res.pvalues['const']])
            except Exception as _:
                pass
Example #8
0
def gen_summary2010():
    intellect2010_fpath = '%s/%s%s.csv' % (
        statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix,
        '2010')
    with open(intellect2010_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = [
            'driverID', 'locInCoef', 'wleTripNumber', 'wleOperatingHour',
            'wleFare', 'locTripNumber', 'locInNumber', 'locOutNumber',
            'locQTime', 'locEP', 'locDuration', 'locFare', 'wleProductivity',
            'QTime/locTrip', 'EP/locTrip', 'locProductivity', 'locInRatio'
        ]
        writer.writerow(header)
    #
    driverIntellect2010 = load_pickle_file(
        '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath,
                         statisticsAllDriversIntellect_ap_prefix, '2010'))
    df = pd.read_csv('%s/Filtered-%s%s.csv' %
                     (statisticsAllDrivers_ap_dpath,
                      statisticsAllDriversDay_ap_prefix, '2010'))
    agg_df = df.groupby(['driverID']).sum().reset_index()
    candi_drop_cn = []
    for cn in agg_df.columns:
        if cn not in [
                'driverID', 'wleTripNumber', 'wleOperatingHour', 'wleFare',
                'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime',
                'locEP', 'locDuration', 'locFare'
        ]:
            candi_drop_cn.append(cn)
    agg_df = agg_df.drop(candi_drop_cn, axis=1)
    #
    agg_df['wleProductivity'] = agg_df['wleFare'] / agg_df['wleOperatingHour']
    agg_df['QTime/locTrip'] = agg_df['locQTime'] / agg_df['locTripNumber']
    agg_df['EP/locTrip'] = agg_df['locEP'] / agg_df['locTripNumber']
    agg_df['locProductivity'] = agg_df['locFare'] / (
        agg_df['locQTime'] + agg_df['locDuration']) * SEC60
    agg_df['locInRatio'] = agg_df['locInNumber'] / agg_df['locTripNumber']
    allDrivers = set(agg_df['driverID'])
    for did, (_, coef) in driverIntellect2010.iteritems():
        if coef == 'X':
            continue
        if did not in allDrivers:
            continue
        with open(intellect2010_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_row = [did, coef]

            for cn in [
                    'wleTripNumber', 'wleOperatingHour', 'wleFare',
                    'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime',
                    'locEP', 'locDuration', 'locFare', 'wleProductivity',
                    'QTime/locTrip', 'EP/locTrip', 'locProductivity',
                    'locInRatio'
            ]:
                new_row += agg_df.loc[agg_df['driverID'] == did][cn].tolist()
            writer.writerow(new_row)
def process_files(yymm):
    print 'handle the file; %s' % yymm
    #
    shift_df = pd.read_csv('%s/%s%s.csv' % (ftd_shift_dir, ftd_shift_prefix, yymm))
    all_trip_df = pd.read_csv('%s/%s%s.csv' % (ftd_trips_dir, ftd_trips_prefix, yymm))
    loc_trip_df = pd.read_csv('%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm))
    ft_drivers = map(int, load_pickle_file('%s/%s%s.pkl' % (full_time_driver_dir, ft_drivers_prefix, yymm)))
    days = set(loc_trip_df['dd'])
    #
    yy, mm = int(yymm[:2]), int(yymm[2:])
    for dd in days:
        day_all_trip_df = all_trip_df[(all_trip_df['dd'] == dd)]
        day_loc_trip_df = loc_trip_df[(loc_trip_df['dd'] == dd)]
        day_shift_df = shift_df[(shift_df['dd'] == dd)]
        for did in ft_drivers:
            #
            # All
            #
            d_all_trip = day_all_trip_df[(day_all_trip_df['did'] == did)]
            d_shift = day_shift_df[(day_shift_df['did'] == did)]
            all_num = len(d_all_trip['fare'])
            pro_dur = sum(d_shift['pro-dur']) * SEC60
            all_fare = sum(d_all_trip['fare'])
            #
            # Specific location
            #
            d_loc_trip = day_loc_trip_df[(day_loc_trip_df['did'] == did)]
            loc_num = len(d_loc_trip['fare'])
            loc_dur = sum(d_loc_trip['duration'])
            loc_fare = sum(d_loc_trip['fare'])
            loc_ep = sum(d_loc_trip['economic-profit'])
            loc_qtime = sum(d_loc_trip['queueing-time'])
            #
            d_loc_trip_in = d_loc_trip[(d_loc_trip['trip-mode'] == DIn_PIn)]
            locIn_num = len(d_loc_trip_in['fare'])
            locIn_dur = sum(d_loc_trip_in['duration'])
            locIn_fare = sum(d_loc_trip_in['fare'])
            locIn_ep = sum(d_loc_trip_in['economic-profit'])
            locIn_qtime = sum(d_loc_trip_in['queueing-time'])
            #
            d_loc_trip_out = d_loc_trip[(d_loc_trip['trip-mode'] == DOut_PIn)]
            locOut_num = len(d_loc_trip_out['fare'])
            locOut_dur = sum(d_loc_trip_out['duration'])
            locOut_fare = sum(d_loc_trip_out['fare'])
            locOut_ep = sum(d_loc_trip_out['economic-profit'])
            locOut_qtime = sum(d_loc_trip_out['queueing-time'])
            #
            with open(ftd_ap_daily_stat_fpath, 'a') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                writer.writerow([yy, mm, dd, did,
                                 all_num, pro_dur, all_fare,
                                 loc_num, loc_dur, loc_fare, loc_ep, loc_qtime,
                                 locIn_num, locIn_dur, locIn_fare, locIn_ep, locIn_qtime,
                                 locOut_num, locOut_dur, locOut_fare, locOut_ep, locOut_qtime])
Example #10
0
def get_sgBoarder_xy():
    fpath = 'sgBorder_xy.pkl'
    if not check_path_exist(fpath):
        sgBorder_xy = []
        for lon, lat in sg_border:
            x, y = convert_GPS2xy(lon, lat)
            sgBorder_xy += [(x, y)]
        save_pickle_file(fpath, sgBorder_xy)
    else:
        sgBorder_xy = load_pickle_file(fpath)
    return sgBorder_xy
def run():
    a1_dir = charts_dir + '/b_aggregated_a1 monthly fare'
    check_dir_create(a1_dir)
    #
    Y09, Y10 = load_pickle_file(driver_monthly_fare_fn)
    num_bin = 50
    #
    print 't statistics %.3f, p-value %.3f' % (stats.ttest_ind(Y09, Y10, equal_var=False))
    #
    one_histogram((8,6), '', 'Fare (S$)', 'Probability', num_bin, Y09, a1_dir + '/Y2009_monthly_fares')
    one_histogram((8,6), '', 'Fare (S$)', 'Probability', num_bin, Y10, a1_dir + '/Y2010_monthly_fares')
Example #12
0
def get_sgZones():
    ofpath = 'sgZone.pkl'
    if check_path_exist(ofpath):
        sgZones = load_pickle_file(ofpath)
    else:
        sgZones = get_sg_zones()
        for z in sgZones.values():
            z.cCoor_xy = convert_GPS2xy(*z.cCoor_gps)
            z.polyPoints_xy = [convert_GPS2xy(*gps_coord) for gps_coord in z.polyPoints_gps]
            z.marked = False
        save_pickle_file(ofpath, sgZones)
    return sgZones
Example #13
0
def get_sgRoards_xy():
    ofpath = 'sgRoards_xy.pkl'
    if check_path_exist(ofpath):
        sgRoards_xy = load_pickle_file(ofpath)
    else:
        sgRoards_xy = []
        for _, coords in get_SG_roads():
            road_fd = []
            for lon, lat in coords:
                road_fd += [convert_GPS2xy(lon, lat)]
            sgRoards_xy += [road_fd]
        save_pickle_file(ofpath, sgRoards_xy)
    return sgRoards_xy
def run():
    a1_dir = charts_dir + '/b_aggregated_a1 monthly fare'
    check_dir_create(a1_dir)
    #
    Y09, Y10 = load_pickle_file(driver_monthly_fare_fn)
    num_bin = 50
    #
    print 't statistics %.3f, p-value %.3f' % (stats.ttest_ind(
        Y09, Y10, equal_var=False))
    #
    one_histogram((8, 6), '', 'Fare (S$)', 'Probability', num_bin, Y09,
                  a1_dir + '/Y2009_monthly_fares')
    one_histogram((8, 6), '', 'Fare (S$)', 'Probability', num_bin, Y10,
                  a1_dir + '/Y2010_monthly_fares')
Example #15
0
def gen_summary():
    driverIntellect2009 = load_pickle_file(
        '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath,
                         statisticsAllDriversIntellect_ap_prefix, '2009'))
    driverIntellect2010 = load_pickle_file(
        '%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath,
                         statisticsAllDriversIntellect_ap_prefix, '2010'))
    driverSet2009, driverSet2010 = set(driverIntellect2009.keys()), set(
        driverIntellect2010.keys())
    driverSetBoth = driverSet2009.intersection(driverSet2010)
    onlySet2009 = driverSet2009.difference(driverSetBoth)
    onlySet2010 = driverSet2010.difference(driverSetBoth)
    # fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, 'all')
    fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath,
                             statisticsAllDriversIntellect_ap_prefix,
                             'all-negativeOnly')
    with open(fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = [
            'did', 'numY2009', 'numY2010', 'coefY2009', 'coefY2010', 'coefDiff'
        ]
        writer.writerow(header)
        for did in driverSetBoth:
            num2009, coef2009 = driverIntellect2009[did]
            num2010, coef2010 = driverIntellect2010[did]
            # if coef2009 == 'X' or coef2010 == 'X':
            #     writer.writerow([did, num2009, num2010, coef2009, coef2010, 'X'])
            # else:
            #     writer.writerow([did, num2009, num2010, coef2009, coef2010, coef2009 - coef2010])
            if coef2009 == 'X' or coef2010 == 'X':
                continue
            if coef2009 < 0 and coef2010 < 0:
                writer.writerow([
                    did, num2009, num2010, coef2009, coef2010,
                    coef2009 - coef2010
                ])
def run(moduloIndex):
    logger.info('loading driversRelations %s; %s' % (year, depVar))
    superSet_fpath = '%s/%sFiltered-superSet-%s%s.pkl' % (if_dpath, depVar, if_prefixs, year)
    driversRelations = load_pickle_file(superSet_fpath)
    whole_drivers = driversRelations.keys()
    driver_subsets = [[] for _ in range(numReducers)]
    for i, did in enumerate(whole_drivers):
        driver_subsets[i % numReducers].append(did)
    for i, driver_subset in enumerate(driver_subsets):
        if i % numWorker != moduloIndex:
            continue
        pickUp_drivers = set()
        for did1 in driver_subset:
            pickUp_drivers = pickUp_drivers.union(driversRelations[did1])
        process_files(i, driver_subset, pickUp_drivers)
Example #17
0
def process_files(yymm):
    print 'handle the file; %s' % yymm
    ft_drivers = load_pickle_file(
        '%s/%s%s.pkl' % (full_time_driver_dir, ft_drivers_prefix, yymm))
    with open('%s/%s%s.csv' % (shift_pro_dur_dir, shift_pro_dur_prefix, yymm),
              'rt') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h: i for i, h in enumerate(headers)}
        with open('%s/%s%s.csv' % (ftd_shift_dir, ftd_shift_prefix, yymm),
                  'wt') as w_csvfile:
            writer = csv.writer(w_csvfile)
            writer.writerow(headers)
            for row in reader:
                did = row[hid['did']]
                if did not in ft_drivers:
                    continue
                writer.writerow(row)
    #
    with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm),
              'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h: i for i, h in enumerate(headers)}
        with open('%s/%s%s.csv' % (ftd_trips_dir, ftd_trips_prefix, yymm),
                  'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = [
                'did', 'start-time', 'duration', 'fare', 'yy', 'mm', 'dd', 'hh'
            ]
            writer.writerow(new_headers)
            #
            # filter out trips data based on two factors;
            #   1. full time driver
            #
            for row in reader:
                st_ts = eval(row[hid['start-time']])
                st_dt = datetime.datetime.fromtimestamp(st_ts)
                did = row[hid['did']]
                if did not in ft_drivers:
                    continue
                writer.writerow([
                    row[hid['did']], row[hid['start-time']],
                    row[hid['duration']], row[hid['fare']], st_dt.year - 2000,
                    st_dt.month, st_dt.day, st_dt.hour
                ])
    #
    print 'end the file; %s' % yymm
Example #18
0
def run(moduloIndex):
    logger.info('loading driversRelations %s; %s' % (year, depVar))
    superSet_fpath = '%s/%sFiltered-superSet-%s%s.pkl' % (if_dpath, depVar,
                                                          if_prefixs, year)
    driversRelations = load_pickle_file(superSet_fpath)
    whole_drivers = driversRelations.keys()
    driver_subsets = [[] for _ in range(numReducers)]
    for i, did in enumerate(whole_drivers):
        driver_subsets[i % numReducers].append(did)
    for i, driver_subset in enumerate(driver_subsets):
        if i % numWorker != moduloIndex:
            continue
        pickUp_drivers = set()
        for did1 in driver_subset:
            pickUp_drivers = pickUp_drivers.union(driversRelations[did1])
        process_files(i, driver_subset, pickUp_drivers)
def run():
    for dpath in [
                    # statisticsSsDrivers_ap_dpath,
                    statisticsSsDrivers_ns_dpath
                    ]:
        check_dir_create(dpath)
    #
    ssDrivers = set()
    for y in xrange(9, 11):
        for m in xrange(1, 13):
            yymm = '%02d%02d' % (y, m)
            if yymm in ['0912', '1010']:
                # both years data are corrupted
                continue
            ssDrivers = ssDrivers.union(load_pickle_file('%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm)))
    #
    for all_dpath, ss_dpath in [
                                # (statisticsAllDrivers_ap_dpath, statisticsSsDrivers_ap_dpath),
                                (statisticsAllDrivers_ns_dpath, statisticsSsDrivers_ns_dpath)
                                ]:
        for all_prefix, ss_prefix in [
                                    # (statisticsAllDriversDay_ap_prefix, statisticsSsDriversDay_ap_prefix),
                                      (statisticsAllDriversDay_ns1517_prefix, statisticsSsDriversDay_ns1517_prefix),
                                      (statisticsAllDriversDay_ns2023_prefix, statisticsSsDriversDay_ns2023_prefix),

                                      # (statisticsAllDriversMonth_ap_prefix, statisticsSsDriversMonth_ap_prefix),
                                      (statisticsAllDriversMonth_ns1517_prefix, statisticsSsDriversMonth_ns1517_prefix),
                                      (statisticsAllDriversMonth_ns2023_prefix, statisticsSsDriversMonth_ns2023_prefix),

                                      # (statisticsAllDriversTrip_ap_prefix, statisticsSsDriversTrip_ap_prefix),
                                      (statisticsAllDriversTrip_ns1517_prefix, statisticsSsDriversTrip_ns1517_prefix),
                                      (statisticsAllDriversTrip_ns2023_prefix, statisticsSsDriversTrip_ns2023_prefix),
                                      ]:
            for fn in get_all_files(all_dpath, '%s*' % all_prefix):
                period = fn[:-len('.csv')].split('-')[2]
                with open('%s/%s' % (all_dpath, fn), 'rt') as r_csvfile:
                    reader = csv.reader(r_csvfile)
                    header = reader.next()
                    hid = {h: i for i, h in enumerate(header)}
                    with open('%s/%s%s.csv' % (ss_dpath, ss_prefix, period), 'wt') as w_csvfile:
                        writer = csv.writer(w_csvfile)
                        writer.writerow(header)
                        for row in reader:
                            did = int(row[hid['driverID']])
                            if did not in ssDrivers:
                                continue
                            writer.writerow(row)
Example #20
0
def get_sgGrid_xy():
    ofpath = 'sgGrid_xy.pkl'
    if check_path_exist(ofpath):
        sgGrid_xy = load_pickle_file(ofpath)
    else:
        sgGrid_xy = []
        lons, lats = generate_sg_grid()
        for lon in lons:
            sx, sy = convert_GPS2xy(lon, lats[0])
            ex, ey = convert_GPS2xy(lon, lats[-1])
            sgGrid_xy += [[(sx, sy), (ex, ey)]]
        for lat in lats:
            sx, sy = convert_GPS2xy(lons[0], lat)
            ex, ey = convert_GPS2xy(lons[-1], lat)
            sgGrid_xy += [[(sx, sy), (ex, ey)]]
        save_pickle_file(ofpath, sgGrid_xy)
    return sgGrid_xy
def gen_summary2010():
    intellect2010_fpath = '%s/%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2010')
    with open(intellect2010_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        header = ['driverID', 'locInCoef',
                  'wleTripNumber', 'wleOperatingHour', 'wleFare', 
                  'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare',
                  'wleProductivity', 'QTime/locTrip', 'EP/locTrip', 'locProductivity', 'locInRatio']
        writer.writerow(header)
    #
    driverIntellect2010 = load_pickle_file('%s/%s%s.pkl' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversIntellect_ap_prefix, '2010'))
    df = pd.read_csv('%s/Filtered-%s%s.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversDay_ap_prefix, '2010'))
    agg_df = df.groupby(['driverID']).sum().reset_index()
    candi_drop_cn = []
    for cn in agg_df.columns:
        if cn not in ['driverID', 'wleTripNumber', 'wleOperatingHour', 'wleFare', 
                      'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare']:
            candi_drop_cn.append(cn)
    agg_df = agg_df.drop(candi_drop_cn, axis=1)
    #
    agg_df['wleProductivity'] = agg_df['wleFare'] / agg_df['wleOperatingHour']
    agg_df['QTime/locTrip'] = agg_df['locQTime'] / agg_df['locTripNumber']
    agg_df['EP/locTrip'] = agg_df['locEP'] / agg_df['locTripNumber']
    agg_df['locProductivity'] = agg_df['locFare'] / (agg_df['locQTime'] + agg_df['locDuration']) * SEC60
    agg_df['locInRatio'] = agg_df['locInNumber'] / agg_df['locTripNumber']
    allDrivers = set(agg_df['driverID'])
    for did, (_, coef) in driverIntellect2010.iteritems():
        if coef == 'X':
            continue
        if did not in allDrivers:
            continue
        with open(intellect2010_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_row = [did, coef]

            for cn in ['wleTripNumber', 'wleOperatingHour', 'wleFare', 
                          'locTripNumber', 'locInNumber', 'locOutNumber', 'locQTime', 'locEP', 'locDuration', 'locFare',
                          'wleProductivity', 'QTime/locTrip', 'EP/locTrip', 'locProductivity', 'locInRatio']:
                new_row += agg_df.loc[agg_df['driverID'] == did][cn].tolist()
            writer.writerow(new_row)
def run():
    check_dir_create(tfZ_TP_dpath)
    numWorker = 6
    init_multiprocessor(numWorker)
    count_num_jobs = 0
    numReducers = numWorker * 10
    #
    yyyy = '20%02d' % 12
    logger.info('loading driversRelations %s' % yyyy)
    driversRelations = load_pickle_file(driversRelations_fpaths[yyyy])
    whole_drivers = driversRelations.keys()
    driver_subsets = [[] for _ in range(numReducers)]
    for i, did in enumerate(whole_drivers):
        driver_subsets[i % numReducers].append(did)
    for i, driver_subset in enumerate(driver_subsets):
        # process_files(yyyy, i, driver_subset, driversRelations)
        pickUp_drivers = set()
        for did1 in driver_subset:
            pickUp_drivers = pickUp_drivers.union(driversRelations[did1])
        put_task(process_files, [yyyy, i, driver_subset, pickUp_drivers])
        count_num_jobs += 1
    end_multiprocessor(count_num_jobs)
Example #23
0
def run():
    check_dir_create(tfZ_TP_dpath)
    numWorker = 6
    init_multiprocessor(numWorker)
    count_num_jobs = 0
    numReducers = numWorker * 10
    #
    yyyy = '20%02d' % 12
    logger.info('loading driversRelations %s' % yyyy)
    driversRelations = load_pickle_file(driversRelations_fpaths[yyyy])
    whole_drivers = driversRelations.keys()
    driver_subsets = [[] for _ in range(numReducers)]
    for i, did in enumerate(whole_drivers):
        driver_subsets[i % numReducers].append(did)
    for i, driver_subset in enumerate(driver_subsets):
        # process_files(yyyy, i, driver_subset, driversRelations)
        pickUp_drivers = set()
        for did1 in driver_subset:
            pickUp_drivers = pickUp_drivers.union(driversRelations[did1])
        put_task(process_files, [yyyy, i, driver_subset, pickUp_drivers])
        count_num_jobs += 1
    end_multiprocessor(count_num_jobs)
Example #24
0
def run(time_from, time_to):
    #
    # Step 1. Split Singapore into zones
    #
    if not check_path_exist(grid_info_fn):
        from taxi_common.sg_grid_zone import run as run_split_into_zones  # @UnresolvedImport
        hl_points, vl_points, zones = run_split_into_zones(rp_zone)
    else:
        hl_points, vl_points, zones = load_pickle_file(grid_info_fn)
    #
    # Step 2. Preprocess logs
    #
    processed_log_fn = get_processed_log_fn(time_from, time_to)
    if not check_path_exist(processed_log_fn):
        from preprocess_logs import run as run_preprocess_logs
        run_preprocess_logs(hl_points, vl_points, time_from, time_to)
    #
    # Step 3. Preprocess trips
    #
    processed_trip_fn = get_processed_trip_fn(time_from, time_to)
    if not check_path_exist(processed_trip_fn):
        from preprocess_trips import run as run_preprocess_trips
        run_preprocess_trips(hl_points, vl_points, time_from, time_to)
Example #25
0
def run(time_from, time_to):
    #
    # Step 1. Split Singapore into zones
    #
    if not check_path_exist(grid_info_fn):
        from taxi_common.sg_grid_zone import run as run_split_into_zones  # @UnresolvedImport
        hl_points, vl_points, zones = run_split_into_zones(rp_zone)
    else:
        hl_points, vl_points, zones = load_pickle_file(grid_info_fn)
    #
    # Step 2. Preprocess logs
    #
    processed_log_fn = get_processed_log_fn(time_from, time_to)
    if not check_path_exist(processed_log_fn):
        from preprocess_logs import run as run_preprocess_logs
        run_preprocess_logs(hl_points, vl_points, time_from, time_to)
    #
    # Step 3. Preprocess trips
    #
    processed_trip_fn = get_processed_trip_fn(time_from, time_to)
    if not check_path_exist(processed_trip_fn):
        from preprocess_trips import run as run_preprocess_trips 
        run_preprocess_trips(hl_points, vl_points, time_from, time_to)
Example #26
0
def get_driver_trajectory(did):
    ofpath = '%s%d.pkl' % (if_prefix, did)
    if check_path_exist(ofpath):
        dt_xy_state = load_pickle_file(ofpath)
    else:
        dates = []
        for fn in get_all_files(if_dpath, '%s*.csv' % if_prefix):
            _, _date, _did = fn[:-len('.csv')].split('-')
            if int(_did) != did:
                continue
            year = 2000 + int(_date[:2])
            month, day = map(int, [_date[2:4], _date[4:6]])
            dt = datetime.datetime(year, month, day)
            dates += [dt]
        dates.sort()
        dt_xy_state = []
        for dt in dates:
            yy = '%02d' % (dt.year - 2000)
            mm, dd = '%02d' % dt.month, '%02d' % dt.day
            yymmdd = yy + mm + dd
            ifpath = '%s/%s%s-%d.csv' % (if_dpath, if_prefix, yymmdd, did)
            with open(ifpath, 'rb') as logFile:
                reader = csv.reader(logFile)
                header = reader.next()
                # header: time,vehicle-id,driver-id,longitude,latitude,speed,state
                hid = {h: i for i, h in enumerate(header)}
                for row in reader:
                    dt = datetime.datetime.fromtimestamp(eval(
                        row[hid['time']]))
                    lon, lat = map(
                        eval,
                        [row[hid[cn]] for cn in ['longitude', 'latitude']])
                    x, y = GPS_xyDrawing.convert_GPS2xy(lon, lat)
                    dt_xy_state += [(dt, x, y, int(row[hid['state']]))]
        save_pickle_file(ofpath, dt_xy_state)
    return dt_xy_state
def process_month(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        yy, mm = yymm[:2], yymm[2:]
        trip_normal_fpath = '%s/20%s/%s/trips/trips-%s-normal.csv' % (
            taxi_home, yy, mm, yymm)
        trip_ext_fpath = '%s/20%s/%s/trips/trips-%s-normal-ext.csv' % (
            taxi_home, yy, mm, yymm)
        log_fpath = '%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm,
                                                            yymm)
        if not check_path_exist(trip_normal_fpath):
            logger.info('The file X exists; %s' % yymm)
            return None
        ss_drivers_fpath = '%s/%s%s.pkl' % (ss_drivers_dpath,
                                            ss_drivers_prefix, yymm)
        if not check_path_exist(ss_drivers_fpath):
            logger.info('The file X exists; %s' % ss_drivers_fpath)
            return None
        ss_drivers = load_pickle_file(ss_drivers_fpath)
        x_points, y_points = get_sg_grid_xy_points()
        #
        ss_trips_fpath = '%s/%s%s.csv' % (ss_trips_dpath, ss_trips_prefix,
                                          yymm)
        if check_path_exist(ss_trips_fpath):
            logger.info('The file had already been processed; %s' % yymm)
            return None
        with open(ss_trips_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([
                'did', 'hour', 'zi', 'zj', 'time', 'day', 'month',
                'start-long', 'start-lat', 'distance', 'duration', 'fare',
                'queueingTime'
            ])
        with open(trip_normal_fpath, 'rb') as tripFileN:
            tripReaderN = csv.reader(tripFileN)
            tripHeaderN = tripReaderN.next()
            # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3,
            #  'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7,
            #  'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11,
            #  'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15,
            #  'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19}
            hidN = {h: i for i, h in enumerate(tripHeaderN)}
            with open(trip_ext_fpath, 'rb') as tripFileE:
                tripReaderE = csv.reader(tripFileE)
                tripHeaderE = tripReaderE.next()
                #
                # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3}
                #
                hidE = {h: i for i, h in enumerate(tripHeaderE)}
                with open(log_fpath, 'rb') as logFile:
                    logReader = csv.reader(logFile)
                    logHeader = logReader.next()
                    hidL = {h: i for i, h in enumerate(logHeader)}
                    handling_day = 0
                    drivers = {}
                    for rowN in tripReaderN:
                        rowE = tripReaderE.next()
                        didT = int(rowE[hidE['driver-id']])
                        if didT not in ss_drivers:
                            continue
                        tripTime = eval(rowN[hidN['start-time']])
                        cur_dtT = datetime.datetime.fromtimestamp(tripTime)
                        if handling_day != cur_dtT.day:
                            handling_day = cur_dtT.day
                            logger.info('Processing %s %dth day' %
                                        (yymm, cur_dtT.day))
                        if cur_dtT.weekday() in [FRI, SAT, SUN]:
                            continue
                        if cur_dtT.hour < AM10:
                            continue
                        if PM8 <= cur_dtT.hour:
                            continue
                        while True:
                            rowL = logReader.next()
                            logTime = eval(rowL[hidL['time']])
                            didL = int(rowL[hidL['driver-id']])
                            if didL not in ss_drivers:
                                continue
                            t = eval(rowL[hidL['time']])
                            cur_dtL = datetime.datetime.fromtimestamp(t)
                            if cur_dtL.weekday() in [FRI, SAT, SUN]:
                                continue
                            if cur_dtL.hour < AM10:
                                continue
                            if PM8 <= cur_dtL.hour:
                                continue
                            longitude, latitude = eval(
                                rowL[hidL['longitude']]), eval(
                                    rowL[hidL['latitude']])
                            zi, zj = bisect(x_points, longitude) - 1, bisect(
                                y_points, latitude) - 1
                            if zi < 0 or zj < 0:
                                continue
                            t, s = eval(rowL[hidL['time']]), eval(
                                rowL[hidL['state']])
                            z = (zi, zj)
                            cur_dt = datetime.datetime.fromtimestamp(t)
                            if handling_day != cur_dt.day:
                                handling_day = cur_dt.day
                                logger.info('Processing %s %dth day' %
                                            (yymm, cur_dt.day))
                            if not drivers.has_key(didL):
                                drivers[didL] = driver(didL, t, z, s)
                            else:
                                drivers[didL].update(t, z, s)
                            if tripTime <= logTime:
                                break
                        s_long, s_lat = eval(rowN[hidN['start-long']]), eval(
                            rowN[hidN['start-lat']])
                        zi, zj = bisect(x_points, s_long) - 1, bisect(
                            y_points, s_lat) - 1
                        if zi < 0 or zj < 0:
                            continue
                        if not drivers.has_key(didT):
                            continue
                        if drivers[didT].firstFreeStateTime == -1:
                            continue
                        queueingTime = tripTime - drivers[didT].zoneEnteredTime
                        if queueingTime < 0:
                            continue
                        with open(ss_trips_fpath, 'a') as w_csvfile:
                            writer = csv.writer(w_csvfile, lineterminator='\n')
                            writer.writerow([
                                didT, cur_dtT.hour, zi, zj, tripTime,
                                cur_dtT.day, cur_dtT.month, s_long, s_lat,
                                rowN[hidN['distance']], rowN[hidN['duration']],
                                rowN[hidN['fare']], queueingTime
                            ])
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
Example #28
0
def run():
    cg_dpath = dpaths['baseline', '2009', 'countGraph']
    cg_prefix = prefixs['baseline', '2009', 'countGraph']
    gp_dpath = dpaths['baseline', '2009', 'groupPartition']
    gp_prefix = prefixs['baseline', '2009', 'groupPartition']
    #
    check_dir_create(gp_dpath)
    #
    gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix)
    gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix)
    gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
    #
    with open(gp_summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        writer.writerow([
            'groupName', 'numDrivers', 'numRelations', 'graphComplexity',
            'tieStrength', 'contribution', 'benCon'
        ])
    #
    logger.info('Start handling SP_group_dpath')
    if not check_path_exist(gp_original_fpath):
        original_graph = {}
        for fn in get_all_files(cg_dpath, '%s*' % cg_prefix):
            count_graph = load_pickle_file('%s/%s' % (cg_dpath, fn))
            logger.info('Start handling; %s' % fn)
            numEdges = len(count_graph)
            moduloNumber = numEdges / 10
            for i, ((did0, did1), w) in enumerate(count_graph.iteritems()):
                if i % moduloNumber == 0:
                    logger.info('Handling; %.2f' % (i / float(numEdges)))
                original_graph[did0, did1] = w
        save_pickle_file(gp_original_fpath, original_graph)
    else:
        original_graph = load_pickle_file(gp_original_fpath)
    #
    logger.info('igraph converting')
    igid, did_igid = 0, {}
    igG = ig.Graph(directed=True)
    numEdges = len(original_graph)
    moduloNumber = numEdges / 10
    for i, ((did0, did1), w) in enumerate(original_graph.iteritems()):
        if i % moduloNumber == 0:
            logger.info('Handling; %.2f' % i / float(numEdges))
        if not did_igid.has_key(did0):
            igG.add_vertex(did0)
            did_igid[did0] = igid
            igid += 1
        if not did_igid.has_key(did1):
            igG.add_vertex(did1)
            did_igid[did1] = igid
            igid += 1
        igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w))
    #
    logger.info('Partitioning')
    part = louvain.find_partition(igG, method='Modularity', weight='weight')
    logger.info('Each group pickling and summary')
    gn_drivers = {}
    for i, sg in enumerate(part.subgraphs()):
        gn = 'G(%d)' % i
        group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn)
        sg.write_pickle(group_fpath)
        #
        drivers = [v['name'] for v in sg.vs]
        weights = [e['weight'] for e in sg.es]
        graphComplexity = len(weights) / float(len(drivers))
        tie_strength = sum(weights) / float(len(drivers))
        contribution = sum(weights) / float(len(weights))
        benCon = tie_strength / float(len(drivers))
        with open(gp_summary_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([
                gn,
                len(drivers),
                len(weights), graphComplexity, tie_strength, contribution,
                benCon
            ])
        gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn)
        layout = sg.layout("kk")
        if len(drivers) < 100:
            ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers)
        else:
            ig.plot(sg, gl_img_fpath, layout=layout)
        gn_drivers[gn] = drivers
        gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn)
        with open(gc_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['groupName', 'did0', 'did1', 'coef'])
            for e in sg.es:
                did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple]
                coef = e['weight']
                writer.writerow([gn, did0, did1, coef])
    save_pickle_file(gp_drivers_fpath, gn_drivers)
Example #29
0
def run():
    gp_summary_fpath = '%s/%ssummary.csv' % (of_dpath, of_prefix)
    gp_original_fpath = '%s/%soriginal.pkl' % (of_dpath, of_prefix)
    gp_drivers_fpath = '%s/%sdrivers.pkl' % (of_dpath, of_prefix)
    #
    with open(gp_summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        writer.writerow([
            'groupName', 'numDrivers', 'numRelations', 'graphComplexity',
            'tieStrength', 'contribution', 'benCon'
        ])
    logger.info('Start handling SP_group_dpath')
    orignal_graph = {}
    for fn in get_all_files(if_dpath,
                            '%ssigRelation-%s-*.pkl' % (if_prefix, year)):
        _, _, _, _, _did1 = fn[:-len('.csv')].split('-')
        sigRelatioin = load_pickle_file('%s/%s' % (if_dpath, fn))
        for _did0, coef in sigRelatioin['pos']:
            did0, did1 = map(int, [_did0, _did1])
            orignal_graph[did0, did1] = coef
    save_pickle_file(gp_original_fpath, orignal_graph)
    #
    igid, did_igid = 0, {}
    igG = ig.Graph(directed=True)
    for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()):
        if not did_igid.has_key(did0):
            igG.add_vertex(did0)
            did_igid[did0] = igid
            igid += 1
        if not did_igid.has_key(did1):
            igG.add_vertex(did1)
            did_igid[did1] = igid
            igid += 1
        igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w))
    logger.info('Partitioning')
    part = louvain.find_partition(igG, method='Modularity', weight='weight')
    logger.info('Each group pickling and summary')
    gn_drivers = {}
    for i, sg in enumerate(part.subgraphs()):
        gn = 'G(%d)' % i
        group_fpath = '%s/%s%s.pkl' % (of_dpath, of_prefix, gn)
        sg.write_pickle(group_fpath)
        #
        drivers = [v['name'] for v in sg.vs]
        weights = [e['weight'] for e in sg.es]
        graphComplexity = len(weights) / float(len(drivers))
        tie_strength = sum(weights) / float(len(drivers))
        contribution = sum(weights) / float(len(weights))
        benCon = tie_strength / float(len(drivers))
        with open(gp_summary_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([
                gn,
                len(drivers),
                len(weights), graphComplexity, tie_strength, contribution,
                benCon
            ])
        gl_img_fpath = '%s/%simg-%s.pdf' % (of_dpath, of_prefix, gn)
        # layout = sg.layout("kk")
        # if len(drivers) < 100:
        #     ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers)
        # else:
        #     ig.plot(sg, gl_img_fpath, layout=layout)
        gn_drivers[gn] = drivers
        gc_fpath = '%s/%scoef-%s.csv' % (of_dpath, of_prefix, gn)
        with open(gc_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['groupName', 'did0', 'did1', 'coef'])
            for e in sg.es:
                did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple]
                coef = e['weight']
                writer.writerow([gn, did0, did1, coef])
    save_pickle_file(gp_drivers_fpath, gn_drivers)
Example #30
0
def run():
    print 'start'
    check_dir_create(com_dir)
    #
    yyyy = '2009'
    la_fn = '2009-CD(184)-N(7003)-E(5717371).pkl'
    la_fpath = '%s/%s' % (la_dir, la_fn)
    _, str_CD, _, _ = la_fn[:-len('.pkl')].split('-')
    CD = int(str_CD[len('CD('):-len(')')])
    print 'pick file loading...'
    pairs_day_counting = load_pickle_file(la_fpath)
    print 'finished'
    for thD in [18, 36, 55, 73, 82, 92]:
        thD_dpath = '%s/%s' % (com_dir, '2009-CD(%d)-thD(%d)' % (CD, thD))
        check_dir_create(thD_dpath)
        summary_fpath = '%s/%s-CD(%d)-thD(%d)-community-summary.csv' % (
            thD_dpath, yyyy, CD, thD)
        glayout_fpath = '%s/%s-CD(%d)-thD(%d)-glayout.pkl' % (thD_dpath, yyyy,
                                                              CD, thD)
        with open(summary_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = [
                'com-name', 'num-nodes', 'num-edges',
                'tie-strength(# of days encounter / # of drivers)'
            ]
            writer.writerow(new_headers)
        #
        nxG = nx.Graph()
        for (k0, k1), num_days in pairs_day_counting.iteritems():
            if num_days < thD:
                continue
            nxG.add_edge(k0, k1, weight=num_days)

        print 'Whole graph pickling ...', yyyy, CD, thD
        nx.write_gpickle(
            nxG, '%s/%s-CD(%d)-thD(%d)-whole-N(%d)-E(%d).pkl' %
            (thD_dpath, yyyy, CD, thD, len(nxG.nodes()), len(nxG.edges())))
        n_label, n_comId = [], []
        nxId_igId = {}
        ig_nid = 0
        print 'Partitioning ...'
        partition = community.best_partition(nxG)
        for i, com in enumerate(set(partition.values())):
            list_nodes = [
                nodes for nodes in partition.keys() if partition[nodes] == com
            ]
            print i, 'Saving sub-graph ...'
            sub_nxG = nxG.subgraph(list_nodes)
            com_name = 'COM(%d)' % i
            com_fpath = '%s/%s-CD(%d)-thD(%d)-%s-N(%d)-E(%d).pkl' % (
                thD_dpath, yyyy, CD, thD, com_name, len(
                    sub_nxG.nodes()), len(sub_nxG.edges()))
            nx.write_gpickle(sub_nxG, com_fpath)

            _, _, weight = zip(
                *list(sub_nxG.edges_iter(data='weight', default=1)))
            num_nodes, num_edges = len(sub_nxG), len(weight)
            with open(summary_fpath, 'a') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                writer.writerow([
                    com_name, num_nodes, num_edges,
                    sum(weight) / float(num_nodes)
                ])
            #
            print i, 'labeling...'
            for n in sub_nxG.nodes():
                n_label.append(n)
                n_comId.append(i)
                nxId_igId[n] = ig_nid
                ig_nid += 1
        #
        if len(nxG.nodes()) < 1000:
            print 'Layout calculating...'
            print datetime.datetime.now()
            Edges = [(nxId_igId[n0], nxId_igId[n1])
                     for (n0, n1) in nxG.edges()]
            print 'finish edge converting', len(Edges)
            print datetime.datetime.now()
            igG = ig.Graph(Edges, directed=False)
            layt = igG.layout('kk', dim=3)
            print 'finish layout calculation'
            print datetime.datetime.now()
            #
            save_pickle_file(glayout_fpath, [n_label, n_comId, layt, Edges])
        else:
            save_pickle_file(glayout_fpath, [])
Example #31
0
def process_files(yymm):
    from traceback import format_exc
    try:
        logger.info('handle %s' % yymm)
        ssDrivers = load_pickle_file(
            '%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm))
        #
        with open(
                '%s/%s%s.csv' % (shiftProDur_dpath, shiftProDur_prefix, yymm),
                'rt') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            with open(
                    '%s/%s%s.csv' %
                (ssDriverShiftProDur_dpath, ssDriverShiftProDur_prefix, yymm),
                    'wt') as w_csvfile:
                writer = csv.writer(w_csvfile)
                writer.writerow(headers)
                for row in reader:
                    did = int(row[hid['did']])
                    if did not in ssDrivers:
                        continue
                    writer.writerow(row)
        #
        yy, mm = yymm[:2], yymm[-2:]
        year, month = 2000 + int(yy), int(mm)
        with open('%s/%s%s.csv' % (trip_dpath, trip_prefix, yymm),
                  'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            with open(
                    '%s/%s%s.csv' %
                (ssDriverTrip_dpath, ssDriverTrip_prefix, yymm),
                    'wt') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                new_headers = [
                    'did', 'startTime', 'duration', 'fare', 'year', 'month',
                    'day', 'hour'
                ]
                writer.writerow(new_headers)
                for row in reader:
                    did = int(row[hid['did']])
                    if did not in ssDrivers:
                        continue
                    writer.writerow([
                        row[hid['did']], row[hid['startTime']],
                        row[hid['duration']], row[hid['fare']], year, month,
                        row[hid['day']], row[hid['hour']]
                    ])
        #
        with open(
                '%s/%s%s.csv' %
            (economicProfit_ap_dpath, economicProfit_ap_prefix, yymm),
                'rt') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            with open(
                    '%s/%s%s.csv' %
                (ssDriverEP_ap_dpath, ssDriverEP_ap_prefix, yymm),
                    'wt') as w_csvfile:
                writer = csv.writer(w_csvfile)
                writer.writerow(headers)
                for row in reader:
                    did = int(row[hid['did']])
                    if did not in ssDrivers:
                        continue
                    writer.writerow(row)
        #
        with open(
                '%s/%s%s.csv' %
            (economicProfit_ns_dpath, economicProfit_ns_prefix, yymm),
                'rt') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            with open(
                    '%s/%s%s.csv' %
                (ssDriverEP_ns_dpath, ssDriverEP_ns_prefix, yymm),
                    'wt') as w_csvfile:
                writer = csv.writer(w_csvfile)
                writer.writerow(headers)
                for row in reader:
                    did = int(row[hid['did']])
                    if did not in ssDrivers:
                        continue
                    writer.writerow(row)
        logger.info('end %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
Example #32
0
    'H',  #    hexagon2
    'd',  #    thin_diamond
    '|',  #    vline
    '_',  #    hline
    '.',  #    point
    ',',  #    pixel

    'D',  #    diamond
    '8',  #    octagon
          )

#
# The number of trips depending on hour
#
from taxi_common.file_handling_functions import load_pickle_file
hour_tripNum = load_pickle_file('_hour_tripNum.pkl')
#
_figsize = (8, 6)
_fontsize = 14
_data = hour_tripNum.values()
xTickMarks = hour_tripNum.keys()
_xlabel = 'Hour'
_ylabel = 'The number of trips'
#
fig = plt.figure(figsize=_figsize)
ax = fig.add_subplot(111)
ind = np.arange(len(_data))
width = 0.5  # the width of the bars
#
ax.bar(ind, _data, color='blue')
# axes and labels
def process_month(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        yy, mm = yymm[:2], yymm[2:]
        trip_normal_fpath = '%s/20%s/%s/trips/trips-%s-normal.csv' % (taxi_home, yy, mm, yymm)
        trip_ext_fpath = '%s/20%s/%s/trips/trips-%s-normal-ext.csv' % (taxi_home, yy, mm, yymm)
        log_fpath = '%s/20%s/%s/logs/logs-%s-normal.csv' % (taxi_home, yy, mm, yymm)
        if not check_path_exist(trip_normal_fpath):
            logger.info('The file X exists; %s' % yymm)
            return None
        ss_drivers_fpath = '%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm)
        if not check_path_exist(ss_drivers_fpath):
            logger.info('The file X exists; %s' % ss_drivers_fpath)
            return None
        ss_drivers = load_pickle_file(ss_drivers_fpath)
        x_points, y_points = get_sg_grid_xy_points()
        #
        ss_trips_fpath = '%s/%s%s.csv' % (ss_trips_dpath, ss_trips_prefix, yymm)
        if check_path_exist(ss_trips_fpath):
            logger.info('The file had already been processed; %s' % yymm)
            return None
        with open(ss_trips_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['did',
                             'hour', 'zi', 'zj',
                             'time', 'day', 'month',
                             'start-long', 'start-lat',
                             'distance', 'duration', 'fare',
                             'queueingTime'])
        with open(trip_normal_fpath, 'rb') as tripFileN:
            tripReaderN = csv.reader(tripFileN)
            tripHeaderN = tripReaderN.next()
            # {'trip-id': 0, 'job-id': 1, 'start-time': 2, 'end-time': 3,
            #  'start-long': 4, 'start-lat': 5, 'end-long': 6, 'end-lat': 7,
            #  'vehicle-id': 8, 'distance': 9, 'fare': 10, 'duration': 11,
            #  'start-dow': 12, 'start-day': 13, 'start-hour': 14, 'start-minute': 15,
            #  'end-dow': 16, 'end-day': 17, 'end-hour': 18, 'end-minute': 19}
            hidN = {h: i for i, h in enumerate(tripHeaderN)}
            with open(trip_ext_fpath, 'rb') as tripFileE:
                tripReaderE = csv.reader(tripFileE)
                tripHeaderE = tripReaderE.next()
                #
                # {'start-zone': 0, 'end-zone': 1, 'start-postal': 2, 'driver-id': 4, 'end-postal': 3}
                #
                hidE = {h: i for i, h in enumerate(tripHeaderE)}
                with open(log_fpath, 'rb') as logFile:
                    logReader = csv.reader(logFile)
                    logHeader = logReader.next()
                    hidL = {h: i for i, h in enumerate(logHeader)}
                    handling_day = 0
                    drivers = {}
                    for rowN in tripReaderN:
                        rowE = tripReaderE.next()
                        didT = int(rowE[hidE['driver-id']])
                        if didT not in ss_drivers:
                            continue
                        tripTime = eval(rowN[hidN['start-time']])
                        cur_dtT = datetime.datetime.fromtimestamp(tripTime)
                        if handling_day != cur_dtT.day:
                            handling_day = cur_dtT.day
                            logger.info('Processing %s %dth day' % (yymm, cur_dtT.day))
                        if cur_dtT.weekday() in [FRI, SAT, SUN]:
                            continue
                        if cur_dtT.hour < AM10:
                            continue
                        if PM8 <= cur_dtT.hour:
                            continue
                        while True:
                            rowL = logReader.next()
                            logTime = eval(rowL[hidL['time']])
                            didL = int(rowL[hidL['driver-id']])
                            if didL not in ss_drivers:
                                continue
                            t = eval(rowL[hidL['time']])
                            cur_dtL = datetime.datetime.fromtimestamp(t)
                            if cur_dtL.weekday() in [FRI, SAT, SUN]:
                                continue
                            if cur_dtL.hour < AM10:
                                continue
                            if PM8 <= cur_dtL.hour:
                                continue
                            longitude, latitude = eval(rowL[hidL['longitude']]), eval(rowL[hidL['latitude']])
                            zi, zj = bisect(x_points, longitude) - 1, bisect(y_points, latitude) - 1
                            if zi < 0 or zj < 0:
                                continue
                            t, s = eval(rowL[hidL['time']]), eval(rowL[hidL['state']])
                            z = (zi, zj)
                            cur_dt = datetime.datetime.fromtimestamp(t)
                            if handling_day != cur_dt.day:
                                handling_day = cur_dt.day
                                logger.info('Processing %s %dth day' % (yymm, cur_dt.day))
                            if not drivers.has_key(didL):
                                drivers[didL] = driver(didL, t, z, s)
                            else:
                                drivers[didL].update(t, z, s)
                            if tripTime <= logTime:
                                break
                        s_long, s_lat = eval(rowN[hidN['start-long']]), eval(rowN[hidN['start-lat']])
                        zi, zj = bisect(x_points, s_long) - 1, bisect(y_points, s_lat) - 1
                        if zi < 0 or zj < 0:
                            continue
                        if not drivers.has_key(didT):
                            continue
                        if drivers[didT].firstFreeStateTime == -1:
                            continue
                        queueingTime = tripTime - drivers[didT].zoneEnteredTime
                        if queueingTime < 0:
                            continue
                        with open(ss_trips_fpath, 'a') as w_csvfile:
                            writer = csv.writer(w_csvfile, lineterminator='\n')
                            writer.writerow([didT,
                                             cur_dtT.hour, zi, zj,
                                             tripTime, cur_dtT.day, cur_dtT.month,
                                             s_long, s_lat,
                                             rowN[hidN['distance']], rowN[hidN['duration']], rowN[hidN['fare']],
                                             queueingTime])
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
Example #34
0
def run():
    print 'start'
    check_dir_create(com_dir)
    #
    yyyy = '2009'
    la_fn = '2009-CD(184)-N(7003)-E(5717371).pkl'
    la_fpath = '%s/%s' % (la_dir, la_fn)
    _, str_CD, _, _ = la_fn[:-len('.pkl')].split('-')
    CD = int(str_CD[len('CD('):-len(')')])
    print 'pick file loading...'
    pairs_day_counting = load_pickle_file(la_fpath)
    print 'finished'
    for thD in [18, 36, 55, 73, 82, 92]:
        thD_dpath = '%s/%s' % (com_dir, '2009-CD(%d)-thD(%d)' % (CD, thD))
        check_dir_create(thD_dpath)
        summary_fpath = '%s/%s-CD(%d)-thD(%d)-community-summary.csv' % (thD_dpath, yyyy, CD, thD)
        glayout_fpath = '%s/%s-CD(%d)-thD(%d)-glayout.pkl' % (thD_dpath, yyyy, CD, thD)
        with open(summary_fpath, 'wb') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = ['com-name', 'num-nodes', 'num-edges', 'tie-strength(# of days encounter / # of drivers)']
            writer.writerow(new_headers)
        #
        nxG = nx.Graph()
        for (k0, k1), num_days in pairs_day_counting.iteritems():
            if num_days < thD:
                continue
            nxG.add_edge(k0, k1, weight=num_days)

        print 'Whole graph pickling ...', yyyy, CD, thD
        nx.write_gpickle(nxG, '%s/%s-CD(%d)-thD(%d)-whole-N(%d)-E(%d).pkl' % (thD_dpath, yyyy, CD, thD,
                                                                              len(nxG.nodes()), len(nxG.edges())))
        n_label, n_comId = [], []
        nxId_igId = {}
        ig_nid = 0
        print 'Partitioning ...'
        partition = community.best_partition(nxG)
        for i, com in enumerate(set(partition.values())):
            list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
            print i, 'Saving sub-graph ...'
            sub_nxG = nxG.subgraph(list_nodes)
            com_name = 'COM(%d)' % i
            com_fpath = '%s/%s-CD(%d)-thD(%d)-%s-N(%d)-E(%d).pkl' % (thD_dpath, yyyy, CD, thD,
                                                               com_name, len(sub_nxG.nodes()), len(sub_nxG.edges()))
            nx.write_gpickle(sub_nxG, com_fpath)

            _, _, weight = zip(*list(sub_nxG.edges_iter(data='weight', default=1)))
            num_nodes, num_edges = len(sub_nxG), len(weight)
            with open(summary_fpath, 'a') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                writer.writerow([com_name, num_nodes, num_edges, sum(weight) / float(num_nodes)])
            #
            print i, 'labeling...'
            for n in sub_nxG.nodes():
                n_label.append(n)
                n_comId.append(i)
                nxId_igId[n] = ig_nid
                ig_nid += 1
        #
        if len(nxG.nodes()) < 1000:
            print 'Layout calculating...'
            print datetime.datetime.now()
            Edges = [(nxId_igId[n0], nxId_igId[n1]) for (n0, n1) in nxG.edges()]
            print 'finish edge converting', len(Edges)
            print datetime.datetime.now()
            igG = ig.Graph(Edges, directed=False)
            layt = igG.layout('kk', dim=3)
            print 'finish layout calculation'
            print datetime.datetime.now()
            #
            save_pickle_file(glayout_fpath, [n_label, n_comId, layt, Edges])
        else:
            save_pickle_file(glayout_fpath, [])
def process_file(tm, year):
    ig_dpath = dpaths[tm, year, 'influenceGraph']
    ig_prefix = prefixs[tm, year, 'influenceGraph']
    gp_dpath = dpaths[tm, year, 'groupPartition']
    gp_prefix = prefixs[tm, year, 'groupPartition']
    #
    check_dir_create(gp_dpath)
    #
    gp_summary_fpath = '%s/%ssummary.csv' % (gp_dpath, gp_prefix)
    gp_original_fpath = '%s/%soriginal.pkl' % (gp_dpath, gp_prefix)
    gp_drivers_fpath = '%s/%sdrivers.pkl' % (gp_dpath, gp_prefix)
    #
    with open(gp_summary_fpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        writer.writerow(['groupName', 'numDrivers', 'numRelations', 'graphComplexity', 'tieStrength', 'contribution', 'benCon'])
    #
    logger.info('Start handling SP_group_dpath')
    orignal_graph = {}
    for fn in get_all_files(ig_dpath, '%s*' % ig_prefix):
        regression_graph = load_pickle_file('%s/%s' % (ig_dpath, fn))
        for i, ((did0, did1), w) in enumerate(regression_graph.iteritems()):
            orignal_graph[did0, did1] = w
    save_pickle_file(gp_original_fpath, orignal_graph)
    #
    igid, did_igid = 0, {}
    igG = ig.Graph(directed=True)
    for i, ((did0, did1), w) in enumerate(orignal_graph.iteritems()):
        if not did_igid.has_key(did0):
            igG.add_vertex(did0)
            did_igid[did0] = igid
            igid += 1
        if not did_igid.has_key(did1):
            igG.add_vertex(did1)
            did_igid[did1] = igid
            igid += 1
        igG.add_edge(did_igid[did0], did_igid[did1], weight=abs(w))
    #
    logger.info('Partitioning')
    part = louvain.find_partition(igG, method='Modularity', weight='weight')
    logger.info('Each group pickling and summary')
    gn_drivers = {}
    for i, sg in enumerate(part.subgraphs()):
        gn = 'G(%d)' % i
        group_fpath = '%s/%s%s.pkl' % (gp_dpath, gp_prefix, gn)
        sg.write_pickle(group_fpath)
        #
        drivers = [v['name'] for v in sg.vs]
        weights = [e['weight'] for e in sg.es]
        graphComplexity = len(weights) / float(len(drivers))
        tie_strength = sum(weights) / float(len(drivers))
        contribution = sum(weights) / float(len(weights))
        benCon = tie_strength / float(len(drivers))
        with open(gp_summary_fpath, 'a') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow([gn, len(drivers), len(weights), graphComplexity, tie_strength, contribution, benCon])
        gl_img_fpath = '%s/%simg-%s.pdf' % (gp_dpath, gp_prefix, gn)
        layout = sg.layout("kk")
        if len(drivers) < 100:
            ig.plot(sg, gl_img_fpath, layout=layout, vertex_label=drivers)
        else:
            ig.plot(sg, gl_img_fpath, layout=layout)
        gn_drivers[gn] = drivers
        gc_fpath = '%s/%scoef-%s.csv' % (gp_dpath, gp_prefix, gn)
        with open(gc_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            writer.writerow(['groupName', 'did0', 'did1', 'coef'])
            for e in sg.es:
                did0, did1 = [sg.vs[nIndex]['name'] for nIndex in e.tuple]
                coef = e['weight']
                writer.writerow([gn, did0, did1, coef])
    save_pickle_file(gp_drivers_fpath, gn_drivers)
Example #36
0
def process_files(yymm):
    print 'handle the file; %s' % yymm
    #
    # initialize csv_files
    #
    # general productivities
    with open('%s/%s%s.csv' % (ftd_gen_stat_dir, ftd_gen_stat_prefix, yymm), 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile)
        headers = ['yy', 'mm', 'did', 'prod']
        writer.writerow(headers)
    # airport and night safari productivities
    for dn, fn_prefix in [(ftd_prev_in_ap_stat_dir, ftd_prev_in_ap_stat_prefix),
                          (ftd_prev_out_ap_stat_dir, ftd_prev_out_ap_stat_prefix),
                          (ftd_prev_in_ns_stat_dir, ftd_prev_in_ns_stat_prefix),
                          (ftd_prev_out_ns_stat_dir, ftd_prev_out_ns_stat_prefix)]:
        with open('%s/%s%s.csv' % (dn, fn_prefix, yymm), 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile)
            headers = ['yy', 'mm', 'did', 'prod', 'eco-profit']
            writer.writerow(headers)
    #
    full_dids = sorted(load_pickle_file('%s/%s%s.pkl' % (ftd_list_dir, ftd_list_prefix, yymm)))
    s_df = pd.read_csv('%s/%s%s.csv' % (ftd_shift_dir, ftd_shift_prefix, yymm))
    trip_df = pd.read_csv('%s/%s%s.csv' % (ftd_trips_dir, ftd_trips_prefix, yymm))
    ap_trip_df = pd.read_csv('%s/%s%s.csv' % (ap_ep_dir, ap_ep_prefix, yymm))
    ns_trip_df = pd.read_csv('%s/%s%s.csv' % (ns_ep_dir, ns_ep_prefix, yymm))
    #
    yy, mm = int(yymm[:2]), int(yymm[2:])
    for did in full_dids:
        #
        # General
        #
        did_sh = s_df[(s_df['did'] == did)]
        pro_dur = sum(did_sh['pro-dur']) * SEC60
        did_wt = trip_df[(trip_df['did'] == did)]
        total_fare = sum(did_wt['fare'])
        if pro_dur > 0 and total_fare != 0:
            total_prod = total_fare / float(pro_dur)
            with open('%s/%s%s.csv' % (ftd_gen_stat_dir, ftd_gen_stat_prefix, yymm), 'a') as w_csvfile:
                writer = csv.writer(w_csvfile)
                writer.writerow([yy, mm, did, total_prod])
        #
        # airport trips
        #
        did_ap = ap_trip_df[(ap_trip_df['did'] == did)]
        prev_in_ap_trip = did_ap[(did_ap['trip-mode'] == DIn_PIn)]
        prev_out_ap_trip = did_ap[(did_ap['trip-mode'] == DOut_PIn)]
        #
        if len(did_ap) != 0:
            df_dir_path_prefix = [(prev_in_ap_trip, ftd_prev_in_ap_stat_dir, ftd_prev_in_ap_stat_prefix),
                                  (prev_out_ap_trip, ftd_prev_out_ap_stat_dir, ftd_prev_out_ap_stat_prefix)]
            calc_drivers_monthly_eco_profit(yymm, yy, mm, did, df_dir_path_prefix)
        #
        # night safari trips
        #
        did_ns = ns_trip_df[(ns_trip_df['did'] == did)]
        prev_in_ns_trip = did_ns[(did_ns['trip-mode'] == DIn_PIn)]
        prev_out_ns_trip = did_ns[(did_ns['trip-mode'] == DOut_PIn)]
        #
        if len(did_ns) != 0:
            df_dir_path_prefix = [(prev_in_ns_trip, ftd_prev_in_ns_stat_dir, ftd_prev_in_ns_stat_prefix),
                                  (prev_out_ns_trip, ftd_prev_out_ns_stat_dir, ftd_prev_out_ns_stat_prefix)]
            calc_drivers_monthly_eco_profit(yymm, yy, mm, did, df_dir_path_prefix)
    print 'End the file; %s' % yymm
Example #37
0
def process_file(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        queueingTime_ap_fpath = '%s/%s%s.csv' % (queueingTime_ap_dpath,
                                                 queueingTime_ap_prefix, yymm)
        queueingTime_ns_fpath = '%s/%s%s.csv' % (queueingTime_ns_dpath,
                                                 queueingTime_ns_prefix, yymm)
        if check_path_exist(queueingTime_ap_fpath) and check_path_exist(
                queueingTime_ns_fpath):
            logger.info('The file had already been processed; %s' % yymm)
            return
        #
        logger.info('load pickle files; %s' % yymm)
        ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath,
                                        crossingTime_ap_prefix, yymm)
        ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath,
                                        crossingTime_ns_prefix, yymm)
        crossingTime_ap, crossingTime_ns = load_pickle_file(
            ap_pkl_fpath), load_pickle_file(ns_pkl_fpath)
        #
        logger.info('initiate csv files; %s' % yymm)
        with open(queueingTime_ap_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = [
                'did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode',
                'queueJoinTime', 'queueingTime', 'year', 'month', 'day',
                'hour', 'pickUpTerminalAP', 'prevEndTerminalAP'
            ]
            writer.writerow(new_headers)
        with open(queueingTime_ns_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = [
                'did', 'startTime', 'endTime', 'duration', 'fare', 'tripMode',
                'queueJoinTime', 'queueingTime', 'year', 'month', 'day', 'hour'
            ]
            writer.writerow(new_headers)
        #
        logger.info('start recording; %s' % yymm)
        with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm),
                  'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            for row in reader:
                did = row[hid['did']]
                et, duration = row[hid['endTime']], row[hid['duration']]
                fare = row[hid['fare']]
                year, month = row[hid['year']], row[hid['month']]
                day, hour = row[hid['day']], row[hid['hour']]
                pickUpTerminalAP, prevEndTerminalAP = row[
                    hid['pickUpTerminalAP']], row[hid['prevEndTerminalAP']]
                #
                ap_tm, ns_tm = int(row[hid['tripModeAP']]), int(
                    row[hid['tripModeNS']])
                vid, st, prev_tet = row[hid['vid']], eval(
                    row[hid['startTime']]), eval(row[hid['prevTripEndTime']])
                #
                # Airport trip
                #
                if ap_tm != DIn_POut or ap_tm != DOut_POut:
                    queueing_time = None
                    if ap_tm == DIn_PIn:
                        queue_join_time = prev_tet
                        queueing_time = st - queue_join_time
                    elif ap_tm == DOut_PIn:
                        try:
                            i = bisect(crossingTime_ap[vid], st)
                            queue_join_time = crossingTime_ap[vid][
                                i - 1] if i != 0 else crossingTime_ap[vid][0]
                            queueing_time = st - queue_join_time
                        except KeyError:
                            pass
                    if queueing_time != None and Q_LIMIT_MIN <= queueing_time:
                        new_row = [
                            did, st, et, duration, fare, ap_tm,
                            queue_join_time, queueing_time, year, month, day,
                            hour, pickUpTerminalAP, prevEndTerminalAP
                        ]
                        append_record(queueingTime_ap_fpath, new_row)
                #
                # Night Safari
                #
                if ns_tm != DIn_POut or ns_tm != DOut_POut:
                    queueing_time = None
                    if ns_tm == DIn_PIn:
                        queue_join_time = prev_tet
                        queueing_time = st - queue_join_time
                    elif ns_tm == DOut_PIn:
                        try:
                            i = bisect(crossingTime_ns[vid], st)
                            queue_join_time = crossingTime_ns[vid][
                                i - 1] if i != 0 else crossingTime_ns[vid][0]
                            queueing_time = st - queue_join_time
                        except KeyError:
                            pass
                    if queueing_time != None and Q_LIMIT_MIN <= queueing_time:
                        new_row = [
                            did, st, et, duration, fare, ns_tm,
                            queue_join_time, queueing_time, year, month, day,
                            hour
                        ]
                        append_record(queueingTime_ns_fpath, new_row)
        logger.info('end the file; %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
Example #38
0
import __init__
'''
'''

from community_analysis import dpaths, prefixs
#
from taxi_common.file_handling_functions import get_all_files, load_pickle_file

year = '20%02d' % 9
# depVar = 'roamingTime'
depVar = 'interTravelTime'
#
#
of_dpath = dpaths[depVar, 'influenceGraph']
of_prefixs = prefixs[depVar, 'influenceGraph']

countRelationWhole = {k: 0 for k in ['sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']}

for fn in get_all_files(of_dpath, '%scount-*' % of_prefixs):
    print fn
    fpath = '%s/%s' % (of_dpath, fn)
    countRelation = load_pickle_file(fpath)
    for n in ['sigPos', 'sigNeg', 'XsigPos', 'XsigNeg']:
        countRelationWhole[n] += countRelation[n]

print countRelationWhole
Example #39
0
def aggregate_dayBased(yymm):
    print 'handle the file; %s' % yymm
    #
    for hours, fpath in [([15, 16, 17, 18, 19], ssDriversStatistics_ns1519_fpath),
                         ([20, 21, 22, 23, 0], ssDriversStatistics_ns2000_fpath)]:
        shift_df = pd.read_csv('%s/%s%s.csv' % (ssDriverShiftProDur_dpath, ssDriverShiftProDur_prefix, yymm))
        shift_df = shift_df[shift_df['hh'].isin(hours)]
        all_trip_df = pd.read_csv('%s/%s%s.csv' % (ssDriverTrip_dpath, ssDriverTrip_prefix, yymm))
        all_trip_df = all_trip_df[all_trip_df['hour'].isin(hours)]
        EP_df = pd.read_csv('%s/%s%s.csv' % (ssDriverEP_ns_dpath, ssDriverEP_ns_prefix, yymm))
        EP_df = EP_df[EP_df['hour'].isin(hours)]
        ssDrivers = map(int, load_pickle_file('%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm)))
        days = set(EP_df['day'])
        #
        yy, mm = int(yymm[:2]), int(yymm[2:])
        for dd in days:
            day_all_trip_df = all_trip_df[(all_trip_df['day'] == dd)]
            day_loc_trip_df = EP_df[(EP_df['day'] == dd)]
            day_shift_df = shift_df[(shift_df['dd'] == dd)]
            for did in ssDrivers:
                #
                # Specific location
                #
                d_loc_trip = day_loc_trip_df[(day_loc_trip_df['did'] == did)]
                if len(d_loc_trip) == 0:
                    continue
                loc_num = len(d_loc_trip['fare'])
                loc_dur = sum(d_loc_trip['duration'])
                loc_fare = sum(d_loc_trip['fare'])
                loc_ep = sum(d_loc_trip['economicProfit'])
                loc_qtime = sum(d_loc_trip['queueingTime'])
                #
                # All
                #
                d_all_trip = day_all_trip_df[(day_all_trip_df['did'] == did)]
                d_shift = day_shift_df[(day_shift_df['did'] == did)]
                all_num = len(d_all_trip['fare'])
                pro_dur = sum(d_shift['pro-dur']) * SEC60
                all_fare = sum(d_all_trip['fare'])
                #
                d_loc_trip_in = d_loc_trip[(d_loc_trip['tripMode'] == DIn_PIn)]
                locIn_num = len(d_loc_trip_in['fare'])
                locIn_dur = sum(d_loc_trip_in['duration'])
                locIn_fare = sum(d_loc_trip_in['fare'])
                locIn_ep = sum(d_loc_trip_in['economicProfit'])
                locIn_qtime = sum(d_loc_trip_in['queueingTime'])
                #
                d_loc_trip_out = d_loc_trip[(d_loc_trip['tripMode'] == DOut_PIn)]
                locOut_num = len(d_loc_trip_out['fare'])
                locOut_dur = sum(d_loc_trip_out['duration'])
                locOut_fare = sum(d_loc_trip_out['fare'])
                locOut_ep = sum(d_loc_trip_out['economicProfit'])
                locOut_qtime = sum(d_loc_trip_out['queueingTime'])
                #
                with open(fpath, 'a') as w_csvfile:
                    writer = csv.writer(w_csvfile, lineterminator='\n')
                    writer.writerow([yy, mm, dd, did,
                                     all_num, pro_dur, all_fare,
                                     loc_num, loc_dur, loc_fare, loc_ep, loc_qtime,
                                     locIn_num, locIn_dur, locIn_fare, locIn_ep, locIn_qtime,
                                     locOut_num, locOut_dur, locOut_fare, locOut_ep, locOut_qtime])
def process_file(yymm):
    from traceback import format_exc
    try:
        logger.info('handle the file; %s' % yymm)
        queueingTime_ap_fpath = '%s/%s%s.csv' % (queueingTime_ap_dpath, queueingTime_ap_prefix, yymm)
        queueingTime_ns_fpath = '%s/%s%s.csv' % (queueingTime_ns_dpath, queueingTime_ns_prefix, yymm)
        if check_path_exist(queueingTime_ap_fpath) and check_path_exist(queueingTime_ns_fpath):
            logger.info('The file had already been processed; %s' % yymm)
            return
        #
        logger.info('load pickle files; %s' % yymm)
        ap_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ap_dpath, crossingTime_ap_prefix, yymm)
        ns_pkl_fpath = '%s/%s%s.pkl' % (crossingTime_ns_dpath, crossingTime_ns_prefix, yymm)
        crossingTime_ap, crossingTime_ns = load_pickle_file(ap_pkl_fpath), load_pickle_file(ns_pkl_fpath)
        #
        logger.info('initiate csv files; %s' % yymm)
        with open(queueingTime_ap_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = ['did',
                           'startTime', 'endTime', 'duration', 'fare',
                           'tripMode', 'queueJoinTime', 'queueingTime',
                           'year', 'month', 'day', 'hour',
                           'pickUpTerminalAP', 'prevEndTerminalAP']
            writer.writerow(new_headers)
        with open(queueingTime_ns_fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = ['did',
                           'startTime', 'endTime', 'duration', 'fare',
                           'tripMode', 'queueJoinTime', 'queueingTime',
                           'year', 'month', 'day', 'hour']
            writer.writerow(new_headers)
        #
        logger.info('start recording; %s' % yymm)
        with open('%s/Filtered-%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h : i for i, h in enumerate(headers)}
            for row in reader:
                did = row[hid['did']]
                et, duration = row[hid['endTime']], row[hid['duration']]
                fare = row[hid['fare']]
                year, month = row[hid['year']], row[hid['month']]
                day, hour = row[hid['day']], row[hid['hour']]
                pickUpTerminalAP, prevEndTerminalAP = row[hid['pickUpTerminalAP']], row[hid['prevEndTerminalAP']]
                #
                ap_tm, ns_tm = int(row[hid['tripModeAP']]), int(row[hid['tripModeNS']])
                vid, st, prev_tet = row[hid['vid']], eval(row[hid['startTime']]), eval(row[hid['prevTripEndTime']])
                #
                # Airport trip
                #
                if ap_tm != DIn_POut or ap_tm != DOut_POut:
                    queueing_time = None
                    if ap_tm == DIn_PIn:
                        queue_join_time = prev_tet
                        queueing_time = st - queue_join_time
                    elif ap_tm == DOut_PIn:
                        try:
                            i = bisect(crossingTime_ap[vid], st)
                            queue_join_time = crossingTime_ap[vid][i - 1] if i != 0 else crossingTime_ap[vid][0]
                            queueing_time = st - queue_join_time
                        except KeyError:
                            pass
                    if queueing_time != None and Q_LIMIT_MIN <= queueing_time:
                        new_row = [did,
                                   st, et, duration, fare,
                                   ap_tm, queue_join_time, queueing_time,
                                   year, month, day, hour,
                                   pickUpTerminalAP, prevEndTerminalAP]
                        append_record(queueingTime_ap_fpath, new_row)
                #
                # Night Safari
                #
                if ns_tm != DIn_POut or ns_tm != DOut_POut:
                    queueing_time = None
                    if ns_tm == DIn_PIn:
                        queue_join_time = prev_tet
                        queueing_time = st - queue_join_time
                    elif ns_tm == DOut_PIn:
                        try:
                            i = bisect(crossingTime_ns[vid], st)
                            queue_join_time = crossingTime_ns[vid][i - 1] if i != 0 else crossingTime_ns[vid][0]
                            queueing_time = st - queue_join_time
                        except KeyError:
                            pass
                    if queueing_time != None and Q_LIMIT_MIN <= queueing_time:
                        new_row = [did,
                                   st, et, duration, fare,
                                   ns_tm, queue_join_time, queueing_time,
                                   year, month, day, hour]
                        append_record(queueingTime_ns_fpath, new_row)
        logger.info('end the file; %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise
def process_file(yymm):
    ap_pkl_file_path = '%s/%s%s.pkl' % (ap_crossing_dir, ap_crossing_prefix,
                                        yymm)
    ns_pkl_file_path = '%s/%s%s.pkl' % (ns_crossing_dir, ns_crossing_prefix,
                                        yymm)
    if not (check_path_exist(ap_pkl_file_path)
            and check_path_exist(ns_pkl_file_path)):
        return None
    #
    # Load pickle files
    #
    ap_crossing_time, ns_crossing_time = load_pickle_file(
        ap_pkl_file_path), load_pickle_file(ns_pkl_file_path)
    #
    # Initiate csv files
    #
    ap_trip_fpath = '%s/%s%s.csv' % (ap_trips_dir, ap_trip_prefix, yymm)
    ns_trip_fpath = '%s/%s%s.csv' % (ns_trips_dir, ns_trip_prefix, yymm)
    if check_path_exist(ap_trip_fpath) and check_path_exist(ns_trip_fpath):
        return None
    print 'handle the file; %s' % yymm
    for fpath in [ap_trip_fpath, ns_trip_fpath]:
        with open(fpath, 'wt') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_headers = [
                'tid', 'vid', 'did', 'start-time', 'end-time', 'duration',
                'fare', 'prev-trip-end-time', 'trip-mode', 'queue—join-time',
                'queueing-time'
            ]
            writer.writerow(new_headers)
    #
    with open('%s/%s%s.csv' % (trips_dpath, trip_prefix, yymm),
              'rb') as r_csvfile:
        reader = csv.reader(r_csvfile)
        headers = reader.next()
        hid = {h: i for i, h in enumerate(headers)}
        for row in reader:
            tid, did = row[hid['tid']], row[hid['did']]
            et, duration = row[hid['end-time']], row[hid['duration']]
            fare = row[hid['fare']]
            #
            ap_tm, ns_tm = int(row[hid['ap-trip-mode']]), int(
                row[hid['ns-trip-mode']])
            vid, st, prev_tet = row[hid['vid']], eval(
                row[hid['start-time']]), eval(row[hid['prev-trip-end-time']])
            #
            for tm, crossing_time, fpath in [
                (ap_tm, ap_crossing_time, ap_trip_fpath),
                (ns_tm, ns_crossing_time, ns_trip_fpath)
            ]:
                if tm == DIn_POut or tm == DOut_POut:
                    continue
                if tm == DIn_PIn:
                    queue_join_time = prev_tet
                elif tm == DOut_PIn:
                    try:
                        i = bisect(crossing_time[vid], st)
                    except KeyError:
                        print '%s-tid-%s' % (yymm, row[hid['tid']])
                        continue
                    queue_join_time = crossing_time[vid][
                        i - 1] if i != 0 else crossing_time[vid][0]
                with open(fpath, 'a') as w_csvfile:
                    writer = csv.writer(w_csvfile, lineterminator='\n')
                    queueing_time = st - queue_join_time
                    if queueing_time < Q_LIMIT_MIN:
                        queueing_time = Q_LIMIT_MIN
                    new_row = [
                        tid, vid, did, st, et, duration, fare, prev_tet, tm,
                        queue_join_time, queueing_time
                    ]
                    writer.writerow(new_row)
    print 'end the file; %s' % yymm
def process_files(yymm):
    from traceback import format_exc
    try:
        logger.info('handle %s' % yymm)
        ssDrivers = load_pickle_file('%s/%s%s.pkl' % (ss_drivers_dpath, ss_drivers_prefix, yymm))
        #
        with open('%s/%s%s.csv' % (shiftProDur_dpath, shiftProDur_prefix, yymm), 'rt') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h : i for i, h in enumerate(headers)}
            with open('%s/%s%s.csv' % (ssDriverShiftProDur_dpath, ssDriverShiftProDur_prefix, yymm), 'wt') as w_csvfile:
                writer = csv.writer(w_csvfile)
                writer.writerow(headers)
                for row in reader:
                    did = int(row[hid['did']])
                    if did not in ssDrivers:
                        continue
                    writer.writerow(row)
        #
        yy, mm = yymm[:2], yymm[-2:]
        year, month = 2000 + int(yy), int(mm)
        with open('%s/%s%s.csv' % (trip_dpath, trip_prefix, yymm), 'rb') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h : i for i, h in enumerate(headers)}
            with open('%s/%s%s.csv' % (ssDriverTrip_dpath, ssDriverTrip_prefix, yymm), 'wt') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                new_headers = ['did', 'startTime', 'duration', 'fare', 'year', 'month', 'day', 'hour']
                writer.writerow(new_headers)
                for row in reader:
                    did = int(row[hid['did']])
                    if did not in ssDrivers:
                        continue
                    writer.writerow([row[hid['did']],
                                     row[hid['startTime']],
                                     row[hid['duration']],
                                     row[hid['fare']],
                                     year, month, row[hid['day']], row[hid['hour']]])
        #
        with open('%s/%s%s.csv' % (economicProfit_ap_dpath, economicProfit_ap_prefix, yymm), 'rt') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            with open('%s/%s%s.csv' % (ssDriverEP_ap_dpath, ssDriverEP_ap_prefix, yymm),
                      'wt') as w_csvfile:
                writer = csv.writer(w_csvfile)
                writer.writerow(headers)
                for row in reader:
                    did = int(row[hid['did']])
                    if did not in ssDrivers:
                        continue
                    writer.writerow(row)
        #
        with open('%s/%s%s.csv' % (economicProfit_ns_dpath, economicProfit_ns_prefix, yymm), 'rt') as r_csvfile:
            reader = csv.reader(r_csvfile)
            headers = reader.next()
            hid = {h: i for i, h in enumerate(headers)}
            with open('%s/%s%s.csv' % (ssDriverEP_ns_dpath, ssDriverEP_ns_prefix, yymm),
                      'wt') as w_csvfile:
                writer = csv.writer(w_csvfile)
                writer.writerow(headers)
                for row in reader:
                    did = int(row[hid['did']])
                    if did not in ssDrivers:
                        continue
                    writer.writerow(row)
        logger.info('end %s' % yymm)
    except Exception as _:
        import sys
        with open('%s_%s.txt' % (sys.argv[0], yymm), 'w') as f:
            f.write(format_exc())
        raise