def extract_from_raw(yymmdd, hh=None, taxi_id=None):
    if type(taxi_id) != int:
        taxi_id = int(taxi_id)
    #
    logging_fpath = opath.join(test_dpath, '_test.txt')
    ofpath = opath.join(test_dpath, 'log-%s-%s-%s.csv' % (yymmdd, hh, taxi_id))
    logging(logging_fpath, 'handle the file; %s' % ofpath)
    with open(ofpath, 'w') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        new_header = [
            'time', 'taxi_id', 'driver_id', 'state', 'lng', 'lat', 'apBasePos'
        ]
        writer.writerow(new_header)
    #
    target_dt = datetime.strptime(yymmdd + hh, '%y%m%d%H')
    next_dt = target_dt + timedelta(hours=1)
    yymm = yymmdd[:len('yymm')]
    yy, mm = yymm[:len('yy')], yymm[len('yy'):]
    yyyy = '20%s' % yy
    #
    log_fpath = reduce(
        opath.join,
        [TAXI_RAW_DATA_HOME, yyyy, mm, 'logs',
         'logs-%s-normal.csv' % yymm])
    ap_polygons = get_ap_polygons()
    with open(log_fpath) as r_csvfile:
        reader = csv.DictReader(r_csvfile)
        for row in reader:
            t, vid, did, state = map(eval, [
                row[cn] for cn in ['time', 'vehicle-id', 'driver-id', 'state']
            ])
            if vid != taxi_id:
                continue
            cur_dt = datetime.fromtimestamp(t)
            if cur_dt.day != target_dt.day:
                continue
            if cur_dt.hour == next_dt.hour:
                logging(logging_fpath, 'next period; %s' % ofpath)
                break
            elif cur_dt.hour != target_dt.hour:
                continue
            #
            lng, lat = map(eval, [row[cn] for cn in ['longitude', 'latitude']])
            new_row = [t, vid, did, state, lng, lat]
            apBasePos = 'X'
            for ap_polygon in ap_polygons:
                if ap_polygon.is_including((lng, lat)):
                    apBasePos = ap_polygon.name
                    break
            new_row.append(apBasePos)
            with open(ofpath, 'a') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                writer.writerow(new_row)
def run(target_months):
    for yymm in target_months:
        logging_fpath = opath.join(lf_dpath, 'a1_%s.txt' % yymm)
        logging(logging_fpath, 'Start handling; %s' % yymm)
        #
        yymm_dt = datetime.strptime(yymm, '%y%m')
        _, numDays = monthrange(yymm_dt.year, yymm_dt.month)
        first_date = yymm_dt
        last_date = datetime(yymm_dt.year, yymm_dt.month, numDays)
        nm_first_day = last_date + timedelta(days=1)
        handling_date = first_date
        worker_dts = [[] for _ in range(NUM_WORKERS)]
        while handling_date < nm_first_day:
            worker_dts[int((handling_date.day - 1) / numDays * NUM_WORKERS)].append(handling_date)
            handling_date += timedelta(days=1)
        #
        ps = []
        for wid, dts in enumerate(worker_dts):
            p = multiprocessing.Process(target=process_dates,
                                        args=(wid, dts, logging_fpath))
            ps.append(p)
            p.start()
        for p in ps:
            p.join()
def process_dates(wid, dts, logging_fpath):
    logging(logging_fpath, 'Start worker %d' % wid)
    target_days = list(range(dts[0].day, dts[-1].day + 1))
    ymd_dt = dts[0]
    logging(logging_fpath, 'Worker %d: handling %s' % (wid, str(ymd_dt)))
    yy, mm = ymd_dt.strftime('%y'), ymd_dt.strftime('%m')
    yyyy = ymd_dt.strftime('%Y')
    #
    log_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME,
                                    yyyy, mm, 'logs', 'logs-%s-normal.csv' % ymd_dt.strftime('%y%m')])
    ap_polygons = get_ap_polygons()
    handling_day, handling_hour = -1, -1
    with open(log_fpath) as r_csvfile:
        reader = csv.DictReader(r_csvfile)
        for row in reader:
            t, vid, did, state = map(eval, [row[cn] for cn in ['time', 'vehicle-id', 'driver-id', 'state']])
            cur_dt = datetime.fromtimestamp(t)
            if cur_dt.day != handling_day:
                handling_day = cur_dt.day
                logging(logging_fpath, 'Worker %d: handling %dth day' % (wid, cur_dt.day))
                handling_hour = -1
            if cur_dt.day < target_days[0]:
                continue
            if cur_dt.day > target_days[-1]:
                logging(logging_fpath, 'Worker %d: end processing' % wid)
                break
            #
            if cur_dt.hour != handling_hour:
                handling_hour = cur_dt.hour
                ofpath = opath.join(log_dpath, 'log-%s.csv' % cur_dt.strftime('%Y%m%d%H'))
                with open(ofpath, 'w') as w_csvfile:
                    writer = csv.writer(w_csvfile, lineterminator='\n')
                    new_header = ['time', 'taxi_id', 'driver_id', 'state', 'lng', 'lat', 'apBasePos']
                    writer.writerow(new_header)
            #
            lng, lat = map(eval, [row[cn] for cn in ['longitude', 'latitude']])
            new_row = [t, vid, did, state, lng, lat]
            apBasePos = 'X'
            for ap_polygon in ap_polygons:
                if ap_polygon.is_including((lng, lat)):
                    apBasePos = ap_polygon.name
                    break
            new_row.append(apBasePos)
            with open(ofpath, 'a') as w_csvfile:
                writer = csv.writer(w_csvfile, lineterminator='\n')
                writer.writerow(new_row)
def run(yymm):
    logging_fpath = opath.join(lf_dpath, 'a2_%s.txt' % yymm)
    ofpath = opath.join(trip_dpath, 'trip-%s.csv' % yymm)
    #
    logging(logging_fpath, 'handle the file; %s' % yymm)
    if opath.exists(ofpath):
        logging(logging_fpath, 'The file had already been processed; %s' % yymm)
        return None
    yy, mm = yymm[:2], yymm[-2:]
    yyyy = '20%s' % yy
    with open(ofpath, 'wt') as w_csvfile:
        writer = csv.writer(w_csvfile, lineterminator='\n')
        new_header = [
            'year', 'month', 'day', 'dow', 'hour',
            'taxi_id', 'driver_id', 'fare',
            'start_latitude', 'start_longitude', 'start_time',
            'end_latitude', 'end_longitude', 'end_time',
            'start_loc', 'end_loc',
            'time_first_free',
        ]
        writer.writerow(new_header)
    #
    normal_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME,
                                       yyyy, mm, 'trips', 'trips-%s-normal.csv' % yymm])
    ext_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME,
                                       yyyy, mm, 'trips', 'trips-%s-normal-ext.csv' % yymm])
    log_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME,
                                    yyyy, mm, 'logs', 'logs-%s-normal.csv' % yymm])
    handling_day = -1
    try:
        year, month = map(int, [yyyy, mm])
        ap_polygons = get_ap_polygons()
        vehicles = {}
        with open(normal_fpath) as tripFileN:
            tripReaderN = csv.DictReader(tripFileN)
            with open(ext_fpath) as tripFileE:
                tripReaderE = csv.DictReader(tripFileE)
                with open(log_fpath) as logFile:
                    logReader = csv.DictReader(logFile)
                    for rowN in tripReaderN:
                        rowE = next(tripReaderE)
                        sDay, sDow, sHour = map(int, [rowN[cn] for cn in ['start-day', 'start-dow', 'start-hour']])
                        if handling_day != sDay:
                            logging(logging_fpath, 'handle day %d' % sDay)
                            handling_day = sDay
                        #
                        tripTime = eval(rowN['start-time'])
                        while True:
                            rowL = next(logReader)
                            logTime = eval(rowL['time'])
                            vidL, state = map(int, [rowL[cn] for cn in ['vehicle-id', 'state']])
                            if not vidL in vehicles:
                                vehicles[vidL] = vehicle(vidL, logTime, state)
                            else:
                                vehicles[vidL].update(logTime, state)
                            if tripTime <= logTime:
                                break
                        taxiID = int(rowN['vehicle-id'])
                        if not taxiID in vehicles:
                            continue
                        driverID = int(rowE['driver-id'])
                        startTime, endTime = map(eval, [rowN[cn] for cn in ['start-time', 'end-time']])
                        timeFirstFree = vehicles[taxiID].firstFreeStateTime
                        vehicles[taxiID].reset()
                        #
                        startLng, startLat = map(eval, [rowN[cn] for cn in ['start-long', 'start-lat']])
                        endLng, endLat = map(eval, [rowN[cn] for cn in ['end-long', 'end-lat']])
                        startLoc, endLoc = 'X', 'X'
                        for ap_polygon in ap_polygons:
                            if startLoc == 'X':
                                if ap_polygon.is_including((startLng, startLat)):
                                    startLoc = ap_polygon.name
                            if endLoc == 'X':
                                if ap_polygon.is_including((endLng, endLat)):
                                    endLoc = ap_polygon.name
                            if startLoc != 'X' and endLoc != 'X':
                                break
                        #
                        with open(ofpath, 'a') as w_csvfile:
                            writer = csv.writer(w_csvfile, lineterminator='\n')
                            new_row = [year, month, sDay, sDow, sHour,
                                       taxiID, driverID, rowN['fare'],
                                       startLng, startLat, startTime,
                                       endLng, endLat, endTime,
                                       startLoc, endLoc,
                                       timeFirstFree]
                            writer.writerow(new_row)
    except Exception as _:
        logging(logging_fpath, format_exc())
        raise
def run(yymm):
    logging_fpath = opath.join(lf_dpath, 'a2_%s.txt' % yymm)
    logging(logging_fpath, 'Start handling; %s' % yymm)
    #
    yymm_dt = datetime.strptime(yymm, '%y%m')
    yy, mm = yymm_dt.strftime('%y'), yymm_dt.strftime('%m')
    yyyy = yymm_dt.strftime('%Y')
    #
    normal_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME,
                                       yyyy, mm, 'trips', 'trips-%s-normal.csv' % yymm])
    ext_fpath = reduce(opath.join, [TAXI_RAW_DATA_HOME,
                                       yyyy, mm, 'trips', 'trips-%s-normal-ext.csv' % yymm])
    #
    year, month = yymm_dt.year, yymm_dt.month
    ap_polygons = get_ap_polygons()
    handling_day, handling_hour = -1, -1
    with open(normal_fpath) as tripFileN:
        tripReaderN = csv.DictReader(tripFileN)
        with open(ext_fpath) as tripFileE:
            tripReaderE = csv.DictReader(tripFileE)
            for rowN in tripReaderN:
                rowE = next(tripReaderE)
                #
                startTime, endTime = map(eval, [rowN[cn] for cn in ['start-time', 'end-time']])
                cur_dt = datetime.fromtimestamp(startTime)
                if cur_dt.day != handling_day:
                    handling_day = cur_dt.day
                    logging(logging_fpath, 'handle day %d' % handling_day)
                    handling_hour = -1
                if cur_dt.day == handling_day and cur_dt.hour != handling_hour:
                    handling_hour = cur_dt.hour
                    ofpath = opath.join(trip_dpath, 'trip-%s.csv' % cur_dt.strftime('%Y%m%d%H'))
                    with open(ofpath, 'wt') as w_csvfile:
                        writer = csv.writer(w_csvfile, lineterminator='\n')
                        new_header = [
                            'year', 'month', 'day', 'dow', 'hour',
                            'taxi_id', 'driver_id', 'fare',
                            'start_longitude', 'start_latitude', 'start_time',
                            'end_longitude', 'end_latitude', 'end_time',
                            'start_loc', 'end_loc',
                        ]
                        writer.writerow(new_header)

                sDay, sDow, sHour = map(int, [rowN[cn] for cn in ['start-day', 'start-dow', 'start-hour']])
                taxiID = int(rowN['vehicle-id'])
                driverID = int(rowE['driver-id'])
                #
                startLng, startLat = map(eval, [rowN[cn] for cn in ['start-long', 'start-lat']])
                endLng, endLat = map(eval, [rowN[cn] for cn in ['end-long', 'end-lat']])
                startLoc, endLoc = 'X', 'X'
                for ap_polygon in ap_polygons:
                    if startLoc == 'X':
                        if ap_polygon.is_including((startLng, startLat)):
                            startLoc = ap_polygon.name
                    if endLoc == 'X':
                        if ap_polygon.is_including((endLng, endLat)):
                            endLoc = ap_polygon.name
                    if startLoc != 'X' and endLoc != 'X':
                        break
                #
                with open(ofpath, 'a') as w_csvfile:
                    writer = csv.writer(w_csvfile, lineterminator='\n')
                    new_row = [year, month, sDay, sDow, sHour,
                               taxiID, driverID, rowN['fare'],
                               startLng, startLat, startTime,
                               endLng, endLat, endTime,
                               startLoc, endLoc]
                    writer.writerow(new_row)
Exemple #6
0
def run(prefix=None):
    logging_fpath = opath.join(lf_dpath, 'a3.txt')
    logging(logging_fpath, 'Start handling')
    target_dates = get_target_dates(prefix)
    #
    for dt in target_dates:
        ofpath = opath.join(adt_dpath, 'apDayTrip-%s.csv' % dt.strftime('%Y%m%d'))
        with open(ofpath, 'w') as w_csvfile:
            writer = csv.writer(w_csvfile, lineterminator='\n')
            new_header = [
                'year', 'month', 'day', 'dow', 'hour',
                'taxi_id', 'driver_id', 'fare',
                'previous_dropoff_longitude', 'previous_dropoff_latitude',
                'start_longitude', 'start_latitude',
                'end_longitude', 'end_latitude',
                'previous_dropoff_loc', 'start_loc', 'end_loc',
                'time_previous_dropoff', 'time_enter_airport', 'start_time', 'time_exit_airport', 'end_time',
                'time_first_free']
            writer.writerow(new_header)
        #
        vid_ts, vid_traj0 = get_trajectory(dt)
        ifpath = opath.join(dt_dpath, 'dayTrip-%s.csv' % dt.strftime('%Y%m%d'))
        with open(ifpath) as r_csvfile:
            reader = csv.DictReader(r_csvfile)
            for row in reader:
                vid = int(row['taxi_id'])
                if vid not in vid_ts:
                    continue
                prevLoc, sLoc, eLoc = [row[cn] for cn in ['previous_dropoff_loc', 'start_loc', 'end_loc']]
                tPrev, tStart, tEnd = [eval(row[cn]) for cn in ['time_previous_dropoff', 'start_time', 'end_time']]
                i, j, k = bisect(vid_ts[vid], tPrev), bisect(vid_ts[vid], tStart), bisect(vid_ts[vid], tEnd)
                #
                tEnter, tExit = -1, -1
                if prevLoc == 'X':
                    if sLoc == 'X':
                        for ts, state, loc in vid_traj0[vid][i:j]:
                            if tEnter == -1 and loc != 'X':
                                tEnter = ts
                            if tEnter != -1 and loc == 'X':
                                tExit = ts
                                break
                        else:
                            continue
                    else:
                        for ts, state, loc in vid_traj0[vid][i:j]:
                            if tEnter == -1 and loc != 'X':
                                tEnter = ts
                                break
                        if eLoc == 'X':
                            for ts, state, loc in vid_traj0[vid][j:k]:
                                if tExit == -1 and loc == 'X':
                                    tExit = ts
                                    break
                        else:
                            tExit = tEnd
                else:
                    tEnter = tPrev
                    if sLoc == 'X':
                        for ts, state, loc in vid_traj0[vid][i:j]:
                            if tExit == -1 and loc == 'X':
                                tExit = ts
                                break
                    else:
                        if eLoc == 'X':
                            for ts, state, loc in vid_traj0[vid][j:k]:
                                if tExit == -1 and loc == 'X':
                                    tExit = ts
                                    break
                        else:
                            tExit = tEnd
                if tEnter == -1 or tExit == -1:
                    logging(logging_fpath, 'Cannot find tEnter or tExit; %d, %d\n%s' % (tEnter, tExit, str(row)))
                    continue
                #
                time_ff = -1
                for ts, state, loc in vid_traj0[vid][i:j]:
                    if state == FREE:
                        time_ff = ts
                        break
                if time_ff == -1:
                    logging(logging_fpath, 'Cannot find time_ff; \n%s' % str(row))
                    continue
                #
                new_row = [row[cn] for cn in ['year', 'month', 'day', 'dow', 'hour',
                                        'taxi_id', 'driver_id', 'fare',
                                        'previous_dropoff_latitude', 'previous_dropoff_longitude',
                                        'start_latitude', 'start_longitude',
                                        'end_latitude', 'end_longitude',
                                        'previous_dropoff_loc', 'start_loc', 'end_loc']]
                new_row += [tPrev, tEnter, tStart, tExit, tEnd,
                            time_ff]
                with open(ofpath, 'a') as w_csvfile:
                    writer = csv.writer(w_csvfile, lineterminator='\n')
                    writer.writerow(new_row)
Exemple #7
0
def run(yymm):
    logging_fpath = opath.join(lf_dpath, 'a4_%s.txt' % yymm)
    #
    logging(logging_fpath, 'handle the file; %s' % yymm)
    trip_fpath = opath.join(trip_dpath, 'trip-%s.csv' % yymm)
    handling_day = 0
    vid_lastLocTime, vehicles = {}, {}
    try:
        with open(trip_fpath) as tripFile:
            tripReader = csv.DictReader(tripFile)
            for row in tripReader:
                did = int(row['driver_id'])
                if did == -1:
                    continue
                day, hour = map(int, [row[cn] for cn in ['day', 'hour']])
                if day == 1 and hour <= AM5:
                    continue
                if AM2 <= hour and hour <= AM5:
                    continue
                if day != handling_day and hour == AM5 + 1:
                    handling_day = day
                    logging(logging_fpath, 'handling %dth day' % handling_day)
                    vid_lastLocTime, vehicles = {}, {}
                    log_fpath = opath.join(
                        apDL_dpath,
                        'ap-dayLog-%s%02d.csv' % (yymm, handling_day))
                    with open(log_fpath) as logFile:
                        logReader = csv.DictReader(logFile)
                        for rowL in logReader:
                            vid = int(rowL['driver_id'])
                            if not vid in vehicles:
                                vehicles[vid] = vehicle(vid)
                            vehicles[vid].add_trajectory(
                                eval(rowL['time']), rowL['apBasePos'])
                    #
                    ofpath = opath.join(
                        apDT_dpath,
                        'ap-dayTrip-%s%02d.csv' % (yymm, handling_day))
                    with open(ofpath, 'wt') as w_csvfile:
                        writer = csv.writer(w_csvfile, lineterminator='\n')
                        new_header = [
                            'year',
                            'month',
                            'day',
                            'dow',
                            'hour',
                            'taxi_id',
                            'driver_id',
                            'fare',
                            'start_latitude',
                            'start_longitude',
                            'start_time',
                            'end_latitude',
                            'end_longitude',
                            'end_time',
                            'start_loc',
                            'end_loc',
                            'time_first_free',
                            'time_previous_dropoff',
                            'time_enter_airport',
                            'time_exit_airport',
                            'previous_dropoff_latitude',
                            'previous_dropoff_longitude',
                            'previous_dropoff_loc',
                        ]
                        writer.writerow(new_header)
                vid = int(row['taxi_id'])
                if not vid in vehicles:
                    continue
                sLoc, eLoc = [row[cn] for cn in ['start_loc', 'end_loc']]
                sTime, eTime = map(
                    eval, [row[cn] for cn in ['start_time', 'end_time']])
                eLat, eLng = map(
                    eval,
                    [row[cn] for cn in ['end_latitude', 'end_longitude']])
                if not vid in vid_lastLocTime:
                    vid_lastLocTime[vid] = (eLat, eLng, eLoc, eTime)
                    continue
                latPrevDropoff, lngPrevDropoff, locPrevDropoff, tPrevDropoff = vid_lastLocTime[
                    vid]
                if not (locPrevDropoff == 'X' and sLoc == 'X'):
                    tEnter, tExit = vehicles[vid].find_eeTime_AP(sTime, sLoc)
                    newInfo = [
                        tPrevDropoff, tEnter, tExit, latPrevDropoff,
                        lngPrevDropoff, locPrevDropoff
                    ]
                    add_row(ofpath, row, newInfo)
                else:
                    visitAP, tEnter, tExit = vehicles[vid].find_eeTime_XAP(
                        tPrevDropoff, sTime)
                    if visitAP:
                        newInfo = [
                            tPrevDropoff, tEnter, tExit, latPrevDropoff,
                            lngPrevDropoff, locPrevDropoff
                        ]
                        add_row(ofpath, row, newInfo)
                vid_lastLocTime[vid] = (eLat, eLng, eLoc, eTime)
    except Exception as _:
        logging(logging_fpath, format_exc())
        raise