Ejemplo n.º 1
0
def getSubwayDate(filepath2):
    allFiles = DataFrame(os.listdir(filepath2))
    allFiles.columns = ['FileNames']
    allFiles = allFiles[allFiles.FileNames.str.slice(0, 6) == 'gtfs-2']
    allFiles['datetime1'] = allFiles.FileNames.str.slice(5)
    allFiles.datetime1 = allFiles.datetime1.map(parser.parse)
    allFiles.datetime1 = pd.DatetimeIndex(allFiles.datetime1).tz_convert('America/New_York')
    times1 = []
    for dt in allFiles.datetime1:
        tm = dt.time()
        times1.append(tm)
    allFiles['time1'] = Series(times1)
    allFiles = allFiles[allFiles.time1 >= time(6)]
    allFiles = allFiles[allFiles.time1 <= time(9, 30)]
    
    df_allTrains = DataFrame(np.zeros(0, dtype = [('current_status', 'O')]))
    df_trips = DataFrame(np.zeros(0, dtype = [('route_id', 'O')]))
    #old_df = getTrains(allFiles.FileNames.iloc[0], test_df)
    for fileName0 in allFiles.FileNames:
        fileName1 = os.path.join(filepath2,fileName0)
        try:
            df_allTrains, df_trips = getTrains(fileName1, df_allTrains, df_trips)
        except:
            print 'file ' + fileName0 + ' did not work'
            continue
        
    df_allTrains = df_allTrains.sort_values(['trip_id', 'timestamp', 'timestamp1'])
    unique_ids = ['start_date','route_id','trip_id', 'train_id', 'stop_id', 'stop_name', 'current_status', 'timestamp']
    df_allTrains.drop_duplicates(unique_ids, keep = 'last', inplace = True)
    unique_ids.remove('timestamp')
    df_grouped = df_allTrains.groupby(unique_ids)
    df_grouped1 = df_grouped['timestamp'].agg({'minTS1': min, 'maxTS1':max})
    df_grouped2 = df_grouped['timestamp1'].agg(max)
    df_grouped1['maxTS2'] = df_grouped2
    df_allTrains = df_grouped1.reset_index()
    df_allTrains.sort_values(['trip_id', 'minTS1'], inplace = True)
    
    df_trips = df_trips.sort_values(['trip_id', 'stop_id', 'timestamp1'])  
    
    shutil.rmtree(filepath2)
    return(df_allTrains, df_trips)