def getSubwayDate(filepath2): allFiles = DataFrame(os.listdir(filepath2)) allFiles.columns = ['FileNames'] allFiles = allFiles[allFiles.FileNames.str.slice(0, 6) == 'gtfs-2'] allFiles['datetime1'] = allFiles.FileNames.str.slice(5) allFiles.datetime1 = allFiles.datetime1.map(parser.parse) allFiles.datetime1 = pd.DatetimeIndex(allFiles.datetime1).tz_convert('America/New_York') times1 = [] for dt in allFiles.datetime1: tm = dt.time() times1.append(tm) allFiles['time1'] = Series(times1) allFiles = allFiles[allFiles.time1 >= time(6)] allFiles = allFiles[allFiles.time1 <= time(9, 30)] df_allTrains = DataFrame(np.zeros(0, dtype = [('current_status', 'O')])) df_trips = DataFrame(np.zeros(0, dtype = [('route_id', 'O')])) #old_df = getTrains(allFiles.FileNames.iloc[0], test_df) for fileName0 in allFiles.FileNames: fileName1 = os.path.join(filepath2,fileName0) try: df_allTrains, df_trips = getTrains(fileName1, df_allTrains, df_trips) except: print 'file ' + fileName0 + ' did not work' continue df_allTrains = df_allTrains.sort_values(['trip_id', 'timestamp', 'timestamp1']) unique_ids = ['start_date','route_id','trip_id', 'train_id', 'stop_id', 'stop_name', 'current_status', 'timestamp'] df_allTrains.drop_duplicates(unique_ids, keep = 'last', inplace = True) unique_ids.remove('timestamp') df_grouped = df_allTrains.groupby(unique_ids) df_grouped1 = df_grouped['timestamp'].agg({'minTS1': min, 'maxTS1':max}) df_grouped2 = df_grouped['timestamp1'].agg(max) df_grouped1['maxTS2'] = df_grouped2 df_allTrains = df_grouped1.reset_index() df_allTrains.sort_values(['trip_id', 'minTS1'], inplace = True) df_trips = df_trips.sort_values(['trip_id', 'stop_id', 'timestamp1']) shutil.rmtree(filepath2) return(df_allTrains, df_trips)