def mainKyle(): # Set parameters for loading the data num_records = 'all' csvfile = "Data/BKGDAT_ZeroTOTALBKD.txt" # Set parameters for filtering the data market = AirportCodes.London orgs=[AirportCodes.Dubai, market] dests=[AirportCodes.Dubai, market] cabins=["Y"] # Get the data, filter it, and group it by flight print "Loading " + csvfile f = FeatureFilter(num_records, csvfile) print "Filtering" data = f.getDrillDown(orgs=orgs, dests=dests, cabins=cabins) print "Grouping by flight" unique_flights = f.getUniqueFlights(data) # Encode the flights print "Encoding flight data" start = -90 stop = 0 num_points = 31 interp_params = (start, stop, num_points) bin_size = 3 date_reduction = 0 cat_encoding = (bin_size, date_reduction) num_folds = 3 X, y, ids = encodeFlights(unique_flights, interp_params, cat_encoding) _, num_features = X[0].shape print 'Generating k-fold' kf = kFoldSplit(X, y, ids, num_folds) print 'Selecting features' model = KNeighborsRegressor print sequentialForwardFeatureSelection(model, kf, num_features)
def mainRyan(): # Set parameters for loading the data num_records = 'all' csvfile = "Data/BKGDAT_ZeroTOTALBKD.txt" # Set parameters for filtering the data market = AirportCodes.Frankfurt cabins=["Y"] # Get the data, filter it, and group it by flight print "Loading " + csvfile f = FeatureFilter(num_records, csvfile) print "Filtering" if market is None: orgs=[AirportCodes.Dubai, AirportCodes.London, AirportCodes.Bahrain, AirportCodes.Frankfurt, AirportCodes.Bangkok] dests=[AirportCodes.Dubai, AirportCodes.London, AirportCodes.Bahrain, AirportCodes.Frankfurt, AirportCodes.Bangkok] data = f.getDrillDown(orgs=orgs, dests=dests, cabins=cabins) else: orgs=[AirportCodes.Dubai, market] dests=[AirportCodes.Dubai, market] data = f.getDrillDown(orgs=orgs, dests=dests, cabins=cabins) print "Grouping by flight" unique_flights = f.getUniqueFlights(data) # Encode the flights print "Encoding flight data" start = -90 stop = 0 num_points = 31 interp_params = (start, stop, num_points) bin_size = 3 date_reduction = 0 cat_encoding = (bin_size, date_reduction) X, y, ids = encodeFlights(unique_flights, interp_params, cat_encoding) X_train, y_train, X_test, y_test, ids_train, ids_test = aggregateTrainTestSplit(X, y, ids, 0.75) return X_train, y_train, X_test, y_test, ids_train, ids_test, interp_params, cat_encoding
def __init__(self, nrows, csvfile='Data/BKGDAT_Filtered.txt'): self.f = FeatureFilter(nrows, csvfile)
class Network(): """ Network consumes data frames from FeatureFilter and calculates interesting statistics about the flight network """ def __init__(self, nrows, csvfile='Data/BKGDAT_Filtered.txt'): self.f = FeatureFilter(nrows, csvfile) def countFlightsBetweenCities(self): """ Counts the total number of flights between unique org-des pairs. Similar to timeseries but it doesn't index the counts by date. returns: dictionary of {(org, des), number of flights from org to des} """ flights = self.f.getFilterUniqueFlights() num_flights = {} for flight, group in flights: num_flights[flight[2:]] = num_flights.get(flight[2:], 0) + 1 return num_flights def countCabinCapacityPerFlight(self): """ Counts the total capcity of a flight in every cabin on the plane returns: dictionary of {flight, dictionary of {cabin, cabin capacity}} """ flights = self.f.getUniqueFlightsAndBookings() capacities = {} for booking_group, data in flights: flight = booking_group[0:4] bc = booking_group[4] cabin, rank = Utils.mapBookingClassToCabinHierarchy(bc) if flight not in capacities: capacities[flight] = {} capacities[flight][cabin] = data['CAP'].mean() return capacities def countTotalBookedPerFlight(self): """ Counts the total number of passengers on a flight in every cabin on the plane returns: dictionary of {flight, dictionary of {cabin, total booked}} """ flights = self.f.getUniqueFlightsAndBookings() total_bookings = {} for booking_group, data in flights: flight = booking_group[0:4] bc = booking_group[4] cabin, rank = Utils.mapBookingClassToCabinHierarchy(bc) if flight not in total_bookings: total_bookings[flight] = {} total_bookings[flight][cabin] = data['TOTALBKD'].mean() return total_bookings def countFinalCabinLoadFactor(self): """ Computes what percentage of each flight in self.entities is filled at the time of departure (i.e. TOTALBKD / CAP) returns: dictionary of {flight, cabin load factor} """ capacities = self.countCabinCapacityPerFlight() total_bookings = self.countTotalBookedPerFlight() cabin_load_factors = {} for flight in capacities.keys(): total_cap = sum(capacities[flight].values()) total_booked = sum(total_bookings[flight].values()) cabin_load_factors[flight] = total_booked / total_cap return cabin_load_factors def countOverbookedAndCabinLoadFactor(self): """ Determines which flights overbooking occurs; calculates the percentage overbooked and the cabin load factor. returns: list of tuples {cabin_load_factor, percent_overbooked} """ flights = self.f.getUniqueFlightsAndBookings() ans = [] for booking_group, data in flights: AUTH = data['AUTH'].mean() CAP = data['CAP'].mean() if AUTH > CAP: # Overbooking occurs when AUTH > CAP flight = booking_group[:4] percent_overbooked = float(AUTH)/CAP cabin_load_factor = float(data['TOTALBKD'].mean()) / CAP ans.append((cabin_load_factor, percent_overbooked)) return ans def interp(self, xvals, x, y): x, y = zip(*sorted(zip(x, y), key=lambda tup: tup[0])) return np.interp(xvals, x, y, left=0) def residuals(self, x): return diff(x) def timeseries(self): """ Counts the number of flights that occur along a directed edge (unique org-des pairs) in self.entities and indexes the counts by their date returns: dictionary of {time, dictionary of {directed_edge, count}} """ flights = self.f.filterUniqueFlights(self.entities) time_series = {} for f, group in flights: local = f[2:] time_series[local] = time_series.get(local, {}) time_series[local][f[0]] = time_series[local].get(f[0], 0) + 1 return time_series