from __future__ import division import pandas as pd import matplotlib.pyplot as plt from utility import printHeader ########################### # Loading the data ########################### printHeader("Loading the data") # We load the csv in to a dataframe. We supply the date parser to pandas, as otherwise pandas assumes American # date formats. weather = pd.read_csv( "data/Weather_extract_2018.csv", sep=",", index_col=0, parse_dates=True, date_parser=lambda x: pd.datetime.strptime(x, '%d.%m.%Y %H:%M')) # We replace 'Trace of precipitation' with a very low value as discussed in the report, just so we can plot it. precipitation_clean = weather["RRR"].dropna().replace("Trace of precipitation", 0.00001).astype(float) print "Weather Data Loaded." ########################### # Plot the two weather features ########################### printHeader("Plotting the two weather features") # Create subplots
# Define the local target directory for files to be downloaded to targetDir = "data/" # Define the proportion of each file we want to include in the dataset sample_proportion = 1 / 120 # Define the path for the output file for tabular data output = "data/combined_pre_cleaning.pickle" # End of configuration parameters ################################################ ########################### # Spatial Data ########################### printHeader("Spatial data") # Download the file form the TLC website urllib.urlretrieve("https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip", "taxi_zones.zip") # Unzip the file print "Unzipping spatial data" taxi_zones_zip = zipfile.ZipFile("taxi_zones.zip", 'r') taxi_zones_zip.extractall("spatialref") taxi_zones_zip.close() print "Unzipping spatial data complete" ########################### # Tabular Data ###########################
from __future__ import division import pandas as pd from utility import printHeader ########################### # Loading cleaned data ########################### printHeader("Loading cleaned data") # We load the three pickles we created during cleaning combined = pd.read_pickle("data/combined_post_cleaning_trips.pickle") weather = pd.read_pickle("data/weather_2018.pickle").sort_index() spatial_ref_data = pd.read_pickle("data/spatial.pickle") print "Loaded." ########################### # Creating new features ########################### printHeader("Creating new features") # This function will classify examples as either '0' which corresponds to a tip percentage greater than 0.1% or 1 # otherwise def classifyTipPercentage(tip_percentage): if tip_percentage < 0.1 and tip_percentage >= 0: return 1 elif tip_percentage >= 0.1: return 0 else: return 2
import geopandas as gpd from sklearn.mixture import GaussianMixture from utility import printHeader import math import scipy.stats as st import matplotlib.pyplot as plt # Our plots often refer to yellow/green, I have chosen hex-codes which match the official colours of NYC taxis. colorscheme = ["#fce300", "#8db600"] ########################### # Loading cleaned data with new features ########################### printHeader("Loading cleaned data with new features") # Load all of the data and combine it in to a single data frame combined = pd.read_pickle("data/combined_post_new_features.pickle") print "Tabular Data Loaded." # Read the shape files in to a geopandas dataframe spatial = gpd.read_file("spatialref/taxi_zones.shp").set_index("LocationID") print "Spatial Data Loaded" ########################### # Figure 18 - Mixture Model with Distribution of Tip Percentage ########################### printHeader("Figure 18 - Mixture Model with Distribution of Tip Percentage") # We take a sample of the dataset for the mixture model to speed up processing
# Utilties from utility import printHeader from utility import printPrettyCM # Keras by default is extremely verbose about the processes/threads it uses so we switch this off. import os os.environ['KMP_WARNINGS'] = 'off' # Our plots often refer to yellow/green, I have chosen hex-codes which match the official colours of NYC taxis. colorscheme = ["#8db600", "#fce300"] ########################### # Loading Data ########################### printHeader("Loading Data") # We load our cleaned tabular data file from the pickle file we saved. # To speed up training, you can limit the sample size of this by sampling a percentage combined = pd.read_pickle("data/combined_post_new_features.pickle") print "%s records loaded from dataset." % len(combined) ########################### # Features ########################### printHeader("Features") # To help manage all the new features we've created, we construct a list of all the features we want to include in # our model features_general = [ "pickup_x", "pickup_y", "dropoff_x", "dropoff_y", "passenger_count", "trip_distance", "duration"
features_ratecode = [ feature for feature in combined.columns if feature[0:len("Ratecode_")] == "Ratecode_" ] features_colour = [ feature for feature in combined.columns if feature[0:len("colour_")] == "colour_" ] features_temporal = [ feature for feature in combined.columns if feature[0:len("temporal_")] == "temporal_" ] features_weather = ["T", "RRR"] # We create our big list of features and print it out printHeader("Features") features = features_general + features_dropoff + features_pickup + \ features_colour + features_temporal + features_weather for feature in features: print "* %s" % feature ########################### # Train/Test Split ########################### printHeader("Train/Test Split") X_train, X_test, y_train, y_test = train_test_split(combined[features], combined["tip_percentage"], test_size=0.33) print "X_train: %s" % len(X_train) print "X_test: %s" % len(X_test)
from __future__ import division import geopandas as gpd from shapely.geometry import Polygon from utility import printHeader ########################### # Loading the data and print out the CRS and features ########################### printHeader("Loading the data and print out the CRS and features") # Load and join the spatial data gdf = gpd.read_file("spatialref/taxi_zones.shp") # Print out the coordinate reference system print "Coordinate system: %s\n" % gdf.crs["init"] # Print out the features of the spatial data print "Features:" for feature in gdf.columns: print "* %s" % feature ########################### # Check whether we should use the OBJECT ID or the LocationID as our index ########################### printHeader( "Check whether we should use the OBJECT ID or the LocationID as our index") print "The following entries exist in the dataframe where LocationID is not the same as OBJECTID" print gdf[(gdf["OBJECTID"] != gdf["LocationID"])] # Index by OBJECTID
# This generator automatically gives IDs to axes, useful for the large number of subplots in Figures 3 and 4. def generateAxes(rows, cols): id = 0 while id < rows * cols: yield_x = id // cols yield_y = id % cols yield (yield_x, yield_y) id += 1 ################################################ # Loading the dataset and creating new features ################################################ printHeader("Loading the dataset and creating new features") # Load the data from the pickle file we created combined = pd.read_pickle("data/combined_pre_cleaning.pickle") print ("Loaded") # Calculate a new feature - the tip percentage combined["tip_percentage"] = 100 * combined["tip_amount"] / (combined["total_amount"]) # Create a duration feature, first converting the datetimes to pandas datetime format combined["dropoff_datetime"] = pd.to_datetime(combined["dropoff_datetime"]) combined["pickup_datetime"] = pd.to_datetime(combined["pickup_datetime"]) combined["duration"] = (combined["dropoff_datetime"] - combined["pickup_datetime"]) \ .apply(lambda td: td.total_seconds())