from __future__ import division
import pandas as pd
import matplotlib.pyplot as plt

from utility import printHeader

###########################
# Loading the data
###########################
printHeader("Loading the data")
# We load the csv in to a dataframe. We supply the date parser to pandas, as otherwise pandas assumes American
# date formats.
weather = pd.read_csv(
    "data/Weather_extract_2018.csv",
    sep=",",
    index_col=0,
    parse_dates=True,
    date_parser=lambda x: pd.datetime.strptime(x, '%d.%m.%Y %H:%M'))

# We replace 'Trace of precipitation' with a very low value as discussed in the report, just so we can plot it.
precipitation_clean = weather["RRR"].dropna().replace("Trace of precipitation",
                                                      0.00001).astype(float)

print "Weather Data Loaded."

###########################
# Plot the two weather features
###########################
printHeader("Plotting the two weather features")

# Create subplots
Beispiel #2
0
# Define the local target directory for files to be downloaded to
targetDir = "data/"

# Define the proportion of each file we want to include in the dataset
sample_proportion = 1 / 120

# Define the path for the output file for tabular data
output = "data/combined_pre_cleaning.pickle"

# End of configuration parameters
################################################

###########################
# Spatial Data
###########################
printHeader("Spatial data")

# Download the file form the TLC website
urllib.urlretrieve("https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip",
                   "taxi_zones.zip")

# Unzip the file
print "Unzipping spatial data"
taxi_zones_zip = zipfile.ZipFile("taxi_zones.zip", 'r')
taxi_zones_zip.extractall("spatialref")
taxi_zones_zip.close()
print "Unzipping spatial data complete"

###########################
# Tabular Data
###########################
Beispiel #3
0
from __future__ import division
import pandas as pd
from utility import printHeader

###########################
# Loading cleaned data
###########################
printHeader("Loading cleaned data")

# We load the three pickles we created during cleaning
combined = pd.read_pickle("data/combined_post_cleaning_trips.pickle")
weather = pd.read_pickle("data/weather_2018.pickle").sort_index()
spatial_ref_data = pd.read_pickle("data/spatial.pickle")
print "Loaded."

###########################
# Creating new features
###########################
printHeader("Creating new features")


# This function will classify examples as either '0' which corresponds to a tip percentage greater than 0.1% or 1
# otherwise
def classifyTipPercentage(tip_percentage):
    if tip_percentage < 0.1 and tip_percentage >= 0:
        return 1
    elif tip_percentage >= 0.1:
        return 0
    else:
        return 2
Beispiel #4
0
import geopandas as gpd
from sklearn.mixture import GaussianMixture
from utility import printHeader

import math
import scipy.stats as st

import matplotlib.pyplot as plt

# Our plots often refer to yellow/green, I have chosen hex-codes which match the official colours of NYC taxis.
colorscheme = ["#fce300", "#8db600"]

###########################
# Loading cleaned data with new features
###########################
printHeader("Loading cleaned data with new features")

# Load all of the data and combine it in to a single data frame
combined = pd.read_pickle("data/combined_post_new_features.pickle")
print "Tabular Data Loaded."

# Read the shape files in to a geopandas dataframe
spatial = gpd.read_file("spatialref/taxi_zones.shp").set_index("LocationID")
print "Spatial Data Loaded"

###########################
# Figure 18 - Mixture Model with Distribution of Tip Percentage
###########################
printHeader("Figure 18 - Mixture Model with Distribution of Tip Percentage")

# We take a sample of the dataset for the mixture model to speed up processing
# Utilties
from utility import printHeader
from utility import printPrettyCM

# Keras by default is extremely verbose about the processes/threads it uses so we switch this off.
import os
os.environ['KMP_WARNINGS'] = 'off'

# Our plots often refer to yellow/green, I have chosen hex-codes which match the official colours of NYC taxis.
colorscheme = ["#8db600", "#fce300"]

###########################
# Loading Data
###########################
printHeader("Loading Data")
# We load our cleaned tabular data file from the pickle file we saved.
# To speed up training, you can limit the sample size of this by sampling a percentage
combined = pd.read_pickle("data/combined_post_new_features.pickle")
print "%s records loaded from dataset." % len(combined)

###########################
# Features
###########################
printHeader("Features")

# To help manage all the new features we've created, we construct a list of all the features we want to include in
# our model
features_general = [
    "pickup_x", "pickup_y", "dropoff_x", "dropoff_y", "passenger_count",
    "trip_distance", "duration"
Beispiel #6
0
features_ratecode = [
    feature for feature in combined.columns
    if feature[0:len("Ratecode_")] == "Ratecode_"
]
features_colour = [
    feature for feature in combined.columns
    if feature[0:len("colour_")] == "colour_"
]
features_temporal = [
    feature for feature in combined.columns
    if feature[0:len("temporal_")] == "temporal_"
]
features_weather = ["T", "RRR"]

# We create our big list of features and print it out
printHeader("Features")
features = features_general + features_dropoff + features_pickup + \
           features_colour + features_temporal + features_weather
for feature in features:
    print "* %s" % feature

###########################
# Train/Test Split
###########################
printHeader("Train/Test Split")
X_train, X_test, y_train, y_test = train_test_split(combined[features],
                                                    combined["tip_percentage"],
                                                    test_size=0.33)

print "X_train: %s" % len(X_train)
print "X_test: %s" % len(X_test)
from __future__ import division
import geopandas as gpd
from shapely.geometry import Polygon

from utility import printHeader

###########################
# Loading the data and print out the CRS and features
###########################
printHeader("Loading the data and print out the CRS and features")
# Load and join the spatial data
gdf = gpd.read_file("spatialref/taxi_zones.shp")

# Print out the coordinate reference system
print "Coordinate system: %s\n" % gdf.crs["init"]

# Print out the features of the spatial data
print "Features:"
for feature in gdf.columns:
    print "* %s" % feature

###########################
# Check whether we should use the OBJECT ID or the LocationID as our index
###########################
printHeader(
    "Check whether we should use the OBJECT ID or the LocationID as our index")

print "The following entries exist in the dataframe where LocationID is not the same as OBJECTID"
print gdf[(gdf["OBJECTID"] != gdf["LocationID"])]

# Index by OBJECTID
Beispiel #8
0
# This generator automatically gives IDs to axes, useful for the large number of subplots in Figures 3 and 4.
def generateAxes(rows, cols):
    id = 0
    while id < rows * cols:
        yield_x = id // cols
        yield_y = id % cols
        yield (yield_x, yield_y)
        id += 1


################################################
# Loading the dataset and creating new features
################################################

printHeader("Loading the dataset and creating new features")

# Load the data from the pickle file we created
combined = pd.read_pickle("data/combined_pre_cleaning.pickle")
print ("Loaded")

# Calculate a new feature - the tip percentage
combined["tip_percentage"] = 100 * combined["tip_amount"] / (combined["total_amount"])

# Create a duration feature, first converting the datetimes to pandas datetime format
combined["dropoff_datetime"] = pd.to_datetime(combined["dropoff_datetime"])
combined["pickup_datetime"] = pd.to_datetime(combined["pickup_datetime"])
combined["duration"] = (combined["dropoff_datetime"] - combined["pickup_datetime"]) \
    .apply(lambda td: td.total_seconds())