def predict(): now = round(time.time()) now_str = time.strftime(Common.REPORT_FILE_NAME_FORMAT, time.localtime(now)) Common.create_folder("report") with open(FILE_NAME_FORMAT.format(now_str), "w") as f: text = "Number,Time,Pred Available Bike Stands,Pred Bikes,Actual Time,Actual Available Bike Stands,Actual Bikes\n" for station in STATIONS: text += str(station) + "," + str(now) + "," url = f"http://localhost:4502/api/search?station={station}&minutes={TIME_DELAY}" response = requests.get(url) json_data = response.json() status = int(json_data["status"]) print(f"Getting URL {url} responses {status} => {json_data}") if (status == 200): available_bike_stands = json_data["data"][ "available_bike_stands"] text += str(available_bike_stands) + "," available_bikes = json_data["data"]["available_bikes"] text += str(available_bikes) + "\n" time_arr.append(now_str) else: print( f"An error occurs while calling the API to predict - {json_data['status']}" ) text += "\n" #time.sleep(20) f.write(text)
def count_check_out(diff): if (diff < 0): return abs(diff) return 0 start = time.time() ######################################################### ##################### READ RAW DATA ##################### ######################################################### # get the current working directory working_dir = os.getcwd() Common.create_folder(f"{working_dir}/temp") Common.create_folder(Common.CLEAN_DATA_DIR) # change to raw-data directory to fetch JSON files data_dir = os.path.join(working_dir, "new-data") os.chdir(data_dir) print(f"Change current directory to {data_dir}") # get number of JSON files underneath a directory files = fnmatch.filter(os.listdir(data_dir), '*.json') n_files = len(files) print(f"Total JSON files is {n_files}") JSON_FILE_NAME_PATTERN = "([A-Z]{1}[a-z]{2})-([0-2][0-9]|3[0-1])-(0[0-9]|1[0-2])-\d{4}" file_dictionaries = {} # 424 items for f in files:
"number": "Number", "name": "Name", "address": "Address", "date": "Date", "time": "Time", "weekday": "Weekday", "bike_stands": "Bike Stands", "diff": "Diff", "available_bike_stands": "Available Stands", "check_in": "Check In", "check_out": "Check Out" }) ############################################################### ############### SAVE PREPROCESSING DATA TO FILE ############### ############################################################### print("Saving data to CSV file") # make the saved data preparation directory Common.create_folder(Common.CLEAN_DATA_DIR) # delete db_all_data.csv file if it exists there Common.delete_file(Common.CLEAN_DATA_FILE_FULL_PATH) # save the data preparation for using later Common.save_csv(df, Common.CLEAN_DATA_FILE_FULL_PATH) # print result out #print(df) #print(df[["Address","Date","Time","Bike Stands","Available Stands","Check In","Check Out"]]) end = time.time() print("Done preparation after {} seconds".format((end - start)))
import pandas as pd import time from common import Common import sys import math import fnmatch import re import matplotlib.pyplot as plt import matplotlib.dates as mdates from sklearn.externals import joblib # for saving and loading model from sklearn import preprocessing # label encoder from sklearn.metrics import mean_squared_error # calculate MSE start = time.time() Common.create_folder(Common.EVALUATION_PLOTS_DIR) Common.create_folder(Common.UNSEEN_PREDICTING_PLOTS_DIR) def setBikeStands(number): if (number == 79): total_stands = 27 elif (number == 5): total_stands = 40 elif (number == 100): total_stands = 25 elif (number == 66): total_stands = 40 else: total_stands = 23 return total_stands
Description: Plotting distribution of activity throughout the week Finding the most 10th busy and least 10th busy stations ''' import os import numpy as np import pandas as pd import calendar import time from common import Common import matplotlib.pyplot as plt start = time.time() Common.create_folder(Common.PLOTS_DIR) # get the relative path of preparation data file rel_path = os.path.relpath(Common.CLEAN_DATA_FILE_FULL_PATH) # read CSV files using Pandas df = pd.read_csv(rel_path, delimiter=",", parse_dates=["Date"]) # see how many occurrence of data for date, the date which has minor values (<10) means the data is somehow missing #print(df.groupby([df["Date"].dt.date])["Date"].count()) # after viewing, notice that July 2016 has minor values #print(df[df["Date"].dt.month == 7].reset_index(drop=True)) top_check_ins = pd.DataFrame( df.groupby(
import pandas as pd import time from common import Common import matplotlib.pyplot as plt import matplotlib.dates as mdates import math from sklearn import preprocessing # label encoder from sklearn import ensemble # library of Gradient Boosting from sklearn.model_selection import train_test_split # split data to training set and tesing set from sklearn.metrics import mean_squared_error # calculate MSE from sklearn.externals import joblib # for saving and loading model import sys start = time.time() Common.create_folder(Common.PREDICTING_PLOTS_DIR) # get clusters dataframe clusters = Common.get_dataframe_from_file(Common.CLUSTERED_DATA_FILE_FULL_PATH, True) # get all data dataframe all_df = Common.get_dataframe_from_file(Common.CLEAN_DATA_FILE_FULL_PATH, True) all_df = all_df[(all_df["Date"] >= "2016-10-14") & (all_df["Date"] <= "2017-10-14")].reset_index(drop=True) # left merge these two dataframes together based on Number, Date and Time merged_df = pd.merge(all_df, clusters[["Number", "Time", "Cluster"]], on=["Number", "Time"], how="left")
import os import numpy as np import pandas as pd import time from common import Common import matplotlib.pyplot as plt from sklearn.cluster import KMeans import datetime as dt import matplotlib.dates as mdates import matplotlib.ticker as ticker import sys start = time.time() Common.create_folder(Common.CLUSTERING_PLOTS_DIR) def fill_na(df): # iterate through rows for i, row in df.iterrows(): # iterate through columns of the current row for j, column in row.iteritems(): # if the current row is the first row and it has N/A value, fill in with the next non-N/A value if (i == 0 and np.isnan(df.loc[i, j])): k = i + 1 # iterate to find the next non-N/A value while (np.isnan(df.loc[k, j])): k = k + 1 df.at[i, j] = df.at[k, j] elif (