def __init__(self, series=None, n_preds=672, n_weeks=5, slen=672, alpha=0.816, beta=0.0001, gamma=0.993, data_file="access_Point_1_incoming.csv"): self.default_series = series self.default_stride_length = slen self.default_alpha = alpha self.default_beta = beta self.default_gamma = gamma self.default_num_predictions = n_preds self.default_num_train_weeks = n_weeks self.data_column_name = "" self.csvWriter = CsvWriter(host="", port=0, username="", password="", database="", new_measurement="", new_cvs_file_name="") self.returned_data_frame = self.csvWriter.csv_file_to_dataframe( new_filepath=path.join(RESOURCES_DIR, data_file), new_row_start=0)
def __init__(self, database="predicted_data"): self._default_stride = Stride.WEEKLY self._num_of_series = 8 self._selected_model = None self._data_writer = CsvWriter(host=db_config.host, port=db_config.port, username=db_config.username, password=db_config.password, database=database)
def __init__(self, predicted, data_file="access_Point_1_incoming.csv"): self.csvWriter = CsvWriter(host="", port=0, username="", password="", database="", new_measurement="", new_cvs_file_name="") self.predictedValues = predicted self.actualValues = self.csvWriter.csv_file_to_dataframe_date_selection( path.join(RESOURCES_DIR, data_file), pd.Timestamp(predicted[0, 0]), pd.Timestamp(predicted[-1, 0]))
def __init__(self, config_object=None): if isinstance(config_object, GeneratorConfig): self._Config = config_object else: self._Config = GeneratorConfig() self._Columns = 'avg_hrcrx_max_byt' self._data_writer = CsvWriter(host=db_config.host, port=db_config.port, username=db_config.username, password=db_config.password, database=self._Config.Database) self.Dist_Array = self._Config.Func_Type.generate()
class ErrorAnalysis: def __init__(self, predicted, data_file="access_Point_1_incoming.csv"): self.csvWriter = CsvWriter(host="", port=0, username="", password="", database="", new_measurement="", new_cvs_file_name="") self.predictedValues = predicted self.actualValues = self.csvWriter.csv_file_to_dataframe_date_selection( path.join(RESOURCES_DIR, data_file), pd.Timestamp(predicted[0, 0]), pd.Timestamp(predicted[-1, 0])) def compute_error(self): actual = self.actualValues["avg_hrcrx_max_byt"].tolist() predicted = self.predictedValues[:, 1].tolist() count = 0 self.meanSquaredError = metrics.mean_squared_error(actual, predicted) self.meanAbsoluteError = metrics.mean_absolute_error(actual, predicted) print("Mean Squared Error: " + str(self.meanSquaredError)) print("Mean Absolute Error: " + str(self.meanAbsoluteError)) def plot_predicted_vs_actual(self): pyplot.plot(self.predictedValues[:, 0], self.predictedValues[:, 1]) pyplot.plot(np.array(self.actualValues[""].tolist()), np.array(self.actualValues["avg_hrcrx_max_byt"].tolist())) pyplot.legend(['Predicted Values', 'Actual Values']) pyplot.show()
def __init__(self, default_stride=Stride.WEEKLY, window_length=8, data_file="access_Point_1_incoming.csv"): self.defaultStride = default_stride self.windowLength = window_length self.csvWriter = CsvWriter(host="", port=0, username="", password="", database="", new_measurement="", new_cvs_file_name="") # Data returned as two columns. One with timeseries and other with bytecount values self.returned_data_frame = self.csvWriter.csv_file_to_dataframe( new_filepath=path.join(RESOURCES_DIR, data_file), new_row_start=0, new_row_end=self.defaultStride.value * self.windowLength)
def __init__(self, database='generated_data'): self.gen_config = GeneratorConfig() # Don't run default generator on init self.generator = None self._selected_model = None # is this line needed? self._data_writer = CsvWriter(host=db_config.host, port=db_config.port, username=db_config.username, password=db_config.password, database=database)
def call_model(self): #This function returns a numpy array of timestamps and forecasted data, it call also return observed values writer = CsvWriter(host=db_config.host, port=db_config.port, username=db_config.username, password=db_config.password, database='predicted_data') #row_end = self.default_rtu + 672 df = writer.csv_file_to_dataframe(new_filepath=self.default_csv_filename, new_row_end=self.default_rtu, usecols=[0, 1]) #if self.default_rtu is None: #self.default_rtu = df.shape[0] series = list(df.values.flatten()) if self.default_rtu is None: last_time_stamp = series[-2] else: last_time_stamp = series[(self.default_rtu-1)*2] bytcts = series[1::2][:self.default_rtu] self.default_series = bytcts smooth_series = self.exponential_smoothing(self.default_series, self.default_alpha) result_datetimes = pd.date_range(last_time_stamp, periods=672+1, freq='15min')[1:] print(len(result_datetimes)) nparray_data = np.array([result_datetimes, smooth_series]).transpose() self.data_column_name = df.columns[1] return nparray_data
class Generator: def __init__(self, config_object=None): if isinstance(config_object, GeneratorConfig): self._Config = config_object else: self._Config = GeneratorConfig() self._Columns = 'avg_hrcrx_max_byt' self._data_writer = CsvWriter(host=db_config.host, port=db_config.port, username=db_config.username, password=db_config.password, database=self._Config.Database) self.Dist_Array = self._Config.Func_Type.generate() def nparray_to_dataframe(self): indexes = pd.DataFrame(self.Dist_Array[:, 0]) indexes[0] = pd.to_datetime(indexes[0], format='%Y-%m-%d %H:%M:%S') cols = [self._Columns] df = pd.DataFrame(data=self.Dist_Array[0:, 1:], index=indexes[0], columns=cols) return df def write_data_to_csv(self): df = self.nparray_to_dataframe() model_name = self._Config.Func_Type.Name if not isinstance(df, pd.DataFrame): print("Error reading the data from database.") df.to_csv(path.join(RESOURCES_DIR, model_name + "_generated.csv")) def write_data_to_database(self): df = self.nparray_to_dataframe() model_name = self._Config.Func_Type.Name df.to_csv(path.join(RESOURCES_DIR, model_name + "_generated.csv")) self._data_writer.csv_file_to_db( measurement_to_use=model_name + '_generated', new_csv_file_name=path.join(RESOURCES_DIR, model_name + "_generated.csv")) remove(path.join(RESOURCES_DIR, model_name + "_generated.csv"))
def test_csv_file_to_data(self): try: data_return_path = RESOURCES_DIR + "/" + 'temp2.csv' initial_data_path = RESOURCES_DIR + "/" + 'temp.csv' test_csv_write3 = CsvWriter(self.host, self.port, self.username, self.password, self.database) test_csv_write3.csv_file_to_db() test_csv_write3.data_to_csv_file('select * from per15min', new_csv_file_name=data_return_path, fillGaps=False) self.assertTrue(self.compare_csv_files(data_return_path, initial_data_path), "Integration test failed, file is not the same") os.remove(data_return_path) except ConnectionError as error: print("Test: Failed - {0}\n".format(error))
class TestCsvDf(TestCase): filepath = os.path.join(RESOURCES_DIR,'temp.csv') start = 0 end = 4 dlt = False csvWriter = CsvWriter(host="", port=0, username="", password="", database="", new_measurement="", new_cvs_file_name="") def test_csv_to_df(self): df = self.csvWriter.csv_file_to_dataframe(new_filepath=self.filepath, new_row_start=self.start, new_row_end=self.end, delete=self.dlt, usecols=[0,1]) compare_file = os.path.join(RESOURCES_DIR,'compare.csv') df.to_csv(compare_file) with open(self.filepath) as f1: next(f1) string_one = next(f1) sone = string_one with open(compare_file) as f2: next(f2) string_two = next(f2) stwo = string_two.lstrip('0,/') os.remove(compare_file) self.assertEqual(sone, stwo, "The specified function test failed, not equal") def test_invalid_parameters(self): with self.assertRaises(FileNotFoundError): df = csv_to_dataframe(filepath="aqqa", row_start=self.start, row_end=self.end, dlt=self.dlt, usecols=[0,1]) with self.assertRaises(StopIteration): self.start=100000000 df = csv_to_dataframe(filepath=self.filepath, row_start=self.start, row_end=self.end, dlt=self.dlt, usecols=[0,1]) def test_csv_data_frame(self): a = csv_to_dataframe_date_selection(file_path=self.filepath,usecols=[0, 1], start_date=pd.Timestamp("2017-03-18 00:15:00"), end_date=pd.Timestamp("2017-03-18 00:30:00")) self.assertIsNotNone(a) self.assertEquals(2, len(a)) self.assertEquals(875, (np.array(a)[0, 1])) self.assertEquals(894, np.array(a)[1, 1])
class HoltWinters: def __init__(self, series=None, n_preds=672, n_weeks=5, slen=672, alpha=0.816, beta=0.0001, gamma=0.993, data_file="access_Point_1_incoming.csv"): self.default_series = series self.default_stride_length = slen self.default_alpha = alpha self.default_beta = beta self.default_gamma = gamma self.default_num_predictions = n_preds self.default_num_train_weeks = n_weeks self.data_column_name = "" self.csvWriter = CsvWriter(host="", port=0, username="", password="", database="", new_measurement="", new_cvs_file_name="") self.returned_data_frame = self.csvWriter.csv_file_to_dataframe( new_filepath=path.join(RESOURCES_DIR, data_file), new_row_start=0) # This method is what is called by the PFramework to display and allow the the parameters # used to be set to something other than the default. def set_parameters(self): """ Asking user to change a parameters specific to a model, if needed :return: """ print("The default number of datapoints to predict: {}".format( self.default_num_predictions)) print("The default number of training weeks: {}".format( self.default_num_train_weeks)) print("The default seasonal stride length: {}".format( self.default_stride_length)) print("The default alpha value: {}".format(self.default_alpha)) print("The default beta value: {}".format(self.default_beta)) print("The default gamma value: {}".format(self.default_gamma)) print( "Would you like to set the parameters for Holt-Winters first? [y]/[n]" ) selection = input("Prompt: ") if selection.lower() == 'y': print("Choose the number of datapoints to predict") selection = input("Prompt: ") self.default_num_predictions = int(selection) print("Choose the number of training weeks") selection = input("Prompt: ") self.default_num_train_weeks = int(selection) print("Choose the seasonal stride length") selection = input("Prompt: ") self.default_stride_length = int(selection) print("Choose the desired alpha value") selection = input("Prompt: ") self.default_alpha = float(selection) print("Choose the desired beta value") selection = input("Prompt: ") self.default_beta = float(selection) print("Choose the desired gamma value") selection = input("Prompt: ") self.default_gamma = float(selection) # This finds the average trend across all seasonal values to used as # the initial trend for the model def initial_trend(Self, series, slen): sum = 0.0 for i in range(slen): sum += float(series[i + slen] - series[i]) / slen return sum / slen # This calculates the initial seasonal values corresponding to each observed season. # A explanation of the reasoning for this can be found # here: http://www.itl.nist.gov/div898/handbook/pmc/section4/pmc435.htm def initial_seasonal_components(Self, series, slen): seasonals = {} season_averages = [] n_seasons = int(len(series) / slen) # compute season averages for j in range(n_seasons): season_averages.append( sum(series[slen * j:slen * j + slen]) / float(slen)) # compute initial values for i in range(slen): sum_of_vals_over_avg = 0.0 for j in range(n_seasons): sum_of_vals_over_avg += series[slen * j + i] - season_averages[j] seasonals[i] = sum_of_vals_over_avg / n_seasons return seasonals # This includes the full algorithm for implementing Holt-Winters. # We can assume that the all factors (alpha, beta and gamma) have been defined def triple_exponential_smoothing(self, series, slen, alpha, beta, gamma, n_preds): result = list() seasonals = self.initial_seasonal_components(series, slen) for i in range(len(series) + n_preds): if i == 0: # initial values smooth = series[0] trend = self.initial_trend(series, slen) continue if i >= len(series): # we are forecasting m = i - len(series) + 1 new_level = (smooth + m * trend) + float(seasonals[i % slen]) new_level = float(0) if new_level < 0 else new_level result.append(new_level) else: val = series[i] last_smooth, smooth = smooth, alpha * ( val - seasonals[i % slen]) + (1 - alpha) * (smooth + trend) trend = beta * (smooth - last_smooth) + (1 - beta) * trend seasonals[i % slen] = gamma * (val - smooth) + ( 1 - gamma) * seasonals[i % slen] return result # This is the method called by the PFramework to initiate the generation of data using the Holt-Winters algorithm. # What follows will correct the gaps within the provided time series dataset, pass this into the triple exponential # smoothing algorithm and return the predicted datapoints (672 or one week) with their corresponding timestamps def call_model(self): # build dataframe df = fg.fill_data_gaps(self.returned_data_frame.shape[0], init_data=self.returned_data_frame) df['avg_hrcrx_max_byt'] = df['avg_hrcrx_max_byt'].fillna(0) self.data_column_name = df.columns[1] # create list from dataframe to pass to triple exponential smoothing tmp_series = list(df.values.flatten()) tmp_default_series = tmp_series[1::2] # Build training set based on specified number of training weeks tmp_training_count = self.default_num_train_weeks * self.default_stride_length self.default_series = tmp_default_series[0:tmp_training_count - 1] # call triple_exponential_smoothing with series = byte counts column in dataframe smooth_series = self.triple_exponential_smoothing( self.default_series, self.default_stride_length, self.default_alpha, self.default_beta, self.default_gamma, self.default_num_predictions) # generate 672 new new sequential timestamps (per 15 min) from start of prediction period start_date = df[''][tmp_training_count] result_datetimes = pd.date_range(start_date, periods=len(smooth_series), freq='15min')[0:] # assign new timestamps to datapoints nparray_data = np.array([result_datetimes, smooth_series]).transpose() # pass back completed dataframe or generate new csv file. return nparray_data def get_data_column_name(self): return self.data_column_name
def setUp(self): test_csv_write = CsvWriter(self.host, self.port, self.username, self.password, self.database) self.assertNotEqual(0, test_csv_write._client, "Class generated properly")
class TrafficPredictor: _default_stride = None _num_of_series = None _selected_model = None _data_writer = None def __init__(self, database="predicted_data"): self._default_stride = Stride.WEEKLY self._num_of_series = 8 self._selected_model = None self._data_writer = CsvWriter(host=db_config.host, port=db_config.port, username=db_config.username, password=db_config.password, database=database) def main(self): print("Welcome to the Traffic Predictor!") print("Please choose your model (enter its index):") for model in models: x = models.index(model) print("{0}: {1}".format(x, model)) print("-: Exit") selection = input("Prompt: ") if selection == '-': return else: try: model = models[int(selection)] print("Please, wait...") np = self.call_model(model) df = self.nparray_to_dataframe(np) print("Finished prediction") print( "Would you like to run Error analysis on the predicted data? [y]/[n]" ) selection = input("Prompt: ") if selection.lower() == 'y': err_analysis = ErrorAnalysis(np) err_analysis.compute_error() print( "Would you like to write predicted data to database? [y]/[n]" "\nIf selected [n] the data will be written to local csv file" ) selection = input("Prompt: ") if selection.lower() == 'y': self.write_data_to_database(model, df) else: self.write_data_to_csv(model, df) except IndexError: print("There's no model under index: {}".format(selection)) except TypeError: print( "ERROR: The model import failed. Please make sure to properly add/choose your model." ) raise TypeError def call_model(self, model_name): model_root = 'PModules.' + model_name + "." + model_name + "." + model_name model = locate(model_root) self._selected_model = model() # Your model class instance self._selected_model.set_parameters() result = self._selected_model.call_model() return result def write_data_to_csv(self, model_name, df): if not isinstance(df, pd.DataFrame): print( "Error reading the data from database. Please test this query in Chronograf/Grafana." ) df.to_csv(path.join(RESOURCES_DIR, model_name + "_predicted.csv")) def write_data_to_database(self, model_name, df): df.to_csv(path.join(RESOURCES_DIR, model_name + "_predicted.csv")) self._data_writer.csv_file_to_db( measurement_to_use=model_name + '_predicted', new_csv_file_name=path.join(RESOURCES_DIR, model_name + "_predicted.csv")) remove(path.join(RESOURCES_DIR, model_name + "_predicted.csv")) def nparray_to_dataframe(self, nparray_data): indexes = pd.DataFrame(nparray_data[:, 0]) indexes[0] = pd.to_datetime(indexes[0], format='%Y-%m-%d %H:%M:%S') cols = [self._selected_model.get_data_column_name()] df = pd.DataFrame(data=nparray_data[0:, 1:], index=indexes[0], columns=cols) return df
class SimpleMovingAverage: """ Calculates the Simple Moving Average on a daily/weekly basis This class makes N different series depending on the selected stride, 96(4*24) in the case of daily and 672(4*24*7) in the case of weekly. This is under the assumption that there is a periodic relationship in the data. For example, for a daily stride, it is being assumed that there is a correlation between all the 9AM values that occur, and the prediction for the next 9AM value is a moving average of all the selected days before it. :param default_stride: represents the stride for calculating the moving average (DAILY/WEEKLY) :param window_length: Number of days in a single series :param data_file: Name of the file that exists in the predictor_resources folder :return: numpy array object with two columns, a timeseries object(in epoch format) and the predicted bytecount """ formattedInput = [] lastDate = "" data_column_name = "" def __init__(self, default_stride=Stride.WEEKLY, window_length=8, data_file="access_Point_1_incoming.csv"): self.defaultStride = default_stride self.windowLength = window_length self.csvWriter = CsvWriter(host="", port=0, username="", password="", database="", new_measurement="", new_cvs_file_name="") # Data returned as two columns. One with timeseries and other with bytecount values self.returned_data_frame = self.csvWriter.csv_file_to_dataframe( new_filepath=path.join(RESOURCES_DIR, data_file), new_row_start=0, new_row_end=self.defaultStride.value * self.windowLength) def set_parameters(self): """ Asking user to change a parameters specific to a model, if needed :return: """ print("The default stride: {}".format(self.defaultStride.name)) print("The default number of series: {}".format(self.windowLength)) print( "Would you like to set the parameters for Simple Moving Average first? [y]/[n]" ) selection = input("Prompt: ") if selection.lower() == 'y': print("Choose the stride (WEEKLY/DAILY): [W]/[D]") selection = input("Prompt: ") if selection.upper() == 'W': self.defaultStride = Stride.WEEKLY if selection.upper() == 'D': self.defaultStride = Stride.DAILY print("Choose the number of series.") selection = input("Prompt: ") if self.defaultStride == Stride.DAILY and int(selection) < 7: print( "You cannot use training set less than 7 days. It will be left as a default" ) if self.defaultStride == Stride.WEEKLY and int(selection) > 52: print( "The number of series cannot exceed one year. It will be left as a default" ) else: self.windowLength = int(selection) def initialize_dataframe_output(self): # Input formatting for future calculation numpy_array = np.array(self.returned_data_frame)[:, 1] numpy_array = numpy_array.reshape( (numpy_array.size // self.defaultStride.value, self.defaultStride.value)).transpose() self.formattedInput = numpy_array # Getting the last day in the "training" data. Used to generate the output timeseries later self.lastDate = np.array(self.returned_data_frame)[-1:, :-1][0][0] def call_model(self): self.initialize_dataframe_output() numpy_array = self.formattedInput # makes a numpy array of length(windowLength) and divides each with the scalar value of window.length x = np.ones(self.windowLength) / self.windowLength if self.defaultStride == Stride.DAILY: loop_count = 7 elif self.defaultStride == Stride.WEEKLY: loop_count = 1 # Calculating moving average here for i in range(loop_count): y = signal.convolve(numpy_array, [x], mode="valid") numpy_array = np.concatenate((numpy_array, y), axis=1) numpy_array = numpy_array[:, 1:] if self.defaultStride == Stride.DAILY: predictions = numpy_array[:, -7:] predictions = predictions.transpose().reshape(1, predictions.size)[0] elif self.defaultStride == Stride.WEEKLY: predictions = numpy_array[:, -1] # Creates a numpy array(One week long), because the function is inclusive getting rid of the first element result_datetimes = pd.date_range(self.lastDate, periods=Stride.WEEKLY.value + 1, freq='15min')[1:] nparray_data = np.array([result_datetimes, predictions]).transpose() self.data_column_name = self.returned_data_frame.columns[1] return nparray_data def get_data_column_name(self): return self.data_column_name
""" from predictor_resources.config import RESOURCES_DIR from predictor_resources import db_config import sys from os import path, remove from root import ROOT_DIR sys.path.append(path.join(ROOT_DIR, 'CPacket-Common-Modules')) from io_framework.csv_writer import CsvWriter from io_framework.db_connector.db_connector import InfluxDBConnector from io_framework.csv_fill_data_gaps import fill_data_gaps database = 'AccessPoints' # choose this if you want to use different DB data_processor = CsvWriter(host=db_config.host, port=db_config.port, username=db_config.username, password=db_config.password, database=database) connector = InfluxDBConnector(host=db_config.host, port=db_config.port, database=database) def data_with_filled_gaps_to_db(file_path=None, new_measurement=None): df = data_processor.csv_file_to_dataframe( new_filepath=file_path) # Change usecols here if you need dr = fill_data_gaps(init_data=df) dr.set_index('', inplace=True) dr.to_csv(path_or_buf=path.join(RESOURCES_DIR, "temp.csv")) data_processor.csv_file_to_db(measurement_to_use=new_measurement, new_csv_file_name=path.join(