def testNetwork(config): ''' Train all data using a session saved at config.path_savedSession ''' pl_input, pl_output, nn, saver, graph, _ = setupNet(config) with tf.Session(graph=graph) as sess: saver.restore(sess, config.path_savedSession) debugInfo(__name__, "Restored session") #return test_DataPrintOutput(nn,sess,pl_input,pl_output,config,fileName = config.path_outputFile) prediction = test_nonRandomizedPrediction(nn, sess, pl_input, pl_output, config) sz_o = config.data.getNumberOutputs() output = pd.DataFrame( np.empty((config.data.getNumberTestPoints(), 2 * sz_o))) y = dsh.denormalizeData(config.data.test.outputData, config.data.max_value) y_ = dsh.denormalizeData(prediction, config.data.max_value) output.iloc[:, 0:sz_o] = y output.iloc[:, sz_o:2 * sz_o] = y_ output.index = config.data.test.rowNames debugInfo(__name__, "Printing prediction output to %s" % config.path_outputFile) output.to_csv(config.path_outputFile, header=[(lambda x: "i_%d" % x)(to) for to in config.timeOffsets] + [(lambda x: "o_%d" % x)(to) for to in config.timeOffsets]) mae = np.mean(np.abs(y - y_), 0) print(mae) return mae
def trainNetwork(config): pl_input, pl_output, nn, saver, graph, summary_op = setupNet(config) with tf.Session(graph=graph) as sess: summary_writer = tf.train.SummaryWriter(config.path_TFoutput, sess.graph) sess.run(tf.initialize_all_variables()) for step in range(config.max_steps): myFeedDict = config.data.test.fill_feed_dict( pl_input, pl_output, Configuration.batch_size) loss_value, predicted = sess.run([nn.optimize, nn.prediction], feed_dict=myFeedDict) if (step % Configuration.test_step == 0): if (args.trackPredictions != None): test_allDataAppendToDf(nn, sess, pl_input, pl_output, config_track, int(step / config.test_step) + 1) #debugInfo(__name__,dsh.denormalizeData(predicted,config.data.max_value)) #summary_writer.add_summary(summary_str) #summary_writer.flush() mean = sess.run(nn.evaluation, feed_dict=myFeedDict) debugInfo( __name__, "Training step : %d of %d" % (step, config.max_steps)) debugInfo( __name__, "Mean test error is %f" % dsh.denormalizeData(mean, config.data.max_value)) path_savedSession = saver.save(sess, config.path_savedSession)
def testNetwork(config): ''' Train all data using a session saved at config.path_savedSession ''' pl_input, pl_output, nn, saver, graph, _ = setupNet(config) with tf.Session(graph=graph) as sess: saver.restore(sess, config.path_savedSession) debugInfo(__name__, "Restored session") #return test_DataPrintOutput(nn,sess,pl_input,pl_output,config,fileName = config.path_outputFile) prediction = test_nonRandomizedPrediction(nn, sess, pl_input, pl_output, config) output = pd.DataFrame( np.empty((config.data.getNumberTrainingPoints(), 2 * config.number_target_neurons))) y = dsh.denormalizeData( config.data.train.outputData.reshape( [-1, config.number_target_neurons]), config.data.max_value) y_ = dsh.denormalizeData( prediction.reshape([-1, config.number_target_neurons]), config.data.max_value) output.iloc[:, 0:config.number_target_neurons] = y output.iloc[:, config.number_target_neurons:2 * config.number_target_neurons] = y_ #output.index = config.data.train.rowNames output.to_csv(config.path_outputFile) debugInfo(__name__, "Printing prediction output to %s" % config.path_outputFile) mae = np.mean(np.abs(y - y_), 0) print(mae) return mae
def read_csv_and_pivot_with_rollingAvg(inputFile, specifiedSensors = None, sql_headers = ['S_IDX','ZEIT','wert'], window = 15): data_wide_all = pivot_simple(inputFile, specifiedSensors, sql_headers) data_wide_all = data_wide_all.rolling(window,min_periods =1).mean() debugInfo(__name__,"Calculated the rolling average using window %d : (%d, %d)"%(window,data_wide_all.shape[0],data_wide_all.shape[1])) return data_wide_all
def evaluate(self): debugInfo(__name__,"Adding Evaluation nodes to the graph") predictions = self.prediction rounded = tf.round(predictions) correct_prediction = tf.equal(rounded,self.targets) accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32)) return accuracy,correct_prediction,predictions,rounded
def removeInefficientSensors(data_wide_all, sensorEfficiency): #count the number of times each column has an 'na' value counts = np.zeros((data_wide_all.shape[1], 1)) print(data_wide_all.shape) for i in range(0, data_wide_all.shape[1]): counts[i] = len(np.where(np.isnan(data_wide_all.iloc[:, i]))[0]) # calculate the efficiency of the sensor sensorsToEfficiency = pd.DataFrame(np.zeros((3, counts.shape[0]))) sensorsToEfficiency.iloc[0, :] = data_wide_all.columns.values.reshape( 1, -1) sensorsToEfficiency.iloc[ 2, :] = 1 - counts.reshape(1, -1) / (data_wide_all.shape[0]) sensorsToEfficiency.iloc[1, :] = counts.reshape(1, -1) # make table containing only efficient sensors (only columns with <10 nan are used) efficientSensorIndices = np.where( sensorsToEfficiency.iloc[2, :].values > sensorEfficiency) data_wide = data_wide_all.iloc[:, efficientSensorIndices[0]] data_wide.shape debugInfo( __name__, "Data where sensors have efficiency > %.2f : (%d, %d)" % (sensorEfficiency, data_wide.shape[0], data_wide.shape[1])) debugInfo( __name__, "There are %d sensors in total, but only %d have efficiency > %.2f" % (data_wide_all.shape[1], data_wide.shape[1], sensorEfficiency)) return data_wide
def error(self): debugInfo(__name__,"Adding MNIST Error nodes to the graph") # using l2 norm (sum of) square error final_error = tf.sub(self.target,self.prediction,name="myError") tf.histogram_summary("final_error",final_error) mean = tf.reduce_mean(final_error,0) tf.histogram_summary("mean_error",mean) return final_error
def evaluation(self): debugInfo(__name__,"Adding Evaluation nodes to the graph") # using l2 norm (sum of) square error final_error = tf.abs(tf.sub(self.target,self.prediction,name="myEvaluationError")) #tf.histogram_summary("evaluation_final_error",final_error) mean = tf.reduce_mean(final_error) #tf.scalar_summary("evaluation_mean_error",mean) return mean
def toString(self): debugInfo( __name__, "FullDataSet Object : [ Train : input (%d, %d) output (%d, %d) ]\t [ Test : input (%d, %d) output (%d, %d) ]" % (self.train.inputData.shape[0], self.train.inputData.shape[1], self.train.outputData.shape[0], self.train.outputData.shape[1], self.test.inputData.shape[0], self.test.inputData.shape[1], self.test.outputData.shape[0], self.test.outputData.shape[1]))
def prepareData(data_wide, indexOutputSensor, inputFunction, config=None, adjacency=None): ''' Creates a dataframe containing desired input/output within the same table Args: data_wide : numpy array of all data (eg pivoted and smooth sqlToNumpy output) indexOutputSensor : the sensor to be predicted timeOffsets : python list of desired output times inputFunction : pd_ function (1 of 8) that formats input data in the desired manner adjacency : optional : a single numpy vector sequential : optional : python list (like timeOffsets) specifying which time points as input target : input : t_5 t_0 t_6 t_1 . . . . t15 t_10 ''' # input data is moved vertically down by max of timeOffsets #max_output = max(timeOffsets) max_output = 0 index_output_begin = max(config.rnn_input_time_sequence) + min( config.rnn_target_time_sequence) i = inputFunction( data_wide, indexOutputSensor, s=config.rnn_input_time_sequence, a=adjacency, max_output=0)[ max(config.rnn_input_time_sequence):-index_output_begin, :] #df = pd.DataFrame(i) #df.to_csv("/Users/ahartens/Desktop/input.csv") i = i.reshape([i.shape[0], len(config.rnn_input_time_sequence), -1]) print(i.shape) debugInfo(__name__, "Preparing data : %d inputs %d" % (i.shape[1], i.shape[0])) # create 'output' data : #o = timeOffsetData(data_wide[:,indexOutputSensor],timeOffsets,b=max(sequential)) o = data_wide[index_output_begin:, indexOutputSensor] #df = pd.DataFrame(o) #df.to_csv("/Users/ahartens/Desktop/output.csv") o = o.reshape([o.shape[0], len(config.rnn_target_time_sequence), 1]) debugInfo(__name__, "Preparing data : %d outputs %d" % (o.shape[1], o.shape[0])) # combine input/output in one dataframe return i, o, i.shape[1]
def error(self): debugInfo(__name__,"Adding Error nodes to the graph") # using l2 norm (sum of) square error #error_op = tf.square(tf.sub(self.targets,self.prediction),name="error") cross_entropy = tf.reduce_mean(-tf.reduce_sum(self.targets * tf.log(self.prediction), reduction_indices=[1])) tf.histogram_summary("error",cross_entropy) return cross_entropy
def evaluation(self): debugInfo(__name__,"Adding MNIST Evaluation nodes to the graph") # using l2 norm (sum of) square error prediction = self.prediction predictions = tf.argmax(prediction,1) targets = tf.argmax(self.target,1) counts = tf.to_float(tf.equal(predictions,targets,"Check_Equal")) #tf.scalar_summary("evaluation_mean_error",mean) mean = tf.reduce_mean(counts) return mean, counts, predictions, targets
def prediction(self): debugInfo(__name__,"Adding Prediction nodes to the graph") with tf.name_scope('layer1'): weights = tf.Variable(tf.truncated_normal((self.n_input,self.n_hidden),stddev=0.1), name="lay1_weights") bias = tf.Variable(tf.constant(0.1,shape=[self.n_hidden]), name = "lay1_bias") out_layer1 = tf.nn.sigmoid(tf.matmul(self.data,weights)+bias, name = "lay1_output") with tf.name_scope('layer2'): weights = tf.Variable(tf.truncated_normal((self.n_hidden,self.n_output),stddev=0.1), name="lay2_weights") bias = tf.Variable(tf.constant(0.1,shape=[self.n_output]), name="lay2_bias") out_layer2 = tf.nn.sigmoid(tf.matmul(out_layer1,weights)+bias, name = "lay2_output") return out_layer2
def splitDataToTrainAndTest(data_df, train_frac): ''' @ param data_df Pandas dataframe object of all data, each row is data point @ param train_frac Float determining how much reserved for training @ return train,test Two pandas dataframes ''' debugInfo(__name__, "Splitting data to train and test fraction %.2f" % (train_frac)) train = data_df.sample(frac=train_frac, random_state=1) test = data_df.loc[~data_df.index.isin(train.index)] return train, test
def read_csv_and_pivot(inputFile, specifiedSensors = None, sql_headers = ['S_IDX','ZEIT','wert']): ''' First step of all further analyses : Long/narrow dataset from SQL is made wide (one column per sensor, time stamps are rows) Format of input file must be [S_IDX,ZEIT,wert] Args : inputFile : Path to csv file generated by sql specifiedSensors : numpy array of sensor Ids that should be used. If present then inefficients sensors not removed window (opt) : If included, performs smoothing operation. If not provided no smoothing is done! Return : data_wide_all : Pandas dataframe containing desired data ''' debugInfo(__name__,"Beginning to read file") all_data = pd.read_csv(inputFile,sep=",") debugInfo(__name__,"Read input SQL file with shape : (%d, %d)"%(all_data.shape[0],all_data.shape[1])) if (specifiedSensors is not None): debugInfo(__name__,"%d Sensors specified, getting indices from"%specifiedSensors.shape[0]) sensorIndices = np.where(all_data.iloc[:,0].values==specifiedSensors.values)[1] all_data = all_data.iloc[sensorIndices,:] # make into a wide table data_wide_all = all_data.pivot(index=sql_headers[1], columns=sql_headers[0], values=sql_headers[2]) debugInfo(__name__,"Pivoted input shape : (%d, %d)"%(data_wide_all.shape[0],data_wide_all.shape[1])) return data_wide_all
def formatFromSQL(path_sqlFile=None, path_preparedData=None, specifiedSensorsArray=None): # remake data from SQL output and min/max normalize it if (path_sqlFile is not None): debugInfo(__name__, "Processing data from an SQL file %s" % path_sqlFile) data_df, _, specifiedSensors = stn.pivotAndSmooth( path_sqlFile, specifiedSensorsArray) data_df, max_value = dsh.normalizeData(data_df) # If no SQL data then open file and min/max normalize data else: debugInfo(__name__, "Opening preprocessed data file %s" % path_preparedData) data_df, max_value = dsh.normalizeData(pd.read_csv(path_preparedData)) return data_df, max_value, specifiedSensors
def calculate_average_week(): df = self.df.values length_week = 7*1440 num_sensors = df.shape[1] num_weeks = df.shape[0]/length_week df_avg = np.zeros([length_week,num_sensors]) debugInfo(__name__,"Data successfully prepared, finding average of %d weeks"%num_weeks) for time_in_week in range(0,length_week): # get indices of all rows corresponding to a certain time of the day/week idxs_for_time_n = [(length_week*week_idx)+time_in_week for week_idx in range(0,int(num_weeks))] # (eg monday 00:02) is equal to the average of every monday at 00:02 df_avg[time_in_week] = np.nanmean(df[idxs_for_time_n],0) self.df = pd.DataFrame(df_avg)
def prediction(self): debugInfo(__name__,"Adding LSTM Prediction nodes to the graph") cell = tf.nn.rnn_cell.LSTMCell(num_units=self.n_hidden,state_is_tuple=True) outputs,last_states = tf.nn.dynamic_rnn( cell=cell,inputs=self.data,dtype=tf.float32) last_output = outputs[:,self.rnn_number_steps-1,:] # outputs contains an tensor with shape ( batch size, rnn_sequence_length , n_hidden) # only the rnn layers or connected! to create the output layer of proper size need a new activation function! # activation function for output layer with tf.name_scope('outputLayer'): weights = tf.Variable(tf.truncated_normal((self.n_hidden,self.n_output),stddev=0.1), name="lay2_weights") bias = tf.Variable(tf.constant(0.1,shape=[self.n_output]), name="lay2_bias") out_layer2 = tf.nn.sigmoid(tf.matmul(last_output,weights)+bias, name = "lay2_output") return out_layer2
def setupNet(config): graph = tf.Graph() with graph.as_default(), tf.device('/cpu:0'): pl_input = tf.placeholder(tf.float32, shape=[None, config.data.getNumberInputs()], name="input_placeholder") pl_output = tf.placeholder( tf.float32, shape=[None, config.data.getNumberOutputs()], name="target_placeholder") # create neural network and define in graph debugInfo(__name__, "Creating neural network") nn = model.SimpleNeuralNetwork(pl_input, pl_output, config.n_hidden, config.learningRate) saver = tf.train.Saver() summary_op = tf.merge_all_summaries() return pl_input, pl_output, nn, saver, graph, summary_op
def __init__(self, data, target, number_hidden_nodes, learning_rate, rnn_number_steps=None): ''' Args : data : tensorflow placeholder to hold input data target : tensorflow placeholder to hold (true) output data (target value) number_hidden_nodes : learning_rate : ''' # data and target are placeholders self.data = data self.target = target # define hyperparameters of network self.n_input = int(self.data.get_shape()[1]) self.n_hidden = number_hidden_nodes self.n_output = int(self.target.get_shape()[1]) self.learningRate = learning_rate self.rnn_number_steps = rnn_number_steps debugInfo(__name__,"#input : %d #hidden : %d #output : %d learningRate : %.2f"%(self.n_input,self.n_hidden,self.n_output,self.learningRate)) # reference operation attributes of model self.addAttributes()
def fill_time_gaps(self, orig_df, time_format='%Y-%m-%d %H:%M:%S.%f'): ''' Public method for filling time gaps Params self.orig_df : panda data frame containing wide data (one row per time stamp, one column per sensor) self.time_format : format that time stamp is to be parsed with. some differences exist (no millisecond etc) Returns [self.new_df, self.new_time_stamps] ''' self.time_format = time_format self.orig_df = orig_df self.convert_orig_time_stamps_as_datetime_objects() if self.count_gaps(self.orig_time_stamps) is 0: debugInfo(__name__, "No time gaps found") self.convert_datetime_objects_to_orig_time() return self.orig_df debugInfo(__name__, "Time gaps found, beginning to fill") return self.fill_gaps()
def setupNet(config): ''' Creates the operation graph in tensorflow returns all components necessary for training/testing ''' graph = tf.Graph() with graph.as_default(), tf.device('/cpu:0'): # batch size, number of input sequences, size of input sequence pl_input = tf.placeholder(tf.float32, shape=[ None, len(config.rnn_input_time_sequence), config.number_input_neurons ], name="input_placeholder") # batch size, number of outputs (which is equal to number of input sequences), size of output (predicts 5 timeputs every ) pl_output = tf.placeholder(tf.float32, shape=[ None, len(config.rnn_target_time_sequence), config.number_target_neurons ], name="target_placeholder") # create neural network and define in graph debugInfo(__name__, "Creating neural network") nn = lstm(data=pl_input, target=pl_output, number_hidden_nodes=config.n_hidden, learning_rate=config.learningRate, rnn_number_steps=len(config.rnn_input_time_sequence)) saver = tf.train.Saver() summary_op = tf.merge_all_summaries() return pl_input, pl_output, nn, saver, graph, summary_op
def prepareData(data_wide, indexOutputSensor, timeOffsets, inputFunction, adjacency=None, sequential=[0]): ''' Creates a dataframe containing desired input/output within the same table Args: data_wide : numpy array of all data (eg pivoted and smooth sqlToNumpy output) indexOutputSensor : the sensor to be predicted timeOffsets : python list of desired output times inputFunction : pd_ function (1 of 8) that formats input data in the desired manner adjacency : optional : a single numpy vector sequential : optional : python list (like timeOffsets) specifying which time points as input ''' # input data is moved vertically down by max of timeOffsets max_output = max(timeOffsets) max_sequential = max(sequential) i = inputFunction(data_wide, indexOutputSensor, s=sequential, a=adjacency, max_output=max_output, max_sequential=max_sequential) debugInfo(__name__, "Preparing data : %d inputs %d" % (i.shape[1], i.shape[0])) # create 'output' data : o = timeOffsetData(data_wide[:, indexOutputSensor], timeOffsets, b=max(sequential)) debugInfo(__name__, "Preparing data : %d outputs %d" % (o.shape[1], o.shape[0])) # combine input/output in one dataframe df = pd.DataFrame(np.hstack((i, o))) return df, i.shape[1]
def make_average_week(filename, sql_headers = ['S_IDX','ZEIT','wert'], time_format = '%Y-%m-%d %H:%M:%S'): ''' filename : File path to csv file (sql output with 3 columns) ''' self.time_format = time_format self.sql_headers = sql_headers debugInfo(__name__,"making average week : preparing data") df_pd = df = df_pd.values length_week = 7*1440 num_sensors = df.shape[1] num_weeks = df.shape[0]/length_week df_avg = np.zeros([length_week,num_sensors]) debugInfo(__name__,"Data successfully prepared, finding average of %d weeks"%num_weeks) for time_in_week in range(0,length_week): # get indices of all rows corresponding to a certain time of the day/week idxs = [(length_week*week_idx)+time_in_week for week_idx in range(0,int(num_weeks))] # (eg monday 00:02) is equal to the average of every monday at 00:02 df_avg[time_in_week] = np.nanmean(df[idxs],0) df_avg_pd = pd.DataFrame(df_avg) # day of week as integer with sunday being 1 avg_row_names = [datetime.datetime.strftime(i, '%w_%H:%M:%S') for i in new_row_names[0:df_avg.shape[0]]] df_avg_pd.index = avg_row_names df_avg_pd.columns = df_pd.columns.values return df_avg_pd
def check_if_has_gaps(self): if self.count_gaps(self.new_time_stamps) is 0: debugInfo(__name__, "Time gaps successfully filled") else: raise Exception("%d Gaps found!!!" % count)
def normalizeData(data_df): max_value = np.nanmax(data_df.values) debugInfo(__name__, "Max value in maxMinNormalization is %.2f" % max_value) return ((data_df / max_value) * .99999999) + 0.00000001, max_value
def main(args): config = Configuration(args) methods = [ { 'func': pd_1s_singleInput, 'name': 'ffnn_simple', 'adj': False }, ] #{'func':pd_2s_allInput,'name':'ffnn_all','adj':False},] #{'func':pd_3s_adjacency_withSelf,'name':'ffnn_nn+','adj':True}, #{'func':pd_4s_adj_noSelf,'name':'ffnn_nn','adj':True}] # create training data (all of july) data_df, max_value, specifiedSensors = formatFromSQL( path_sqlFile=config.path_sqlFile, sql_headers=config.sql_headers) # 182-185,281 are missing from adjacency matrix!! remove them! tell max, this needs to be changed! remove = [182, 183, 184, 185, 281] removeInx = [] for i in remove: index = np.where(data_df.columns.values == i)[0] if len(index) > 0: removeInx.append(index[0]) data_df.drop(data_df.columns[removeInx], axis=1, inplace=True) specifiedSensors = pd.DataFrame(data_df.columns.values) # create test data (one or more) config.test_dicts = [] for path_test in config.path_sqlTestFile: test_df, test_max_value, _ = formatFromSQL( path_sqlFile=path_test, specifiedSensorsArray=specifiedSensors, sql_headers=config.sql_headers) config.test_dicts.append({ 'df': test_df, 'max': test_max_value, 'name': os.path.basename(os.path.normpath(path_test)).replace('.csv', '') }) # create a list that contains the (function) index of the minimum MAE (averaged) path_idxsMinMae = os.path.join(config.path_outputDir, "indicesMinMaes.csv") idxMinMae_list = [] ## FOR EACH SENSOR ## #for indexOutputSensor in range(0,data_df.shape[1]): for indexOutputSensor in range(0, 1): # create folder for current sensor debugInfo(__name__, "SENSOR %d" % data_df.columns.values[indexOutputSensor]) dir_sensor = os.path.join( config.path_outputDir, "s_%d" % data_df.columns.values[indexOutputSensor]) dir_sensor_tf = os.path.join(dir_sensor, 'tf') if not os.path.exists(dir_sensor): os.makedirs(dir_sensor) if not os.path.exists(dir_sensor_tf): os.makedirs(dir_sensor_tf) # set up empty data frame and array for the average MAE of all testing data. first column is names of all the functions testSetInfo = [] # create necessary paths and empty arrays for each training set for i in range(0, len(config.test_dicts)): avgMaes_df = pd.DataFrame( np.zeros((len(methods), len(config.rnn_target_time_sequence)))) avgMaes_array = np.zeros( (len(methods), len(config.rnn_target_time_sequence))) path_avgMaes_df = os.path.join( dir_sensor, "avgMaesForSensor_%d.csv" % data_df.columns.values[indexOutputSensor]) # create folders for current (test) data dir_test = os.path.join(dir_sensor, config.test_dicts[i]['name']) dir_test_tf = os.path.join(dir_test, 'tf') dir_test_results = os.path.join(dir_test, 'output') if not os.path.exists(dir_test): os.makedirs(dir_test) if not os.path.exists(dir_test_tf): os.makedirs(dir_test_tf) if not os.path.exists(dir_test_results): os.makedirs(dir_test_results) testSetInfo.append({ 'avgMaes_df': avgMaes_df, 'avgMaes_array': avgMaes_array, 'path_avgMaes_df': path_avgMaes_df, 'dir_test': dir_test, 'test_dr_tf': dir_test_tf, 'dir_test_results': dir_test_results }) ## FOR EACH DATA PREPARATION METHOD ## # this affects how the network is formed! changes every time for j in range(0, len(methods)): debugInfo(__name__, "Using %s to prepare data" % methods[j]['name']) config.path_savedSession = os.path.join( dir_sensor_tf, "tfsession_%s.ckpt" % methods[j]['name']) debugInfo(__name__, "Creating Data for Training") config.data = makeDataSetObject( data_df=data_df, max_value=max_value, outputSensorIndex=indexOutputSensor, prepareData_function=methods[j]['func'], path_adjacencyMatrix=None if (methods[j]['adj'] == False) else config.path_adjacencyMatrix, config=config) # the number of data points used as input (per time point). eg all sensors, 1 sensor, only nearest neighbors etc config.number_input_neurons = config.data.train.inputData.shape[2] config.number_target_neurons = 1 # train using training set trainNetwork(config) ## FOR EACH TEST DATA SET ## for i in range(0, len(config.test_dicts)): testData = config.test_dicts[i] debugInfo(__name__, "Creating Data for Testing") config.path_outputFile = os.path.join( testSetInfo[i]['dir_test_results'], "%s.csv" % methods[j]['name']) config.data = makeDataSetObject( data_df=testData['df'], max_value=testData['max'], outputSensorIndex=indexOutputSensor, prepareData_function=methods[j]['func'], path_adjacencyMatrix=None if (methods[j]['adj'] == False) else config.path_adjacencyMatrix, config=config) maes = testNetwork(config) testSetInfo[i]['avgMaes_df'].iloc[j, :] = maes # contains average maes for all test data sets (average of averages) avgMaeOverTests_df = pd.DataFrame( np.zeros((len(methods), len(config.rnn_target_time_sequence)))) avgMaeOverTests_array = np.zeros( (len(methods), len(config.rnn_target_time_sequence))) path_avgMaeOverTests = os.path.join( dir_sensor, "avgMaesForSensor_%d.csv" % data_df.columns.values[indexOutputSensor]) # iterate over all 'avgMae' tables (for each test data set) for i in range(0, len(config.test_dicts)): testSetInfo[i]['avgMaes_df'].index = np.array([ (lambda x: x['name'])(funcDic) for funcDic in methods ]) testSetInfo[i]['avgMaes_df'].to_csv( testSetInfo[i]['path_avgMaes_df'], header=([(lambda x: "t_%d" % x)(to) for to in config.rnn_target_time_sequence])) avgMaeOverTests_array = testSetInfo[i][ 'avgMaes_df'].values + avgMaeOverTests_array # get average of maes for all test data avgMaeOverTests_array = avgMaeOverTests_array / len(config.test_dicts) avgMaeOverTests_df.iloc[:, :] = avgMaeOverTests_array avgMaeOverTests_df.index = [(lambda x: x['name'])(funcDic) for funcDic in methods] avgMaeOverTests_df.to_csv(path_avgMaeOverTests, header=([ (lambda x: "t_%d" % x)(to) for to in config.rnn_target_time_sequence ])) # get index of function with lowest MAE and save idxMinMae_list.append(avgMaeOverTests_array.argmin(axis=0)) idxMinMae_df = pd.DataFrame( np.hstack((specifiedSensors.values[0:2], np.array(idxMinMae_list)))) idxMinMae_df.to_csv(path_idxsMinMae, header=(['sensor'] + [(lambda x: "t_%d" % x)(to) for to in config.rnn_target_time_sequence]))
def makeDataSetObject(data_df, max_value, prepareData_function, path_adjacencyMatrix, outputSensorIndex, config=None): ''' Args : inputFilePath : Path to csv file 26_8_16_PZS_Belgugn_All_Wide_NanOmitecsv or similar remakeData : Boolean : if True then inputFilePath refers to an SQL output file and the data is remade outputFilePath : Path to outputfile if saveOutputFile is True timeOffset : Int : number of minutes that Return : theData : FullDataSet object from dataset_helpers containing two DataSet objects containing two numpy arrays(input/target), contains next_batch() function! ''' # define index of single output sensor (the output is at some time in the future) adjacencyForOutputSensor = None # add in the adjacency matrix if (path_adjacencyMatrix is not None): debugInfo(__name__, "Found an adjacency matrix : multiplying it in!") # 182-185,281 are missing from adjacency matrix!! remove them! tell max, this needs to be changed! #data_df=pd.DataFrame(data_df.iloc[:,5:data_df.shape[1]].values,columns=data_df.columns.values[5:data_df.shape[1]]) # list of sensors columns that we are using desired = data_df.columns.values # read adjacency matrix adjMatrix_orig = pd.read_csv(path_adjacencyMatrix) # adjacency matrix csv has headers as type string, with columns 0,1 actual strings : rename all columns as ints! sensorsList = list( adjMatrix_orig.columns.values[2:adjMatrix_orig.shape[1]].astype( np.int64)) columns = [0, 1] + sensorsList adjMatrix_orig.columns = columns # remove all columns (sensors) that we don't want, leaving only sensors that are desired # this uses header names to reference the columns that i want removed = adjMatrix_orig[desired] # get row index of single sensor being used for output (as a string) : this row is the adjacency! indexForSensorInMatrix = np.where(adjMatrix_orig.iloc[:, 1] == data_df. columns.values[outputSensorIndex])[0] adjacencyForOutputSensor = removed.iloc[ indexForSensorInMatrix, :].values print(data_df.columns.values[np.where( adjacencyForOutputSensor[0] == 1)[0]]) # create input and output vectors input_, output_, indexOutputBegin = prepareData( data_df.values, indexOutputSensor=outputSensorIndex, inputFunction=prepareData_function, config=config, adjacency=adjacencyForOutputSensor) debugInfo(__name__, "Making FullDataSet object containing train/test data") # create FullDataSet object with appropriate data theData = dsh.FullDataSet(trainInput=input_, trainOutput=output_) theData.max_value = max_value theData.train.rowNames = data_df.index[:-( max(config.rnn_target_time_sequence) - 1)] return theData
def main(args): config = Configuration(args) methods = [ { 'func': pd_1_singleInput, 'name': 'f1_singleInput', 'adj': False }, ] #{'func':pd_2_allInput,'name':'f2_allInput','adj':False},] #{'func':pd_3_adjacency_withSelf,'name':'pd_3_adjacency_withSelf','adj':True}, #{'func':pd_4_adj_noSelf,'name':'pd_4_adj_noSelf','adj':True}, #{'func':pd_1s_singleInput,'name':'f1s_singleInput','adj':False}, #{'func':pd_2s_allInput,'name':'f2s_allInput','adj':False}, #{'func':pd_3s_adjacency_withSelf,'name':'pd_3s_adjacency_withSelf','adj':True}, #{'func':pd_4s_adj_noSelf,'name':'pd_4s_adj_noSelf','adj':True}] # create training data (all of july) data_df, max_value, specifiedSensors = formatFromSQL( path_sqlFile=config.path_sqlFile) # 182-185,281 are missing from adjacency matrix!! remove them! tell max, this needs to be changed! remove = [182, 183, 184, 185, 281] removeInx = [] for i in remove: index = np.where(data_df.columns.values == i)[0] if len(index) > 0: removeInx.append(index[0]) data_df.drop(data_df.columns[removeInx], axis=1, inplace=True) specifiedSensors = pd.DataFrame(data_df.columns.values) # create test data (one or more) config.test_dicts = [] for path_test in config.path_sqlTestFile: test_df, test_max_value, _ = formatFromSQL( path_sqlFile=path_test, specifiedSensorsArray=specifiedSensors) config.test_dicts.append({ 'df': test_df, 'max': test_max_value, 'name': os.path.basename(os.path.normpath(path_test)).replace('.csv', '') }) # create a list that contains the (function) index of the minimum MAE (averaged) path_idxsMinMae = os.path.join(config.path_outputDir, "indicesMinMaes.csv") idxMinMae_list = [] #for indexOutputSensor in range(0,1): for indexOutputSensor in range(0, data_df.shape[1]): # create folder for current sensor debugInfo(__name__, "SENSOR %d" % data_df.columns.values[indexOutputSensor]) currentDir = os.path.join( config.path_outputDir, "s_%d" % data_df.columns.values[indexOutputSensor]) if not os.path.exists(currentDir): os.makedirs(currentDir) # set up empty data frame and array for the average MAE of all testing data. first column is names of all the functions avgMaes_df = pd.DataFrame( np.zeros((len(methods), len(config.timeOffsets)))) avgMaes_array = np.zeros((len(methods), len(config.timeOffsets))) path_avgMaes_df = os.path.join( currentDir, "avgMaesForSensor_%d.csv" % data_df.columns.values[indexOutputSensor]) for i in range(0, len(config.test_dicts)): testData = config.test_dicts[i] # create folders for current data frame current_df_dir = os.path.join(currentDir, testData['name']) currentDir_tf = os.path.join(current_df_dir, 'tf') currentDir_rslts = os.path.join(current_df_dir, 'output') if not os.path.exists(current_df_dir): os.makedirs(current_df_dir) if not os.path.exists(currentDir_tf): os.makedirs(currentDir_tf) if not os.path.exists(currentDir_rslts): os.makedirs(currentDir_rslts) path_allMaesForSensor = os.path.join( current_df_dir, "allMaesForSensor_%d.csv" % data_df.columns.values[indexOutputSensor]) all_maesForSensor = pd.DataFrame(np.zeros(avgMaes_df.shape)) # iterate over each method and train a new network for j in range(0, len(methods)): debugInfo(__name__, "Using %s to prepare data" % methods[j]['name']) config.path_savedSession = os.path.join( currentDir_tf, "tfsession_%s" % methods[j]['name']) config.path_outputFile = os.path.join( currentDir_rslts, "predictions_%s.csv" % methods[j]['name']) #first train the network using all data points for july # if a non sequential input then sequential should be none config.sequential = [0] if (i < 4) else list(range(0, 5)) debugInfo(__name__, "Creating Data for Training") config.data = makeDataSetObject( data_df=data_df, max_value=max_value, timeOffsets=config.timeOffsets, outputSensorIndex=indexOutputSensor, sequential=config.sequential, splitTrain=False, path_adjacencyMatrix=None if (methods[j]['adj'] == False) else config.path_adjacencyMatrix, prepareData_function=methods[j]['func']) # train all data using n trainNetwork(config) #then test the network (after all training) using all data points from august debugInfo(__name__, "Creating Data for Testing") config.data = makeDataSetObject( data_df=testData['df'], max_value=testData['max'], timeOffsets=config.timeOffsets, outputSensorIndex=indexOutputSensor, sequential=config.sequential, splitTrain=False, path_adjacencyMatrix=None if (methods[j]['adj'] == False) else config.path_adjacencyMatrix, prepareData_function=methods[j]['func']) maes = testNetwork(config) all_maesForSensor.iloc[j, :] = maes all_maesForSensor.index = np.array([(lambda x: x['name'])(funcDic) for funcDic in methods]) all_maesForSensor.to_csv(path_allMaesForSensor, header=([(lambda x: "t_%d" % x)(to) for to in config.timeOffsets])) avgMaes_array = all_maesForSensor.values + avgMaes_array # get average of maes for all test data avgMaes_array = avgMaes_array / len(config.test_dicts) avgMaes_df.iloc[:, :] = avgMaes_array avgMaes_df.index = [(lambda x: x['name'])(funcDic) for funcDic in methods] avgMaes_df.to_csv(path_avgMaes_df, header=([(lambda x: "t_%d" % x)(to) for to in config.timeOffsets])) # get index of function with lowest MAE and save idxMinMae_list.append(avgMaes_array.argmin(axis=0)) idxMinMae_df = pd.DataFrame( np.hstack((specifiedSensors.values[0:2], np.array(idxMinMae_list)))) idxMinMae_df.to_csv(path_idxsMinMae, header=(['sensor'] + [(lambda x: "t_%d" % x)(to) for to in config.timeOffsets]))
def makeDataSetObject(data_df, max_value, prepareData_function, outputSensorIndex, sequential=None, timeOffsets=None, splitTrain=True, trainTestFraction=.8, path_adjacencyMatrix=None, path_preparedData=None): ''' Args : inputFilePath : Path to csv file 26_8_16_PZS_Belgugn_All_Wide_NanOmitecsv or similar remakeData : Boolean : if True then inputFilePath refers to an SQL output file and the data is remade outputFilePath : Path to outputfile if saveOutputFile is True timeOffset : Int : number of minutes that Return : theData : FullDataSet object from dataset_helpers containing two DataSet objects containing two numpy arrays(input/target), contains next_batch() function! ''' # define index of single output sensor (the output is at some time in the future) adjacencyForOutputSensor = None # add in the adjacency matrix if (path_adjacencyMatrix is not None): debugInfo(__name__, "Found an adjacency matrix : multiplying it in!") # 182-185,281 are missing from adjacency matrix!! remove them! tell max, this needs to be changed! #data_df=pd.DataFrame(data_df.iloc[:,5:data_df.shape[1]].values,columns=data_df.columns.values[5:data_df.shape[1]]) # list of sensors columns that we are using desired = data_df.columns.values # read adjacency matrix adjMatrix_orig = pd.read_csv(path_adjacencyMatrix) # adjacency matrix csv has headers as type string, with columns 0,1 actual strings : rename all columns as ints! sensorsList = list( adjMatrix_orig.columns.values[2:adjMatrix_orig.shape[1]].astype( np.int64)) columns = [0, 1] + sensorsList adjMatrix_orig.columns = columns # remove all columns (sensors) that we don't want, leaving only sensors that are desired # this uses header names to reference the columns that i want removed = adjMatrix_orig[desired] # get row index of single sensor being used for output (as a string) : this row is the adjacency! indexForSensorInMatrix = np.where(adjMatrix_orig.iloc[:, 1] == data_df. columns.values[outputSensorIndex])[0] adjacencyForOutputSensor = removed.iloc[ indexForSensorInMatrix, :].values print(data_df.columns.values[np.where( adjacencyForOutputSensor[0] == 1)[0]]) data_prepared, indexOutputBegin = prepareData( data_df.values, outputSensorIndex, timeOffsets, prepareData_function, adjacency=adjacencyForOutputSensor, sequential=sequential) print(data_prepared.shape) #rowNames = range(0,max(timeOffsets))+list(data_df.index) + range(0,max(sequential)) #data_prepared.index = rowNames data_final_naDropped = data_prepared.dropna() debugInfo( __name__, "From %d total timepoints, %d are being used (%.2f)" % (data_prepared.shape[0], data_final_naDropped.shape[0], (data_final_naDropped.shape[0] / data_prepared.shape[0]))) #data_final.to_csv("/Users/ahartens/Desktop/Temporary/24_10_16_wideTimeSeriesBelegung.csv") if (path_preparedData is not None): debugInfo(__name__, "Saving processed file to %s" % (path_preparedData)) data_final_naDropped.to_csv(path_preparedData, index=False) if (splitTrain == True): train_df, test_df = dsh.splitDataToTrainAndTest( data_final_naDropped, trainTestFraction) debugInfo( __name__, "train_df (%d,%d)\ttest_df (%d,%d)" % (train_df.shape[0], train_df.shape[1], test_df.shape[0], test_df.shape[1])) debugInfo( __name__, "Single output sensor at index %d, sensor name : %s" % (outputSensorIndex, data_df.columns.values[outputSensorIndex])) train_input = train_df.iloc[:, 0:indexOutputBegin] train_output = train_df.iloc[:, indexOutputBegin:data_final_naDropped. shape[1]] test_input = test_df.iloc[:, 0:indexOutputBegin] test_output = test_df.iloc[:, indexOutputBegin:data_final_naDropped. shape[1]] debugInfo(__name__, "Making FullDataSet object containing train/test data") # create FullDataSet object with appropriate data theData = dsh.FullDataSet(trainInput=train_input.values, trainOutput=train_output.values, testInput=test_input.values, testOutput=test_output.values) # Don't split data into train/test (only for testing) else: test_input = data_final_naDropped.iloc[:, 0:indexOutputBegin] test_output = data_final_naDropped.iloc[:, indexOutputBegin: data_final_naDropped.shape[1]] debugInfo(__name__, "Making FullDataSet object with only test data") # create FullDataSet object with appropriate data theData = dsh.FullDataSet(trainInput=np.empty(test_input.shape), trainOutput=np.empty(test_output.shape), testInput=test_input.values, testOutput=test_output.values) theData.test.rowNames = data_final_naDropped.index theData.max_value = max_value theData.toString() return theData