def findAutoCorrelations(time_series_csv, max_lag, fraction_training): """ Run auto-correlations independently 2 column time series by converting into a regression with max_lag x and y lags per instance fraction_training is the fraction of sample used for training """ base_name = os.path.splitext(time_series_csv)[0] auto_correlation_matrix_csv = base_name + '.autocorrelation.csv' time_series_data,header = csv.readCsvFloat2(time_series_csv, True) number_training = int(float(len(time_series_data))*fraction_training) print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data) assert(number_training > max_lag) time_series = NP.transpose(NP.array(time_series_data)) days_downloads = getDaysOfWeekToKeep(time_series[0,:number_training]) days_purchases = getDaysOfWeekToKeep(time_series[1,:number_training]) print days_downloads print days_purchases exit() removeOutlierDaysOfWeek(time_series[1,:number_training]) removeOutiers(time_series[1,:number_training], 0.8) downloads = time_series[0,:number_training] purchases = time_series[1,:number_training] #auto_correlations = [getAutoCorrelation(time_series[i,:number_training], max_lag) for i in range(time_series.shape[2])] #return (getAutoCorrelation(downloads, max_lag),getAutoCorrelation(purchases, max_lag)) auto_correlation_data = NP.hstack([getAutoCorrelation(downloads, max_lag),getAutoCorrelation(purchases, max_lag)]) csv.writeCsv(auto_correlation_matrix_csv, list(auto_correlation_data), header)
def analyzeTimeSeries(filename, max_lag, fraction_training): """ Main function. Analyze time series in 'filename' (assumed to be a CSV for now) Create model with up to mag_lag lags Use the first fraction_training of data for training and the remainder for testing """ base_name = os.path.splitext(filename)[0] regression_matrix_csv = base_name + '.regression.csv' results_filename = base_name + '.results' model_filename = base_name + '.model' prediction_matrix_csv = base_name + '.prediction.csv' """ Assume input file is a CSV with a header row """ time_series_data, header = csv.readCsvFloat2(filename, True) """ Assume a weekly pattern """ number_training = (int(float(len(time_series_data))*fraction_training)//7)*7 print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data) assert(number_training > max_lag) time_series = NP.transpose(NP.array(time_series_data)) describeNPArray('time_series', time_series) training_time_series = NP.transpose(NP.array(time_series_data[:number_training])) print 'training_time_series.shape', training_time_series.shape t = NP.arange(time_series.shape[1]) training_t = NP.arange(training_time_series.shape[1]) num_series = training_time_series.shape[0] num_rows = training_time_series.shape[1] days_to_keep = [getDaysOfWeekToKeep(training_time_series[i,:]) for i in range(num_series)] masks = [getDaysOfWeekMask(days_to_keep[i], time_series.shape[1]) for i in range(num_series)] training_masks = [getDaysOfWeekMask(days_to_keep[i], num_rows) for i in range(num_series)] trends = [getTrend(training_t, training_time_series[i,:], training_masks[i]) for i in range(num_series)] x = [removeTrend1D(trends[i], training_t, training_time_series[i], training_masks[i]) for i in range(num_series)] for i in range(num_series): describeNPVector('x[%0d]'%i, x[i]) detrended_training_time_series = NP.zeros([num_series, x[0].shape[0]]) print 'detrended_training_time_series.shape', detrended_training_time_series.shape for i in range(num_series): print 'x[%0d].shape'%i, x[i].shape detrended_training_time_series[i,:] = x[i] print 'detrended_training_time_series.shape', detrended_training_time_series.shape # filtered_time_series = NP.vstack([filterDaysOfWeek(training_time_series[i,:], days_to_keep[i]) for i in range(num_series)]) # print 'filtered_time_series.shape', filtered_time_series.shape for i in range(num_series): describeNPVector('detrended_training_time_series[%0d]'%i, detrended_training_time_series[i]) means, stddevs = timeSeriesToMatrixCsv(regression_matrix_csv, detrended_training_time_series, training_masks, max_lag) print 'means', means print 'stddevs', stddevs run_weka.runMLPTrain(regression_matrix_csv, results_filename, model_filename, True, '-H 4') coefficients = run_weka.getCoefficients(results_filename) print '--------------------------------------------' print 'coefficients', len(coefficients) print coefficients print '--------------------------------------------' print 'means', len(means) print means print '--------------------------------------------' print 'stddevs', len(stddevs) print stddevs print '--------------------------------------------' #exit() detrended_full_x = [removeTrend1D(trends[i], t, time_series[i], masks[i]) for i in range(num_series)] detrended_time_series = NP.zeros([num_series, detrended_full_x[0].shape[0]]) print 'detrended_time_series.shape', detrended_time_series.shape for i in range(num_series): print 'full_x[%0d].shape'%i, detrended_full_x[i].shape detrended_predictions = predictTimeSeries(coefficients, means, stddevs, t, detrended_full_x[0], detrended_full_x[1], number_training, max_lag, masks) predictions = addTrend1D(trends[1], t, detrended_predictions, masks[1]) print '--------------------------------------------' print 'predictions =', predictions.shape # print predictions full_x = [NP.array(time_series[i]) for i in range(num_series)] print 't.shape', t.shape print 'full_x[0].shape', full_x[0].shape print 'full_x[1].shape', full_x[1].shape print 'predictions.shape', predictions.shape predicted_time_series = NP.vstack([t, full_x[0], full_x[1], predictions]) print 'predicted_time_series.shape', predicted_time_series.shape # retrend !@#$\\ prediction_header = ['t', 'x', 'y', 'y_pred'] predicted_time_series_data = [[str(predicted_time_series[i,j]) for i in range(predicted_time_series.shape[0])] for j in range(predicted_time_series.shape[1])] csv.writeCsv(prediction_matrix_csv, predicted_time_series_data, prediction_header)
def runWekaOnTimeSeries(time_series_csv, max_lag, fraction_training): """ Run Weka training a 2 column time series by converting into a regression with max_lag x and y lags per instance fraction_training is the fraction of sample used for training """ base_name = os.path.splitext(time_series_csv)[0] regression_matrix_csv = base_name + '.regression.csv' results_filename = base_name + '.results' model_filename = base_name + '.model' predictions_filename = base_name + '.predict' test_filename = base_name + '.test.csv' evaluation_filename = base_name + '.evaluation.csv' time_series_data,_ = csv.readCsvFloat2(time_series_csv, True) number_training = (int(float(len(time_series_data))*fraction_training)//7)*7 print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data) assert(number_training > max_lag) training_time_series = NP.transpose(NP.array(time_series_data[:number_training])) print '1: training_time_series.shape', training_time_series.shape if True: days_downloads = getDaysOfWeekToKeep(training_time_series[0,:]) days_purchases = getDaysOfWeekToKeep(training_time_series[1,:]) training_time_series = NP.vstack([filterDaysOfWeek(training_time_series[0,:], days_downloads), filterDaysOfWeek(training_time_series[1,:], days_purchases)]) print '2: training_time_series.shape', training_time_series.shape if True: timeSeriesToMatrixCsv(regression_matrix_csv, training_time_series, max_lag) run_weka.runMLPTrain(regression_matrix_csv, results_filename, model_filename, True) print 'number_training, training_time_series.shape[1]', number_training, training_time_series.shape[1] number_training_x = number_training #- 5 prediction_data = CP.deepcopy(time_series_data) prediction_data_downloads = [[row[0],0] for row in prediction_data] for i in range(number_training_x, len(prediction_data)): if i%7 in days_purchases: prediction_array = NP.transpose(NP.array(prediction_data[i-max_lag:i+1])) timeSeriesToMatrixCsv(test_filename, prediction_array, max_lag) run_weka.runMLPPredict(test_filename, model_filename, predictions_filename) prediction_list = run_weka.getPredictionsRegression(predictions_filename) print 'predictions', prediction_list prediction = prediction_list[0]['predicted'] if False: prediction_array_downloads = NP.transpose(NP.array(prediction_data_downloads[i-max_lag:i+1])) timeSeriesToMatrixCsv(test_filename, prediction_array_downloads, max_lag) run_weka.runMLPPredict(test_filename, model_filename, predictions_filename) prediction_list_downloads = run_weka.getPredictionsRegression(predictions_filename) print 'predictions_downloads', prediction_list_downloads prediction_downloads = prediction_list[0]['predicted'] else: prediction = -1 prediction_downloads = -1 prediction_data[i][1] = prediction #prediction_data[i] = [prediction_data[i][0], prediction, prediction_downloads] evaluation_data = [] for i in range(len(prediction_data)-number_training_x): if i%7 in days_purchases: row = [0]*5 for j in [0,1]: row[j] = time_series_data[number_training_x+i][j] row[2] = prediction_data[number_training_x+i][1] row[3] = abs(row[2]-row[1]) row[4] = row[3]/abs(row[2]+row[1]) if abs(row[2]+row[1]) else row[3] evaluation_data.append([number_training_x+i]+row) evaluation_header = ['i', 'x', 'y_actual', 'y_predicted', 'abs_error', 'normalized_error'] csv.writeCsv(evaluation_filename, evaluation_data, evaluation_header)