Ejemplo n.º 1
0
def findAutoCorrelations(time_series_csv, max_lag, fraction_training):
    """ Run auto-correlations independently 2 column time series 
        by converting into a regression with max_lag x and y lags
        per instance 
        fraction_training is the fraction of sample used for training
    """  
    base_name = os.path.splitext(time_series_csv)[0]
    auto_correlation_matrix_csv = base_name + '.autocorrelation.csv'
    time_series_data,header = csv.readCsvFloat2(time_series_csv, True)
    number_training = int(float(len(time_series_data))*fraction_training)
    print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data)
    assert(number_training > max_lag)
    time_series = NP.transpose(NP.array(time_series_data))
    days_downloads = getDaysOfWeekToKeep(time_series[0,:number_training])
    days_purchases = getDaysOfWeekToKeep(time_series[1,:number_training])
    print days_downloads
    print days_purchases
    exit()
    removeOutlierDaysOfWeek(time_series[1,:number_training])
    removeOutiers(time_series[1,:number_training], 0.8)
    downloads = time_series[0,:number_training]
    purchases = time_series[1,:number_training]
    #auto_correlations = [getAutoCorrelation(time_series[i,:number_training], max_lag) for i in range(time_series.shape[2])]
    #return (getAutoCorrelation(downloads, max_lag),getAutoCorrelation(purchases, max_lag))
    auto_correlation_data = NP.hstack([getAutoCorrelation(downloads, max_lag),getAutoCorrelation(purchases, max_lag)])
    csv.writeCsv(auto_correlation_matrix_csv, list(auto_correlation_data), header)
Ejemplo n.º 2
0
def analyzeTimeSeries(filename, max_lag, fraction_training):
    """ Main function. 
        Analyze time series in 'filename' (assumed to be a CSV for now)
        Create model with up to mag_lag lags
        Use the first fraction_training of data for training and the 
        remainder for testing
    """
    
    base_name = os.path.splitext(filename)[0]
    regression_matrix_csv = base_name + '.regression.csv'
    results_filename = base_name + '.results' 
    model_filename = base_name + '.model' 
    prediction_matrix_csv = base_name + '.prediction.csv'
    
    """ Assume input file is a CSV with a header row """
    time_series_data, header = csv.readCsvFloat2(filename, True)
    
    """ Assume a weekly pattern """
    number_training = (int(float(len(time_series_data))*fraction_training)//7)*7
    
    print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data)
    assert(number_training > max_lag)
    
    time_series = NP.transpose(NP.array(time_series_data))
    describeNPArray('time_series', time_series)
        
    training_time_series = NP.transpose(NP.array(time_series_data[:number_training]))
    print 'training_time_series.shape', training_time_series.shape
    
    t = NP.arange(time_series.shape[1])
    training_t = NP.arange(training_time_series.shape[1])
    
    num_series = training_time_series.shape[0]
    num_rows = training_time_series.shape[1]
    
    days_to_keep = [getDaysOfWeekToKeep(training_time_series[i,:]) for i in range(num_series)]
    
    masks = [getDaysOfWeekMask(days_to_keep[i], time_series.shape[1]) for i in range(num_series)]
    training_masks = [getDaysOfWeekMask(days_to_keep[i], num_rows) for i in range(num_series)]
    
    trends = [getTrend(training_t, training_time_series[i,:], training_masks[i]) for i in range(num_series)]
    
    x = [removeTrend1D(trends[i], training_t, training_time_series[i], training_masks[i]) for i in range(num_series)]
    for i in range(num_series):
        describeNPVector('x[%0d]'%i, x[i])
    detrended_training_time_series = NP.zeros([num_series, x[0].shape[0]])
    print 'detrended_training_time_series.shape', detrended_training_time_series.shape
    for i in range(num_series):
        print 'x[%0d].shape'%i, x[i].shape
        detrended_training_time_series[i,:] = x[i]
    print 'detrended_training_time_series.shape', detrended_training_time_series.shape
    # filtered_time_series = NP.vstack([filterDaysOfWeek(training_time_series[i,:], days_to_keep[i]) for i in range(num_series)])
    # print 'filtered_time_series.shape', filtered_time_series.shape
   
    for i in range(num_series):
        describeNPVector('detrended_training_time_series[%0d]'%i, detrended_training_time_series[i])
        
    means, stddevs = timeSeriesToMatrixCsv(regression_matrix_csv, detrended_training_time_series, training_masks, max_lag)
    print 'means', means
    print 'stddevs', stddevs
    run_weka.runMLPTrain(regression_matrix_csv, results_filename, model_filename, True, '-H 4')
    coefficients = run_weka.getCoefficients(results_filename)
   
    print '--------------------------------------------'
    print 'coefficients', len(coefficients)
    print coefficients
    print '--------------------------------------------'
    print 'means', len(means)
    print means
    print '--------------------------------------------'
    print 'stddevs', len(stddevs)
    print stddevs
    print '--------------------------------------------'
    #exit()
    detrended_full_x = [removeTrend1D(trends[i], t, time_series[i], masks[i]) for i in range(num_series)]
    detrended_time_series = NP.zeros([num_series, detrended_full_x[0].shape[0]])
    print 'detrended_time_series.shape', detrended_time_series.shape
    for i in range(num_series):
        print 'full_x[%0d].shape'%i, detrended_full_x[i].shape
    detrended_predictions = predictTimeSeries(coefficients, means, stddevs, t, detrended_full_x[0], detrended_full_x[1], number_training, max_lag, masks)
    predictions = addTrend1D(trends[1], t, detrended_predictions, masks[1]) 
    print '--------------------------------------------'
    print 'predictions =', predictions.shape
    # print predictions
    full_x = [NP.array(time_series[i]) for i in range(num_series)]
    
    print 't.shape', t.shape
    print 'full_x[0].shape', full_x[0].shape
    print 'full_x[1].shape', full_x[1].shape
    print 'predictions.shape', predictions.shape
    
    predicted_time_series = NP.vstack([t, full_x[0], full_x[1], predictions])
    
    print 'predicted_time_series.shape', predicted_time_series.shape
    # retrend !@#$\\
    prediction_header = ['t', 'x', 'y', 'y_pred']
    predicted_time_series_data = [[str(predicted_time_series[i,j]) 
                                    for i in range(predicted_time_series.shape[0])]
                                        for j in range(predicted_time_series.shape[1])]
                            
    csv.writeCsv(prediction_matrix_csv, predicted_time_series_data, prediction_header)
Ejemplo n.º 3
0
def runWekaOnTimeSeries(time_series_csv, max_lag, fraction_training):
    """ Run Weka training a 2 column time series 
        by converting into a regression with max_lag x and y lags
        per instance 
        fraction_training is the fraction of sample used for training
    """  
    base_name = os.path.splitext(time_series_csv)[0]
    regression_matrix_csv = base_name + '.regression.csv'
    results_filename = base_name + '.results' 
    model_filename = base_name + '.model' 
    predictions_filename =  base_name + '.predict'
    test_filename = base_name + '.test.csv'
    evaluation_filename = base_name + '.evaluation.csv'
    
    time_series_data,_ = csv.readCsvFloat2(time_series_csv, True)
    number_training = (int(float(len(time_series_data))*fraction_training)//7)*7

    print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data)
    assert(number_training > max_lag)
    
    training_time_series = NP.transpose(NP.array(time_series_data[:number_training]))
    print '1: training_time_series.shape', training_time_series.shape
    
    if True:
        days_downloads = getDaysOfWeekToKeep(training_time_series[0,:])
        days_purchases = getDaysOfWeekToKeep(training_time_series[1,:])
        training_time_series = NP.vstack([filterDaysOfWeek(training_time_series[0,:], days_downloads),
         filterDaysOfWeek(training_time_series[1,:], days_purchases)])
        print '2: training_time_series.shape', training_time_series.shape
    
    if True:
    
        timeSeriesToMatrixCsv(regression_matrix_csv, training_time_series, max_lag)
        run_weka.runMLPTrain(regression_matrix_csv, results_filename, model_filename, True)
    
    print 'number_training, training_time_series.shape[1]', number_training, training_time_series.shape[1]
    number_training_x = number_training #- 5
    
    prediction_data = CP.deepcopy(time_series_data)
    prediction_data_downloads = [[row[0],0] for row in prediction_data]

    for i in range(number_training_x, len(prediction_data)):
        if i%7 in days_purchases:
            prediction_array = NP.transpose(NP.array(prediction_data[i-max_lag:i+1]))
            timeSeriesToMatrixCsv(test_filename, prediction_array, max_lag)
            run_weka.runMLPPredict(test_filename, model_filename, predictions_filename)
            prediction_list = run_weka.getPredictionsRegression(predictions_filename)
            print 'predictions', prediction_list
            prediction = prediction_list[0]['predicted']
            if False:
                prediction_array_downloads = NP.transpose(NP.array(prediction_data_downloads[i-max_lag:i+1]))
                timeSeriesToMatrixCsv(test_filename, prediction_array_downloads, max_lag)
                run_weka.runMLPPredict(test_filename, model_filename, predictions_filename)
                prediction_list_downloads = run_weka.getPredictionsRegression(predictions_filename)
                print 'predictions_downloads', prediction_list_downloads
                prediction_downloads = prediction_list[0]['predicted']
        else:
            prediction = -1
            prediction_downloads = -1
        prediction_data[i][1] = prediction
        #prediction_data[i] = [prediction_data[i][0], prediction, prediction_downloads]
       
          
    evaluation_data = []
    for i in range(len(prediction_data)-number_training_x):
        if i%7 in days_purchases:
            row = [0]*5
            for j in [0,1]:
                row[j] = time_series_data[number_training_x+i][j]
            row[2] = prediction_data[number_training_x+i][1] 
            row[3] = abs(row[2]-row[1])
            row[4] = row[3]/abs(row[2]+row[1]) if abs(row[2]+row[1]) else row[3]
            evaluation_data.append([number_training_x+i]+row)
     
    evaluation_header = ['i', 'x', 'y_actual', 'y_predicted', 'abs_error', 'normalized_error']
    
    csv.writeCsv(evaluation_filename, evaluation_data, evaluation_header)