Ejemplo n.º 1
0
def generateTimeSeriesOutlierScores(inDir,
                                    use_link_db=False,
                                    robust=False,
                                    num_pcs=10,
                                    gamma=.5,
                                    tol_perc=1e-06,
                                    perc_missing_allowed=.05,
                                    make_zscore_vid=False,
                                    pool=DefaultPool()):

    numpy.set_printoptions(linewidth=1000, precision=4)

    #Read the time-series data from the file
    logMsg("Reading files...")
    stdout.flush()
    if (use_link_db):
        file_prefix = "link_"

        #pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_pace_data(
        #    num_trips_threshold=consistent_threshold, pool=pool)

        pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_from_file(
            use_link_db)

    else:
        file_prefix = "coarse_"
        (pace_timeseries, pace_grouped, dates_grouped,
         trip_names) = readPaceData(inDir)

    if (robust):
        if (gamma == "tune"):
            robustStr = "RPCAtune"
        else:
            robustStr = "RPCA%d" % int(gamma * 100)
    else:
        robustStr = "PCA"

    file_prefix += "%s_%s_%dpcs_%dpercmiss" % (inDir, robustStr, num_pcs,
                                               perc_missing_allowed * 100)

    #pace_grouped = preprocess_data(pace_grouped, num_pcs,
    #                               perc_missing_allowed=perc_missing_allowed)
    pace_grouped, trip_names = remove_bad_dimensions_grouped(
        pace_grouped, trip_names, perc_missing_allowed)
    # logMsg(trip_names)

    #Also get global pace information
    if inDir != '':
        global_pace_timeseries = readGlobalPace(inDir)
        (expected_pace_timeseries,
         sd_pace_timeseries) = getExpectedPace(global_pace_timeseries)

    logMsg("Starting processes")
    if (gamma == "tune"):
        logMsg("Doing RPCA and tuning gamma")
    else:
        logMsg("Doing RPCA with gamma=%f, k=%d" % (gamma, num_pcs))
    stdout.flush()

    # Freeze the parameters of the computeMahalanobisDistances() function
    mahalFunc = partial(computeMahalanobisDistances,
                        robust=robust,
                        k=num_pcs,
                        gamma=gamma,
                        tol_perc=tol_perc)

    # Compute all mahalanobis distances
    sorted_keys = sorted(pace_grouped)
    groups = [(key, pace_grouped[key]) for key in sorted_keys]
    outlier_scores = pool.map(
        mahalFunc, groups
    )  #Run all of the groups, using as much parallel computing as possible

    logMsg("Merging output")
    #Merge outputs from all of the threads
    entries = reduceOutlierScores(outlier_scores, sorted_keys, dates_grouped)

    logMsg("Writing file")
    #Output outlier scores to file
    scoreWriter = csv.writer(
        open("results/%s_robust_outlier_scores.csv" % file_prefix, "w"))
    scoreWriter.writerow([
        'date', 'hour', 'weekday', 'mahal5', 'mahal10', 'mahal20', 'mahal50',
        'c_val', 'gamma', 'tol', 'pca_dim', 'num_guess', 'hi_pcs',
        'global_pace', 'expected_pace', 'sd_pace'
    ])

    for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val,
         z_scores, gamma, tol, n_pca_dim, n_guess, hi_pcs) in sorted(entries):
        try:
            gl_pace = global_pace_timeseries[(date, hour, weekday)]
            exp_pace = expected_pace_timeseries[(date, hour, weekday)]
            sd_pace = sd_pace_timeseries[(date, hour, weekday)]
        except:
            gl_pace = 0
            exp_pace = 0
            sd_pace = 0

        scoreWriter.writerow([
            date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val,
            gamma, tol, n_pca_dim, n_guess, hi_pcs, gl_pace, exp_pace, sd_pace
        ])

    all_cvals = [
        c_val for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50,
                   c_val, z_scores, gamma, tol, n_pca_dim, n_guess,
                   hi_pcs) in sorted(entries)
    ]

    zscoreWriter = csv.writer(open("results/%s_zscore.csv" % file_prefix, "w"))
    zscoreWriter.writerow(['Date', 'Hour', 'Weekday'] + trip_names)
    #Output zscores to file
    for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val,
         z_scores, gamma, tol, n_pca_dim, n_guess, hi_pcs) in sorted(entries):
        std_vect = z_scores
        zscoreWriter.writerow([date, hour, weekday] + ravel(std_vect).tolist())

    #def make_video(tmp_folder, filename_base, pool=DefaultPool(), dates=None, speed_dicts=None)
    if (make_zscore_vid):
        logMsg("Making speed dicts")
        #zscore_list = [zscores[key] for key in sorted(zscores)]
        date_list = []
        zscore_list = []

        for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val,
             z_scores, gamma, tol, n_pca_dim, n_guess,
             hi_pcs) in sorted(entries):
            if (date >= '2014-06-01' and date < '2014-07-01'):
                dt = datetime.strptime(date,
                                       '%Y-%m-%d') + timedelta(hours=int(hour))
                date_list.append(dt)
                zscore_list.append(z_scores)

        speed_dicts = build_speed_dicts(consistent_link_set, zscore_list)
        logMsg("Making video with %d frames" % len(zscore_list))

        with open('tmp_zscores.pickle', 'w') as f:
            pickle.dump((date_list, speed_dicts), f)
        make_video("tmp_vid",
                   "zscore_vid",
                   pool=pool,
                   dates=date_list,
                   speed_dicts=speed_dicts)

    logMsg("Done.")
    return all_cvals
Ejemplo n.º 2
0
def generateTimeSeriesOutlierScores(inDir, use_link_db=False, robust=False, num_pcs=10,
                                    gamma=.5, tol_perc=1e-06, perc_missing_allowed=.05,
                                    make_zscore_vid=False, pool = DefaultPool()):
                                 


    numpy.set_printoptions(linewidth=1000, precision=4)
    
    #Read the time-series data from the file
    logMsg("Reading files...")
    stdout.flush()
    if(use_link_db):
        file_prefix = "link_"
        
        #pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_pace_data(
        #    num_trips_threshold=consistent_threshold, pool=pool)
        
        pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_from_file(use_link_db)


    else:
        file_prefix = "coarse_"
        (pace_timeseries, pace_grouped, dates_grouped, trip_names) = readPaceData(inDir)
        


    if(robust):
        if(gamma=="tune"):
            robustStr = "RPCAtune"
        else:
            robustStr = "RPCA%d" % int(gamma*100)
    else:
        robustStr = "PCA"

    file_prefix += "%s_%s_%dpcs_%dpercmiss" % (inDir, robustStr, num_pcs, perc_missing_allowed*100)

    #pace_grouped = preprocess_data(pace_grouped, num_pcs,
    #                               perc_missing_allowed=perc_missing_allowed)
    pace_grouped, trip_names = remove_bad_dimensions_grouped(pace_grouped, trip_names, perc_missing_allowed)
    logMsg(trip_names)


    #Also get global pace information
    global_pace_timeseries = readGlobalPace(inDir)
    (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries)

    logMsg("Starting processes")
    if(gamma=="tune"):
        logMsg("Doing RPCA and tuning gamma")
    else:
        logMsg("Doing RPCA with gamma=%f, k=%d" % (gamma, num_pcs))
    stdout.flush()

    # Freeze the parameters of the computeMahalanobisDistances() function
    mahalFunc = partial(computeMahalanobisDistances, robust=robust, k=num_pcs,
                        gamma=gamma, tol_perc=tol_perc)
    
    # Compute all mahalanobis distances
    sorted_keys = sorted(pace_grouped)    
    groups = [(key,pace_grouped[key]) for key in sorted_keys]    
    outlier_scores = pool.map(mahalFunc, groups) #Run all of the groups, using as much parallel computing as possible

    logMsg("Merging output")
    #Merge outputs from all of the threads
    entries = reduceOutlierScores(outlier_scores, sorted_keys, dates_grouped)

    
    logMsg("Writing file")
    #Output outlier scores to file
    scoreWriter = csv.writer(open("results/%s_robust_outlier_scores.csv"%file_prefix, "w"))
    scoreWriter.writerow(['date','hour','weekday', 'mahal5', 'mahal10', 'mahal20',
                          'mahal50' ,'c_val', 'gamma', 'tol', 'pca_dim', 'num_guess',
                          'hi_pcs', 'global_pace', 'expected_pace', 'sd_pace'])
    
    for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, z_scores, gamma, tol,
         n_pca_dim, n_guess, hi_pcs) in sorted(entries):
        try:
            gl_pace = global_pace_timeseries[(date, hour, weekday)]
            exp_pace = expected_pace_timeseries[(date, hour, weekday)]
            sd_pace = sd_pace_timeseries[(date, hour, weekday)]
        except:
            gl_pace = 0
            exp_pace = 0
            sd_pace = 0
        
        scoreWriter.writerow([date, hour, weekday,  mahal5, mahal10, mahal20, mahal50,
                              c_val, gamma, tol, n_pca_dim, n_guess, hi_pcs, 
                              gl_pace, exp_pace, sd_pace])


    all_cvals = [c_val for(date, hour, weekday, mahal5, mahal10, mahal20, mahal50,
                    c_val, z_scores, gamma, tol,n_pca_dim, n_guess, hi_pcs) in sorted(entries)]

    
    zscoreWriter= csv.writer(open("results/%s_zscore.csv"%file_prefix, "w"))
    zscoreWriter.writerow(['Date','Hour','Weekday'] + trip_names)
    #Output zscores to file
    for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, z_scores, gamma, tol,
         n_pca_dim, n_guess, hi_pcs) in sorted(entries):
        std_vect = z_scores
        zscoreWriter.writerow([date, hour, weekday] + ravel(std_vect).tolist())
    
    

    #def make_video(tmp_folder, filename_base, pool=DefaultPool(), dates=None, speed_dicts=None)
    if(make_zscore_vid):
        logMsg("Making speed dicts")
        #zscore_list = [zscores[key] for key in sorted(zscores)]
        date_list = []
        zscore_list = []
        
        for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, z_scores, gamma, tol,
             n_pca_dim, n_guess, hi_pcs) in sorted(entries):
            if(date >= '2012-10-21' and date < '2012-11-11'):
                dt = datetime.strptime(date, '%Y-%m-%d') + timedelta(hours=int(hour))
                date_list.append(dt)
                zscore_list.append(z_scores)
                
        speed_dicts = build_speed_dicts(consistent_link_set, zscore_list)
        logMsg("Making video with %d frames" % len(zscore_list))
        
        with open('tmp_zscores.pickle', 'w') as f:
            pickle.dump((date_list, speed_dicts), f)
        make_video("tmp_vid", "zscore_vid", pool=pool, dates=date_list, speed_dicts=speed_dicts)
        
        
        

    logMsg("Done.")
    return all_cvals
Ejemplo n.º 3
0
def generateTimeSeriesOutlierScores(inDir, use_link_db=False, robust=False, num_pcs=10,
                                    gamma=.5, perc_missing_allowed=.05, make_zscore_vid=False,
                                    pool = DefaultPool()):
                                 


    numpy.set_printoptions(linewidth=1000, precision=4)
    
    #Read the time-series data from the file
    logMsg("Reading files...")
    stdout.flush()
    if(use_link_db):
        file_prefix = "link_"
        
        #pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_pace_data(
        #    num_trips_threshold=consistent_threshold, pool=pool)
        
        pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_from_file('tmp_vectors.pickle')


    else:
        file_prefix = "coarse_"
        (pace_timeseries, pace_grouped, dates_grouped, trip_names) = readPaceData(inDir)


    if(robust):
        robustStr = "RPCA%d" % int(gamma*100)
    else:
        robustStr = "PCA"

    file_prefix += "%s_%s_%dpcs_%dpercmiss" % (inDir, robustStr, num_pcs, perc_missing_allowed*100)

    #pace_grouped = preprocess_data(pace_grouped, num_pcs,
    #                               perc_missing_allowed=perc_missing_allowed)
    pace_grouped = remove_bad_dimensions_grouped(pace_grouped, perc_missing_allowed)



    #Also get global pace information
    global_pace_timeseries = readGlobalPace(inDir)
    (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries)

    logMsg("Starting processes")
    logMsg("Doing RPCA with gamma=%f, k=%d" % (gamma, num_pcs))
    stdout.flush()

    # Freeze the parameters of the computeMahalanobisDistances() function
    mahalFunc = partial(computeMahalanobisDistances, robust=robust, k=num_pcs,
                        gamma=gamma)
    
    # Compute all mahalanobis distances
    sorted_keys = sorted(pace_grouped)    
    groups = [pace_grouped[key] for key in sorted_keys]    
    outlier_scores = pool.map(mahalFunc, groups) #Run all of the groups, using as much parallel computing as possible

    logMsg("Merging output")
    #Merge outputs from all of the threads
    entries = reduceOutlierScores(outlier_scores, sorted_keys, dates_grouped)

    
    logMsg("Writing file")
    #Output outlier scores to file
    scoreWriter = csv.writer(open("results/%s_robust_outlier_scores.csv"%file_prefix, "w"))
    scoreWriter.writerow(['date','hour','weekday', 'mahal' ,'c_val','global_pace', 'expected_pace', 'sd_pace'])
    

    for (date, hour, weekday, mahal, c_val) in sorted(entries):
        try:
            gl_pace = global_pace_timeseries[(date, hour, weekday)]
            exp_pace = expected_pace_timeseries[(date, hour, weekday)]
            sd_pace = sd_pace_timeseries[(date, hour, weekday)]
        except:
            gl_pace = 0
            exp_pace = 0
            sd_pace = 0
        
        scoreWriter.writerow([date, hour, weekday, mahal, c_val, gl_pace, exp_pace, sd_pace])


    """
    zscoreWriter= csv.writer(open("results/%szscore.csv"%file_prefix, "w"))
    zscoreWriter.writerow(['Date','Hour','Weekday'] + trip_names)
    #Output zscores to file
    for (date, hour, weekday) in sorted(zscores):
        std_vect = zscores[date, hour, weekday]
        zscoreWriter.writerow([date, hour, weekday] + ravel(std_vect).tolist())
    """
    

    #def make_video(tmp_folder, filename_base, pool=DefaultPool(), dates=None, speed_dicts=None)
    if(make_zscore_vid):
        logMsg("Making speed dicts")
        #zscore_list = [zscores[key] for key in sorted(zscores)]
        date_list = dates = [datetime(2012,10,21) + timedelta(hours=1)*x for x in range(168*3)]
        weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']        
        zscore_list = [zscores[str(d.date()), d.hour, weekday_names[d.weekday()]] for d in date_list]

        speed_dicts = build_speed_dicts(consistent_link_set, zscore_list)
        logMsg("Making video")
        make_video("tmp_vid", "zscore_vid", pool=pool, dates=date_list, speed_dicts=speed_dicts)
        
        
        

    logMsg("Done.")