def generateTimeSeriesOutlierScores(inDir, use_link_db=False, robust=False, num_pcs=10, gamma=.5, tol_perc=1e-06, perc_missing_allowed=.05, make_zscore_vid=False, pool=DefaultPool()): numpy.set_printoptions(linewidth=1000, precision=4) #Read the time-series data from the file logMsg("Reading files...") stdout.flush() if (use_link_db): file_prefix = "link_" #pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_pace_data( # num_trips_threshold=consistent_threshold, pool=pool) pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_from_file( use_link_db) else: file_prefix = "coarse_" (pace_timeseries, pace_grouped, dates_grouped, trip_names) = readPaceData(inDir) if (robust): if (gamma == "tune"): robustStr = "RPCAtune" else: robustStr = "RPCA%d" % int(gamma * 100) else: robustStr = "PCA" file_prefix += "%s_%s_%dpcs_%dpercmiss" % (inDir, robustStr, num_pcs, perc_missing_allowed * 100) #pace_grouped = preprocess_data(pace_grouped, num_pcs, # perc_missing_allowed=perc_missing_allowed) pace_grouped, trip_names = remove_bad_dimensions_grouped( pace_grouped, trip_names, perc_missing_allowed) # logMsg(trip_names) #Also get global pace information if inDir != '': global_pace_timeseries = readGlobalPace(inDir) (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries) logMsg("Starting processes") if (gamma == "tune"): logMsg("Doing RPCA and tuning gamma") else: logMsg("Doing RPCA with gamma=%f, k=%d" % (gamma, num_pcs)) stdout.flush() # Freeze the parameters of the computeMahalanobisDistances() function mahalFunc = partial(computeMahalanobisDistances, robust=robust, k=num_pcs, gamma=gamma, tol_perc=tol_perc) # Compute all mahalanobis distances sorted_keys = sorted(pace_grouped) groups = [(key, pace_grouped[key]) for key in sorted_keys] outlier_scores = pool.map( mahalFunc, groups ) #Run all of the groups, using as much parallel computing as possible logMsg("Merging output") #Merge outputs from all of the threads entries = reduceOutlierScores(outlier_scores, sorted_keys, dates_grouped) logMsg("Writing file") #Output outlier scores to file scoreWriter = csv.writer( open("results/%s_robust_outlier_scores.csv" % file_prefix, "w")) scoreWriter.writerow([ 'date', 'hour', 'weekday', 'mahal5', 'mahal10', 'mahal20', 'mahal50', 'c_val', 'gamma', 'tol', 'pca_dim', 'num_guess', 'hi_pcs', 'global_pace', 'expected_pace', 'sd_pace' ]) for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, z_scores, gamma, tol, n_pca_dim, n_guess, hi_pcs) in sorted(entries): try: gl_pace = global_pace_timeseries[(date, hour, weekday)] exp_pace = expected_pace_timeseries[(date, hour, weekday)] sd_pace = sd_pace_timeseries[(date, hour, weekday)] except: gl_pace = 0 exp_pace = 0 sd_pace = 0 scoreWriter.writerow([ date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, gamma, tol, n_pca_dim, n_guess, hi_pcs, gl_pace, exp_pace, sd_pace ]) all_cvals = [ c_val for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, z_scores, gamma, tol, n_pca_dim, n_guess, hi_pcs) in sorted(entries) ] zscoreWriter = csv.writer(open("results/%s_zscore.csv" % file_prefix, "w")) zscoreWriter.writerow(['Date', 'Hour', 'Weekday'] + trip_names) #Output zscores to file for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, z_scores, gamma, tol, n_pca_dim, n_guess, hi_pcs) in sorted(entries): std_vect = z_scores zscoreWriter.writerow([date, hour, weekday] + ravel(std_vect).tolist()) #def make_video(tmp_folder, filename_base, pool=DefaultPool(), dates=None, speed_dicts=None) if (make_zscore_vid): logMsg("Making speed dicts") #zscore_list = [zscores[key] for key in sorted(zscores)] date_list = [] zscore_list = [] for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, z_scores, gamma, tol, n_pca_dim, n_guess, hi_pcs) in sorted(entries): if (date >= '2014-06-01' and date < '2014-07-01'): dt = datetime.strptime(date, '%Y-%m-%d') + timedelta(hours=int(hour)) date_list.append(dt) zscore_list.append(z_scores) speed_dicts = build_speed_dicts(consistent_link_set, zscore_list) logMsg("Making video with %d frames" % len(zscore_list)) with open('tmp_zscores.pickle', 'w') as f: pickle.dump((date_list, speed_dicts), f) make_video("tmp_vid", "zscore_vid", pool=pool, dates=date_list, speed_dicts=speed_dicts) logMsg("Done.") return all_cvals
def generateTimeSeriesOutlierScores(inDir, use_link_db=False, robust=False, num_pcs=10, gamma=.5, tol_perc=1e-06, perc_missing_allowed=.05, make_zscore_vid=False, pool = DefaultPool()): numpy.set_printoptions(linewidth=1000, precision=4) #Read the time-series data from the file logMsg("Reading files...") stdout.flush() if(use_link_db): file_prefix = "link_" #pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_pace_data( # num_trips_threshold=consistent_threshold, pool=pool) pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_from_file(use_link_db) else: file_prefix = "coarse_" (pace_timeseries, pace_grouped, dates_grouped, trip_names) = readPaceData(inDir) if(robust): if(gamma=="tune"): robustStr = "RPCAtune" else: robustStr = "RPCA%d" % int(gamma*100) else: robustStr = "PCA" file_prefix += "%s_%s_%dpcs_%dpercmiss" % (inDir, robustStr, num_pcs, perc_missing_allowed*100) #pace_grouped = preprocess_data(pace_grouped, num_pcs, # perc_missing_allowed=perc_missing_allowed) pace_grouped, trip_names = remove_bad_dimensions_grouped(pace_grouped, trip_names, perc_missing_allowed) logMsg(trip_names) #Also get global pace information global_pace_timeseries = readGlobalPace(inDir) (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries) logMsg("Starting processes") if(gamma=="tune"): logMsg("Doing RPCA and tuning gamma") else: logMsg("Doing RPCA with gamma=%f, k=%d" % (gamma, num_pcs)) stdout.flush() # Freeze the parameters of the computeMahalanobisDistances() function mahalFunc = partial(computeMahalanobisDistances, robust=robust, k=num_pcs, gamma=gamma, tol_perc=tol_perc) # Compute all mahalanobis distances sorted_keys = sorted(pace_grouped) groups = [(key,pace_grouped[key]) for key in sorted_keys] outlier_scores = pool.map(mahalFunc, groups) #Run all of the groups, using as much parallel computing as possible logMsg("Merging output") #Merge outputs from all of the threads entries = reduceOutlierScores(outlier_scores, sorted_keys, dates_grouped) logMsg("Writing file") #Output outlier scores to file scoreWriter = csv.writer(open("results/%s_robust_outlier_scores.csv"%file_prefix, "w")) scoreWriter.writerow(['date','hour','weekday', 'mahal5', 'mahal10', 'mahal20', 'mahal50' ,'c_val', 'gamma', 'tol', 'pca_dim', 'num_guess', 'hi_pcs', 'global_pace', 'expected_pace', 'sd_pace']) for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, z_scores, gamma, tol, n_pca_dim, n_guess, hi_pcs) in sorted(entries): try: gl_pace = global_pace_timeseries[(date, hour, weekday)] exp_pace = expected_pace_timeseries[(date, hour, weekday)] sd_pace = sd_pace_timeseries[(date, hour, weekday)] except: gl_pace = 0 exp_pace = 0 sd_pace = 0 scoreWriter.writerow([date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, gamma, tol, n_pca_dim, n_guess, hi_pcs, gl_pace, exp_pace, sd_pace]) all_cvals = [c_val for(date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, z_scores, gamma, tol,n_pca_dim, n_guess, hi_pcs) in sorted(entries)] zscoreWriter= csv.writer(open("results/%s_zscore.csv"%file_prefix, "w")) zscoreWriter.writerow(['Date','Hour','Weekday'] + trip_names) #Output zscores to file for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, z_scores, gamma, tol, n_pca_dim, n_guess, hi_pcs) in sorted(entries): std_vect = z_scores zscoreWriter.writerow([date, hour, weekday] + ravel(std_vect).tolist()) #def make_video(tmp_folder, filename_base, pool=DefaultPool(), dates=None, speed_dicts=None) if(make_zscore_vid): logMsg("Making speed dicts") #zscore_list = [zscores[key] for key in sorted(zscores)] date_list = [] zscore_list = [] for (date, hour, weekday, mahal5, mahal10, mahal20, mahal50, c_val, z_scores, gamma, tol, n_pca_dim, n_guess, hi_pcs) in sorted(entries): if(date >= '2012-10-21' and date < '2012-11-11'): dt = datetime.strptime(date, '%Y-%m-%d') + timedelta(hours=int(hour)) date_list.append(dt) zscore_list.append(z_scores) speed_dicts = build_speed_dicts(consistent_link_set, zscore_list) logMsg("Making video with %d frames" % len(zscore_list)) with open('tmp_zscores.pickle', 'w') as f: pickle.dump((date_list, speed_dicts), f) make_video("tmp_vid", "zscore_vid", pool=pool, dates=date_list, speed_dicts=speed_dicts) logMsg("Done.") return all_cvals
def generateTimeSeriesOutlierScores(inDir, use_link_db=False, robust=False, num_pcs=10, gamma=.5, perc_missing_allowed=.05, make_zscore_vid=False, pool = DefaultPool()): numpy.set_printoptions(linewidth=1000, precision=4) #Read the time-series data from the file logMsg("Reading files...") stdout.flush() if(use_link_db): file_prefix = "link_" #pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_pace_data( # num_trips_threshold=consistent_threshold, pool=pool) pace_timeseries, pace_grouped, weights_grouped, dates_grouped, trip_names, consistent_link_set = load_from_file('tmp_vectors.pickle') else: file_prefix = "coarse_" (pace_timeseries, pace_grouped, dates_grouped, trip_names) = readPaceData(inDir) if(robust): robustStr = "RPCA%d" % int(gamma*100) else: robustStr = "PCA" file_prefix += "%s_%s_%dpcs_%dpercmiss" % (inDir, robustStr, num_pcs, perc_missing_allowed*100) #pace_grouped = preprocess_data(pace_grouped, num_pcs, # perc_missing_allowed=perc_missing_allowed) pace_grouped = remove_bad_dimensions_grouped(pace_grouped, perc_missing_allowed) #Also get global pace information global_pace_timeseries = readGlobalPace(inDir) (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries) logMsg("Starting processes") logMsg("Doing RPCA with gamma=%f, k=%d" % (gamma, num_pcs)) stdout.flush() # Freeze the parameters of the computeMahalanobisDistances() function mahalFunc = partial(computeMahalanobisDistances, robust=robust, k=num_pcs, gamma=gamma) # Compute all mahalanobis distances sorted_keys = sorted(pace_grouped) groups = [pace_grouped[key] for key in sorted_keys] outlier_scores = pool.map(mahalFunc, groups) #Run all of the groups, using as much parallel computing as possible logMsg("Merging output") #Merge outputs from all of the threads entries = reduceOutlierScores(outlier_scores, sorted_keys, dates_grouped) logMsg("Writing file") #Output outlier scores to file scoreWriter = csv.writer(open("results/%s_robust_outlier_scores.csv"%file_prefix, "w")) scoreWriter.writerow(['date','hour','weekday', 'mahal' ,'c_val','global_pace', 'expected_pace', 'sd_pace']) for (date, hour, weekday, mahal, c_val) in sorted(entries): try: gl_pace = global_pace_timeseries[(date, hour, weekday)] exp_pace = expected_pace_timeseries[(date, hour, weekday)] sd_pace = sd_pace_timeseries[(date, hour, weekday)] except: gl_pace = 0 exp_pace = 0 sd_pace = 0 scoreWriter.writerow([date, hour, weekday, mahal, c_val, gl_pace, exp_pace, sd_pace]) """ zscoreWriter= csv.writer(open("results/%szscore.csv"%file_prefix, "w")) zscoreWriter.writerow(['Date','Hour','Weekday'] + trip_names) #Output zscores to file for (date, hour, weekday) in sorted(zscores): std_vect = zscores[date, hour, weekday] zscoreWriter.writerow([date, hour, weekday] + ravel(std_vect).tolist()) """ #def make_video(tmp_folder, filename_base, pool=DefaultPool(), dates=None, speed_dicts=None) if(make_zscore_vid): logMsg("Making speed dicts") #zscore_list = [zscores[key] for key in sorted(zscores)] date_list = dates = [datetime(2012,10,21) + timedelta(hours=1)*x for x in range(168*3)] weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] zscore_list = [zscores[str(d.date()), d.hour, weekday_names[d.weekday()]] for d in date_list] speed_dicts = build_speed_dicts(consistent_link_set, zscore_list) logMsg("Making video") make_video("tmp_vid", "zscore_vid", pool=pool, dates=date_list, speed_dicts=speed_dicts) logMsg("Done.")