def run(args, features, rank_matrix, plot_dict, outlier_ids): # Quick Access Variables N_val = args.num_outliers B_val = args.budget P_val = float(args.p_val) # Create graph between outliers and plots cprint("Generating Bipartite Graph") scaled_matrix, normal_matrix = generate_graph(P_val, rank_matrix, outlier_ids) saved_graph = Graph(scaled_matrix) print_ok("Graph Generated Successfully") # Run appropriate algorithm to get list of selected graphs scatter_plots = len(plot_dict) file = open(args.logfolder + args.logfile, 'w') if args.baseline: algos = ["LookOut", "TopK", "Random"] else: algos = ["LookOut"] for algo in algos: cprint("\nIteration " + algo, RED) graph = copy.deepcopy(saved_graph) print("N_val = ", N_val, " Budget = ", B_val) start_time = time.time() cprint("Running " + algo + " Algorithm") plots = LookOut(graph, B_val, algo) frequencies = generate_frequency_list(plots, scaled_matrix) print_ok(algo + " Complete") elapsed_time = time.time() - start_time cprint("Saving Plots") coverage, max_coverage = get_coverage(plots, N_val, normal_matrix) print("\t-> Total Plots Generated = ", end='') cprint(scatter_plots, OKBLUE) print("\t-> Total Plots Chosen = ", end='') cprint(len(plots), OKBLUE) print("\t-> Coverage = ", end='') cprint("{0:.3f} / {1:.3f}".format(coverage, max_coverage), OKBLUE) # Save selected plots as png images for i, plot in enumerate(plots): pair = plot_dict[plot] fig = scatter_outliers(features[pair[0]], features[pair[1]], frequencies, plot) fname = args.plotfolder + '{0}-{1}-{2}-{3}.png'.format( algo, N_val, B_val, i) fig.savefig(fname) plt.close(fig) print_ok("Plots Saved") file.write("N_val " + str(N_val) + "\tBudget " + str(B_val) + "\tAlgo " + algo + "\tTime Taken = " + str(elapsed_time) + "\tCoverage = " + str(coverage) + "%" + "\n") file.close() cprint("Finished")
# Get Outliers Scores if using iForests if generate_iForest: cprint("Generating Graph File") features = combine_features([ eval(F) for F in identity_features + continuous_features + discrete_features ]) iForest(features) print_ok("iForest Generation Complete") file = open(filefolder + "Log.txt", 'w') N_list = [10, 20, 50, 75, 100] for N_val in N_list: # Create graph between outliers and plots cprint("Generating Graph File") ranklist.generate_graph(P_val, N_val) print_ok("Graph File Generated") # Run plotSpot to get selected graphs Budget = [1, 2, 3, 4, 5, 6] for B in Budget: for algo in ["SpellOut", "Greedy", "G_Norm"]: print "N_val = ", N_val, " Budget = ", B, " ALGO = ", algo start_time = time.time() cprint("Running PlotSpot Algorithm") plots = plotSpot(B, algo) print_ok("PlotSpot Complete") elapsed_time = time.time() - start_time coverage = get_coverage(plots) # Save selected plots in pdf cprint("Saving Plots") total_plots = scatter_plots
pp = PdfPages(plotfolder + 'scatterplots--full.pdf') for j, features in enumerate(feature_pairs): X, Y = features[0], features[1] print j, 'of', len(feature_pairs) pair_features = np.array([INFO[features[0]], INFO[features[1]]]).T forest = IsolationForest( n_estimators=500, max_samples=1000, random_state=0, contamination=num_outlier / 343546.0 # number of nodes ) fig = scatter_plot(INFO[X], INFO[Y], INFO['IDs'], discription[Y], discription[X], discription[Y] + ' vs ' + discription[X], compare_value[X]) forest.fit(pair_features) scores = forest.decision_function(pair_features[outlier_ids, :]) rank_list = sorted([(outliers[i], -s) for (i, s) in enumerate(scores)], key=lambda x: x[1], reverse=True) rank_matrix.append(rank_list) pp.close() # runs, properly till this, why is generate_graph returning nothing? scaled_matrix, normal_matrix = ranklist.generate_graph(P_val, num_outlier, rank_matrix) plots = plotSpot(budget, scaled_matrix, "SpellOut") frequencies = generate_frequency_list(plots, scaled_matrix) for i, plot in enumerate(plots): fig = scatter_outliers(plot, INFO['IDs'], frequencies) fname = 'discoveries/DBLP--full-{0}-{1}-{2}.png'.format(num_outlier, budget, i) fig.savefig(fname)
cprint("Generating Graph File") features = combine_features([eval(F) for F in identity_features + continuous_features + discrete_features]) iForest(features) print_ok("iForest Generation Complete") file = open(filefolder + logfile, 'w') # Use outlier list if provided if not generate_iForest and not merge_ranklists: N_list = [len(global_outlier_list)] count = 0 for N_val in N_list: # Create graph between outliers and plots cprint("Generating Graph File") scaled_matrix, normal_matrix = ranklist.generate_graph(P_val, N_val, rank_matrix) print_ok("Graph File Generated") # Run plotSpot to get selected graphs for B in Budget: for algo in ["SpellOut", "TopK"]: if algo != "SpellOut" and not baseline: continue count += 1 cprint("\nIteration " + str(count), RED) print "N_val = ", N_val, " Budget = ", B, " ALGO = ", algo start_time = time.time() cprint ("Running PlotSpot Algorithm") plots = plotSpot(B, scaled_matrix, algo) frequencies = generate_frequency_list(plots, scaled_matrix)
# start clock data = copy.deepcopy(raw_data) start_time = time.time() # feature extraction data = data_transform.read_data(data) users = data.groupby('SOURCE') IDs = map(int, users.groups.keys()) destinations = data.groupby('DESTINATION') AMOUNT = fix_zero_error(users['WEIGHT'].sum().values.tolist()) DEST = fix_zero_error(users['DESTINATION'].nunique().values.tolist()) LIFE = fix_zero_error(users['LIFETIME'].first().values.tolist()) IN_EDGE = fix_zero_error(users['WEIGHT'].count().values.tolist()) IAT_VAR = fix_zero_error(users['IAT_VAR'].first().values.tolist()) # select plots features = combine_features([ eval(F) for F in identity_features + continuous_features + discrete_features ]) iForest(features) ranklist.generate_graph(p_val, num_outlier) plots = plotSpot(budget, 'SpellOut') # end clock time_elapsed = time.time() - start_time running_times[num_edge].append(time_elapsed) print num_edge, t, time_elapsed pickle.dump(running_times, open('results/scalability_edges.pkl', 'wb'))
time_plots = 0 # Count of the number of time plots generated """ Generate Band Plots """ band_plots = 0 # Count of the number of band plots generated """ PlotSPOT Algorithm """ # Get Outliers Scores if using iForests if generate_iForest: cprint("Generating Graph File") features = combine_features([ eval(F) for F in identity_features + continuous_features + discrete_features ]) iForest(features) print_ok("iForest Generation Complete") # Create graph between outliers and plots cprint("Generating Graph File") ranklist.generate_graph() print_ok("Graph File Generated") # Run plotSpot to get selected graphs cprint("Running PlotSpot Algorithm") plots = plotSpot() print_ok("PlotSpot Complete") # Save selected plots in pdf cprint("Saving Plots") total_plots = get_total_plots(scatter_plots, ccdf_plots, histograms, time_plots, band_plots) print "\t-> Total Plots Generated = ", cprint(total_plots, OKBLUE) print "\t-> Total Plots Chosen = ", cprint(len(plots), OKBLUE) print "\t-> Compression = ", cprint("{0:.2f} %".format((1 - float(len(plots)) / total_plots) * 100), OKBLUE)