def plot_histograms(args): classifiers = {} data, labels = [], [] for _file in args.infiles: label, content = _format_data(_file, args) labels.append(label) X, _ = zip(*content) data.append(X) plt.hist(data, args.histogram_bins, histtype='bar', stacked=False, label=labels) plt.legend(loc=args.legend_loc) set_plot_limits(plt, args) if args.savefig is not None: filename = args.savefig modifiers = [] if args.x_limits: modifiers.append('X=%d,%d' % tuple(args.x_limits)) if args.y_limits: modifiers.append('Y=%d,%d' % tuple(args.y_limits)) name, ext = filename.rsplit('.') new_filename = '{old_name}-{modifiers}.{ext}'.format(old_name=name, modifiers='-'.join(modifiers), ext=ext) print 'saving figure to - ', new_filename plt.savefig(new_filename, dpi=320) plt.clf() else: plt.show()
def plot_histograms(args): classifiers = {} data, labels = [], [] for _file in args.infiles: label, content = _format_data(_file, args) labels.append(label) X, _ = zip(*content) data.append(X) plt.hist(data, args.histogram_bins, histtype='bar', stacked=False, label=labels) plt.legend(loc=args.legend_loc) set_plot_limits(plt, args) if args.savefig is not None: filename = args.savefig modifiers = [] if args.x_limits: modifiers.append('X=%d,%d' % tuple(args.x_limits)) if args.y_limits: modifiers.append('Y=%d,%d' % tuple(args.y_limits)) name, ext = filename.rsplit('.') new_filename = '{old_name}-{modifiers}.{ext}'.format( old_name=name, modifiers='-'.join(modifiers), ext=ext) print 'saving figure to - ', new_filename plt.savefig(new_filename, dpi=320) plt.clf() else: plt.show()
def _plot_hist2d(data, args): data = data[data[args.hist2d[0]].notnull()][data[args.hist2d[1]].notnull()] if data.shape[0] < 1000: sys.exit(1) plt.hist2d(data[args.hist2d[0]], data[args.hist2d[1]], bins=args.histogram_bins, norm=LogNorm()) plt.colorbar() set_plot_limits(plt, args) plt.xlabel(args.hist2d[0]) plt.ylabel(args.hist2d[1]) set_plot_limits(plt, args) plt.title("N = {}".format(data.shape[0]))
def _plot_hist2d(data, args): data = data[data[args.hist2d[0]].notnull()][data[args.hist2d[1]].notnull()] if data.shape[0] < 1000: sys.exit(1) df = data.replace([np.inf, -np.inf], np.nan).dropna(subset=args.hist2d) plt.hist2d(df[args.hist2d[0]].astype(float), df[args.hist2d[1]].astype(float), bins=args.histogram_bins, norm=LogNorm()) plt.colorbar() set_plot_limits(plt, args) plt.xlabel(args.hist2d[0]) plt.ylabel(args.hist2d[1]) set_plot_limits(plt, args) plt.title("N = {}".format(data.shape[0]))
def plot_distribution(args): if args.csv is not None: data = pd.read_csv(args.csv) print ' '.join(list(data.columns.values)) if args.filter_num_rtus: print 'before filtering size =', data.shape[0] data = data[data['num_rtus'] == args.filter_num_rtus] print 'after filtering size =', data.shape[0] if args.filter_controller: print 'before filtering size =', data.shape[0] data = data[data['controller_id'] == args.filter_controller] print 'after filtering size =', data.shape[0] if 'controller_id' in data: print 'total controller_ids included =', len( set(data['controller_id'])) if 'num_rtus' in data: print 'distinct num_rtus =', len(set(data['num_rtus'])), set( data['num_rtus']) else: cursor = args.db_connection.cursor() cursor.execute( "select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)';" ) # noqa if args.query: with open(args.query, 'r') as infile: sql = ''.join(list(infile)) else: sql = """ SELECT {select} FROM {table}; """.format(select='*', table=args.table) print sql cursor.execute(sql) colnames = [desc[0] for desc in cursor.description] data = pd.DataFrame(cursor.fetchall(), columns=colnames) # Set args.data, so we can pass only args to functions args.data = data data_size = data.shape[0] if args.scatter is not None: if args.labels: interesting_data = data[[ args.scatter[0], args.scatter[1], args.labels ]] different_labels = set(data[args.labels]) for label, color in zip(different_labels, matplotlib.colors.cnames.keys()): df = interesting_data.query('{column} == "{label}"'.format( column=args.labels, label=label)) plt.scatter(df[args.scatter[0]], df[args.scatter[1]], c=color, label=label) else: plt.scatter(data[args.scatter[0]], data[args.scatter[1]], c=color) plt.xlabel(args.scatter[0]) plt.ylabel(args.scatter[1]) elif args.histogram is not None: if args.labels: interesting_data = data[[args.histogram, args.labels]] different_labels = set(data[args.labels]) data_to_plot, colors_to_use, labels_to_show = [], [], [] miscellaneous_labels = set() misc_frame, misc_color = pd.DataFrame(), None for label, color in zip(different_labels, matplotlib.colors.cnames.keys()): df = interesting_data.query('{column} == "{label}"'.format( column=args.labels, label=label)) if df.shape[0] < args.miscellaneous_cutoff * data_size: miscellaneous_labels.add(label) misc_frame = pd.concat([misc_frame, df[args.histogram]]) misc_color = color continue labels_to_show.append('{label} ({count})'.format( label=label, count=df.shape[0])) data_to_plot.append(df[args.histogram]) colors_to_use.append(color) if misc_color is not None: labels_to_show.append('miscellaneous ({count})'.format( count=misc_frame.shape[0])) data_to_plot.append(misc_frame) # colors_to_use.append(misc_color) colors_to_use.append('cyan') plt.hist(data_to_plot, args.histogram_bins, histtype='bar', color=colors_to_use, label=labels_to_show) else: df = data.replace([np.inf, -np.inf], np.nan).dropna(subset=[args.histogram]) plt.hist(df[args.histogram].astype(float), bins=args.histogram_bins, label=args.histogram) plt.yscale('log') plt.xlabel(args.histogram) if args.scale_down: plt.ylim(ymax=int(data_size * args.miscellaneous_cutoff)) elif args.hist2d is not None: _plot_hist2d(data, args) elif args.scatter3d is not None: plot_scatter3d(data, args) plt.legend() if not args.scatter3d and not args.histogram: set_plot_limits(plt, args) if args.savefig is not None: plt.savefig(args.savefig, dpi=320) plt.clf() else: plt.show()
def plot_estimator(args): estimates = json.load(args.estimates) dimensions = len(_format_datum(estimates['inliers'][0], args.columns)) - 1 y_limits = None if dimensions != 2 else args.y_limits inliers = _extract_data(estimates, 'inliers', args.columns, args.x_limits, y_limits) outliers = _extract_data(estimates, 'outliers', args.columns, args.x_limits, y_limits) all_data = itertools.chain(inliers, outliers) print 'plotting %d dimensional plot' % dimensions if dimensions == 1: if args.restrict_to: data = _extract_data(estimates, args.restrict_to, args.columns, args.x_limits, None) X, _ = zip(*data) plt.hist(X, args.histogram_bins, label=args.restrict_to) else: inliers = list(inliers) outliers = list(outliers) X1, _ = zip(*inliers) X2, _ = zip(*outliers) n, bins, patches = plt.hist([X1, X2], args.histogram_bins, histtype='bar', stacked=args.stacked_hist, label=['inliers', 'outliers'], color=['blue', 'red']) bin_width = bins[1] - bins[0] print 'plotted a curve with bin width =', bin_width if args.plot_scores: # Plot the estimate combined_X, scores = zip( *sorted(itertools.chain(inliers, outliers))) scaling_factor = bin_width * (len(estimates['inliers']) + len(estimates['outliers'])) sign = 1. if scores[0] > 0 else -1 scaling_factor *= sign scaled_scores = [scaling_factor * y for y in scores] plt.plot(combined_X, scaled_scores, color='magenta', label='est distribution', lw=1.1) elif dimensions == 2: if args.restrict_to: data = _extract_data(estimates, args.restrict_to, args.columns, args.x_limits, args.y_limits) else: data = all_data X, Y, _ = zip(*data) plt.hist2d(X, Y, bins=args.histogram_bins, norm=LogNorm()) plt.colorbar() plt.legend(loc=args.legend_loc) set_plot_limits(plt, args) if args.savefig is not None: if args.savefig == SAVEFIG_INFER_VALUE: name = os.path.basename(args.estimates.name).rsplit('.')[0] filename = 'target/plots/{}.png'.format(name) else: filename = args.savefig modifiers = [] if args.restrict_to: modifiers.append(args.restrict_to) if args.estimates: modifiers.append('estimates') if args.x_limits: modifiers.append('X=%d,%d' % tuple(args.x_limits)) if args.y_limits: modifiers.append('Y=%d,%d' % tuple(args.y_limits)) name, ext = filename.rsplit('.') new_filename = '{old_name}-{modifiers}.{ext}'.format( old_name=name, modifiers='-'.join(modifiers), ext=ext) print 'saving figure to - ', new_filename plt.savefig(new_filename, dpi=320) plt.clf() else: plt.show()
def plot_distribution(args): if args.csv is not None: data = pd.read_csv(args.csv) print ' '.join(list(data.columns.values)) if args.filter_num_rtus: print 'before filtering size =', data.shape[0] data = data[data['num_rtus'] == args.filter_num_rtus] print 'after filtering size =', data.shape[0] if args.filter_controller: print 'before filtering size =', data.shape[0] data = data[data['controller_id'] == args.filter_controller] print 'after filtering size =', data.shape[0] if 'controller_id' in data: print 'total controller_ids included =', len(set(data['controller_id'])) if 'num_rtus' in data: print 'distinct num_rtus =', len(set(data['num_rtus'])), set(data['num_rtus']) else: cursor = args.db_connection.cursor() cursor.execute("select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)';") # noqa if args.query: with open(args.query, 'r') as infile: sql = ''.join(list(infile)) else: sql = """ SELECT {select} FROM {table}; """.format(select='*', table=args.table) print sql cursor.execute(sql) colnames = [desc[0] for desc in cursor.description] data = pd.DataFrame(cursor.fetchall(), columns=colnames) # Set args.data, so we can pass only args to functions args.data = data data_size = data.shape[0] if args.scatter is not None: if args.labels: interesting_data = data[[args.scatter[0], args.scatter[1], args.labels]] different_labels = set(data[args.labels]) for label, color in zip(different_labels, matplotlib.colors.cnames.keys()): df = interesting_data.query('{column} == "{label}"'.format( column=args.labels, label=label)) plt.scatter(df[args.scatter[0]], df[args.scatter[1]], c=color, label=label) else: plt.scatter(data[args.scatter[0]], data[args.scatter[1]], c=color) plt.xlabel(args.scatter[0]) plt.ylabel(args.scatter[1]) elif args.histogram is not None: if args.labels: interesting_data = data[[args.histogram, args.labels]] different_labels = set(data[args.labels]) data_to_plot, colors_to_use, labels_to_show = [], [], [] miscellaneous_labels = set() misc_frame, misc_color = pd.DataFrame(), None for label, color in zip(different_labels, matplotlib.colors.cnames.keys()): df = interesting_data.query('{column} == "{label}"'.format( column=args.labels, label=label)) if df.shape[0] < args.miscellaneous_cutoff * data_size: miscellaneous_labels.add(label) misc_frame = pd.concat([misc_frame, df[args.histogram]]) misc_color = color continue labels_to_show.append('{label} ({count})'.format(label=label, count=df.shape[0])) data_to_plot.append(df[args.histogram]) colors_to_use.append(color) if misc_color is not None: labels_to_show.append('miscellaneous ({count})'.format( count=misc_frame.shape[0])) data_to_plot.append(misc_frame) # colors_to_use.append(misc_color) colors_to_use.append('cyan') plt.hist(data_to_plot, args.histogram_bins, histtype='bar', color=colors_to_use, label=labels_to_show) else: df = data.replace([np.inf, -np.inf], np.nan).dropna(subset=[args.histogram]) plt.hist(df[args.histogram].astype(float), bins=args.histogram_bins, label=args.histogram) plt.yscale('log') plt.xlabel(args.histogram) if args.scale_down: plt.ylim(ymax=int(data_size * args.miscellaneous_cutoff)) elif args.hist2d is not None: _plot_hist2d(data, args) elif args.scatter3d is not None: plot_scatter3d(data, args) plt.legend() if not args.scatter3d and not args.histogram: set_plot_limits(plt, args) if args.savefig is not None: plt.savefig(args.savefig, dpi=320) plt.clf() else: plt.show()
def plot_estimator(args): estimates = json.load(args.estimates) dimensions = len(_format_datum(estimates['inliers'][0], args.columns)) - 1 y_limits = None if dimensions != 2 else args.y_limits inliers = _extract_data(estimates, 'inliers', args.columns, args.x_limits, y_limits) outliers = _extract_data(estimates, 'outliers', args.columns, args.x_limits, y_limits) all_data = itertools.chain(inliers, outliers) print 'plotting %d dimensional plot' % dimensions if dimensions == 1: if args.restrict_to: data = _extract_data(estimates, args.restrict_to, args.columns, args.x_limits, None) X, _ = zip(*data) plt.hist(X, args.histogram_bins, label=args.restrict_to) else: inliers = list(inliers) outliers = list(outliers) X1, _ = zip(*inliers) X2, _ = zip(*outliers) n, bins, patches = plt.hist([X1, X2], args.histogram_bins, histtype='bar', stacked=args.stacked_hist, label=['inliers', 'outliers'], color=['blue', 'red']) bin_width = bins[1] - bins[0] print 'plotted a curve with bin width =', bin_width if args.plot_scores: # Plot the estimate combined_X, scores = zip(*sorted(itertools.chain(inliers, outliers))) scaling_factor = bin_width * (len(estimates['inliers']) + len(estimates['outliers'])) sign = 1. if scores[0] > 0 else -1 scaling_factor *= sign scaled_scores = [scaling_factor * y for y in scores] plt.plot(combined_X, scaled_scores, color='magenta', label='est distribution', lw=1.1) elif dimensions == 2: if args.restrict_to: data = _extract_data(estimates, args.restrict_to, args.columns, args.x_limits, args.y_limits) else: data = all_data X, Y, _ = zip(*data) plt.hist2d(X, Y, bins=args.histogram_bins, norm=LogNorm()) plt.colorbar() plt.legend(loc=args.legend_loc) set_plot_limits(plt, args) if args.savefig is not None: if args.savefig == SAVEFIG_INFER_VALUE: name = os.path.basename(args.estimates.name).rsplit('.')[0] filename = 'target/plots/{}.png'.format(name) else: filename = args.savefig modifiers = [] if args.restrict_to: modifiers.append(args.restrict_to) if args.estimates: modifiers.append('estimates') if args.x_limits: modifiers.append('X=%d,%d' % tuple(args.x_limits)) if args.y_limits: modifiers.append('Y=%d,%d' % tuple(args.y_limits)) name, ext = filename.rsplit('.') new_filename = '{old_name}-{modifiers}.{ext}'.format( old_name=name, modifiers='-'.join(modifiers), ext=ext) print 'saving figure to - ', new_filename plt.savefig(new_filename, dpi=320) plt.clf() else: plt.show()
def plot_score_contours(args): weights = [] centers = [] sigmas = [] if args.centers and args.covariances and args.weights: # normalize weights to sum to 1. weights = json.load(args.weights) weights = [w / sum(weights) for w in weights] weights = [0.4 * w / max(weights) for w in weights] centers = load_cluster_parameters(args.centers) sigmas = load_cluster_parameters(args.covariances) fig = plt.figure(0) ax = fig.add_subplot(111) for i in range(len(centers)): w, h, angle = get_ellipse_from_covariance(sigmas[i]) e = patches.Ellipse(centers[i], w, h, angle=angle) e.set_alpha(weights[i]) ax.add_artist(e) print i, weights[i], centers[i], sigmas[i] set_ax_limits(ax, args) x, y = zip(*centers) plt.scatter(x, y, s=weights) X, Y, Z = load_json_dump(args.scored_grid) if args.score_cap: Z = [min(z, args.score_cap) for z in Z] if args.score_lower_limit: Z = [max(z, args.score_lower_limit) for z in Z] size = int(math.sqrt(len(Z))) X = np.reshape(X, (size, size)) Y = np.reshape(Y, (size, size)) Z = np.reshape(Z, (size, size)) def format_args(i): kwargs = {} kwargs['mux'] = centers[i][0] kwargs['muy'] = centers[i][1] kwargs['sigmax'] = math.sqrt(sigmas[i][0][0]) kwargs['sigmay'] = math.sqrt(sigmas[i][1][1]) kwargs['sigmaxy'] = sigmas[i][0][1] return kwargs if len(weights): Zgaussians = weights[0] * mlab.bivariate_normal(X, Y, **format_args(0)) for i in range(1, len(centers)): Zgaussians += weights[i] * mlab.bivariate_normal( X, Y, **format_args(i)) if args.plot == 'components': CS = plt.contour(X, Y, Zgaussians, linewidth=10000, inline=1) elif args.plot == 'density': CS = plt.contour(X, Y, Z, linewidth=10000, inline=1) elif args.plot == 'difference': CS = plt.contour(X, Y, Z - Zgaussians, linewidth=10000, inline=1) if args.plot != 'noop': plt.clabel(CS, inline=1) set_plot_limits(plt, args) if args.csv and args.hist2d: args.data = pd.read_csv(args.csv) _plot_hist2d(args.data, args) if args.savefig: if args.savefig == SAVEFIG_INFER_VALUE: name = os.path.basename(args.scored_grid).rsplit('.')[0] filename = 'target/plots/{}.png'.format(name) else: filename = args.savefig print 'saving figure to - ', filename plt.savefig(filename, dpi=320) else: plt.show()
def plot_score_contours(args): weights = [] centers = [] sigmas = [] if args.centers and args.covariances and args.weights: # normalize weights to sum to 1. weights = json.load(args.weights) weights = [w / sum(weights) for w in weights] weights = [0.4 * w / max(weights) for w in weights] centers = load_cluster_parameters(args.centers) sigmas = load_cluster_parameters(args.covariances) fig = plt.figure(0) ax = fig.add_subplot(111) for i in range(len(centers)): w, h, angle = get_ellipse_from_covariance(sigmas[i]) e = patches.Ellipse(centers[i], w, h, angle=angle) e.set_alpha(weights[i]) ax.add_artist(e) print i, weights[i], centers[i], sigmas[i] set_ax_limits(ax, args) x, y = zip(*centers) plt.scatter(x, y, s=weights) X, Y, Z = load_json_dump(args.scored_grid) if args.score_cap: Z = [min(z, args.score_cap) for z in Z] if args.score_lower_limit: Z = [max(z, args.score_lower_limit) for z in Z] size = int(math.sqrt(len(Z))) X = np.reshape(X, (size, size)) Y = np.reshape(Y, (size, size)) Z = np.reshape(Z, (size, size)) def format_args(i): kwargs = {} kwargs['mux'] = centers[i][0] kwargs['muy'] = centers[i][1] kwargs['sigmax'] = math.sqrt(sigmas[i][0][0]) kwargs['sigmay'] = math.sqrt(sigmas[i][1][1]) kwargs['sigmaxy'] = sigmas[i][0][1] return kwargs if len(weights): Zgaussians = weights[0] * mlab.bivariate_normal(X, Y, **format_args(0)) for i in range(1, len(centers)): Zgaussians += weights[i] * mlab.bivariate_normal(X, Y, **format_args(i)) if args.plot == 'components': CS = plt.contour(X, Y, Zgaussians, linewidth=10000, inline=1) elif args.plot == 'density': CS = plt.contour(X, Y, Z, linewidth=10000, inline=1) elif args.plot == 'difference': CS = plt.contour(X, Y, Z - Zgaussians, linewidth=10000, inline=1) if args.plot != 'noop': plt.clabel(CS, inline=1) set_plot_limits(plt, args) if args.csv and args.hist2d: args.data = pd.read_csv(args.csv) _plot_hist2d(args.data, args) if args.savefig: if args.savefig == SAVEFIG_INFER_VALUE: name = os.path.basename(args.scored_grid).rsplit('.')[0] filename = 'target/plots/{}.png'.format(name) else: filename = args.savefig print 'saving figure to - ', filename plt.savefig(filename, dpi=320) else: plt.show()
def plot_distribution(args): if args.csv is None: cursor = args.db_connection.cursor() cursor.execute("select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)';") # noqa print cursor.fetchall() sql = """ SELECT {select} FROM {table}; """.format(select='*', table=args.table) print sql colnames = [desc[0] for desc in cursor.description] data = pd.DataFrame(cursor.fetchall(), columns=colnames) else: data = pd.read_csv(args.csv) # Set args.data, so we can pass only args to functions args.data = data data_size = data.shape[0] if args.scatter is not None: if args.labels: interesting_data = data[[args.scatter[0], args.scatter[1], args.labels]] different_labels = set(data[args.labels]) for label, color in zip(different_labels, matplotlib.colors.cnames.keys()): df = interesting_data.query('{column} == "{label}"'.format( column=args.labels, label=label)) plt.scatter(df[args.scatter[0]], df[args.scatter[1]], c=color, label=label) else: plt.scatter(data[args.scatter[0]], data[args.scatter[1]], c=color) plt.xlabel(args.scatter[0]) plt.ylabel(args.scatter[1]) elif args.histogram is not None: if args.labels: interesting_data = data[[args.histogram, args.labels]] different_labels = set(data[args.labels]) data_to_plot, colors_to_use, labels_to_show = [], [], [] miscellaneous_labels = set() misc_frame, misc_color = pd.DataFrame(), None for label, color in zip(different_labels, matplotlib.colors.cnames.keys()): df = interesting_data.query('{column} == "{label}"'.format( column=args.labels, label=label)) if df.shape[0] < args.miscellaneous_cutoff * data_size: miscellaneous_labels.add(label) misc_frame = pd.concat([misc_frame, df[args.histogram]]) misc_color = color continue labels_to_show.append('{label} ({count})'.format(label=label, count=df.shape[0])) data_to_plot.append(df[args.histogram]) colors_to_use.append(color) if misc_color is not None: labels_to_show.append('miscellaneous ({count})'.format( count=misc_frame.shape[0])) data_to_plot.append(misc_frame) # colors_to_use.append(misc_color) colors_to_use.append('cyan') plt.hist(data_to_plot, args.histogram_bins, histtype='bar', color=colors_to_use, label=labels_to_show) else: plt.hist(data[args.histogram], args.histogram_bins, label=args.histogram) plt.xlabel(args.histogram) if args.scale_down: plt.ylim(ymax=int(data_size * args.miscellaneous_cutoff)) elif args.hist2d is not None: _plot_hist2d(data, args) plt.legend() set_plot_limits(plt, args) if args.savefig is not None: plt.savefig(args.savefig, dpi=320) plt.clf() else: plt.show()