bureau2_path = args[1] print "make_crowd_plots: illustrating behaviour captured in bureau files: " print "make_crowd_plots: ", bureau1_path print "make_crowd_plots: ", bureau2_path else: print make_crowd_plots.__doc__ return cornerplotter_path = cornerplotter_path + 'CornerPlotter.py' output_directory = './' # ------------------------------------------------------------------ # Read in bureau objects: bureau1 = swap.read_pickle(bureau1_path, 'bureau') bureau2 = swap.read_pickle(bureau2_path, 'bureau') print "make_crowd_plots: stage 1, 2 agent numbers: ", len( bureau1.list()), len(bureau2.list()) # make lists by going through agents N_early = 10 stage2_veteran_members = [] list1 = bureau1.list() for ID in bureau2.list(): if ID in list1: stage2_veteran_members.append(ID) print "make_crowd_plots: ", len( stage2_veteran_members ), " volunteers stayed on for Stage 2 from Stage 1"
# How will we decide if a sim has been seen? try: use_marker_positions = tonights.parameters['use_marker_positions'] except: use_marker_positions = False print "SWAP: should we use the marker positions on sims? ",use_marker_positions # How will we make decisions based on probability? thresholds = {} thresholds['detection'] = tonights.parameters['detection_threshold'] thresholds['rejection'] = tonights.parameters['rejection_threshold'] # ------------------------------------------------------------------ # Read in, or create, a bureau of agents who will represent the # volunteers: bureau = swap.read_pickle(tonights.parameters['bureaufile'],'bureau') # ------------------------------------------------------------------ # Read in, or create, an object representing the candidate list: sample = swap.read_pickle(tonights.parameters['samplefile'],'collection') # ------------------------------------------------------------------ # Open up database: if practise: db = swap.read_pickle(tonights.parameters['dbfile'],'database') if db is None: print "SWAP: making a new Toy database..."
from skimage import io from subprocess import call #from colors import blues_r import numpy as np from matplotlib import pyplot as plt import swap #cornerplotter_path = '/Users/cpd/SWAP/pappy/CornerPlotter.py' #bureau_path = '/Users/cpd/SWAP/SpaceWarps/analysis/CFHTLS_bureau.pickle' #output_directory = '/Users/cpd/SWAP/swap_analysis/' cornerplotter_path = '/Users/mbaumer/SWAP/pappy/CornerPlotter.py' bureau_path = '/Users/mbaumer/SpaceWarps/analysis/CFHTLS_bureau.pickle' output_directory = '/Users/mbaumer/SWAP/swap_analysis' bureau = swap.read_pickle(bureau_path, 'bureau') len(bureau.list()) # make lists buy going through agents N_early = 10 skill = [] contribution = [] experience = [] education = [] early_contribution = [] early_skill = [] skill_all = [] contribution_all = [] experience_all = [] education_all = []
def make_roc_curves(args): """ NAME make_roc_curves PURPOSE Given some collection pickles, this script produces the one roc plot that will be put someplace in the SW system paper. COMMENTS FLAGS -h Print this message INPUTS collection pickles colors for the lines line styles labels OUTPUTS roc png plot EXAMPLE BUGS - Code is not tested yet... AUTHORS This file is part of the Space Warps project, and is distributed under the MIT license by the Space Warps Science Team. http://spacewarps.org/ HISTORY 2013-07-01 started Davis (KIPAC) 2014-09-06 updated to only use collections. """ # ------------------------------------------------------------------ flags = { 'output_directory': './', 'collections': [], 'labels': [], 'line_styles': [], 'colors': [] } for arg in args: if arg in flags: flags[arg] = args[arg] else: print "make_roc_curves: unrecognized flag ", arg print flags # check that collections etc are equal length if len(flags['collections']) != len(flags['labels']): raise Exception('Collections and labels must be same length!') if len(flags['collections']) != len(flags['line_styles']): raise Exception('Collections and line_styles must be same length!') if len(flags['collections']) != len(flags['colors']): raise Exception('Collections and colors must be same length!') n_min = 0 output_directory = flags['output_directory'] collections = flags['collections'] fig, ax = plt.subplots(figsize=(10, 8)) for i, collection_path in enumerate(collections): # ------------------------------------------------------------------ # Read in collection object: collection = swap.read_pickle(collection_path, 'collection') print "make_roc_curves: collection {0} subject numbers: {1}".format( i, len(collection.list())) # ------------------------------------------------------------------ # set up data for roc plots y_true = np.array([]) y_score = np.array([]) for ID in collection.list(): subject = collection.member[ID] if (subject.category == 'training'): n_assessment = len(subject.annotationhistory['ItWas']) if (n_assessment > n_min): truth = {'LENS': 1, 'NOT': 0}[subject.truth] y_true = np.append(y_true, truth) y_score = np.append(y_score, subject.mean_probability) fpr, tpr, threshold = roc_curve(y_true, y_score) color = flags['colors'][i] label = flags['labels'][i] line_style = flags['line_styles'][i] ax.plot(fpr, tpr, color, label=label, linestyle=line_style, linewidth=3) ax.set_xlim(0, 0.1) ax.set_ylim(0.8, 1) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') plt.legend(loc='lower right') pngfile = output_directory + 'roc_curve.png' plt.savefig(pngfile) print "make_roc_curves: roc curve saved to " + pngfile # ------------------------------------------------------------------ print "make_roc_curves: all done!" return
def make_roc_curves(args): """ NAME make_roc_curves PURPOSE Given some collection pickles, this script produces the one roc plot that will be put someplace in the SW system paper. COMMENTS FLAGS -h Print this message INPUTS collection pickles colors for the lines line styles labels OUTPUTS roc png plot EXAMPLE BUGS - Code is not tested yet... AUTHORS This file is part of the Space Warps project, and is distributed under the GPL v2 by the Space Warps Science Team. http://spacewarps.org/ HISTORY 2013-07-01 started Davis (KIPAC) 2014-09-06 updated to only use collections. """ # ------------------------------------------------------------------ flags = {'output_directory': './', 'collections': [], 'labels': [], 'line_styles': [], 'colors': []} for arg in args: if arg in flags: flags[arg] = args[arg] else: print "make_roc_curves: unrecognized flag ",arg print flags # check that collections etc are equal length if len(flags['collections']) != len(flags['labels']): raise Exception('Collections and labels must be same length!') if len(flags['collections']) != len(flags['line_styles']): raise Exception('Collections and line_styles must be same length!') if len(flags['collections']) != len(flags['colors']): raise Exception('Collections and colors must be same length!') n_min = 0 output_directory = flags['output_directory'] collections = flags['collections'] fig, ax = plt.subplots(figsize=(10,8)) for i, collection_path in enumerate(collections): # ------------------------------------------------------------------ # Read in collection object: collection = swap.read_pickle(collection_path, 'collection') print "make_roc_curves: collection {0} subject numbers: {1}".format(i, len(collection.list())) # ------------------------------------------------------------------ # set up data for roc plots y_true = np.array([]) y_score = np.array([]) for ID in collection.list(): subject = collection.member[ID] if (subject.category == 'training'): n_assessment = len(subject.annotationhistory['ItWas']) if (n_assessment > n_min): truth = {'LENS': 1, 'NOT': 0}[subject.truth] y_true = np.append(y_true, truth) y_score = np.append(y_score, subject.mean_probability) fpr, tpr, threshold = roc_curve(y_true, y_score) color = flags['colors'][i] label = flags['labels'][i] line_style = flags['line_styles'][i] ax.plot(fpr, tpr, color, label=label, linestyle=line_style, linewidth=3) ax.set_xlim(0, 0.1) ax.set_ylim(0.8, 1) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') plt.legend(loc='lower right') pngfile = output_directory + 'roc_curve.png' plt.savefig(pngfile) print "make_roc_curves: roc curve saved to "+pngfile # ------------------------------------------------------------------ print "make_roc_curves: all done!" return
def make_lens_atlas(args): """ NAME make_lens_atlas PURPOSE Given location of bureau and collection pickles as well as a list of subjects, this script produces a set of annotated images of lenses (heatmaps for lens locations, markers for where clicks were, etc). COMMENTS You have to download the file so it chooses whever your output directory is to also download the raw images. This should be pretty customizable. FLAGS -h Print this message --heatmap Do heatmaps --contour Do contours --field Do full image --stamp Do cutouts --alpha Do alpha --points N Take N agents and plot them. Any number < 0 = do all --skill Weight agent markers by skill INPUTS collection collection.pickle catalog catalog.dat Assumed format: ID kind x y Prob N0 Skill Dist Here: ID = Space Warps subject ID kind = Space Warps subject type (sim, dud, test) x,y = object (cluster) centroid, in pixels P = Space Warps subject probability N0 = number of markers in the cluster S = total skill per cluster, summed over markers D = biggest distance within cluster OUTPUTS EXAMPLE BUGS TODO: incorporate some of these defaults into the flags dictionary AUTHORS This file is part of the Space Warps project, and is distributed under the GPL v2 by the Space Warps Science Team. http://spacewarps.org/ HISTORY 2013-07-16 started Davis (KIPAC) """ # ------------------------------------------------------------------ # Some defaults: flags = {'points': 30, 'heatmap': False, 'contour': False, 'field': False, 'stamp': False, 'alpha': False, 'skill': False, 'output_directory': './', 'output_format': 'png', 'stamp_size': 50, 'dist_max': 30, 'stamp_min': 1, 'smooth_click': 3, 'figsize_stamp': 5, 'figsize_field': 10, 'image_y_size': 440, 'diagnostics': False, } # ------------------------------------------------------------------ # Read in options: # this has to be easier to do... for arg in args: if arg in flags: flags[arg] = args[arg] elif arg == 'collection': collection_path = args[arg] elif arg == 'catalog': catalog_path = args[arg] else: print "make_lens_atlas: unrecognized flag ",arg xbins = np.arange(flags['stamp_size'] * 2) ybins = np.arange(flags['stamp_size'] * 2) figsize_stamp = (flags['figsize_stamp'], flags['figsize_stamp']) figsize_field = (flags['figsize_field'], flags['figsize_field']) image_y_size = flags['image_y_size'] print "make_lens_atlas: illustrating behaviour captured in collection, and lens files: " print "make_lens_atlas: ", collection_path print "make_lens_atlas: ", catalog_path # ------------------------------------------------------------------ # Read in files: #bureau = swap.read_pickle(bureau_path, 'bureau') # TODO: needed? collection = swap.read_pickle(collection_path, 'collection') catalog = csv2rec(catalog_path) #print "make_lens_atlas: bureau numbers ", len(bureau.list()) print "make_lens_atlas: collection numbers ", len(collection.list()) print "make_lens_atlas: catalog numbers ", len(catalog) # ------------------------------------------------------------------ # Run through data: # ------------------------------------------------------------------ # Stamps: if flags['stamp']: print "make_lens_atlas: running stamps" for lens_i in range(len(catalog)): ID = catalog[lens_i]['id'] kind = catalog[lens_i]['kind'] x = catalog[lens_i]['x'] # flip y axis y = image_y_size - catalog[lens_i]['y'] N0 = catalog[lens_i]['n0'] if 'dist' in catalog.dtype.names: if catalog[lens_i]['dist'] == 0: continue if ((x < 0)): # this is one of the 'non points'; skip continue if (N0 < flags['stamp_min']): # not enough points! continue subject = collection.member[ID] annotationhistory = subject.annotationhistory # ------------------------------------------------------------------ # download png url = subject.location outname = flags['output_directory'] + '{0}_field.png'.format(ID) im = get_online_png(url, outname) min_x = np.int(np.max((x - flags['stamp_size'], 0))) max_x = np.int(np.min((x + flags['stamp_size'], im.shape[0]))) min_y = np.int(np.max((y - flags['stamp_size'], 0))) max_y = np.int(np.min((y + flags['stamp_size'], im.shape[1]))) min_member_x = np.int(np.max((x - flags['dist_max'], 0))) max_member_x = np.int(np.min((x + flags['dist_max'], im.shape[0]))) min_member_y = np.int(np.max((y - flags['dist_max'], 0))) max_member_y = np.int(np.min((y + flags['dist_max'], im.shape[1]))) if (min_x >= max_x) + (min_y >= max_y): print "make_lens_atlas: misshapen lens for ID ", ID continue # if it is a training image, claim the alpha parameter if im.shape[2] == 4: alpha = im[:, :, 3][min_y: max_y, min_x: max_x] im = im[:, :, :3][min_y: max_y, min_x: max_x] else: alpha = None im = im[min_y: max_y, min_x: max_x] fig = plt.figure(figsize=figsize_stamp) ax = fig.add_subplot(111) ax.imshow(im, origin=origin) ax.scatter(x - min_x, y - min_y, marker='d', c=(0, 1.0, 0), s=100, alpha=0.75) if ((flags['contour']) + (flags['heatmap']) + (flags['points'] != 0)): itwas = annotationhistory['ItWas'] x_all = annotationhistory['At_X'] y_all = annotationhistory['At_Y'] x_markers_all = np.array([xi for xj in x_all for xi in xj]) y_markers_all = np.array([yi for yj in y_all for yi in yj]) agents_numbers = np.arange( x_markers_all.size) if 'labels' in annotationhistory: # find which label is closest to your folks labels_all = annotationhistory['labels'] labels = np.array([xi for xj in labels_all for xi in xj]) cluster_labels = list(set(labels)) data = np.vstack((x_markers_all, y_markers_all)).T cluster_centers = np.array([np.mean(data[labels == i], axis=0) for i in cluster_labels]) # find which label is closest to the (x,y) label_center = cluster_labels[np.argmin(np.sum(np.square(cluster_centers - np.vstack((x, y)).T), axis=1))] conds = (labels == label_center) else: # now filter markers by those that are within # dist_max of the center (since I don't record cluster # members...) conds = ((x_markers_all >= min_member_x) * (x_markers_all <= max_member_x) * (y_markers_all >= min_member_y) * (y_markers_all <= max_member_y)) agents = agents_numbers[conds] x_markers = x_markers_all[agents] y_markers = y_markers_all[agents] # filter markers n_catalog = len(agents) if (flags['points'] > 0) * \ (flags['points'] < n_catalog): agents_points = np.random.choice( agents, size=flags['points'], replace=False) else: agents_points = agents x_markers_filtered = x_markers_all[agents_points] y_markers_filtered = y_markers_all[agents_points] if (flags['skill']) * (len(agents) > 0): PL_all = annotationhistory['PL'] PD_all = annotationhistory['PD'] # filter out the empty clicks PL_list = [] for i, xj in enumerate(x_all): PL_list.append([PL_all[i]] * len(xj)) PL = np.array([PLi for PLj in PL_list for PLi in PLj]) # filter out the empty clicks PD_list = [] for i, xj in enumerate(x_all): PD_list.append([PD_all[i]] * len(xj)) PD = np.array([PDi for PDj in PD_list for PDi in PDj]) skill_all = swap.expectedInformationGain(0.5, PL, PD) skill = skill_all[agents] smax = 100 smin = 5 if np.max(skill) != np.min(skill): sizes_all = (skill_all - np.min(skill)) * (smax - smin) / \ (np.max(skill) - np.min(skill)) sizes_filtered = sizes_all[agents_points] else: sizes_filtered = 50 else: skill = None sizes_filtered = 50 colors = (0, 1.0, 0) # ---------------------------------------------------------- # heatmaps if (flags['heatmap']) * (len(agents) > 0): fig_heatmap = plt.figure(figsize=figsize_stamp) ax_heatmap = fig_heatmap.add_subplot(111) # now do the lens locations # don't need to filter the x's since that is filtered by # xbins and ybins anyways pdf2d(x_markers - min_x, y_markers - min_y, xbins=xbins, ybins=ybins, weights=skill, smooth=flags['smooth_click'], color=(0, 1.0, 0), style='hist', axis=ax_heatmap) if flags['alpha'] * (alpha != None): contour_hist(alpha.T, extent=(xbins[0], xbins[-1], ybins[0], ybins[-1]), color='w', style='contour', axis=ax_heatmap) ax_heatmap.tick_params(\ axis='both', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off left='off', right='off', labelleft='off', labelbottom='off') # labels along the bottom edge are off # CPD 04.08.14: Flip axis to old conventions ax_heatmap.invert_yaxis() try: outfile = flags['output_directory'] + \ '{0}_cluster_{1}_heatmap.{2}'.format( ID, lens_i, flags['output_format']) # fig_heatmap.savefig(outfile) #fig_heatmap.canvas.print_png(outfile) fig_heatmap.savefig(outfile, bbox_inches='tight', pad_inches=0) except: print 'make_lens_catalog: heatmap problem with ', ID, lens_i # import ipdb; ipdb.set_trace() # --------------------------------------------------------- # back to our other plots # contours if (flags['contour']) * (len(agents) > 0): # now do the lens locations # don't need to filter the x's since that is filtered by # xbins and ybins anyways pdf2d(x_markers - min_x, y_markers - min_y, xbins=xbins, ybins=ybins, weights=skill, smooth=flags['smooth_click'], color=(0, 1.0, 0), style='contour', axis=ax) # plot points if (flags['points'] != 0) * (len(agents) > 0): ax.scatter(x_markers_filtered - min_x, y_markers_filtered - min_y, c=colors, s=sizes_filtered, alpha=0.25) # plot alpha if flags['alpha'] * (alpha != None): contour_hist(alpha.T, extent=(xbins[0], xbins[-1], ybins[0], ybins[-1]), color='w', style='contour', axis=ax) # ---------------------------------------------------------- ax.tick_params(\ axis='both', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off left='off', right='off', labelleft='off', labelbottom='off') # labels along the bottom edge are off ax.invert_yaxis() try: outfile = flags['output_directory'] + \ '{0}_cluster_{1}_contour.{2}'.format( ID, lens_i, flags['output_format'] ) # fig.savefig(outfile) fig.savefig(outfile, bbox_inches='tight', pad_inches=0) # fig.canvas.print_png(outfile) except: print 'make_lens_catalog: contour problem with ', ID, lens_i # import ipdb; ipdb.set_trace() plt.close('all') # ------------------------------------------------------------------ # Fields if flags['field']: print "make_lens_atlas: running fields" # find the unique IDs. mark centers and also centrals if clustering is # done #import ipdb; ipdb.set_trace() unique_IDs = np.unique(catalog['id']) for ID in unique_IDs: mini_catalog = catalog[catalog['id'] == ID] subject = collection.member[ID] annotationhistory = subject.annotationhistory # plot cluster centers kind = mini_catalog['kind'] x_centers = mini_catalog['x'] # flip y from catalog y_centers = image_y_size - mini_catalog['y'] skill_centers = mini_catalog['skill'] # filter out the -1 entry center_cond = (x_centers > 0) * (y_centers > 0) # filter outliers if possible if 'dist' in mini_catalog.dtype.names: center_cond *= mini_catalog['dist'] > 0 skill_centers = skill_centers[center_cond] x_centers = x_centers[center_cond] y_centers = y_centers[center_cond] colors_centers = [(0, 1.0, 0) for i in x_centers] if len(colors_centers) == 0: #welp, nothing here continue # ------------------------------------------------------------------ # download png url = subject.location outname = flags['output_directory'] + '{0}_field.png'.format(ID) im = get_online_png(url, outname) # if it is a training image, claim the alpha parameter if im.shape[2] == 4: alpha = im[:, :, 3] im = im[:, :, :3] else: alpha = None fig = plt.figure(figsize=figsize_field) ax = fig.add_subplot(111) ax.imshow(im, origin=origin) xbins = np.arange(im.shape[0]) ybins = np.arange(im.shape[1]) min_x = 0 min_y = 0 max_x = im.shape[0] max_y = im.shape[1] if (flags['skill']) * (np.max(skill_centers) != np.min(skill_centers)): sizes_centers = ( (skill_centers - np.min(skill_centers)) * (200 - 10) / (np.max(skill_centers) - np.min(skill_centers))) else: sizes_centers = [100 for i in x_centers] sizes_centers = [100 for i in x_centers] ax.scatter(x_centers, y_centers, marker='d', c=colors_centers, s=sizes_centers, alpha=0.75) if flags['diagnostics']: r = flags['dist_max'] b = flags['stamp_size'] b_ones = np.ones(100) * b b_arr = np.linspace(-b, b, 100) def xy(x0, y0, r, phi): return x0 + r * np.cos(phi), y0 + r * np.sin(phi) phis = np.arange(0, 6.28, 0.01) for i in xrange(len(x_centers)): x_center = x_centers[i] y_center = y_centers[i] ax.plot( *xy(x_center, y_center, r, phis), c='w', ls='-', linewidth=4) # plot box ax.plot(x_center + b_ones, y_center + b_arr, c='r', ls='--', linewidth=4) ax.plot(x_center - b_ones, y_center + b_arr, c='r', ls='--', linewidth=4) ax.plot(x_center + b_arr, y_center + b_ones, c='r', ls='--', linewidth=4) ax.plot(x_center + b_arr, y_center - b_ones, c='r', ls='--', linewidth=4) itwas = annotationhistory['ItWas'] x_all = annotationhistory['At_X'] y_all = annotationhistory['At_Y'] x_markers_all = np.array([xi for xj in x_all for xi in xj]) y_markers_all = np.array([yi for yj in y_all for yi in yj]) # now filter markers by those that are within # stamp_size of the stamp # I'm pretty sure this step is redundant when going over the full # image? agents_numbers = np.arange( x_markers_all.size) conds = ((x_markers_all >= min_x) * (x_markers_all <= max_x) * (y_markers_all >= min_y) * (y_markers_all <= max_y)) agents = agents_numbers[conds] x_markers = x_markers_all[agents] y_markers = y_markers_all[agents] # filter markers n_catalog = len(agents) if (flags['points'] > 0) * \ (flags['points'] < n_catalog): agents_points = np.random.choice( agents, size=flags['points'], replace=False) else: agents_points = agents x_markers_filtered = x_markers_all[agents_points] y_markers_filtered = y_markers_all[agents_points] if flags['skill']: PL_all = annotationhistory['PL'] PD_all = annotationhistory['PD'] # filter out the empty clicks PL_list = [] for i, xj in enumerate(x_all): PL_list.append([PL_all[i]] * len(xj)) PL = np.array([PLi for PLj in PL_list for PLi in PLj]) # filter out the empty clicks PD_list = [] for i, xj in enumerate(x_all): PD_list.append([PD_all[i]] * len(xj)) PD = np.array([PDi for PDj in PD_list for PDi in PDj]) skill_all = swap.expectedInformationGain(0.5, PL, PD) skill = skill_all[agents] smax = 100 smin = 5 if np.max(skill) != np.min(skill): sizes_all = (skill_all - np.min(skill)) * (smax - smin) / \ (np.max(skill) - np.min(skill)) sizes_filtered = sizes_all[agents_points] else: sizes_filtered = 50 else: skill = None sizes_filtered = 50 if 'labels' in annotationhistory: # find which label is closest to your folks labels_all = annotationhistory['labels'] labels = np.array([xi for xj in labels_all for xi in xj]) labels_filtered = labels[agents_points] colors = [] alpha = 0.75 for label in labels_filtered: if label == -1: colors.append((1.0, 0.0, 0)) else: colors.append((0, 1.0, 0)) else: colors = (0, 1.0, 0) alpha = 0.25 # ---------------------------------------------------------- # contours if flags['contour'] * (len(x_markers) >= flags['stamp_min']): # now do the lens locations # don't need to filter the x's since that is filtered by # xbins and ybins anyways pdf2d(x_markers - min_x, y_markers - min_y, xbins=xbins, ybins=ybins, weights=skill, smooth=flags['smooth_click'], color=(0, 1.0, 0), style='contour', axis=ax) # ---------------------------------------------------------- # plot points if flags['points'] != 0: ax.scatter(x_markers_filtered - min_x, y_markers_filtered - min_y, c=colors, s=sizes_filtered, alpha=alpha) # ---------------------------------------------------------- # do alpha if flags['alpha'] * (alpha != None): contour_hist(alpha.T, extent=(xbins[0], xbins[-1], ybins[0], ybins[-1]), color='w', style='contour', axis=ax) ax.tick_params(\ axis='both', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off left='off', right='off', labelleft='off', labelbottom='off') # labels along the bottom edge are off ax.invert_yaxis() try: outfile = flags['output_directory'] + '{0}_field_output.{1}'.format(ID, flags['output_format']) # fig.savefig(outfile) fig.savefig(outfile, bbox_inches='tight', pad_inches=0) #fig.canvas.print_png(outfile) except: print 'make_lens_catalog: field problem with field ', ID plt.close('all') print 'make_lens_catalog: All done!'
tonights = swap.Configuration(configfile) thresholds = {} thresholds['detection'] = tonights.parameters['detection_threshold'] thresholds['rejection'] = tonights.parameters['rejection_threshold'] # Use the following directory for output lists and plots: tonights.parameters['trunk'] = tonights.parameters['survey'] tonights.parameters['dir'] = '.' # ------------------------------------------------------------------ # Read in, or create, an object representing the candidate list: print "SWAG: reading in subjects..." sample = swap.read_pickle(tonights.parameters['samplefile'],'collection') # ------------------------------------------------------------------ # Write out a catalog of subjects, including the ZooID, subject ID, # how many classifications, and probability: catalog = swap.get_new_filename(tonights.parameters,'candidate_catalog') print "SWAG: saving catalog of high probability subjects..." Nlenses,Nsubjects = swap.write_catalog(sample,catalog,thresholds,kind='test') print "SWAG: "+str(Nsubjects)+" subjects classified," print "SWAG: "+str(Nlenses)+" candidates (with P > rejection) written to "+catalog # Also write out the sims, and the duds: catalog = swap.get_new_filename(tonights.parameters,'sim_catalog')
def MachineShop(args): # Buh. I never built in the ability to change directories on the fly #machine_sim_directory = 'sims_Machine/redo_with_circular_morphs' """ Sometimes you just need to the run the Machine on a bunch of already made SWAP-runs / simulations. If so, this script is for you! """ # Get parameters from the SWAP run of interest the = swap.Configuration(args.config) params = the.parameters # This pulls up the FIDUCIAL SWAP simulation sim = Simulation(config=args.config, directory='sims_SWAP/S_PLPD5_p5_ff_norand', variety='feat_or_not') # this was originally set to 2/17/09 which is WRONG # 11/2/17: WHY?? F**k you, Past Melanie. What am I supposed to do here?? first_day = dt.datetime(2009, 2, 12) today = dt.datetime.strptime(params['start'], '%Y-%m-%d_%H:%M:%S') start_day = dt.datetime(2009, 2, 17) last_day = dt.datetime.strptime(params['end'], '%Y-%m-%d_%H:%M:%S') yesterday = None run_machine = False SWAP_retired = 0 notfound = 0 last_night = None for idx, filename in enumerate(sim.retiredFileList[(today - first_day).days:]): print "" print "----------------------- The Machine Shop ----------------------------" print "Today is {}".format(today) if today >= last_day: print "Get outta the machine shop!" exit() # --------------------------------------------------------------------- # OPEN METADATA PICKLE (updated each time MachineClassifier is run) # --------------------------------------------------------------------- backup_meta_file = params['metadatafile'].replace( '.pickle', '_orig.pickle') if today == first_day: try: storage = swap.read_pickle(backup_meta_file, 'metadata') except: print "MachineShop: Backup metadata pickle not yet created." print "MachineShop: Opening original metadata pickle file instead" storage = swap.read_pickle(params['metadatafile'], 'metadata') if 'retired_date' not in storage.subjects.colnames: storage.subjects['retired_date'] = '2016-09-10' if 'valid' not in np.unique(storage.subjects['MLsample']): expert = (storage.subjects['Expert_label'] != -1) storage.subjects['MLsample'][expert] = 'valid' # save an untouched copy for reference later print "MachineShop: Creating a backup metadata pickle" swap.write_pickle(storage, backup_meta_file) else: storage = swap.read_pickle(params['metadatafile'], 'metadata') # Regardless of which metadata you open, make sure it has these columns # (old metadata files WON'T have them!) if 'retired_date' not in storage.subjects.colnames: storage.subjects['retired_date'] = '2016-09-10' if 'valid' not in np.unique(storage.subjects['MLsample']): expert = (storage.subjects['Expert_label'] != -1) storage.subjects['MLsample'][expert] = 'valid' subjects = storage.subjects # I just need to know what was retired TONIGHT -- # compare what's retired UP TILL tonight with what was # retired up till LAST NIGHT SWAP_retired_by_tonight = sim.fetchCatalog(filename) # If we're picking up where we left off, grab previous training sample #if today>start_day and last_night is None: # print 'MachineShop: getting previous training sample' # last_night = subjects[subjects['MLsample']=='train'] # last_night['zooid'] = last_night['SDSS_id'] try: ids_retired_tonight = set(SWAP_retired_by_tonight['zooid']) - \ set(last_night['zooid']) except: ids_retired_tonight = set(SWAP_retired_by_tonight['zooid']) print "Newly retired subjects: {}".format(len(ids_retired_tonight)) # Now that I have the ids from the previous night, adjust the # metadata file to reflect what was retired / add SWAP info for ID in list(ids_retired_tonight): # Locate this subject in the metadata file mask = subjects['SDSS_id'] == int(ID) # Update them in metadata file as training sample for MC # DOUBLE CHECK THAT IT HAS NOT BEEN RETIRED BY MACHINE!!! #if subjects['MLsample'][mask] == 'test ': if subjects['MLsample'][mask] == 'test': SWAP_retired += 1 subjects['MLsample'][mask] = 'train' subjects['retired_date'][mask] = dt.datetime.strftime( today, '%Y-%m-%d') subjects['SWAP_prob'][mask] = SWAP_retired_by_tonight['P'][ SWAP_retired_by_tonight['zooid'] == ID] run_machine = True else: notfound += 1 if len(subjects[subjects['MLsample'] == 'train']) >= 10000: run_machine = True last_night = SWAP_retired_by_tonight print "Retired by this day:", len(last_night) print "" print "MachineShop: Found {0} subjects retired by SWAP on {1}"\ .format(SWAP_retired, today) print "MachineShop: {0} total subjects retired so far"\ .format(np.sum(subjects['MLsample']=='train')) print "MachineShop: Found {0} subjects retired by Machine."\ .format(np.sum(subjects['MLsample']=='mclas')) print "MachineShop: Saving updated StorageLocker." params['dir'] = os.getcwd() # Save our new metadata file -- MC needs this -- save to NOT the original params['metadatafile'] = params['dir'] + '/' + params[ 'survey'] + '_metadata.pickle' swap.write_pickle(storage, params['metadatafile']) if run_machine: # Need to doctor the config to refect the "correct date" params['start'] = today.strftime('%Y-%m-%d_%H:%M:%S') swap.write_config(args.config, params) # Run MachineClassifier.py using this subject file os.system("python MachineClassifier.py -c %s" % args.config) """os.system("python test_Machine.py -c {0}".format(args.config))""" # MachineClassifier updates the configfile so now we need to open the NEW one the = swap.Configuration(args.config) params = the.parameters # Update date (since we're not running SWAP) today += dt.timedelta(days=1)
def main(): """ This script makes ALL THE MUTHAFUCKIN FIGURES FOR MAH PAYPAH. 1. VOLUNTEER PROBABILITIES NAME plot_user_probabilities() REQUIRES swap bureau file and # of users to plot 2. VOTE DISTRIBUTIONS COMPARED TO GZ2 NAME plot_vote_distributions() REQUIRES gz2_metadata and simulation to compare to 3. BASELINE SWAP SIMULATION COMPARED TO GZ2 NAME plot_GZX_baseline() REQUIRES baseline simulation, evaluation ascii file for baseline run, gz2_retired (cumulative retired subjects in GZ2) NOTES this plots the retired subject rate AND the corresponding quality metrics ON THE SAME AXES The eval file and the GZ2 retired subjects file must be created in separate script: (generate_SWAP_eval_report) 4. VARIATIONS IN SWAP NAME plot_GZX_evaluation_spread() plot_GZX_cumulative_retirement_spread() REQUIRES three simulations to compare (for spread in retirement) three evaluation files to compare (for spread in eval) NOTES the eval files have to be created with generate_SWAP_eval_report 5. SWAP AND GZ2 DISAGREE swap_gets_it_wrong() 6. MONEYPLOT MONEYPLOT() 7. 1D MORPHOLOGY DISTRIBUTIONS NAME plot_morph_params_1D() REQUIRES """ make_volunteer_probabilties_plot = False make_subject_trajectory_plot = False make_vote_distributions_plot = False make_baseline_simulation_plot = False make_swap_variations_plot = False make_swap_gets_it_wrong_plot = False make_moneyplot = True make_morph_distributions_plot = False make_roc_curves = False calculate_GX_human_effort = False survey = 'GZ2_sup_PLPD5_p5_flipfeature2b' dir_tertiary = 'tertiary_simulation_output' dir_sim_machine = 'sims_Machine/redo_with_correct_ell_morphs' dir_sim_swap = 'sims_SWAP/S_PLPD5_p5_ff_norand/' # Load up some GZ2 data # ----------------------------------------------- gz2_metadata = Table.read('metadata_ground_truth_labels.fits') if 'GZ2_deb_combo' not in gz2_metadata.colnames: gz2_metadata['GZ2_raw_combo'] = GZ2_label_SMOOTH_NOT(bigfuckingtable, type='raw') gz2_metadata.write('metadata_ground_truth_labels.fits', overwrite=True) gz2_metadata['zooid'] = gz2_metadata['SDSS_id'] gz2_metadata['id'] = gz2_metadata['asset_id'] F = open('GZ2_cumulative_retired_subjects_expert.pickle', 'r') gz2_cum_sub_retired = cPickle.load(F) morph = Table.read("metadata_ground_truth_labels.fits") pdb.set_trace() # Load up BASELINE simulation # ------------------------------------------------------ mid_name = 'sup_PLPD5_p5_flipfeature2b_norandom2' #stuff = generate_SWAP_eval_report(mid_sim, gz2_metadata, outname=mid_name+'_raw_combo', # write_file=True, gz_kind='raw_combo') mid_eval2 = Table.read('{0}/GZX_evaluation_{1}.txt'.format( dir_tertiary, mid_name + '_raw_combo'), format='ascii') mid_sim = Simulation( config='configfiles/update_sup_PLPD5_p5_flipfeature2b_norandom2.config', directory=dir_sim_swap, variety='feat_or_not') """ MAKE VOLUNTEER PROBABILTIES PLOT """ if make_volunteer_probabilties_plot: # Load up the SWAP Simulation AGENT BUREAU picklename = '{0}/{1}_bureau.pickle'.format(dir_sim_swap, survey) bureau = swap.read_pickle(picklename, 'bureau') plot_user_probabilities(bureau, 200) if make_subject_trajectory_plot: # Load up the SWAP Simulation AGENT BUREAU picklename = '{0}/{1}_collection.pickle'.format(dir_sim_swap, survey) collection = swap.read_pickle(picklename, 'collection') plot_subject_trajectories(collection, 200) """ MAKE BASELINE SIMULATION PLOT """ if make_baseline_simulation_plot: # BASELINE fig requires BASELINE Simulation, # evaluation output for that sim, # cumulative retirement for GZ2 plot_GZX_baseline(mid_sim, mid_eval2, gz2_cum_sub_retired) """ MAKE MONEY PLOT """ if make_moneyplot: outfile = '{}/{}'.format(dir_sim_machine, survey) # this file made by analaze_GZX_simulation.py filename = glob.glob('{}*_combo_analysis*.pickle'.format(outfile)) F = open(filename[0], 'rb') combo_run = cPickle.load(F) F.close() # Load up the Machine bureau F = open('{0}/{1}_MLbureau.pickle'.format(dir_sim_machine, survey), 'rb') MLbureau = cPickle.load(F) F.close() MONEYPLOT(92, mid_sim, mid_eval2, gz2_cum_sub_retired, combo_run, MLbureau, outfile=outfile) """ MORPH DISTRIBUTIONS """ if make_morph_distributions_plot: # Plotting FEAT vs NOT, FALSE POS & FALSE NEGs, RETIRED vs NOT RETIRED # to do all that, need files that were created.... GZX_SWAP_eval? filename = glob.glob( '{}/*_machine_retired_subjects.fits'.format(dir_sim_machine)) machine_retired = Table.read(filename[0]) #machine_not_retired = Table.read('tertiary_simulation_output/{}_machine_not_retired_subjects.fits'.format(outfile)) plot_morph_params_1D(machine_retired, gz2_metadata, outfile=dir_sim_machine) """ MAKE SWAP GETS IT WRONG PLOT """ if make_swap_gets_it_wrong_plot: # Compare SWAP-retired subjects to various parameters in the GZ2 Main Catalog bigfuckingtable = Table.read( '../SpaceWarps/analysis/GZ2ASSETS_NAIR_MORPH_MAIN.fits') gz2_bigfuckingtable = join(gz2_metadata, bigfuckingtable, keys='id') all_retired = mid_sim.fetchCatalog(mid_sim.retiredFileList[-1]) gz2_baseline = join(gz2_bigfuckingtable, all_retired, keys='zooid') tps2, fps2, tns2, fns2 = calculate_confusion_matrix( gz2_baseline[gz2_baseline['P'] > 0.3], gz2_baseline[gz2_baseline['P'] < 0.3], smooth_or_not=False, gz_kind='raw_combo') correct = vstack([tps2, tns2]) #print len(correct) swap_gets_it_wrong(fps2, fns2, correct) """ MAKE VOTE DISTRIBUTION PLOT """ if make_vote_distributions_plot: # Requires the Vote Distributions for GZ2 and those from the Simulation plot_vote_distributions(gz2_metadata, mid_sim) if calculate_GX_human_effort: mlbureaufile = 'sims_Machine/redo_with_correct_ell_morphs/GZ2_sup_PLPD5_p5_flipfeature2b_MLbureau.pickle' MLbureau = swap.read_pickle(mlbureaufile, 'bureau') machine_meta = 'sims_Machine/redo_with_correct_ell_morphs/GZ2_sup_PLPD5_p5_flipfeature2b_metadata.pickle' all_subjects = swap.read_pickle(machine_meta, 'metadata').subjects #subjects = all_subjects[all_subjects['retired_date']!='2016-09-10'] mclass = all_subjects[all_subjects['MLsample'] == 'mclas'] swaps = all_subjects[(all_subjects['MLsample'] == 'train') | (all_subjects['MLsample'] == 'valid')] catalog = mid_sim.fetchCatalog(mid_sim.retiredFileList[-1]) catalog['SDSS_id'] = catalog['zooid'] # How many machine-retired subjects would have been retired by SWAP anyway? #swap_mach_retired = join(catalog, mclass, keys='SDSS_id') swap_retired = join(catalog, swaps, keys='SDSS_id') # Assume that only Human Effort came from training sample effort = np.sum(swap_retired['Nclass']) print "Human effort for GZX:", effort # LOOK AT MOST IMPORTANT FEATURES FOR MACHINE machine = MLbureau.member['RF_accuracy'] trainhist = machine.traininghistory models = trainhist['Model'] for i, model in enumerate(models): if i == 0: feature_importances = model.feature_importances_ else: feature_importances = np.vstack( [feature_importances, model.feature_importances_]) labels = ['M$_{20}$', '$C$', '$1-b/a$', '$A$', '$G$'] fi = feature_importances avg, std = [], [] for i in range(5): avg.append(np.mean(fi[:, i])) std.append(np.std(fi[:, i])) avg = np.array(avg) std = np.array(std) labels = np.array(labels) sort_indices = np.argsort(avg) ind = np.arange(len(labels)) #pdb.set_trace() fig = plt.figure(figsize=(11, 8)) ax = fig.add_subplot(111) rects1 = ax.bar(ind, avg[sort_indices], yerr=std[sort_indices], color='red', edgecolor='black', capsize=5, align='center') ax.set_ylabel('Feature Importance') ax.set_xticks(ind) ax.set_xticklabels(labels[sort_indices]) ax.set_ylim(0, 0.45) ax.set_yticks([0., .1, .2, .3, .4]) plt.savefig('RF_feature_importance_4paper.pdf', bbox_inches='tight') plt.show() #pdb.set_trace() if make_roc_curves: candidateFileList = mid_sim.fetchFileList(kind='candidate') """ # SWAP situation at ~30 days into simulation candidates1 = mid_sim.fetchCatalog(candidateFileList[30]) rejected1 = mid_sim.fetchCatalog(mid_sim.rejectedFileList[30]) swap_subjects1 = np.concatenate([candidates1, rejected1]) subjects1 = join(gz2_metadata, swap_subjects1, keys='zooid') # SWAP situation at ~60 days into simualtion candidates2 = mid_sim.fetchCatalog(candidateFileList[60]) rejected2 = mid_sim.fetchCatalog(mid_sim.rejectedFileList[60]) swap_subjects2 = np.concatenate([candidates2, rejected2]) subjects2 = join(gz2_metadata, swap_subjects2, keys='zooid') """ # SWAP situation at the end of the siulation candidates3 = mid_sim.fetchCatalog(candidateFileList[-1]) rejected3 = mid_sim.fetchCatalog(mid_sim.rejectedFileList[-1]) swap_subjects3 = np.concatenate([candidates3, rejected3]) subjects3 = join(gz2_metadata, swap_subjects3, keys='zooid') subject_sets = [subjects3] #subjects1, subjects2, plot_roc_curve(subject_sets, smooth_or_not=False, gz_kind='raw_combo', swap=True, outname=None) """ MAKE SWAP VARIATIONS PLOT(S) """ if make_swap_variations_plot: #""" # Load up simulations varying subject PRIOR # ------------------------------------------------------- low_p = 'sup_PLPD5_p2_flipfeature2_norand' high_p = 'sup_PLPD5_p8_flipfeature2_norand' p35 = 'sup_PLPD5_p35_flipfeature2_norand' low_p_eval2 = Table.read( 'tertiary_simulation_output/GZX_evaluation_{0}.txt'.format( low_p + '_raw_combo'), format='ascii') high_p_eval2 = Table.read( 'tertiary_simulation_output/GZX_evaluation_{0}.txt'.format( high_p + '_raw_combo'), format='ascii') #p35_eval2 = Table.read('tertiary_simulation_output/GZX_evaluation_{0}.txt'.format(p35+'_raw_combo'), format='ascii') low_p_sim = Simulation( config='configfiles/update_sup_PLPD5_p2_flipfeature2_norand.config', directory='sims_SWAP/S_PLPD5_p2_ff_norand/', variety='feat_or_not') high_p_sim = Simulation( config='configfiles/update_sup_PLPD5_p8_flipfeature2_norand.config', directory='sims_SWAP/S_PLPD5_p8_ff_norand/', variety='feat_or_not') #p35_sim = Simulation(config='configfiles/update_sup_PLPD5_p35_flipfeature2_norand.config', # directory='sims_SWAP/S_PLPD5_p35_ff_norand/', # variety='feat_or_not') #""" # Load up simulations for varying user PL/PD # ------------------------------------------------------- low_plpd = 'sup_PLPD4_p5_flipfeature2_norand' high_plpd = 'sup_PLPD6_p5_flipfeature2_norand' low_plpd_eval2 = Table.read( 'tertiary_simulation_output/GZX_evaluation_{0}.txt'.format( low_plpd + '_raw_combo'), format='ascii') high_plpd_eval2 = Table.read( 'tertiary_simulation_output/GZX_evaluation_{0}.txt'.format( high_plpd + '_raw_combo'), format='ascii') low_plpd_sim = Simulation( config='configfiles/update_sup_PLPD4_p5_flipfeature2_norand.config', directory='sims_SWAP/S_PLPD4_p5_ff_norand/', variety='feat_or_not') high_plpd_sim = Simulation( config='configfiles/update_sup_PLPD6_p5_flipfeature2_norand.config', directory='sims_SWAP/S_PLPD6_p5_ff_norand/', variety='feat_or_not') #""" # VARY PRIOR fig = plt.figure(figsize=(11, 16)) plt.rc('text', usetex=True) gs = gridspec.GridSpec(2, 1) gs.update(wspace=0.05, hspace=0.01) ax = fig.add_subplot(gs[0]) plot_GZX_evaluation_spread(92, low_p_eval2, mid_eval2, high_p_eval2, outfile='compare_PLPD_4paper', ax=ax) ax2 = fig.add_subplot(gs[1]) plot_GZX_cumulative_retirement_spread(92, low_p_sim, mid_sim, high_p_sim, gz2_cum_sub_retired, outfile='compare_prior_4paper', ax=ax2) fig.suptitle(r'$0.1 \le \mathrm{Subject~Prior} \le 0.8$', fontsize=30) gs.tight_layout(fig, rect=[0, 0, 1, 0.97]) plt.savefig('GZX_eval_and_retirement_prior_spread_4paper_v2.pdf') plt.show() plt.close() #""" # ----------------------------------------------------------- # VARY PLPD fig = plt.figure(figsize=(11, 16)) plt.rc('text', usetex=True) gs = gridspec.GridSpec(2, 1) gs.update(wspace=0.01, hspace=0.01) ax = fig.add_subplot(gs[0]) plot_GZX_evaluation_spread(92, low_plpd_eval2, mid_eval2, high_plpd_eval2, outfile='compare_PLPD_4paper', ax=ax) ax2 = fig.add_subplot(gs[1]) plot_GZX_cumulative_retirement_spread(92, low_plpd_sim, mid_sim, high_plpd_sim, gz2_cum_sub_retired, outfile='compare_prior_4paper', ax=ax2) fig.suptitle( r'$(0.4, 0.4) \le \mathrm{Confusion~Matrix} \le (0.6, 0.6)$', fontsize=30) gs.tight_layout(fig, rect=[0, 0, 1, 0.97]) plt.savefig('GZX_eval_and_retirement_PLPD_spread_4paper_v2.pdf') plt.show() plt.close()
# Check for pickles in array args: if len(args) == 1: collectionfile = args[0] print "make_trajectory_plots: illustrating subject trajectories in: " print "make_trajectory_plots: ",collectionfile else: print make_trajectory_plots.__doc__ return output_directory = './' # ------------------------------------------------------------------ # Read in collection: sample = swap.read_pickle(collectionfile, 'collection') print "make_trajectory_plots: total no. of available subjects: ",len(sample.list()) if highlights: # Read in subjects to be highlighted: highlightIDs = swap.read_list(listfile) print highlightIDs print "make_trajectory_plots: total no. of special subjects: ",len(highlightIDs) print "make_trajectory_plots: special subjects: ",highlightIDs # ------------------------------------------------------------------ # Start plot: figure = sample.start_trajectory_plot(title=title,histogram=histogram,logscale=False) pngfile = 'trajectories.png'
else: t1 = datetime.datetime.strptime(tonights.parameters['start'], '%Y-%m-%d') print "SWAP: updating all subjects with classifications made since " + tonights.parameters[ 'start'] # How will we make decisions based on probability? thresholds = {} thresholds['detection'] = tonights.parameters['detection_threshold'] thresholds['rejection'] = tonights.parameters['rejection_threshold'] # ------------------------------------------------------------------ # Read in, or create, a bureau of agents who will represent the # volunteers: bureau = swap.read_pickle(tonights.parameters['crowdfile'], 'crowd') # ------------------------------------------------------------------ # Read in, or create, an object representing the candidate list: sample = swap.read_pickle(tonights.parameters['samplefile'], 'collection') # ------------------------------------------------------------------ # Open up database: if practise: db = swap.read_pickle(tonights.parameters['dbfile'], 'database') if db is None: print "SWAP: making a new Toy database..."
# Check for pickles in array args: if len(args) == 1: collectionfile = args[0] print "make_trajectory_plots: illustrating subject trajectories in: " print "make_trajectory_plots: ", collectionfile else: print make_trajectory_plots.__doc__ return output_directory = "./" # ------------------------------------------------------------------ # Read in collection: sample = swap.read_pickle(collectionfile, "collection") print "make_trajectory_plots: total no. of available subjects: ", len(sample.list()) if highlights: # Read in subjects to be highlighted: highlightIDs = swap.read_list(listfile) print highlightIDs print "make_trajectory_plots: total no. of special subjects: ", len(highlightIDs) print "make_trajectory_plots: special subjects: ", highlightIDs # ------------------------------------------------------------------ # Start plot: figure = sample.start_trajectory_plot(title=title, histogram=histogram, logscale=False) pngfile = "trajectories.png"
def make_lens_catalog(args): """ NAME make_lens_catalog PURPOSE Given location of collection pickle, this script produces a set of annotated images of lenses (heatmaps for lens locations, markers for where clicks were, etc). COMMENTS You have to download the file so it chooses whever your output directory is to also download the raw images. This should be pretty customizable. FLAGS -h Print this message --skill Weight by skill INPUTS collection.pickle OUTPUTS lens.dat Assumed format: ID kind x y Prob N0 Skill Dist Here: ID = Space Warps subject ID kind = Space Warps subject type (sim, dud, test) x,y = object (cluster) centroid, in pixels P = Space Warps subject probability N0 = number of markers in the cluster S = total skill per cluster, summed over markers D = biggest distance within cluster EXAMPLE BUGS AUTHORS This file is part of the Space Warps project, and is distributed under the GPL v2 by the Space Warps Science Team. http://spacewarps.org/ HISTORY 2013-07-16 started Davis (KIPAC) """ # ------------------------------------------------------------------ # Some defaults: flags = { 'skill': False, 'output_directory': './', 'output_name': 'catalog.dat', 'image_y_size': 440, 'catalog_path': '', 'update_collection': '', } # ------------------------------------------------------------------ # Read in options: # this has to be easier to do... for arg in args: if arg in flags: flags[arg] = args[arg] elif arg == 'collection_path': collection_path = args[arg] else: print "make_lens_atlas: unrecognized flag ", arg print "make_lens_catalog: illustrating behaviour captured in collection file: " print "make_lens_catalog: ", collection_path memory = joblib.Memory(cachedir=flags['output_directory']) memory.clear() catalog_path = flags['output_directory'] + flags['output_name'] if len(flags['output_name']) > 0: F = open(catalog_path, 'w') F.write('id,kind,x,y,prob,n0,skill,dist\n') # ------------------------------------------------------------------ # Read in files: collection = swap.read_pickle(collection_path, 'collection') ID_list = collection.list() print "make_lens_catalog: collection numbers ", len(ID_list) if flags['catalog_path'] != '': print "make_lens_catalog: filtering from catalog ", flags[ 'catalog_path'] catalog_in = csv2rec(flags['catalog_path']) ID_list = np.unique(catalog_in['id']) # ------------------------------------------------------------------ # Run through data: catalog = {} for ID in ID_list: subject = collection.member[ID] kind = subject.kind P = subject.mean_probability itwas = subject.annotationhistory['ItWas'] x_all = subject.annotationhistory['At_X'] y_all = subject.annotationhistory['At_Y'] x_markers = np.array([xi for xj in x_all for xi in xj]) y_markers = np.array([yi for yj in y_all for yi in yj]) catalog.update( {ID: { 'agents_reject': [], 'x': x_markers, 'y': y_markers, }}) PL_all = subject.annotationhistory['PL'] PD_all = subject.annotationhistory['PD'] # filter out the empty clicks PL_list = [] PL_nots = [] for i, xj in enumerate(x_all): # len(xj) of empty = 0 PL_list.append([PL_all[i]] * len(xj)) if len(xj) == 0: PL_nots.append(PL_all[i]) PL = np.array([PLi for PLj in PL_list for PLi in PLj]) PL_nots = np.array(PL_nots) # filter out the empty clicks PD_list = [] PD_nots = [] for i, xj in enumerate(x_all): PD_list.append([PD_all[i]] * len(xj)) if len(xj) == 0: PD_nots.append(PD_all[i]) catalog[ID]['agents_reject'].append(i) PD = np.array([PDi for PDj in PD_list for PDi in PDj]) PD_nots = np.array(PD_nots) skill = swap.expectedInformationGain(0.5, PL, PD) # skill # it is only fair to write out the NOTs, too # do the empty guys skill_nots = swap.expectedInformationGain(0.5, PL_nots, PD_nots) # skill x, y = -1, -1 N0 = len(skill_nots) S = np.sum(skill_nots) D = 0 ## catalog.append((ID, kind, x, y, P, N0, S, D)) if len(catalog) % 500 == 0: print len(catalog) if len(flags['output_name']) > 0: F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format( ID, kind, x, y, P, N0, S, D)) if len(x_markers) == 0: # apparently everyone was a not... catalog[ID]['agents_labels'] = np.array([]) continue # ------------------------------------------------------------------ # cluster print 'make_lens_catalog: subject ID = ', ID if flags['skill']: cluster_centers, cluster_center_labels, cluster_labels, \ n_clusters, dist_within = outlier_clusters(x_markers, y_markers, skill, memory=memory) else: cluster_centers, cluster_center_labels, cluster_labels, \ n_clusters, dist_within = outlier_clusters(x_markers, y_markers, None, memory=memory) # need to get: x, y, N0, S catalog[ID]['agents_labels'] = cluster_labels for cluster_center_label in cluster_center_labels: cluster_center = cluster_centers[cluster_center_label] members = (cluster_labels == cluster_center_label) x, y = cluster_center # convert y to catalog convention y = flags['image_y_size'] - y N0 = np.sum(members) S = np.sum(skill[members]) D = dist_within[cluster_center_label] if cluster_center_label == -1: # outlier cluster # so really every point is its own cluster... D = 0 ## catalog.append((ID, kind, x, y, P, N0, S, D)) ## if len(catalog)%500 == 0: ## print len(catalog) # TODO: make some requirement to be included (exclude outliers) if len(flags['output_name']) > 0: F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format( ID, kind, x, y, P, N0, S, D)) print 'make_lens_catalog: Clearing memory' # clear memory memory.clear() if len(flags['output_name']) > 0: print 'make_lens_catalog: closing file!' F.close() if len(flags['update_collection']) > 0: print 'make_lens_catalog: writing updated collection to', flags[ 'update_collection'] # TODO: get the other params correct!!!! collection_fat = swap.collection.Collection() for ID in catalog: subject = collection.member[ID] atx = subject.annotationhistory['At_X'] labels_in = list(catalog[ID]['agents_labels']) labels_fat = [] for atx_i in atx: labels_fat.append([]) for atx_ij in atx_i: labels_fat[-1].append(labels_in.pop(0)) subject.annotationhistory.update({'labels': labels_fat}) collection_fat.member.update({ID: subject}) swap.write_pickle(collection_fat, flags['update_collection']) print 'make_lens_catalog: All done!' return catalog
def make_lens_atlas(args): """ NAME make_lens_atlas PURPOSE Given location of bureau and collection pickles as well as a list of subjects, this script produces a set of annotated images of lenses (heatmaps for lens locations, markers for where clicks were, etc). COMMENTS You have to download the file so it chooses whever your output directory is to also download the raw images. This should be pretty customizable. FLAGS -h Print this message --heatmap Do heatmaps --contour Do contours --field Do full image --stamp Do cutouts --alpha Do alpha --points N Take N agents and plot them. Any number < 0 = do all --skill Weight agent markers by skill INPUTS collection collection.pickle catalog catalog.dat Assumed format: ID kind x y Prob N0 Skill Dist Here: ID = Space Warps subject ID kind = Space Warps subject type (sim, dud, test) x,y = object (cluster) centroid, in pixels P = Space Warps subject probability N0 = number of markers in the cluster S = total skill per cluster, summed over markers D = biggest distance within cluster OUTPUTS EXAMPLE BUGS TODO: incorporate some of these defaults into the flags dictionary AUTHORS This file is part of the Space Warps project, and is distributed under the GPL v2 by the Space Warps Science Team. http://spacewarps.org/ HISTORY 2013-07-16 started Davis (KIPAC) """ # ------------------------------------------------------------------ # Some defaults: flags = { 'points': 30, 'heatmap': False, 'contour': False, 'field': False, 'stamp': False, 'alpha': False, 'skill': False, 'output_directory': './', 'output_format': 'png', 'stamp_size': 50, 'dist_max': 30, 'stamp_min': 1, 'smooth_click': 3, 'figsize_stamp': 5, 'figsize_field': 10, 'image_y_size': 440, 'diagnostics': False, } # ------------------------------------------------------------------ # Read in options: # this has to be easier to do... for arg in args: if arg in flags: flags[arg] = args[arg] elif arg == 'collection': collection_path = args[arg] elif arg == 'catalog': catalog_path = args[arg] else: print "make_lens_atlas: unrecognized flag ", arg print(flags) xbins = np.arange(flags['stamp_size'] * 2) ybins = np.arange(flags['stamp_size'] * 2) figsize_stamp = (flags['figsize_stamp'], flags['figsize_stamp']) figsize_field = (flags['figsize_field'], flags['figsize_field']) image_y_size = flags['image_y_size'] print "make_lens_atlas: illustrating behaviour captured in collection, and lens files: " print "make_lens_atlas: ", collection_path print "make_lens_atlas: ", catalog_path # ------------------------------------------------------------------ # Read in files: #bureau = swap.read_pickle(bureau_path, 'bureau') # TODO: needed? collection = swap.read_pickle(collection_path, 'collection') catalog = csv2rec(catalog_path) #print "make_lens_atlas: bureau numbers ", len(bureau.list()) print "make_lens_atlas: collection numbers ", len(collection.list()) print "make_lens_atlas: catalog numbers ", len(catalog) # ------------------------------------------------------------------ # Run through data: # ------------------------------------------------------------------ # Stamps: if flags['stamp']: print "make_lens_atlas: running stamps" for lens_i in range(len(catalog)): ID = catalog[lens_i]['id'] kind = catalog[lens_i]['kind'] x = catalog[lens_i]['x'] # flip y axis y = image_y_size - catalog[lens_i]['y'] N0 = catalog[lens_i]['n0'] if 'dist' in catalog.dtype.names: if catalog[lens_i]['dist'] == 0: continue if ((x < 0)): # this is one of the 'non points'; skip print(lens_i, 'x < 0!') continue if (N0 < flags['stamp_min']): # not enough points! print(lens_i, '{0} < {1}'.format(N0, flags['stamp_min'])) continue subject = collection.member[ID] annotationhistory = subject.annotationhistory # ------------------------------------------------------------------ # download png url = subject.location outname = flags['output_directory'] + '{0}_field.png'.format(ID) im = get_online_png(url, outname) min_x = np.int(np.max((x - flags['stamp_size'], 0))) max_x = np.int(np.min((x + flags['stamp_size'], im.shape[0]))) min_y = np.int(np.max((y - flags['stamp_size'], 0))) max_y = np.int(np.min((y + flags['stamp_size'], im.shape[1]))) min_member_x = np.int(np.max((x - flags['dist_max'], 0))) max_member_x = np.int(np.min((x + flags['dist_max'], im.shape[0]))) min_member_y = np.int(np.max((y - flags['dist_max'], 0))) max_member_y = np.int(np.min((y + flags['dist_max'], im.shape[1]))) if (min_x >= max_x) + (min_y >= max_y): print "make_lens_atlas: misshapen lens for ID ", ID continue # if it is a training image, claim the alpha parameter if im.shape[2] == 4: alpha = im[:, :, 3][min_y:max_y, min_x:max_x] im = im[:, :, :3][min_y:max_y, min_x:max_x] else: alpha = None im = im[min_y:max_y, min_x:max_x] fig = plt.figure(figsize=figsize_stamp) ax = fig.add_subplot(111) ax.imshow(im, origin=origin) ax.scatter(x - min_x, y - min_y, marker='d', c=(0, 1.0, 0), s=100, alpha=0.75) if ((flags['contour']) + (flags['heatmap']) + (flags['points'] != 0)): itwas = annotationhistory['ItWas'] x_all = annotationhistory['At_X'] y_all = annotationhistory['At_Y'] x_markers_all = np.array([xi for xj in x_all for xi in xj]) y_markers_all = np.array([yi for yj in y_all for yi in yj]) agents_numbers = np.arange(x_markers_all.size) if 'labels' in annotationhistory: # find which label is closest to your folks labels_all = annotationhistory['labels'] labels = np.array([xi for xj in labels_all for xi in xj]) cluster_labels = list(set(labels)) data = np.vstack((x_markers_all, y_markers_all)).T cluster_centers = np.array([ np.mean(data[labels == i], axis=0) for i in cluster_labels ]) # find which label is closest to the (x,y) label_center = cluster_labels[np.argmin( np.sum(np.square(cluster_centers - np.vstack((x, y)).T), axis=1))] conds = (labels == label_center) else: # now filter markers by those that are within # dist_max of the center (since I don't record cluster # members...) conds = ((x_markers_all >= min_member_x) * (x_markers_all <= max_member_x) * (y_markers_all >= min_member_y) * (y_markers_all <= max_member_y)) agents = agents_numbers[conds] x_markers = x_markers_all[agents] y_markers = y_markers_all[agents] # filter markers n_catalog = len(agents) if n_catalog < 1: print(lens_i, n_catalog) if (flags['points'] > 0) * \ (flags['points'] < n_catalog): agents_points = np.random.choice(agents, size=flags['points'], replace=False) else: agents_points = agents x_markers_filtered = x_markers_all[agents_points] y_markers_filtered = y_markers_all[agents_points] if (flags['skill']) * (len(agents) > 0): PL_all = annotationhistory['PL'] PD_all = annotationhistory['PD'] # filter out the empty clicks PL_list = [] for i, xj in enumerate(x_all): PL_list.append([PL_all[i]] * len(xj)) PL = np.array([PLi for PLj in PL_list for PLi in PLj]) # filter out the empty clicks PD_list = [] for i, xj in enumerate(x_all): PD_list.append([PD_all[i]] * len(xj)) PD = np.array([PDi for PDj in PD_list for PDi in PDj]) skill_all = swap.expectedInformationGain(0.5, PL, PD) skill = skill_all[agents] smax = 100 smin = 5 if np.max(skill) != np.min(skill): sizes_all = (skill_all - np.min(skill)) * (smax - smin) / \ (np.max(skill) - np.min(skill)) sizes_filtered = sizes_all[agents_points] else: sizes_filtered = 50 else: skill = None sizes_filtered = 50 colors = (0, 1.0, 0) # ---------------------------------------------------------- # heatmaps if (flags['heatmap']) * (len(agents) > 0): fig_heatmap = plt.figure(figsize=figsize_stamp) ax_heatmap = fig_heatmap.add_subplot(111) # now do the lens locations # don't need to filter the x's since that is filtered by # xbins and ybins anyways pdf2d(x_markers - min_x, y_markers - min_y, xbins=xbins, ybins=ybins, weights=skill, smooth=flags['smooth_click'], color=(0, 1.0, 0), style='hist', axis=ax_heatmap) if flags['alpha'] * (alpha != None): contour_hist(alpha.T, extent=(xbins[0], xbins[-1], ybins[0], ybins[-1]), color='w', style='contour', axis=ax_heatmap) ax_heatmap.tick_params(\ axis='both', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off left='off', right='off', labelleft='off', labelbottom='off') # labels along the bottom edge are off # CPD 04.08.14: Flip axis to old conventions ax_heatmap.invert_yaxis() try: outfile = flags['output_directory'] + \ '{0}_cluster_{1}_heatmap.{2}'.format( ID, lens_i, flags['output_format']) # fig_heatmap.savefig(outfile) #fig_heatmap.canvas.print_png(outfile) fig_heatmap.savefig(outfile, bbox_inches='tight', pad_inches=0) except: print 'make_lens_catalog: heatmap problem with ', ID, lens_i # import ipdb; ipdb.set_trace() # --------------------------------------------------------- # back to our other plots # contours if (flags['contour']) * (len(agents) > 0): # now do the lens locations # don't need to filter the x's since that is filtered by # xbins and ybins anyways pdf2d(x_markers - min_x, y_markers - min_y, xbins=xbins, ybins=ybins, weights=skill, smooth=flags['smooth_click'], color=(0, 1.0, 0), style='contour', axis=ax) # plot points if (flags['points'] != 0) * (len(agents) > 0): ax.scatter(x_markers_filtered - min_x, y_markers_filtered - min_y, c=colors, s=sizes_filtered, alpha=0.25) # plot alpha if flags['alpha'] * (alpha != None): contour_hist(alpha.T, extent=(xbins[0], xbins[-1], ybins[0], ybins[-1]), color='w', style='contour', axis=ax) # ---------------------------------------------------------- ax.tick_params(\ axis='both', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off left='off', right='off', labelleft='off', labelbottom='off') # labels along the bottom edge are off ax.invert_yaxis() try: outfile = flags['output_directory'] + \ '{0}_cluster_{1}_contour.{2}'.format( ID, lens_i, flags['output_format'] ) # fig.savefig(outfile) fig.savefig(outfile, bbox_inches='tight', pad_inches=0) # fig.canvas.print_png(outfile) except: print 'make_lens_catalog: contour problem with ', ID, lens_i # import ipdb; ipdb.set_trace() plt.close('all') # ------------------------------------------------------------------ # Fields if flags['field']: print "make_lens_atlas: running fields" # find the unique IDs. mark centers and also centrals if clustering is # done #import ipdb; ipdb.set_trace() unique_IDs = np.unique(catalog['id']) for ID in unique_IDs: mini_catalog = catalog[catalog['id'] == ID] subject = collection.member[ID] annotationhistory = subject.annotationhistory # plot cluster centers kind = mini_catalog['kind'] x_centers = mini_catalog['x'] # flip y from catalog y_centers = image_y_size - mini_catalog['y'] skill_centers = mini_catalog['skill'] # filter out the -1 entry center_cond = (x_centers > 0) * (y_centers > 0) # filter outliers if possible if 'dist' in mini_catalog.dtype.names: center_cond *= mini_catalog['dist'] > 0 skill_centers = skill_centers[center_cond] x_centers = x_centers[center_cond] y_centers = y_centers[center_cond] colors_centers = [(0, 1.0, 0) for i in x_centers] if len(colors_centers) == 0: #welp, nothing here continue # ------------------------------------------------------------------ # download png url = subject.location outname = flags['output_directory'] + '{0}_field.png'.format(ID) im = get_online_png(url, outname) # if it is a training image, claim the alpha parameter if im.shape[2] == 4: alpha = im[:, :, 3] im = im[:, :, :3] else: alpha = None fig = plt.figure(figsize=figsize_field) ax = fig.add_subplot(111) ax.imshow(im, origin=origin) xbins = np.arange(im.shape[0]) ybins = np.arange(im.shape[1]) min_x = 0 min_y = 0 max_x = im.shape[0] max_y = im.shape[1] if (flags['skill']) * (np.max(skill_centers) != np.min(skill_centers)): sizes_centers = ( (skill_centers - np.min(skill_centers)) * (200 - 10) / (np.max(skill_centers) - np.min(skill_centers))) else: sizes_centers = [100 for i in x_centers] sizes_centers = [100 for i in x_centers] ax.scatter(x_centers, y_centers, marker='d', c=colors_centers, s=sizes_centers, alpha=0.75) if flags['diagnostics']: r = flags['dist_max'] b = flags['stamp_size'] b_ones = np.ones(100) * b b_arr = np.linspace(-b, b, 100) def xy(x0, y0, r, phi): return x0 + r * np.cos(phi), y0 + r * np.sin(phi) phis = np.arange(0, 6.28, 0.01) for i in xrange(len(x_centers)): x_center = x_centers[i] y_center = y_centers[i] ax.plot(*xy(x_center, y_center, r, phis), c='w', ls='-', linewidth=4) # plot box ax.plot(x_center + b_ones, y_center + b_arr, c='r', ls='--', linewidth=4) ax.plot(x_center - b_ones, y_center + b_arr, c='r', ls='--', linewidth=4) ax.plot(x_center + b_arr, y_center + b_ones, c='r', ls='--', linewidth=4) ax.plot(x_center + b_arr, y_center - b_ones, c='r', ls='--', linewidth=4) itwas = annotationhistory['ItWas'] x_all = annotationhistory['At_X'] y_all = annotationhistory['At_Y'] x_markers_all = np.array([xi for xj in x_all for xi in xj]) y_markers_all = np.array([yi for yj in y_all for yi in yj]) # now filter markers by those that are within # stamp_size of the stamp # I'm pretty sure this step is redundant when going over the full # image? agents_numbers = np.arange(x_markers_all.size) conds = ((x_markers_all >= min_x) * (x_markers_all <= max_x) * (y_markers_all >= min_y) * (y_markers_all <= max_y)) agents = agents_numbers[conds] x_markers = x_markers_all[agents] y_markers = y_markers_all[agents] # filter markers n_catalog = len(agents) if (flags['points'] > 0) * \ (flags['points'] < n_catalog): agents_points = np.random.choice(agents, size=flags['points'], replace=False) else: agents_points = agents x_markers_filtered = x_markers_all[agents_points] y_markers_filtered = y_markers_all[agents_points] if flags['skill']: PL_all = annotationhistory['PL'] PD_all = annotationhistory['PD'] # filter out the empty clicks PL_list = [] for i, xj in enumerate(x_all): PL_list.append([PL_all[i]] * len(xj)) PL = np.array([PLi for PLj in PL_list for PLi in PLj]) # filter out the empty clicks PD_list = [] for i, xj in enumerate(x_all): PD_list.append([PD_all[i]] * len(xj)) PD = np.array([PDi for PDj in PD_list for PDi in PDj]) skill_all = swap.expectedInformationGain(0.5, PL, PD) skill = skill_all[agents] smax = 100 smin = 5 if np.max(skill) != np.min(skill): sizes_all = (skill_all - np.min(skill)) * (smax - smin) / \ (np.max(skill) - np.min(skill)) sizes_filtered = sizes_all[agents_points] else: sizes_filtered = 50 else: skill = None sizes_filtered = 50 if 'labels' in annotationhistory: # find which label is closest to your folks labels_all = annotationhistory['labels'] labels = np.array([xi for xj in labels_all for xi in xj]) labels_filtered = labels[agents_points] colors = [] alpha = 0.75 for label in labels_filtered: if label == -1: colors.append((1.0, 0.0, 0)) else: colors.append((0, 1.0, 0)) else: colors = (0, 1.0, 0) alpha = 0.25 # ---------------------------------------------------------- # contours if flags['contour'] * (len(x_markers) >= flags['stamp_min']): # now do the lens locations # don't need to filter the x's since that is filtered by # xbins and ybins anyways pdf2d(x_markers - min_x, y_markers - min_y, xbins=xbins, ybins=ybins, weights=skill, smooth=flags['smooth_click'], color=(0, 1.0, 0), style='contour', axis=ax) # ---------------------------------------------------------- # plot points if flags['points'] != 0: ax.scatter(x_markers_filtered - min_x, y_markers_filtered - min_y, c=colors, s=sizes_filtered, alpha=alpha) # ---------------------------------------------------------- # do alpha if flags['alpha'] * (alpha != None): contour_hist(alpha.T, extent=(xbins[0], xbins[-1], ybins[0], ybins[-1]), color='w', style='contour', axis=ax) ax.tick_params(\ axis='both', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off left='off', right='off', labelleft='off', labelbottom='off') # labels along the bottom edge are off ax.invert_yaxis() try: outfile = flags[ 'output_directory'] + '{0}_field_output.{1}'.format( ID, flags['output_format']) # fig.savefig(outfile) fig.savefig(outfile, bbox_inches='tight', pad_inches=0) #fig.canvas.print_png(outfile) except: print 'make_lens_catalog: field problem with field ', ID plt.close('all') print 'make_lens_catalog: All done!'
def MachineClassifier(options, args): """ NAME MachineClassifier.py PURPOSE Machine learning component of Galaxy Zoo Express Read in a training sample generated by human users (which have previously been analyzed by SWAP). Learn on the training sample and moniter progress. Once "fully trained", apply learned model to test sample. COMMENTS Lots I'm sure. FLAGS -h Print this message -c config file name """ #----------------------------------------------------------------------- # LOAD CONFIG FILE PARAMETERS #----------------------------------------------------------------------- # Check for config file in array args: if (len(args) >= 1) or (options.configfile): if args: config = args[0] elif options.configfile: config = options.configfile print swap.doubledashedline print swap.ML_hello print swap.doubledashedline print "ML: taking instructions from",config else: print MachineClassifier.__doc__ return machine_sim_directory = 'sims_Machine/redo_with_circular_morphs/' tonights = swap.Configuration(config) # Read the pickled random state file random_file = open(tonights.parameters['random_file'],"r"); random_state = cPickle.load(random_file); random_file.close(); np.random.set_state(random_state) time = tonights.parameters['start'] # Get the machine threshold (to make retirement decisions) swap_thresholds = {} swap_thresholds['detection'] = tonights.parameters['detection_threshold'] swap_thresholds['rejection'] = tonights.parameters['rejection_threshold'] threshold = tonights.parameters['machine_threshold'] prior = tonights.parameters['prior'] # Get list of evaluation metrics and criteria eval_metrics = tonights.parameters['evaluation_metrics'] # How much cross-validation should we do? cv = tonights.parameters['cross_validation'] survey = tonights.parameters['survey'] # To generate training labels based on the subject probability, # we need to know what should be considered the positive label: # i.e., GZ2 has labels (in metadatafile) Smooth = 1, Feat = 0 # Doing a Smooth or Not run, the positive label is 1 # Doing a Featured or Not run, the positive label is 0 pos_label = tonights.parameters['positive_label'] #---------------------------------------------------------------------- # read in the metadata for all subjects storage = swap.read_pickle(tonights.parameters['metadatafile'], 'metadata') # 11TH HOUR QUICK FIX CUZ I F****D UP. MB 10/27/16 if 'GZ2_raw_combo' not in storage.subjects.colnames: gz2_metadata = Table.read('metadata_ground_truth_labels.fits') storage.subjects['GZ2_raw_combo'] = gz2_metadata['GZ2_raw_combo'] swap.write_pickle(storage, tonights.parameters['metadatafile']) subjects = storage.subjects #---------------------------------------------------------------------- # read in the PROJECT COLLECTION -- (shared between SWAP/Machine) #sample = swap.read_pickle(tonights.parameters['samplefile'],'collection') # read in or create the ML bureau for machine agents (history for Machines) MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'],'bureau') #----------------------------------------------------------------------- # FETCH TRAINING & VALIDATION SAMPLES #----------------------------------------------------------------------- train_sample = storage.fetch_subsample(sample_type='train', class_label='GZ2_raw_combo') """ Notes about the training sample: # this will select only those which have my morphology measured for them # AND which have "ground truth" according to GZ2 # Eventually we could open this up to include the ~10k that aren't in the # GZ Main Sample but I think, for now, we should reduce ourselves to this # stricter sample so that we always have back-up "truth" for each galaxy. """ try: train_meta, train_features = ml.extract_features(train_sample, keys=['M20_corr', 'C_corr', 'E', 'A_corr', 'G_corr']) original_length = len(train_meta) except TypeError: print "ML: can't extract features from subsample." print "ML: Exiting MachineClassifier.py" sys.exit() else: # TODO: consider making this part of SWAP's duties? # 5/18/16: Only use those subjects which are no longer on the prior off_the_fence = np.where(train_meta['SWAP_prob']!=prior) train_meta = train_meta[off_the_fence] train_features = train_features[off_the_fence] train_labels = np.array([pos_label if p > prior else 1-pos_label for p in train_meta['SWAP_prob']]) shortened_length = len(train_meta) print "ML: found a training sample of %i subjects"%shortened_length removed = original_length - shortened_length print "ML: %i subjects removed to create balanced training sample"%removed valid_sample = storage.fetch_subsample(sample_type='valid', class_label='Expert_label') try: valid_meta, valid_features = ml.extract_features(valid_sample, keys=['M20_corr', 'C_corr', 'E', 'A_corr', 'G_corr']) except: print "ML: there are no subjects with the label 'valid'!" else: valid_labels = valid_meta['Expert_label'].filled() print "ML: found a validation sample of %i subjects"%len(valid_meta) # --------------------------------------------------------------------- # Require a minimum size training sample [Be reasonable, my good man!] # --------------------------------------------------------------------- if len(train_sample) < 10000: print "ML: training sample is too small to be worth anything." print "ML: Exiting MachineClassifier.py" sys.exit() else: print "ML: training sample is large enough to give it a shot." # TODO: LOOP THROUGH DIFFERENT MACHINES? # 5/12/16 -- no... need to make THIS a class and create multiple # instances? Each one can be passed an instance of a machine? # Machine can be trained to optimize different metrics # (ACC, completeness, purity, etc. Have a list of acceptable ones.) # Minimize a Loss function. for metric in eval_metrics: # REGISTER Machine Classifier # Construct machine name --> Machine+Metric machine = 'RF' Name = machine+'_'+metric # register an Agent for this Machine try: test = MLbureau.member[Name] except: MLbureau.member[Name] = swap.Agent_ML(Name, metric) MLagent = MLbureau.member[Name] #--------------------------------------------------------------- # TRAIN THE MACHINE; EVALUATE ON VALIDATION SAMPLE #--------------------------------------------------------------- # Now we run the machine -- need cross validation on whatever size # training sample we have .. # Fixed until we build in other machine options # Need to dynamically determine appropriate parameters... #max_neighbors = get_max_neighbors(train_features, cv) #n_neighbors = np.arange(1, (cv-1)*max_neighbors/cv, 5, dtype=int) #params = {'n_neighbors':n_neighbors, # 'weights':('uniform','distance')} num_features = train_features.shape[1] min_features = int(round(np.sqrt(num_features))) params = {'max_features':np.arange(min_features, num_features+1), 'max_depth':np.arange(2,16)} # Create the model # for "estimator=XXX" all you need is an instance of a machine -- # any scikit-learn machine will do. However, non-sklearn machines.. # That will be a bit trickier! (i.e. Phil's conv-nets) general_model = GridSearchCV(estimator=RF(n_estimators=30), param_grid=params, n_jobs=31, error_score=0, scoring=metric, cv=cv) # Train the model -- k-fold cross validation is embedded print "ML: Searching the hyperparameter space for values that "\ "optimize the %s."%metric trained_model = general_model.fit(train_features, train_labels) MLagent.model = trained_model # Test accuracy (metric of choice) on validation sample score = trained_model.score(valid_features, valid_labels) ratio = np.sum(train_labels==pos_label) / len(train_labels) MLagent.record_training(model_described_by= trained_model.best_estimator_, with_params=trained_model.best_params_, trained_on=len(train_features), with_ratio=ratio, at_time=time, with_train_score=trained_model.best_score_, and_valid_score=trained_model.score( valid_features, valid_labels)) valid_prob_thresh = trained_model.predict_proba(valid_features)[:,pos_label] fps, tps, thresh = mtrx.roc_curve(valid_labels,valid_prob_thresh, pos_label=pos_label) metric_list = compute_binary_metrics(fps, tps) ACC, TPR, FPR, FNR, TNR, PPV, FDR, FOR, NPV = metric_list MLagent.record_validation(accuracy=ACC, recall=TPR, precision=PPV, false_pos=FPR, completeness_f=TNR, contamination_f=NPV) #MLagent.plot_ROC() # --------------------------------------------------------------- # IF TRAINED MACHINE PREDICTS WELL ON VALIDATION .... # --------------------------------------------------------------- if MLagent.is_trained(metric) or MLagent.trained: print "ML: %s has successfully trained and will be applied "\ "to the test sample."%Name # Retrieve the test sample test_sample = storage.fetch_subsample(sample_type='test', class_label='GZ2_raw_combo') """ Notes on test sample: The test sample will, in real life, be those subjects for which we don't have an answer a priori. However, for now, this sample is how we will judge, in part, the performance of the overall method. As such, we only include those subjects which have GZ2 labels in the Main Sample. """ try: test_meta, test_features = ml.extract_features(test_sample, keys=['M20_corr', 'C_corr', 'E', 'A_corr', 'G_corr']) except: print "ML: there are no subjects with the label 'test'!" print "ML: Either there is nothing more to do or there is a BIG mistake..." else: print "ML: found test sample of %i subjects"%len(test_meta) #----------------------------------------------------------- # APPLY MACHINE TO TEST SAMPLE #----------------------------------------------------------- predictions = MLagent.model.predict(test_features) probabilities = MLagent.model.predict_proba(test_features)[:,pos_label] print "ML: %s has finished predicting labels for the test "\ "sample."%Name print "ML: Generating performance report on the test sample:" test_labels = test_meta['GZ2_raw_combo'].filled() print mtrx.classification_report(test_labels, predictions) test_accuracy = mtrx.accuracy_score(test_labels,predictions) test_precision = mtrx.precision_score(test_labels,predictions,pos_label=pos_label) test_recall = mtrx.recall_score(test_labels,predictions,pos_label=pos_label) MLagent.record_evaluation(accuracy_score=test_accuracy, precision_score=test_precision, recall_score=test_recall, at_time=time) # ---------------------------------------------------------- # Save the predictions and probabilities to a new pickle test_meta['predictions'] = predictions test_meta['machine_probability'] = probabilities # If is hasn't been done already, save the current directory # --------------------------------------------------------------------- tonights.parameters['trunk'] = survey+'_'+tonights.parameters['start'] # This is the standard directory... #tonights.parameters['dir'] = os.getcwd()+'/'+tonights.parameters['trunk'] # This is to put files into the sims_Machine/... directory. tonights.parameters['dir'] = os.getcwd() filename=tonights.parameters['dir']+'/'+tonights.parameters['trunk']+'_'+Name+'.fits' test_meta.write(filename) count=0 noSWAP=0 for sub, pred, prob in zip(test_meta, predictions, probabilities): # IF MACHINE P >= THRESHOLD, INSERT INTO SWAP COLLECTION # -------------------------------------------------------- if (prob >= threshold) or (1-prob >= threshold): # Flip the set label in the metadata file -- # don't want to use this as a training sample! idx = np.where(subjects['asset_id'] == sub['asset_id']) storage.subjects['MLsample'][idx] = 'mclass' storage.subjects['retired_date'][idx] = time count+=1 print "MC: Machine classifed {0} subjects with >= 90% confidence".format(count) print "ML: Of those, {0} had never been seen by SWAP".format(noSWAP) tonights.parameters['trunk'] = survey+'_'+tonights.parameters['start'] tonights.parameters['dir'] = os.getcwd() if not os.path.exists(tonights.parameters['dir']): os.makedirs(tonights.parameters['dir']) # Repickle all the shits # ----------------------------------------------------------------------- if tonights.parameters['repickle']: #new_samplefile = swap.get_new_filename(tonights.parameters,'collection') #print "ML: saving SWAP subjects to "+new_samplefile #swap.write_pickle(sample, new_samplefile) #tonights.parameters['samplefile'] = new_samplefile new_bureaufile=swap.get_new_filename(tonights.parameters,'bureau','ML') print "ML: saving MLbureau to "+new_bureaufile swap.write_pickle(MLbureau, new_bureaufile) tonights.parameters['MLbureaufile'] = new_bureaufile metadatafile = swap.get_new_filename(tonights.parameters,'metadata') print "ML: saving metadata to "+metadatafile swap.write_pickle(storage, metadatafile) tonights.parameters['metadatafile'] = metadatafile # UPDATE CONFIG FILE with pickle filenames, dir/trunk, and (maybe) new day # ---------------------------------------------------------------------- configfile = config.replace('startup','update') # Random_file needs updating, else we always start from the same random # state when update.config is reread! random_file = open(tonights.parameters['random_file'],"w"); random_state = np.random.get_state(); cPickle.dump(random_state,random_file); random_file.close(); swap.write_config(configfile, tonights.parameters) return
collection2_path = args[3] print "make_roc_curves: illustrating behaviour captured in bureau and collection files: " print "make_roc_curves: ",bureau1_path print "make_roc_curves: ",bureau2_path print "make_roc_curves: ",collection1_path print "make_roc_curves: ",collection2_path else: print make_roc_curves.__doc__ return output_directory = './' # ------------------------------------------------------------------ # Read in bureau and collection objects: bureau1 = swap.read_pickle(bureau1_path, 'bureau') collection1 = swap.read_pickle(collection1_path, 'collection') bureau2 = swap.read_pickle(bureau2_path, 'bureau') collection2 = swap.read_pickle(collection2_path, 'collection') print "make_roc_curves: stage 1, 2 agent numbers: ",len(bureau1.list()), len(bureau2.list()) print "make_roc_curves: stage 1, 2 subject numbers: ",len(collection1.list()), len(collection2.list()) # ------------------------------------------------------------------ # set up data for roc plots n_min = 1 fprs = []
def MachineClassifier(options, args): """ NAME MachineClassifier.py PURPOSE Machine learning component of Galaxy Zoo Express Read in a training sample generated by human users (which have preferentially been analyzed by SWAP). Learn on the training sample and moniter progress. Once "fully trained", apply learned model to test sample. COMMENTS Lots I'm sure. FLAGS -h Print this message -c config file name """ # Check for setup file in array args: if (len(args) >= 1) or (options.configfile): if args: config = args[0] elif options.configfile: config = options.configfile print swap.doubledashedline print swap.ML_hello print swap.doubledashedline print "ML: taking instructions from",config else: print MachineClassifier.__doc__ return tonights = swap.Configuration(config) # Read the pickled random state file random_file = open(tonights.parameters['random_file'],"r"); random_state = cPickle.load(random_file); random_file.close(); np.random.set_state(random_state); time = tonights.parameters['start'] print time # Get the machine threshold (make retirement decisions) threshold = tonights.parameters['machine_threshold'] prior = tonights.parameters['prior'] # Get list of evaluation metrics and criteria eval_metrics = tonights.parameters['evaluation_metrics'] # How much cross-validation should we do? cv = tonights.parameters['cross_validation'] survey = tonights.parameters['survey'] #---------------------------------------------------------------------- # read in the metadata for all subjects (Test or Training sample?) storage = swap.read_pickle(tonights.parameters['metadatafile'], 'metadata') subjects = storage.subjects #---------------------------------------------------------------------- # read in the SWAP collection sample = swap.read_pickle(tonights.parameters['samplefile'],'collection') #---------------------------------------------------------------------- # read in or create the ML collection MLsample = swap.read_pickle(tonights.parameters['MLsamplefile'], 'MLcollection') # read in or create the ML bureau for machine agents (history) MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'],'bureau') #if not tonights.parameters['MLbureaufile']: # MLbureaufile = swap.get_new_filename(tonights.parameters,'bureau','ML') # tonights.parameters['MLbureaufile'] = MLbureaufile #MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'],'bureau') #----------------------------------------------------------------------- # SELECT TRAINING & VALIDATION SAMPLES #----------------------------------------------------------------------- # TO DO: training sample should only select those which are NOT part of # validation sample (Nair catalog objects) 2/22/16 train_sample = storage.fetch_subsample(sample_type='train', class_label='GZ2_label') """ Notes about the training sample: # this will select only those which have my morphology measured for them # AND which have a true "answer" according to GZ2 # Eventually we could open this up to include the ~10k that aren't in the # GZ Main Sample but I think, for now, we should reduce ourselves to this # stricter sample so that we always have back-up "truth" for each galaxy. """ try: train_meta, train_features = ml.extract_features(train_sample) original_length = len(train_meta) except TypeError: print "ML: can't extract features from subsample." print "ML: Exiting MachineClassifier.py" sys.exit() else: # TODO: consider making this part of SWAP's duties? # 5/18/16: Only use those subjects which are no longer on the prior off_the_fence = np.where(train_meta['SWAP_prob']!=prior) train_meta = train_meta[off_the_fence] train_features = train_features[off_the_fence] train_labels = np.array([1 if p > prior else 0 for p in train_meta['SWAP_prob']]) #train_labels = train_meta['Nair_label'].filled() shortened_length = len(train_meta) print "ML: found a training sample of %i subjects"%shortened_length removed = original_length - shortened_length print "ML: %i subjects had prior probability and were removed"%removed valid_sample = storage.fetch_subsample(sample_type='valid', class_label='Expert_label') try: valid_meta, valid_features = ml.extract_features(valid_sample) except: print "ML: there are no subjects with the label 'valid'!" else: valid_labels = valid_meta['Expert_label'].filled() print "ML: found a validation sample of %i subjects"%len(valid_meta) # --------------------------------------------------------------------- # Require a minimum size training sample [Be reasonable, my good man!] # --------------------------------------------------------------------- if len(train_sample) < 10000: print "ML: training sample is too small to be worth anything." print "ML: Exiting MachineClassifier.py" sys.exit() else: print "ML: training sample is large enough to give it a shot." # TODO: LOOP THROUGH DIFFERENT MACHINES? # 5/12/16 -- no... need to make THIS a class and create multiple # instances? Each one can be passed an instance of a machine? # Machine can be trained to maximize/minimize different metrics # (ACC, completeness, purity, etc. Have a list of acceptable ones.) # Minimize a Loss function (KNC doesn't have a loss fcn). for metric in eval_metrics: # REGISTER Machine Classifier # Construct machine name --> Machine+Metric? For now: KNC machine = 'KNC' machine = 'RF' Name = machine+'_'+metric # register an Agent for this Machine # This "Agent" doesn't behave like a SW agent... at least not yet try: test = MLbureau.member[Name] except: MLbureau.member[Name] = swap.Agent_ML(Name, metric) MLagent = MLbureau.member[Name] #--------------------------------------------------------------- # TRAIN THE MACHINE; EVALUATE ON VALIDATION SAMPLE #--------------------------------------------------------------- # Now we run the machine -- need cross validation on whatever size # training sample we have .. # Fixed until we build in other machine options # Need to dynamically determine appropriate parameters... #max_neighbors = get_max_neighbors(train_features, cv) #n_neighbors = np.arange(1, (cv-1)*max_neighbors/cv, 5, dtype=int) #params = {'n_neighbors':n_neighbors, # 'weights':('uniform','distance')} num_features = train_features.shape[1] min_features = int(round(np.sqrt(num_features))) params = {'max_features':np.arange(min_features, num_features+1), 'max_depth':np.arange(2,16)} # Create the model # for "estimator=XXX" all you need is an instance of a machine -- # any scikit-learn machine will do. However, non-sklearn machines.. # That will be a bit trickier! (i.e. Phil's conv-nets) general_model = GridSearchCV(estimator=RF(n_estimators=30), param_grid=params, n_jobs=-1, error_score=0, scoring=metric, cv=cv) # Train the model -- k-fold cross validation is embedded print "ML: Searching the hyperparameter space for values that "\ "optimize the %s."%metric trained_model = general_model.fit(train_features, train_labels) MLagent.model = trained_model # Test "accuracy" (metric of choice) on validation sample score = trained_model.score(valid_features, valid_labels) ratio = np.sum(train_labels==1) / len(train_labels) MLagent.record_training(model_described_by= trained_model.best_estimator_, with_params=trained_model.best_params_, trained_on=len(train_features), with_ratio=ratio, at_time=time, with_train_score=trained_model.best_score_, and_valid_score=trained_model.score( valid_features, valid_labels)) fps, tps, thresh = mtrx.roc_curve(valid_labels, trained_model.predict_proba(valid_features)[:,1]) metric_list = compute_binary_metrics(fps, tps) ACC, TPR, FPR, FNR, TNR, PPV, FDR, FOR, NPV = metric_list MLagent.record_validation(accuracy=ACC, recall=TPR, precision=PPV, false_pos=FPR, completeness_f=TNR, contamination_f=NPV) #MLagent.plot_ROC() # --------------------------------------------------------------- # IF TRAINED MACHINE PREDICTS WELL ON VALIDATION .... # --------------------------------------------------------------- if MLagent.is_trained(metric): print "ML: %s has successfully trained and will be applied "\ "to the test sample." # Retrieve the test sample test_sample = storage.fetch_subsample(sample_type='test', class_label='GZ2_label') """ Notes on test sample: The test sample will, in real life, be those subjects for which we don't have an answer a priori. However, for now, this sample is how we will judge, in part, the performance of the overall method. As such, we only include those subjects which have GZ2 labels in the Main Sample. """ try: test_meta, test_features = ml.extract_features(test_sample) except: print "ML: there are no subjects with the label 'test'!" print "ML: which means there's nothing more to do!" else: print "ML: found test sample of %i subjects"%len(test_meta) #----------------------------------------------------------- # APPLY MACHINE TO TEST SAMPLE #----------------------------------------------------------- predictions = MLagent.model.predict(test_features) probabilities = MLagent.model.predict_proba(test_features) print "ML: %s has finished predicting labels for the test "\ "sample."%Name print "ML: Generating performance report on the test sample:" test_labels = test_meta['GZ2_label'].filled() print mtrx.classification_report(test_labels, predictions) test_accuracy=mtrx.accuracy_score(test_labels,predictions) test_precision=mtrx.precision_score(test_labels,predictions) test_recall=mtrx.recall_score(test_labels,predictions) MLagent.record_evaluation(accuracy_score=test_accuracy, precision_score=test_precision, recall_score=test_recall, at_time=time) #pdb.set_trace() # ---------------------------------------------------------- # Save the predictions and probabilities to a new pickle test_meta['predictions'] = predictions test_meta['probability_of_smooth'] = probabilities[:,1] filename=tonights.parameters['trunk']+'_'+Name+'.pickle' swap.write_pickle(test_meta, filename) """ for thing, pred, p in zip(test_meta, predictions, probabitilies): # IF MACHINE P >= THRESHOLD, INSERT INTO SWAP COLLECTION # -------------------------------------------------------- if (p >= threshold) or (1-p >= threshold): print "BOOM! WE'VE GOT A MACHINE-CLASSIFIED SUBJECT:" print "Probability:", p # Initialize the subject in SWAP Collection ID = thing['asset_id'] sample.member[ID] = swap.Subject(ID, str(s['SDSS_id']), location=s['external_ref']) sample.member[ID].retiredby = 'machine' # Flag subject as 'INACTIVE' / 'DETECTED' / 'REJECTED' # ---------------------------------------------------------- if p >= threshold: sample.member[str(s['id'])].state = 'inactive' elif 1-p >= threshold: sample.member[str(s['id'])].status = 'rejected' #""" # If is hasn't been done already, save the current directory # --------------------------------------------------------------------- tonights.parameters['dir'] = os.getcwd()+'/'+tonights.parameters['trunk'] if not os.path.exists(tonights.parameters['dir']): os.makedirs(tonights.parameters['dir']) # Repickle all the shits # ----------------------------------------------------------------------- if tonights.parameters['repickle']: new_samplefile = swap.get_new_filename(tonights.parameters,'collection') print "ML: saving SWAP subjects to "+new_samplefile swap.write_pickle(sample, new_samplefile) tonights.parameters['samplefile'] = new_samplefile new_samplefile=swap.get_new_filename(tonights.parameters,'MLcollection') print "ML: saving test sample subjects to "+new_samplefile swap.write_pickle(MLsample,new_samplefile) tonights.parameters['MLsamplefile'] = new_samplefile new_bureaufile=swap.get_new_filename(tonights.parameters,'bureau','ML') print "ML: saving MLbureau to "+new_bureaufile swap.write_pickle(MLbureau, new_bureaufile) tonights.parameters['MLbureaufile'] = new_bureaufile metadatafile = swap.get_new_filename(tonights.parameters,'metadata') print "ML: saving metadata to "+metadatafile swap.write_pickle(storage, metadatafile) tonights.parameters['metadatafile'] = metadatafile # UPDATE CONFIG FILE with pickle filenames, dir/trunk, and (maybe) new day # ---------------------------------------------------------------------- configfile = config.replace('startup','update') # Random_file needs updating, else we always start from the same random # state when update.config is reread! random_file = open(tonights.parameters['random_file'],"w"); random_state = np.random.get_state(); cPickle.dump(random_state,random_file); random_file.close(); swap.write_config(configfile, tonights.parameters) return
def MachineClassifier(options, args): try: config = options.configfile except: pdb.set_trace() tonights = swap.Configuration(config) #""" # Read the pickled random state file random_file = open(tonights.parameters['random_file'],"r"); random_state = cPickle.load(random_file); random_file.close(); np.random.set_state(random_state); #""" # Get the machine threshold (make retirement decisions) threshold = tonights.parameters['machine_threshold'] # Get list of evaluation metrics and criteria eval_metrics = tonights.parameters['evaluation_metrics'] survey = tonights.parameters['survey'] subdir = 'sup_run4' #---------------------------------------------------------------------- # read in the metadata for all subjects (Test or Training sample?) subjects = swap.read_pickle(tonights.parameters['metadatafile'], 'metadata') #---------------------------------------------------------------------- # read in the SWAP collection sample = swap.read_pickle(tonights.parameters['samplefile'],'collection') #---------------------------------------------------------------------- # read in or create the ML collection MLsample = swap.read_pickle(tonights.parameters['MLsamplefile'], 'MLcollection') # read in or create the ML bureau for machine agents (history) MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'], 'MLbureau') #----------------------------------------------------------------------- # DETERMINE IF THERE IS A TRAINING SAMPLE TO WORK WITH #----------------------------------------------------------------------- # TO DO: training sample should only select those which are NOT part of # validation sample (Nair catalog objects) 2/22/16 # IDENTIFY TRAINING SAMPLE train_sample = subjects[subjects['MLsample']=='train'] train_meta, train_features = ml.extract_training(train_sample) train_labels = np.array([1 if p > 0.3 else 0 \ for p in train_meta['SWAP_prob']]) # IDENTIFY VALIDATION SAMPLE (FINAL) valid_sample = subjects[subjects['MLsample']=='valid'] valid_meta, valid_features = ml.extract_training(valid_sample) valid_labels = valid_meta['Expert_label'].filled() #if len(train_sample) >= 100: # TO DO: LOOP THROUGH DIFFERENT MACHINES? HOW MANY MACHINES? for metric in eval_metrics: # REGISTER Machine Classifier # Construct machine name --> Machine+Metric? For now: KNC machine = 'KNC' Name = machine+'_'+metric # register an Agent for this Machine try: test = MLbureau.member[Name] except: MLbureau.member[Name] = swap.Agent_ML(Name, metric) #--------------------------------------------------------------- # TRAIN THE MACHINE; EVALUATE ON VALIDATION SAMPLE #--------------------------------------------------------------- # Now we run the machine -- need cross validation on whatever size # training sample we have .. # For now this will be fixed until we build in other machine options params = {'n_neighbors':np.arange(1, 2*(len(train_sample)-1) / 3, 2), 'weights':('uniform','distance')} # Create the model general_model = GridSearchCV(estimator=KNC(), param_grid=params, error_score=0, scoring=metric) # Train the model -- k-fold cross validation is embedded trained_model = general_model.fit(train_features, train_labels) # Test "accuracy" (metric of choice) on validation sample score = trained_model.score(valid_features, valid_labels) MLbureau.member[Name].record_training(\ model_described_by=trained_model.best_estimator_, with_params=trained_model.best_params_, trained_on=len(train_features), at_time=TIME, with_train_acc=traineed_model.best_score_, and_valid_acc=trained_model.score(valid_features, valid_labels)) # Store the trained machine MLbureau.member[Name].model = trained_model # Compute / store confusion matrix as a function of threshold # produced by this machine on the Expert Validation sample fps, tps, thresh = mtrx._binary_clf_curve(valid_labels, trained_model.predict_proba(valid_features)[:,1]) metric_list = mtrx.compute_binary_metrics(fps, tps) ACC, TPR, FPR, FNR, TNR, PPV, FDR, FOR, NPV = metric_list MLbureau.member[Name].record_evaluation(accuracy=ACC, completeness_s=TPR, contamination_s=FDR, completeness_f=TNR, contamination_f=NPV) pdb.set_trace() # 3. compare the metric of choice with the evaluation criterion to # see if this machine has sufficiently learned? # ... what if my criterion is simply "Maximize Accuracy"? # ... or minimize feature contamination? these require that we # compare tonight's machine with the previous night's machine # But if my criterion is simply "have feature contam less than 20%" # then it's easy.... # IF TRAINED MACHINE PREDICTS WELL ON VALIDATION .... if MLbureau.member[Name].evaluate(): #--------------------------------------------------------------- # APPLY MACHINE TO TEST SAMPLE #--------------------------------------------------------------- # This requires that my runKNC function returns the Machine Object shitski=5 #--------------------------------------------------------------- # PROCESS PREDICTIONS/PROBS #--------------------------------------------------------------- for s,p,l in zip(test_meta, probas, predictions): ID = str(s['id']) descriptions = Nair_or_Not(s) category, kind, flavor, truth = descriptions # LOAD EACH TEST SUBJECT INTO MACHINE COLLECTION # ------------------------------------------------------------- try: test = MLsample.member[ID] except: MLsample.member[ID] = swap.Subject_ML(ID, str(s['name']), category, kind, truth,threshold,s['external_ref']) tstring = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') MLsample.member[ID].was_described(by='knn', as_being=1, withp=p, at_time=tstring) # NOTE: if subject is Nair (training) it doesn't get flagged as # inactive but it can be flagged as detected/rejected # IF MACHINE P >= THRESHOLD, INSERT INTO SWAP COLLECTION # ------------------------------------------------------------- thresholds = {'detection':0.,'rejection':0.} if (p >= threshold) or (1-p >= threshold): print "BOOM! WE'VE GOT A MACHINE-CLASSIFIED SUBJECT:" print "Probability:",p # Initialize the subject in SWAP Collection sample.member[ID] = swap.Subject(ID, str(s['name']), category, kind,flavor,truth, thresholds, s['external_ref'],0.) sample.member[ID].retiredby = 'machine' # Flag subject as 'INACTIVE' / 'DETECTED' / 'REJECTED' # ---------------------------------------------------------- if p >= threshold: sample.member[str(s['id'])].state = 'inactive' elif 1-p >= threshold: sample.member[str(s['id'])].status = 'rejected' #--------------------------------------------------------------- # SAVE MACHINE METADATA? #--------------------------------------------------------------- print "Size of SWAP sample:", sample.size() print "Size of ML sample:", MLsample.size() if tonights.parameters['report']: # Output list of subjects to retire, based on this batch of # classifications. Note that what is needed here is the ZooID, # not the subject ID: new_retirementfile = swap.get_new_filename(tonights.parameters,\ 'retire_these', source='ML') print "SWAP: saving Machine-retired subject Zooniverse IDs..." N = swap.write_list(MLsample,new_retirementfile, item='retired_subject', source='ML') print "SWAP: "+str(N)+" lines written to "+new_retirementfile # write catalogs of smooth/not over MLthreshold # ------------------------------------------------------------- catalog = swap.get_new_filename(tonights.parameters, 'retired_catalog', source='ML') print "SWAP: saving catalog of Machine-retired subjects..." Nretired, Nsubjects = swap.write_catalog(MLsample,bureau, catalog, threshold, kind='rejected', source='ML') print "SWAP: From "+str(Nsubjects)+" subjects classified," print "SWAP: "+str(Nretired)+" retired (with P < rejection) "\ "written to "+catalog catalog = swap.get_new_filename(tonights.parameters, 'detected_catalog', source='ML') print "SWAP: saving catalog of Machine detected subjects..." Ndetected, Nsubjects = swap.write_catalog(MLsample, bureau, catalog, threshold, kind='detected', source='ML') print "SWAP: From "+str(Nsubjects)+" subjects classified," print "SWAP: %i detected (with P > MLthreshold) "\ "written to %s"%(Ndetected, catalog) # If is hasn't been done already, save the current directory # --------------------------------------------------------------------- tonights.parameters['dir'] = os.getcwd()+'/'+tonights.parameters['trunk'] if not os.path.exists(tonights.parameters['dir']): os.makedirs(tonights.parameters['dir']) # Repickle all the shits # ----------------------------------------------------------------------- if tonights.parameters['repickle']: new_samplefile = swap.get_new_filename(tonights.parameters,'collection') print "SWAP: saving SWAP subjects to "+new_samplefile swap.write_pickle(sample,new_samplefile) tonights.parameters['samplefile'] = new_samplefile new_samplefile=swap.get_new_filename(tonights.parameters,'MLcollection') print "SWAP: saving test sample subjects to "+new_samplefile swap.write_pickle(MLsample,new_samplefile) tonights.parameters['MLsamplefile'] = new_samplefile metadatafile = swap.get_new_filename(tonights.parameters,'metadata') print "SWAP: saving metadata to "+metadatafile swap.write_pickle(subjects,metadatafile) tonights.parameters['metadatafile'] = metadatafile # Update the time increment for SWAP's next run # ----------------------------------------------------------------------- t2 = datetime.datetime.strptime(tonights.parameters['start'], '%Y-%m-%d_%H:%M:%S') + \ datetime.timedelta(days=tonights.parameters['increment']) tstop = datetime.datetime.strptime(tonights.parameters['end'], '%Y-%m-%d_%H:%M:%S') if t2 == tstop: plots = True else: tonights.parameters['start'] = t2.strftime('%Y-%m-%d_%H:%M:%S') # Update configfile to reflect Machine additions # ----------------------------------------------------------------------- configfile = 'update.config' random_file = open(tonights.parameters['random_file'],"w"); random_state = np.random.get_state(); cPickle.dump(random_state,random_file); random_file.close(); swap.write_config(configfile, tonights.parameters) pdb.set_trace()
def bureau(self): if self._bureau is None: self.bureau = swap.read_pickle(self.parameters['bureaufile'], 'bureau') return self._bureau
# Check for pickles in array args: if len(args) == 1: collectionfile = args[0] print "make_trajectory_plots: illustrating subject trajectories in: " print "make_trajectory_plots: ", collectionfile else: print make_trajectory_plots.__doc__ return output_directory = './' # ------------------------------------------------------------------ # Read in collection: sample = swap.read_pickle(collectionfile, 'collection') print "make_trajectory_plots: total no. of available subjects: ", len( sample.list()) if highlights: # Read in subjects to be highlighted: highlightIDs = swap.read_list(listfile) print highlightIDs print "make_trajectory_plots: total no. of special subjects: ", len( highlightIDs) print "make_trajectory_plots: special subjects: ", highlightIDs # ------------------------------------------------------------------ # Start plot: figure = sample.start_trajectory_plot(title=title,
def collection(self): if self._collection is None: self._collection = swap.read_pickle(self.parameters['samplefile'], 'collection') return self._collection
def main(args): params = fetch_parameters(args.config) num_days = fetch_num_days(params) # --------------------------------------------------------------------- # Fetch lists of relevant filenames over the course of the run detectedfilelist = fetch_filelist(params, kind='detected') if args.old_run: rejectedfilelist = fetch_filelist(params, kind='retired') else: rejectedfilelist = fetch_filelist(params, kind='rejected') # --------------------------------------------------------------------- # Fetch the cumulative number of classified subjects from SWAP detected = fetch_number_of_subjects(detectedfilelist, kind='detected') rejected = fetch_number_of_subjects(rejectedfilelist, kind='rejected') GZX_retired_subjects = np.vstack([detected, rejected]) # --------------------------------------------------------------------- # Fetch the cumulative number of classified subjects from GZ2 GZ2_retired_subjects = fetch_num_retired_GZ2(num_days,expert=args.expert) # Generate appropriate output filename if args.combined_subjects: outname = args.config[len('update_'):-len('.config')]+'_combo' GZX_retired_subjects = np.sum(GZX_retired_subjects,axis=0) else: outname = args.config[len('update_'):-len('.config')] # --------------------------------------------------------------------- # Plot that shit plot_retired_GZ_vs_SWAP(GZX_retired_subjects, GZ2_retired_subjects, num_days, outfilename=outname) # --------------------------------------------------------------------- ### Generate evaluation report as a function of time if args.eval_report: try: eval_report = Table.read('GZXevaluation_%s.txt'%outname, format='ascii') recall = eval_report['recall'] accuracy = eval_report['accuracy'] precision = eval_report['precision'] except: meta = swap.read_pickle(params['metadatafile'], 'storage') subjects = meta.subjects accuracy, recall, precision = generate_SWAP_eval_report( detectedfilelist, rejectedfilelist, subjects) plot_GZX_evaluation(num_days, accuracy, precision, recall, outname)
def subjectMetadata(self): if self._subjectMetadata is None: self._subjectMetadata = swap.read_pickle( self.parameters['metadatafile'], 'metadata').subjects return self._subjectMetadata
def make_lens_catalog(args): """ NAME make_lens_catalog PURPOSE Given location of collection pickle, this script produces a set of annotated images of lenses (heatmaps for lens locations, markers for where clicks were, etc). COMMENTS You have to download the file so it chooses whever your output directory is to also download the raw images. This should be pretty customizable. FLAGS -h Print this message --skill Weight by skill INPUTS collection.pickle OUTPUTS lens.dat Assumed format: ID kind x y Prob N0 Skill Dist Here: ID = Space Warps subject ID kind = Space Warps subject type (sim, dud, test) x,y = object (cluster) centroid, in pixels P = Space Warps subject probability N0 = number of markers in the cluster S = total skill per cluster, summed over markers D = biggest distance within cluster EXAMPLE BUGS AUTHORS This file is part of the Space Warps project, and is distributed under the GPL v2 by the Space Warps Science Team. http://spacewarps.org/ HISTORY 2013-07-16 started Davis (KIPAC) """ # ------------------------------------------------------------------ # Some defaults: flags = {'skill': False, 'output_directory': './', 'output_name': 'catalog.dat', 'image_y_size': 440, 'catalog_path': '', 'update_collection': '',} # ------------------------------------------------------------------ # Read in options: # this has to be easier to do... for arg in args: if arg in flags: flags[arg] = args[arg] elif arg == 'collection_path': collection_path = args[arg] else: print "make_lens_atlas: unrecognized flag ",arg print "make_lens_catalog: illustrating behaviour captured in collection file: " print "make_lens_catalog: ",collection_path memory = joblib.Memory(cachedir=flags['output_directory']) memory.clear() catalog_path = flags['output_directory'] + flags['output_name'] if len(flags['output_name']) > 0: F = open(catalog_path, 'w') F.write('id,kind,x,y,prob,n0,skill,dist\n') # ------------------------------------------------------------------ # Read in files: collection = swap.read_pickle(collection_path, 'collection') ID_list = collection.list() print "make_lens_catalog: collection numbers ", len(ID_list) if flags['catalog_path'] != '': print "make_lens_catalog: filtering from catalog ",flags['catalog_path'] catalog_in = csv2rec(flags['catalog_path']) ID_list = np.unique(catalog_in['id']) # ------------------------------------------------------------------ # Run through data: catalog = {} for ID in ID_list: subject = collection.member[ID] kind = subject.kind P = subject.mean_probability itwas = subject.annotationhistory['ItWas'] x_all = subject.annotationhistory['At_X'] y_all = subject.annotationhistory['At_Y'] x_markers = np.array([xi for xj in x_all for xi in xj]) y_markers = np.array([yi for yj in y_all for yi in yj]) catalog.update({ID: {'agents_reject': [], 'x': x_markers, 'y': y_markers,}}) PL_all = subject.annotationhistory['PL'] PD_all = subject.annotationhistory['PD'] # filter out the empty clicks PL_list = [] PL_nots = [] for i, xj in enumerate(x_all): # len(xj) of empty = 0 PL_list.append([PL_all[i]] * len(xj)) if len(xj) == 0: PL_nots.append(PL_all[i]) PL = np.array([PLi for PLj in PL_list for PLi in PLj]) PL_nots = np.array(PL_nots) # filter out the empty clicks PD_list = [] PD_nots = [] for i, xj in enumerate(x_all): PD_list.append([PD_all[i]] * len(xj)) if len(xj) == 0: PD_nots.append(PD_all[i]) catalog[ID]['agents_reject'].append(i) PD = np.array([PDi for PDj in PD_list for PDi in PDj]) PD_nots = np.array(PD_nots) skill = swap.expectedInformationGain(0.5, PL, PD) # skill # it is only fair to write out the NOTs, too # do the empty guys skill_nots = swap.expectedInformationGain(0.5, PL_nots, PD_nots) # skill x, y = -1, -1 N0 = len(skill_nots) S = np.sum(skill_nots) D = 0 ## catalog.append((ID, kind, x, y, P, N0, S, D)) if len(catalog)%500 == 0: print len(catalog) if len(flags['output_name']) > 0: F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format( ID, kind, x, y, P, N0, S, D)) if len(x_markers) == 0: # apparently everyone was a not... catalog[ID]['agents_labels'] = np.array([]) continue # ------------------------------------------------------------------ # cluster print 'make_lens_catalog: subject ID = ', ID if flags['skill']: cluster_centers, cluster_center_labels, cluster_labels, \ n_clusters, dist_within = outlier_clusters(x_markers, y_markers, skill, memory=memory) else: cluster_centers, cluster_center_labels, cluster_labels, \ n_clusters, dist_within = outlier_clusters(x_markers, y_markers, None, memory=memory) # need to get: x, y, N0, S catalog[ID]['agents_labels'] = cluster_labels for cluster_center_label in cluster_center_labels: cluster_center = cluster_centers[cluster_center_label] members = (cluster_labels == cluster_center_label) x, y = cluster_center # convert y to catalog convention y = flags['image_y_size'] - y N0 = np.sum(members) S = np.sum(skill[members]) D = dist_within[cluster_center_label] if cluster_center_label == -1: # outlier cluster # so really every point is its own cluster... D = 0 ## catalog.append((ID, kind, x, y, P, N0, S, D)) ## if len(catalog)%500 == 0: ## print len(catalog) # TODO: make some requirement to be included (exclude outliers) if len(flags['output_name']) > 0: F.write('{0},{1},{2},{3},{4},{5},{6},{7}\n'.format( ID, kind, x, y, P, N0, S, D)) print 'make_lens_catalog: Clearing memory' # clear memory memory.clear() if len(flags['output_name']) > 0: print 'make_lens_catalog: closing file!' F.close() if len(flags['update_collection']) > 0: print 'make_lens_catalog: writing updated collection to', flags['update_collection'] # TODO: get the other params correct!!!! collection_fat = swap.collection.Collection() for ID in catalog: subject = collection.member[ID] atx = subject.annotationhistory['At_X'] labels_in = list(catalog[ID]['agents_labels']) labels_fat = [] for atx_i in atx: labels_fat.append([]) for atx_ij in atx_i: labels_fat[-1].append(labels_in.pop(0)) subject.annotationhistory.update({'labels': labels_fat}) collection_fat.member.update({ID: subject}) swap.write_pickle(collection_fat, flags['update_collection']) print 'make_lens_catalog: All done!' return catalog
# How will we make decisions based on probability? thresholds = {} thresholds['detection'] = tonights.parameters['detection_threshold'] thresholds['rejection'] = tonights.parameters['rejection_threshold'] # will we perform machine learning after SWAP? try: machine = tonights.parameters['machine'] except: machine = False print "SWAP: running MachineClassifier.py after this run?",machine #pdb.set_trace() # ------------------------------------------------------------------ # Read in, or create, a bureau of agents who will represent the # volunteers: bureau = swap.read_pickle(tonights.parameters['bureaufile'],'bureau') # ------------------------------------------------------------------ # Read in, or create, an object representing the candidate list: sample = swap.read_pickle(tonights.parameters['samplefile'],'collection') # ------------------------------------------------------------------ # Read in metadata (all subjects for which we have morph params) subjects = swap.read_pickle(tonights.parameters['metadatafile'], 'metadata') # ------------------------------------------------------------------ # Open up database: db = swap.MySQLdb() # Read in a batch of classifications, made since the aforementioned # start time:
# Check for pickles in array args: if len(args) == 2: bureau1_path = args[0] bureau2_path = args[1] print "make_info_plots: illustrating behaviour captured in bureau files: " print "make_info_plots: ",bureau1_path print "make_info_plots: ",bureau2_path else: print make_info_plots.__doc__ return # Read in bureau objects: bureau1 = swap.read_pickle(bureau1_path, 'bureau') bureau2 = swap.read_pickle(bureau2_path, 'bureau') print "make_info_plots: stage 1, 2 agent numbers: ",len(bureau1.list()), len(bureau2.list()) experience1 = [] effort1 = [] final_skill1 = [] final_PL1 =[] final_PD1 =[] information1 = [] contribution1 = [] experience2 = [] effort2 = [] final_skill2 = [] final_PL2 =[]
if not path.exists(cluster_directory): makedirs(cluster_directory) if not path.exists(field_directory): makedirs(field_directory) # load up requisite files collection_path = args['collection'] if '.pickle' in collection_path: # will need PJM's swap to interpret the spacewarps database try: import swap except: raise Exception('create_catalogs: Unable to import SpaceWarps analysis code!') #collection_path = base_collection_path + 'stage{0}'.format(stage) + '/CFHTLS_collection.pickle' collection_pickle = swap.read_pickle(collection_path, 'collection') stage = -1 collection = convert_swap_collection_to_dataframe(collection_pickle, stage) elif '.csv' in collection_path: collection = pd.read_csv(collection_path) else: raise Exception('create_catalogs: No idea what kind of collection you are giving me here for {0}'.format(collection_path)) if args['knownlens']: knownlens = pd.read_csv(args['knownlens']) else: knownlens = None # create the cluster args cluster_args = {'collection': collection, 'knownlens': knownlens,
def make_offline_reports(args): """ NAME make_offline_reports PURPOSE Given an offline tuple as well as other bureau tuples etc, this script produces the reports made at the end of SWAP COMMENTS FLAGS -h Print this message --out Output directory, otherwise is '.' --do_offline Do offline analysis? INPUTS configfile Plain text file containing SW experiment configuration bureaufile collectionfile OUTPUTS EXAMPLE BUGS AUTHORS This file is part of the Space Warps project, and is distributed under the MIT license by the Space Warps Science Team. http://spacewarps.org/ HISTORY 2014-09-16 started Davis (KIPAC) """ # ------------------------------------------------------------------ # Some defaults: # default settings are for offline using only exact training info flags = {'do_offline': False, 'output_directory': '.', 'PL0': 0.5, # initial PL guess 'PD0': 0.5, # initial PD guess 'pi': 4e-2, # initial lens probability 'n_min_assessment': 0, # minimum number of assessments before included in analysis 'use_training_info': True, 'exclude_test_info': True, 'exclude_training_info': False, 'N_min': 10, # min number of EM steps required 'N_max': 100, # max number of EM steps 'epsilon_min': 1e-6, # escape condition } # this has to be easier to do... for arg in args: if arg in flags: flags[arg] = args[arg] elif arg == 'config': configfile = args[arg] elif arg == 'collection': collectionfile = args[arg] elif arg == 'bureau': bureaufile = args[arg] else: print "make_offline_reports: unrecognized flag ",arg out_dir = flags['output_directory'] # ------------------------------------------------------------------ # Read in run configuration: tonights = swap.Configuration(configfile) # TODO: do this correctly tonights.parameters['finish'] = 'now' tonights.parameters['start'] = 'now' tonights.parameters['trunk'] = \ tonights.parameters['survey']+'_'+tonights.parameters['finish'] tonights.parameters['dir'] = out_dir # How will we make decisions based on probability? thresholds = {} thresholds['detection'] = tonights.parameters['detection_threshold'] thresholds['rejection'] = tonights.parameters['rejection_threshold'] t = -1 # for now?! # ------------------------------------------------------------------ # Read in, or create, a bureau of agents who will represent the # volunteers: bureau = swap.read_pickle(bureaufile, 'bureau') # ------------------------------------------------------------------ # Read in, or create, an object representing the candidate list: sample = swap.read_pickle(collectionfile, 'collection') # ------------------------------------------------------------------ # if do_offline, run offline analysis here: if flags['do_offline']: PL0 = flags['PL0'] PD0 = flags['PD0'] pi = flags['pi'] n_min_assessment = flags['n_min_assessment'] use_training_info = flags['use_training_info'] exclude_test_info = flags['exclude_test_info'] exclude_training_info = flags['exclude_training_info'] N_min = flags['N_min'] N_max = flags['N_max'] epsilon_min = flags['epsilon_min'] # initialize offline params bureau_offline = {} probabilities = {} online_probabilities = {} training_IDs = {} # which entries in collection are training set_aside_subject = {} # which subjects do we set aside? Here we set aside none set_aside_agent = {} # which agents do we set aside? Here we set aside none collection = {} for ID in sample.list(): if ID in set_aside_subject: continue else: collection.update({ID: sample.member[ID]}) for ID in collection.keys(): subject = collection[ID] n_assessment = len(subject.annotationhistory['ItWas']) if (n_assessment > n_min_assessment): if (subject.category == 'training'): if use_training_info: truth = {'LENS': 1, 'NOT': 0}[subject.truth] training_IDs.update({ID: truth}) if exclude_training_info: # when doing M step, don't use these to update parameters training_IDs.update({ID: -1}) elif (subject.category == 'test'): if exclude_test_info: # when doing M step, don't use these to update parameters training_IDs.update({ID: -1}) probabilities.update({ID: pi}) online_probabilities.update({ID: subject.mean_probability}) for agent_i in xrange(len(subject.annotationhistory['Name'])): name = subject.annotationhistory['Name'][agent_i] if name in set_aside_agent: continue xij = subject.annotationhistory['ItWas'][agent_i] if name not in bureau_offline: bureau_offline.update({name: { 'PD': PD0, 'PL': PL0, 'PL_in': bureau.member[name].PL, 'PD_in': bureau.member[name].PD, 'Pi': pi, 'Subjects': {ID: xij}}}) else: bureau_offline[name]['Subjects'].update({ID: xij}) # Run EM Algorithm bureau_offline, pi, probabilities, information_dict = EM_algorithm( bureau_offline, pi, probabilities, training_IDs, N_min=N_min, N_max=N_max, epsilon_min=epsilon_min, return_information=True) tup = (bureau_offline, pi, probabilities, information_dict) offlinefile = out_dir + '/offline.pickle' swap.write_pickle(tup, offlinefile) # ------------------------------------------------------------------ # Now replace sample member probabilities with offline probabilities # Also update bureau with offline results for ID in sample.list(): # just in case any IDs didn't get into offline somehow?! if ID not in probabilities.keys(): sample.member.pop(ID) continue # This is a bit hackish: update mean_probability, # median_probability, and do the rejection threshold stuff subject = sample.member[ID] subject.mean_probability = probabilities[ID] subject.median_probability = probabilities[ID] # ripped from subject.py if subject.mean_probability < subject.rejection_threshold: subject.status = 'rejected' if subject.kind == 'test': subject.state = 'inactive' subject.retirement_time = -1#at_time subject.retirement_age = subject.exposure elif subject.mean_probability > subject.detection_threshold: subject.status = 'detected' if subject.kind == 'test': # Let's keep the detections live! # subject.state = 'inactive' # subject.retirement_time = at_time # subject.retirement_age = subject.exposure pass else: # Keep the subject alive! This code is only reached if # we are not being hasty. subject.status = 'undecided' if subject.kind == 'test': subject.state = 'active' subject.retirement_time = 'not yet' subject.retirement_age = 0.0 # I don't think this is necessary, but just in case sample.member[ID] = subject for kind in ['sim', 'dud', 'test']: sample.collect_probabilities(kind) # now save collectionfile = out_dir + '/collection_offline.pickle' swap.write_pickle(collection, collectionfile) # now update bureau for ID in bureau.list(): # just in case any IDs didn't make it to offline? if ID not in bureau_offline.keys(): bureau.member.pop(ID) continue # update PL, PD, then update_skill agent = bureau.member[ID] agent.PL = bureau_offline[ID]['PL'] agent.PD = bureau_offline[ID]['PD'] agent.update_skill() # I don't think this is necessary, but just in case bureau.member[ID] = agent bureau.collect_probabilities() # now save bureaufile = out_dir + '/bureau_offline.pickle' swap.write_pickle(bureau, bureaufile) # ------------------------------------------------------------------ # now we can pretend we're in SWAP.py new_retirementfile = swap.get_new_filename(tonights.parameters,'retire_these') print "make_offline_reports: saving retiree subject Zooniverse IDs..." N = swap.write_list(sample,new_retirementfile,item='retired_subject') print "make_offline_reports: "+str(N)+" lines written to "+new_retirementfile # Also print out lists of detections etc! These are urls of images. new_samplefile = swap.get_new_filename(tonights.parameters,'candidates') print "make_offline_reports: saving lens candidates..." N = swap.write_list(sample,new_samplefile,item='candidate') print "make_offline_reports: "+str(N)+" lines written to "+new_samplefile # Now save the training images, for inspection: new_samplefile = swap.get_new_filename(tonights.parameters,'training_true_positives') print "make_offline_reports: saving true positives..." N = swap.write_list(sample,new_samplefile,item='true_positive') print "make_offline_reports: "+str(N)+" lines written to "+new_samplefile new_samplefile = swap.get_new_filename(tonights.parameters,'training_false_positives') print "make_offline_reports: saving false positives..." N = swap.write_list(sample,new_samplefile,item='false_positive') print "make_offline_reports: "+str(N)+" lines written to "+new_samplefile new_samplefile = swap.get_new_filename(tonights.parameters,'training_false_negatives') print "make_offline_reports: saving false negatives..." N = swap.write_list(sample,new_samplefile,item='false_negative') print "make_offline_reports: "+str(N)+" lines written to "+new_samplefile # Also write out catalogs of subjects, including the ZooID, subject ID, # how many classifications, and probability: catalog = swap.get_new_filename(tonights.parameters,'candidate_catalog') print "make_offline_reports: saving catalog of high probability subjects..." Nlenses,Nsubjects = swap.write_catalog(sample,catalog,thresholds,kind='test') print "make_offline_reports: From "+str(Nsubjects)+" subjects classified," print "make_offline_reports: "+str(Nlenses)+" candidates (with P > rejection) written to "+catalog catalog = swap.get_new_filename(tonights.parameters,'sim_catalog') print "make_offline_reports: saving catalog of high probability subjects..." Nsims,Nsubjects = swap.write_catalog(sample,catalog,thresholds,kind='sim') print "make_offline_reports: From "+str(Nsubjects)+" subjects classified," print "make_offline_reports: "+str(Nsims)+" sim 'candidates' (with P > rejection) written to "+catalog catalog = swap.get_new_filename(tonights.parameters,'dud_catalog') print "make_offline_reports: saving catalog of high probability subjects..." Nduds,Nsubjects = swap.write_catalog(sample,catalog,thresholds,kind='dud') print "make_offline_reports: From "+str(Nsubjects)+" subjects classified," print "make_offline_reports: "+str(Nduds)+" dud 'candidates' (with P > rejection) written to "+catalog # ------------------------------------------------------------------ # Make plots! Can't plot everything - uniformly sample 200 of each # thing (agent or subject). # Agent histories: fig1 = bureau.start_history_plot() pngfile = swap.get_new_filename(tonights.parameters,'histories') Nc = np.min([200,bureau.size()]) print "make_offline_reports: plotting "+str(Nc)+" agent histories in "+pngfile for Name in bureau.shortlist(Nc): bureau.member[Name].plot_history(fig1) bureau.finish_history_plot(fig1,t,pngfile) tonights.parameters['historiesplot'] = pngfile # Agent probabilities: pngfile = swap.get_new_filename(tonights.parameters,'probabilities') print "make_offline_reports: plotting "+str(Nc)+" agent probabilities in "+pngfile bureau.plot_probabilities(Nc,t,pngfile) tonights.parameters['probabilitiesplot'] = pngfile # Subject trajectories: fig3 = sample.start_trajectory_plot() pngfile = swap.get_new_filename(tonights.parameters,'trajectories') # Random 500 for display purposes: Ns = np.min([500,sample.size()]) print "make_offline_reports: plotting "+str(Ns)+" subject trajectories in "+pngfile for ID in sample.shortlist(Ns): sample.member[ID].plot_trajectory(fig3) # To plot only false negatives, or only true positives: # for ID in sample.shortlist(Ns,kind='sim',status='rejected'): # sample.member[ID].plot_trajectory(fig3) # for ID in sample.shortlist(Ns,kind='sim',status='detected'): # sample.member[ID].plot_trajectory(fig3) sample.finish_trajectory_plot(fig3,pngfile,t=t) tonights.parameters['trajectoriesplot'] = pngfile # Candidates! Plot all undecideds or detections: fig4 = sample.start_trajectory_plot(final=True) pngfile = swap.get_new_filename(tonights.parameters,'sample') # BigN = 100000 # Would get them all... BigN = 500 # Can't see them all! candidates = [] candidates += sample.shortlist(BigN,kind='test',status='detected') candidates += sample.shortlist(BigN,kind='test',status='undecided') sims = [] sims += sample.shortlist(BigN,kind='sim',status='detected') sims += sample.shortlist(BigN,kind='sim',status='undecided') duds = [] duds += sample.shortlist(BigN,kind='dud',status='detected') duds += sample.shortlist(BigN,kind='dud',status='undecided') print "make_offline_reports: plotting "+str(len(sims))+" sims in "+pngfile for ID in sims: sample.member[ID].plot_trajectory(fig4) print "make_offline_reports: plotting "+str(len(duds))+" duds in "+pngfile for ID in duds: sample.member[ID].plot_trajectory(fig4) print "make_offline_reports: plotting "+str(len(candidates))+" candidates in "+pngfile for ID in candidates: sample.member[ID].plot_trajectory(fig4) # They will all show up in the histogram though: sample.finish_trajectory_plot(fig4,pngfile,final=True) tonights.parameters['candidatesplot'] = pngfile # ------------------------------------------------------------------ # Finally, write a PDF report: swap.write_report(tonights.parameters,bureau,sample)
def main(): parser = OptionParser() parser.add_option("-w", dest="weight", default='uniform', help="Run KNC with preferred weighting.") parser.add_option("-t", dest="thresh", default=None, help="Set threshold") parser.add_option("-p", dest="plotonly", action='store_true', default=False, help="Skip machine learning and go straight to plotting") parser.add_option("-n", dest="name_modifier", default=None, help="Additional naming identification for output files") (options, args) = parser.parse_args() if options.thresh: thresh = float(options.thresh) else: thresh = None if options.plotonly: #metrics_uni = read_pickle('KNC_uniform_eval.pickle') #metrics_dist = read_pickle('KNC_distance_eval.pickle') ###################### JUST PLOT THE SHITS ########################### kwargs = {'thresh':thresh, 'keys':['accuracy','contamination', 'completeness', 'falseomis','trueneg'], 'labels':['Accuracy', 'Contamination (S)', 'Completeness (S)', 'Contamination (F)', 'Completeness (F)'], 'name_modifier':options.name_modifier} print options.weight plot_the_shits(metric='all', method='KNC_%s'%options.weight, **kwargs) #explore_accuracy(method='KNC_uniform') exit() ################### READ IN TRAINING / VALIDATION DATA ############# filename = 'GZ2_testML2_metadata.pickle' data = swap.read_pickle(filename, 'metadata') # This is the "New" validation sample -- Expertly classified valid_idx = np.where((data['MLsample']=='valid') & (data['GZ2_label']!=-1) & (data['Nair_label']!=-1) & (data['Expert_label']!=-1)) valid = data[valid_idx] valid_meta, valid_features = ml.extract_training(valid) valid_labels_ex = valid_meta['Expert_label'].filled() valid_labels_gz = valid_meta['GZ2_label'].filled() valid_labels_nr = valid_meta['Nair_label'].filled() # Let's try to recreate what I had before. valid2_idx = np.where((data['Nair_label']!=-1)) valid2 = data[valid2_idx] valid2_meta, valid2_features = ml.extract_training(valid2) valid2_labels = valid2_meta['Nair_label'].filled() # Which validation sample do I want to use? BLAH. # --> Used this to try to replicate what I had a month ago (Nair "truth") #valid_features = valid2_features #valid_labels = valid2_labels # Now test on the new, smaller validation sample # --> first, still with Nair "truth" valid_labels = valid_labels_nr # --> second, using GZ2 user "truth" #valid_labels = valid_labels_gz # --> finally, using Expert "truth" #valid_labels = valid_labels_ex # Load up the training set (ALL GZ labels) train_idx = np.where((data['MLsample']!='valid') & (data['GZ2_label']!=-1)) train = data[train_idx] train_meta, train_features = ml.extract_training(train) train_labels = train_meta['GZ2_label'].filled() # select various and increasing size training samples # ------------------------------------------------------------------- N = [100,500,1000,5000,10000,50000]# K = [5,10,15,20,25,30,35,40,45,50] evaluation_metrics = {'precision':[], 'recall':[], 'pr_thresh':[], 'falsepos':[], 'truepos':[], 'roc_thresh':[], 'accuracy':[], 'thresh':[], 'falseomis':[], 'falseneg':[], 'trueneg':[], 'contamination':[], 'completeness':[], 'precision_score':[], 'recall_score':[], 'accuracy_score':[], 'roc_auc_score1':[], 'roc_auc_score2':[], 'f1_score':[], 'k':[],'n':[]} ################### RUN CLASSIFIERS WITIH VARIOUS PARAMS ############ ############## this is running through various K manually ########## for j,n in enumerate(N): train_features_sub = train_features[:n] train_labels_sub = train_labels[:n] #ratio = float(np.sum(train_labels_sub==1))/len(train_labels_sub) #print "Ratio of Smooth / Total for training sample (%i): %f"\ # %(n, ratio) for i,k in enumerate(K): # Adjust k because it can't be => sample size if n <= k: k = n-1 preds, probs, machine = ml.runKNC(train_features_sub, train_labels_sub, valid_features, N=k, weights=options.weight) #preds = ml.runRNC(train_sample, labels, valid_sample, R=k, # weights='distance', outlier=0) fps, tps, thresh = mtrx._binary_clf_curve(valid_labels,probs[:,1]) metrics_list = mtrx.compute_binary_metrics(fps, tps) [acc, tpr, fpr, fnr, tnr, prec, fdr, fomis, npv] = metrics_list evaluation_metrics['completeness'].append(tpr) evaluation_metrics['contamination'].append(fdr) evaluation_metrics['falseneg'].append(fnr) evaluation_metrics['trueneg'].append(tnr) evaluation_metrics['falseomis'].append(fomis) evaluation_metrics['accuracy'].append(acc) evaluation_metrics['thresh'].append(thresh) # Curves -- for plotting ROC and PR pp, rr, thresh2 = mx.precision_recall_curve(valid_labels,probs[:,1]) evaluation_metrics['precision'].append(pp) evaluation_metrics['recall'].append(rr) evaluation_metrics['pr_thresh'].append(thresh2) fpr, tpr, thresh3=mx.roc_curve(valid_labels, probs[:,1],pos_label=1) evaluation_metrics['falsepos'].append(fpr) evaluation_metrics['truepos'].append(tpr) evaluation_metrics['roc_thresh'].append(thresh3) # Single value metrics -- for plotting against N? K? whatever... evaluation_metrics['roc_auc_score1'].append(mx.auc(fpr, tpr)) evaluation_metrics['roc_auc_score2'].append(mx.roc_auc_score( valid_labels,preds)) evaluation_metrics['precision_score'].append(mx.precision_score( valid_labels,preds)) evaluation_metrics['recall_score'].append(mx.recall_score( valid_labels,preds)) evaluation_metrics['accuracy_score'].append(mx.accuracy_score( valid_labels,preds)) evaluation_metrics['f1_score'].append(mx.f1_score( valid_labels,preds)) # current k and n so I don't have to backstrapolate evaluation_metrics['k'].append(k) evaluation_metrics['n'].append(n) for key, val in evaluation_metrics.iteritems(): evaluation_metrics[key] = np.array(evaluation_metrics[key]) # If everything works... Let's save this huge structure as a pickle filename = 'KNC_%s_eval_%s.pickle'%(options.weight, options.name_modifier) F = open(filename,'wb') cPickle.dump(evaluation_metrics, F, protocol=2) print "Saved evaluation metrics %s"%filename ######################### PLOT THE SHITS ############################# #kwargs = {'thresh':.5, 'keys':['accuracy','precision','recall']} #plot_the_shits(method='KNC_uniform', metric='all', **kwargs) #explore_accuracy(method='KNC_uniform') exit()
# Simple naming scheme for output lists and plots: tonights.parameters['trunk'] = \ tonights.parameters['survey'] tonights.parameters['dir'] = os.getcwd() # What was the last time a subject was touched? t = datetime.datetime.strptime(tonights.parameters['start'], '%Y-%m-%d_%H:%M:%S') # ------------------------------------------------------------------ # Read in, or create, a bureau of agents who will represent the # volunteers: bureau = swap.read_pickle(tonights.parameters['bureaufile'], 'bureau') # ------------------------------------------------------------------ # Read in, or create, an object representing the candidate list: sample = swap.read_pickle(tonights.parameters['samplefile'], 'collection') # ------------------------------------------------------------------ # Output list of subjects to retire, based on this batch of # classifications. Note that what is needed here is the ZooID, # not the subject ID. Also print out lists of detections etc! # These are urls of images. new_samplefile = swap.get_new_filename(tonights.parameters, 'candidates') print "SWEAR: saving lens candidates..."
makedirs(cluster_directory) if not path.exists(field_directory): makedirs(field_directory) # load up requisite files collection_path = args['collection'] if '.pickle' in collection_path: # will need PJM's swap to interpret the spacewarps database try: import swap except: raise Exception( 'create_catalogs: Unable to import SpaceWarps analysis code!') #collection_path = base_collection_path + 'stage{0}'.format(stage) + '/CFHTLS_collection.pickle' collection_pickle = swap.read_pickle(collection_path, 'collection') stage = -1 collection = convert_swap_collection_to_dataframe( collection_pickle, stage) elif '.csv' in collection_path: collection = pd.read_csv(collection_path) else: raise Exception( 'create_catalogs: No idea what kind of collection you are giving me here for {0}' .format(collection_path)) if args['knownlens']: knownlens = pd.read_csv(args['knownlens']) else: knownlens = None