if not more_to_do: tonights.parameters['start'] = tstring swap.set_cookie(False) # SWAPSHOP will read this cookie and act accordingly. configfile = 'update.config' # Random_file needs updating, else we always start from the same random # state when update.config is reread! random_file = open(tonights.parameters['random_file'],"w"); random_state = np.random.get_state(); cPickle.dump(random_state,random_file); random_file.close(); swap.write_config(configfile, tonights.parameters) # ------------------------------------------------------------------ if report: # Make plots! Can't plot everything - uniformly sample 200 of each # thing (agent or subject). # Agent histories: fig1 = bureau.start_history_plot() pngfile = swap.get_new_filename(tonights.parameters,'histories') Nc = np.min([200,bureau.size()]) print "SWAP: plotting "+str(Nc)+" agent histories in "+pngfile
if not more_to_do: tonights.parameters['start'] = tstring swap.set_cookie(False) # SWAPSHOP will read this cookie and act accordingly. configfile = 'update.config' # Random_file needs updating, else we always start from the same random # state when update.config is reread! random_file = open(tonights.parameters['random_file'], "w") random_state = np.random.get_state() cPickle.dump(random_state, random_file) random_file.close() swap.write_config(configfile, tonights.parameters) # ------------------------------------------------------------------ if report: # Make plots! Can't plot everything - uniformly sample 200 of each # thing (agent or subject). # Agent histories: fig1 = bureau.start_history_plot() pngfile = swap.get_new_filename(tonights.parameters, 'histories') Nc = np.min([200, bureau.size()]) print "SWAP: plotting " + str(Nc) + " agent histories in " + pngfile
def MachineClassifier(options, args): try: config = options.configfile except: pdb.set_trace() tonights = swap.Configuration(config) #""" # Read the pickled random state file random_file = open(tonights.parameters['random_file'],"r"); random_state = cPickle.load(random_file); random_file.close(); np.random.set_state(random_state); #""" # Get the machine threshold (make retirement decisions) threshold = tonights.parameters['machine_threshold'] # Get list of evaluation metrics and criteria eval_metrics = tonights.parameters['evaluation_metrics'] survey = tonights.parameters['survey'] subdir = 'sup_run4' #---------------------------------------------------------------------- # read in the metadata for all subjects (Test or Training sample?) subjects = swap.read_pickle(tonights.parameters['metadatafile'], 'metadata') #---------------------------------------------------------------------- # read in the SWAP collection sample = swap.read_pickle(tonights.parameters['samplefile'],'collection') #---------------------------------------------------------------------- # read in or create the ML collection MLsample = swap.read_pickle(tonights.parameters['MLsamplefile'], 'MLcollection') # read in or create the ML bureau for machine agents (history) MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'], 'MLbureau') #----------------------------------------------------------------------- # DETERMINE IF THERE IS A TRAINING SAMPLE TO WORK WITH #----------------------------------------------------------------------- # TO DO: training sample should only select those which are NOT part of # validation sample (Nair catalog objects) 2/22/16 # IDENTIFY TRAINING SAMPLE train_sample = subjects[subjects['MLsample']=='train'] train_meta, train_features = ml.extract_training(train_sample) train_labels = np.array([1 if p > 0.3 else 0 \ for p in train_meta['SWAP_prob']]) # IDENTIFY VALIDATION SAMPLE (FINAL) valid_sample = subjects[subjects['MLsample']=='valid'] valid_meta, valid_features = ml.extract_training(valid_sample) valid_labels = valid_meta['Expert_label'].filled() #if len(train_sample) >= 100: # TO DO: LOOP THROUGH DIFFERENT MACHINES? HOW MANY MACHINES? for metric in eval_metrics: # REGISTER Machine Classifier # Construct machine name --> Machine+Metric? For now: KNC machine = 'KNC' Name = machine+'_'+metric # register an Agent for this Machine try: test = MLbureau.member[Name] except: MLbureau.member[Name] = swap.Agent_ML(Name, metric) #--------------------------------------------------------------- # TRAIN THE MACHINE; EVALUATE ON VALIDATION SAMPLE #--------------------------------------------------------------- # Now we run the machine -- need cross validation on whatever size # training sample we have .. # For now this will be fixed until we build in other machine options params = {'n_neighbors':np.arange(1, 2*(len(train_sample)-1) / 3, 2), 'weights':('uniform','distance')} # Create the model general_model = GridSearchCV(estimator=KNC(), param_grid=params, error_score=0, scoring=metric) # Train the model -- k-fold cross validation is embedded trained_model = general_model.fit(train_features, train_labels) # Test "accuracy" (metric of choice) on validation sample score = trained_model.score(valid_features, valid_labels) MLbureau.member[Name].record_training(\ model_described_by=trained_model.best_estimator_, with_params=trained_model.best_params_, trained_on=len(train_features), at_time=TIME, with_train_acc=traineed_model.best_score_, and_valid_acc=trained_model.score(valid_features, valid_labels)) # Store the trained machine MLbureau.member[Name].model = trained_model # Compute / store confusion matrix as a function of threshold # produced by this machine on the Expert Validation sample fps, tps, thresh = mtrx._binary_clf_curve(valid_labels, trained_model.predict_proba(valid_features)[:,1]) metric_list = mtrx.compute_binary_metrics(fps, tps) ACC, TPR, FPR, FNR, TNR, PPV, FDR, FOR, NPV = metric_list MLbureau.member[Name].record_evaluation(accuracy=ACC, completeness_s=TPR, contamination_s=FDR, completeness_f=TNR, contamination_f=NPV) pdb.set_trace() # 3. compare the metric of choice with the evaluation criterion to # see if this machine has sufficiently learned? # ... what if my criterion is simply "Maximize Accuracy"? # ... or minimize feature contamination? these require that we # compare tonight's machine with the previous night's machine # But if my criterion is simply "have feature contam less than 20%" # then it's easy.... # IF TRAINED MACHINE PREDICTS WELL ON VALIDATION .... if MLbureau.member[Name].evaluate(): #--------------------------------------------------------------- # APPLY MACHINE TO TEST SAMPLE #--------------------------------------------------------------- # This requires that my runKNC function returns the Machine Object shitski=5 #--------------------------------------------------------------- # PROCESS PREDICTIONS/PROBS #--------------------------------------------------------------- for s,p,l in zip(test_meta, probas, predictions): ID = str(s['id']) descriptions = Nair_or_Not(s) category, kind, flavor, truth = descriptions # LOAD EACH TEST SUBJECT INTO MACHINE COLLECTION # ------------------------------------------------------------- try: test = MLsample.member[ID] except: MLsample.member[ID] = swap.Subject_ML(ID, str(s['name']), category, kind, truth,threshold,s['external_ref']) tstring = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') MLsample.member[ID].was_described(by='knn', as_being=1, withp=p, at_time=tstring) # NOTE: if subject is Nair (training) it doesn't get flagged as # inactive but it can be flagged as detected/rejected # IF MACHINE P >= THRESHOLD, INSERT INTO SWAP COLLECTION # ------------------------------------------------------------- thresholds = {'detection':0.,'rejection':0.} if (p >= threshold) or (1-p >= threshold): print "BOOM! WE'VE GOT A MACHINE-CLASSIFIED SUBJECT:" print "Probability:",p # Initialize the subject in SWAP Collection sample.member[ID] = swap.Subject(ID, str(s['name']), category, kind,flavor,truth, thresholds, s['external_ref'],0.) sample.member[ID].retiredby = 'machine' # Flag subject as 'INACTIVE' / 'DETECTED' / 'REJECTED' # ---------------------------------------------------------- if p >= threshold: sample.member[str(s['id'])].state = 'inactive' elif 1-p >= threshold: sample.member[str(s['id'])].status = 'rejected' #--------------------------------------------------------------- # SAVE MACHINE METADATA? #--------------------------------------------------------------- print "Size of SWAP sample:", sample.size() print "Size of ML sample:", MLsample.size() if tonights.parameters['report']: # Output list of subjects to retire, based on this batch of # classifications. Note that what is needed here is the ZooID, # not the subject ID: new_retirementfile = swap.get_new_filename(tonights.parameters,\ 'retire_these', source='ML') print "SWAP: saving Machine-retired subject Zooniverse IDs..." N = swap.write_list(MLsample,new_retirementfile, item='retired_subject', source='ML') print "SWAP: "+str(N)+" lines written to "+new_retirementfile # write catalogs of smooth/not over MLthreshold # ------------------------------------------------------------- catalog = swap.get_new_filename(tonights.parameters, 'retired_catalog', source='ML') print "SWAP: saving catalog of Machine-retired subjects..." Nretired, Nsubjects = swap.write_catalog(MLsample,bureau, catalog, threshold, kind='rejected', source='ML') print "SWAP: From "+str(Nsubjects)+" subjects classified," print "SWAP: "+str(Nretired)+" retired (with P < rejection) "\ "written to "+catalog catalog = swap.get_new_filename(tonights.parameters, 'detected_catalog', source='ML') print "SWAP: saving catalog of Machine detected subjects..." Ndetected, Nsubjects = swap.write_catalog(MLsample, bureau, catalog, threshold, kind='detected', source='ML') print "SWAP: From "+str(Nsubjects)+" subjects classified," print "SWAP: %i detected (with P > MLthreshold) "\ "written to %s"%(Ndetected, catalog) # If is hasn't been done already, save the current directory # --------------------------------------------------------------------- tonights.parameters['dir'] = os.getcwd()+'/'+tonights.parameters['trunk'] if not os.path.exists(tonights.parameters['dir']): os.makedirs(tonights.parameters['dir']) # Repickle all the shits # ----------------------------------------------------------------------- if tonights.parameters['repickle']: new_samplefile = swap.get_new_filename(tonights.parameters,'collection') print "SWAP: saving SWAP subjects to "+new_samplefile swap.write_pickle(sample,new_samplefile) tonights.parameters['samplefile'] = new_samplefile new_samplefile=swap.get_new_filename(tonights.parameters,'MLcollection') print "SWAP: saving test sample subjects to "+new_samplefile swap.write_pickle(MLsample,new_samplefile) tonights.parameters['MLsamplefile'] = new_samplefile metadatafile = swap.get_new_filename(tonights.parameters,'metadata') print "SWAP: saving metadata to "+metadatafile swap.write_pickle(subjects,metadatafile) tonights.parameters['metadatafile'] = metadatafile # Update the time increment for SWAP's next run # ----------------------------------------------------------------------- t2 = datetime.datetime.strptime(tonights.parameters['start'], '%Y-%m-%d_%H:%M:%S') + \ datetime.timedelta(days=tonights.parameters['increment']) tstop = datetime.datetime.strptime(tonights.parameters['end'], '%Y-%m-%d_%H:%M:%S') if t2 == tstop: plots = True else: tonights.parameters['start'] = t2.strftime('%Y-%m-%d_%H:%M:%S') # Update configfile to reflect Machine additions # ----------------------------------------------------------------------- configfile = 'update.config' random_file = open(tonights.parameters['random_file'],"w"); random_state = np.random.get_state(); cPickle.dump(random_state,random_file); random_file.close(); swap.write_config(configfile, tonights.parameters) pdb.set_trace()
def MachineShop(args): # Buh. I never built in the ability to change directories on the fly #machine_sim_directory = 'sims_Machine/redo_with_circular_morphs' """ Sometimes you just need to the run the Machine on a bunch of already made SWAP-runs / simulations. If so, this script is for you! """ # Get parameters from the SWAP run of interest the = swap.Configuration(args.config) params = the.parameters # This pulls up the FIDUCIAL SWAP simulation sim = Simulation(config=args.config, directory='sims_SWAP/S_PLPD5_p5_ff_norand', variety='feat_or_not') # this was originally set to 2/17/09 which is WRONG # 11/2/17: WHY?? F**k you, Past Melanie. What am I supposed to do here?? first_day = dt.datetime(2009, 2, 12) today = dt.datetime.strptime(params['start'], '%Y-%m-%d_%H:%M:%S') start_day = dt.datetime(2009, 2, 17) last_day = dt.datetime.strptime(params['end'], '%Y-%m-%d_%H:%M:%S') yesterday = None run_machine = False SWAP_retired = 0 notfound = 0 last_night = None for idx, filename in enumerate(sim.retiredFileList[(today - first_day).days:]): print "" print "----------------------- The Machine Shop ----------------------------" print "Today is {}".format(today) if today >= last_day: print "Get outta the machine shop!" exit() # --------------------------------------------------------------------- # OPEN METADATA PICKLE (updated each time MachineClassifier is run) # --------------------------------------------------------------------- backup_meta_file = params['metadatafile'].replace( '.pickle', '_orig.pickle') if today == first_day: try: storage = swap.read_pickle(backup_meta_file, 'metadata') except: print "MachineShop: Backup metadata pickle not yet created." print "MachineShop: Opening original metadata pickle file instead" storage = swap.read_pickle(params['metadatafile'], 'metadata') if 'retired_date' not in storage.subjects.colnames: storage.subjects['retired_date'] = '2016-09-10' if 'valid' not in np.unique(storage.subjects['MLsample']): expert = (storage.subjects['Expert_label'] != -1) storage.subjects['MLsample'][expert] = 'valid' # save an untouched copy for reference later print "MachineShop: Creating a backup metadata pickle" swap.write_pickle(storage, backup_meta_file) else: storage = swap.read_pickle(params['metadatafile'], 'metadata') # Regardless of which metadata you open, make sure it has these columns # (old metadata files WON'T have them!) if 'retired_date' not in storage.subjects.colnames: storage.subjects['retired_date'] = '2016-09-10' if 'valid' not in np.unique(storage.subjects['MLsample']): expert = (storage.subjects['Expert_label'] != -1) storage.subjects['MLsample'][expert] = 'valid' subjects = storage.subjects # I just need to know what was retired TONIGHT -- # compare what's retired UP TILL tonight with what was # retired up till LAST NIGHT SWAP_retired_by_tonight = sim.fetchCatalog(filename) # If we're picking up where we left off, grab previous training sample #if today>start_day and last_night is None: # print 'MachineShop: getting previous training sample' # last_night = subjects[subjects['MLsample']=='train'] # last_night['zooid'] = last_night['SDSS_id'] try: ids_retired_tonight = set(SWAP_retired_by_tonight['zooid']) - \ set(last_night['zooid']) except: ids_retired_tonight = set(SWAP_retired_by_tonight['zooid']) print "Newly retired subjects: {}".format(len(ids_retired_tonight)) # Now that I have the ids from the previous night, adjust the # metadata file to reflect what was retired / add SWAP info for ID in list(ids_retired_tonight): # Locate this subject in the metadata file mask = subjects['SDSS_id'] == int(ID) # Update them in metadata file as training sample for MC # DOUBLE CHECK THAT IT HAS NOT BEEN RETIRED BY MACHINE!!! #if subjects['MLsample'][mask] == 'test ': if subjects['MLsample'][mask] == 'test': SWAP_retired += 1 subjects['MLsample'][mask] = 'train' subjects['retired_date'][mask] = dt.datetime.strftime( today, '%Y-%m-%d') subjects['SWAP_prob'][mask] = SWAP_retired_by_tonight['P'][ SWAP_retired_by_tonight['zooid'] == ID] run_machine = True else: notfound += 1 if len(subjects[subjects['MLsample'] == 'train']) >= 10000: run_machine = True last_night = SWAP_retired_by_tonight print "Retired by this day:", len(last_night) print "" print "MachineShop: Found {0} subjects retired by SWAP on {1}"\ .format(SWAP_retired, today) print "MachineShop: {0} total subjects retired so far"\ .format(np.sum(subjects['MLsample']=='train')) print "MachineShop: Found {0} subjects retired by Machine."\ .format(np.sum(subjects['MLsample']=='mclas')) print "MachineShop: Saving updated StorageLocker." params['dir'] = os.getcwd() # Save our new metadata file -- MC needs this -- save to NOT the original params['metadatafile'] = params['dir'] + '/' + params[ 'survey'] + '_metadata.pickle' swap.write_pickle(storage, params['metadatafile']) if run_machine: # Need to doctor the config to refect the "correct date" params['start'] = today.strftime('%Y-%m-%d_%H:%M:%S') swap.write_config(args.config, params) # Run MachineClassifier.py using this subject file os.system("python MachineClassifier.py -c %s" % args.config) """os.system("python test_Machine.py -c {0}".format(args.config))""" # MachineClassifier updates the configfile so now we need to open the NEW one the = swap.Configuration(args.config) params = the.parameters # Update date (since we're not running SWAP) today += dt.timedelta(days=1)
def MachineClassifier(options, args): """ NAME MachineClassifier.py PURPOSE Machine learning component of Galaxy Zoo Express Read in a training sample generated by human users (which have preferentially been analyzed by SWAP). Learn on the training sample and moniter progress. Once "fully trained", apply learned model to test sample. COMMENTS Lots I'm sure. FLAGS -h Print this message -c config file name """ # Check for setup file in array args: if (len(args) >= 1) or (options.configfile): if args: config = args[0] elif options.configfile: config = options.configfile print swap.doubledashedline print swap.ML_hello print swap.doubledashedline print "ML: taking instructions from",config else: print MachineClassifier.__doc__ return tonights = swap.Configuration(config) # Read the pickled random state file random_file = open(tonights.parameters['random_file'],"r"); random_state = cPickle.load(random_file); random_file.close(); np.random.set_state(random_state); time = tonights.parameters['start'] print time # Get the machine threshold (make retirement decisions) threshold = tonights.parameters['machine_threshold'] prior = tonights.parameters['prior'] # Get list of evaluation metrics and criteria eval_metrics = tonights.parameters['evaluation_metrics'] # How much cross-validation should we do? cv = tonights.parameters['cross_validation'] survey = tonights.parameters['survey'] #---------------------------------------------------------------------- # read in the metadata for all subjects (Test or Training sample?) storage = swap.read_pickle(tonights.parameters['metadatafile'], 'metadata') subjects = storage.subjects #---------------------------------------------------------------------- # read in the SWAP collection sample = swap.read_pickle(tonights.parameters['samplefile'],'collection') #---------------------------------------------------------------------- # read in or create the ML collection MLsample = swap.read_pickle(tonights.parameters['MLsamplefile'], 'MLcollection') # read in or create the ML bureau for machine agents (history) MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'],'bureau') #if not tonights.parameters['MLbureaufile']: # MLbureaufile = swap.get_new_filename(tonights.parameters,'bureau','ML') # tonights.parameters['MLbureaufile'] = MLbureaufile #MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'],'bureau') #----------------------------------------------------------------------- # SELECT TRAINING & VALIDATION SAMPLES #----------------------------------------------------------------------- # TO DO: training sample should only select those which are NOT part of # validation sample (Nair catalog objects) 2/22/16 train_sample = storage.fetch_subsample(sample_type='train', class_label='GZ2_label') """ Notes about the training sample: # this will select only those which have my morphology measured for them # AND which have a true "answer" according to GZ2 # Eventually we could open this up to include the ~10k that aren't in the # GZ Main Sample but I think, for now, we should reduce ourselves to this # stricter sample so that we always have back-up "truth" for each galaxy. """ try: train_meta, train_features = ml.extract_features(train_sample) original_length = len(train_meta) except TypeError: print "ML: can't extract features from subsample." print "ML: Exiting MachineClassifier.py" sys.exit() else: # TODO: consider making this part of SWAP's duties? # 5/18/16: Only use those subjects which are no longer on the prior off_the_fence = np.where(train_meta['SWAP_prob']!=prior) train_meta = train_meta[off_the_fence] train_features = train_features[off_the_fence] train_labels = np.array([1 if p > prior else 0 for p in train_meta['SWAP_prob']]) #train_labels = train_meta['Nair_label'].filled() shortened_length = len(train_meta) print "ML: found a training sample of %i subjects"%shortened_length removed = original_length - shortened_length print "ML: %i subjects had prior probability and were removed"%removed valid_sample = storage.fetch_subsample(sample_type='valid', class_label='Expert_label') try: valid_meta, valid_features = ml.extract_features(valid_sample) except: print "ML: there are no subjects with the label 'valid'!" else: valid_labels = valid_meta['Expert_label'].filled() print "ML: found a validation sample of %i subjects"%len(valid_meta) # --------------------------------------------------------------------- # Require a minimum size training sample [Be reasonable, my good man!] # --------------------------------------------------------------------- if len(train_sample) < 10000: print "ML: training sample is too small to be worth anything." print "ML: Exiting MachineClassifier.py" sys.exit() else: print "ML: training sample is large enough to give it a shot." # TODO: LOOP THROUGH DIFFERENT MACHINES? # 5/12/16 -- no... need to make THIS a class and create multiple # instances? Each one can be passed an instance of a machine? # Machine can be trained to maximize/minimize different metrics # (ACC, completeness, purity, etc. Have a list of acceptable ones.) # Minimize a Loss function (KNC doesn't have a loss fcn). for metric in eval_metrics: # REGISTER Machine Classifier # Construct machine name --> Machine+Metric? For now: KNC machine = 'KNC' machine = 'RF' Name = machine+'_'+metric # register an Agent for this Machine # This "Agent" doesn't behave like a SW agent... at least not yet try: test = MLbureau.member[Name] except: MLbureau.member[Name] = swap.Agent_ML(Name, metric) MLagent = MLbureau.member[Name] #--------------------------------------------------------------- # TRAIN THE MACHINE; EVALUATE ON VALIDATION SAMPLE #--------------------------------------------------------------- # Now we run the machine -- need cross validation on whatever size # training sample we have .. # Fixed until we build in other machine options # Need to dynamically determine appropriate parameters... #max_neighbors = get_max_neighbors(train_features, cv) #n_neighbors = np.arange(1, (cv-1)*max_neighbors/cv, 5, dtype=int) #params = {'n_neighbors':n_neighbors, # 'weights':('uniform','distance')} num_features = train_features.shape[1] min_features = int(round(np.sqrt(num_features))) params = {'max_features':np.arange(min_features, num_features+1), 'max_depth':np.arange(2,16)} # Create the model # for "estimator=XXX" all you need is an instance of a machine -- # any scikit-learn machine will do. However, non-sklearn machines.. # That will be a bit trickier! (i.e. Phil's conv-nets) general_model = GridSearchCV(estimator=RF(n_estimators=30), param_grid=params, n_jobs=-1, error_score=0, scoring=metric, cv=cv) # Train the model -- k-fold cross validation is embedded print "ML: Searching the hyperparameter space for values that "\ "optimize the %s."%metric trained_model = general_model.fit(train_features, train_labels) MLagent.model = trained_model # Test "accuracy" (metric of choice) on validation sample score = trained_model.score(valid_features, valid_labels) ratio = np.sum(train_labels==1) / len(train_labels) MLagent.record_training(model_described_by= trained_model.best_estimator_, with_params=trained_model.best_params_, trained_on=len(train_features), with_ratio=ratio, at_time=time, with_train_score=trained_model.best_score_, and_valid_score=trained_model.score( valid_features, valid_labels)) fps, tps, thresh = mtrx.roc_curve(valid_labels, trained_model.predict_proba(valid_features)[:,1]) metric_list = compute_binary_metrics(fps, tps) ACC, TPR, FPR, FNR, TNR, PPV, FDR, FOR, NPV = metric_list MLagent.record_validation(accuracy=ACC, recall=TPR, precision=PPV, false_pos=FPR, completeness_f=TNR, contamination_f=NPV) #MLagent.plot_ROC() # --------------------------------------------------------------- # IF TRAINED MACHINE PREDICTS WELL ON VALIDATION .... # --------------------------------------------------------------- if MLagent.is_trained(metric): print "ML: %s has successfully trained and will be applied "\ "to the test sample." # Retrieve the test sample test_sample = storage.fetch_subsample(sample_type='test', class_label='GZ2_label') """ Notes on test sample: The test sample will, in real life, be those subjects for which we don't have an answer a priori. However, for now, this sample is how we will judge, in part, the performance of the overall method. As such, we only include those subjects which have GZ2 labels in the Main Sample. """ try: test_meta, test_features = ml.extract_features(test_sample) except: print "ML: there are no subjects with the label 'test'!" print "ML: which means there's nothing more to do!" else: print "ML: found test sample of %i subjects"%len(test_meta) #----------------------------------------------------------- # APPLY MACHINE TO TEST SAMPLE #----------------------------------------------------------- predictions = MLagent.model.predict(test_features) probabilities = MLagent.model.predict_proba(test_features) print "ML: %s has finished predicting labels for the test "\ "sample."%Name print "ML: Generating performance report on the test sample:" test_labels = test_meta['GZ2_label'].filled() print mtrx.classification_report(test_labels, predictions) test_accuracy=mtrx.accuracy_score(test_labels,predictions) test_precision=mtrx.precision_score(test_labels,predictions) test_recall=mtrx.recall_score(test_labels,predictions) MLagent.record_evaluation(accuracy_score=test_accuracy, precision_score=test_precision, recall_score=test_recall, at_time=time) #pdb.set_trace() # ---------------------------------------------------------- # Save the predictions and probabilities to a new pickle test_meta['predictions'] = predictions test_meta['probability_of_smooth'] = probabilities[:,1] filename=tonights.parameters['trunk']+'_'+Name+'.pickle' swap.write_pickle(test_meta, filename) """ for thing, pred, p in zip(test_meta, predictions, probabitilies): # IF MACHINE P >= THRESHOLD, INSERT INTO SWAP COLLECTION # -------------------------------------------------------- if (p >= threshold) or (1-p >= threshold): print "BOOM! WE'VE GOT A MACHINE-CLASSIFIED SUBJECT:" print "Probability:", p # Initialize the subject in SWAP Collection ID = thing['asset_id'] sample.member[ID] = swap.Subject(ID, str(s['SDSS_id']), location=s['external_ref']) sample.member[ID].retiredby = 'machine' # Flag subject as 'INACTIVE' / 'DETECTED' / 'REJECTED' # ---------------------------------------------------------- if p >= threshold: sample.member[str(s['id'])].state = 'inactive' elif 1-p >= threshold: sample.member[str(s['id'])].status = 'rejected' #""" # If is hasn't been done already, save the current directory # --------------------------------------------------------------------- tonights.parameters['dir'] = os.getcwd()+'/'+tonights.parameters['trunk'] if not os.path.exists(tonights.parameters['dir']): os.makedirs(tonights.parameters['dir']) # Repickle all the shits # ----------------------------------------------------------------------- if tonights.parameters['repickle']: new_samplefile = swap.get_new_filename(tonights.parameters,'collection') print "ML: saving SWAP subjects to "+new_samplefile swap.write_pickle(sample, new_samplefile) tonights.parameters['samplefile'] = new_samplefile new_samplefile=swap.get_new_filename(tonights.parameters,'MLcollection') print "ML: saving test sample subjects to "+new_samplefile swap.write_pickle(MLsample,new_samplefile) tonights.parameters['MLsamplefile'] = new_samplefile new_bureaufile=swap.get_new_filename(tonights.parameters,'bureau','ML') print "ML: saving MLbureau to "+new_bureaufile swap.write_pickle(MLbureau, new_bureaufile) tonights.parameters['MLbureaufile'] = new_bureaufile metadatafile = swap.get_new_filename(tonights.parameters,'metadata') print "ML: saving metadata to "+metadatafile swap.write_pickle(storage, metadatafile) tonights.parameters['metadatafile'] = metadatafile # UPDATE CONFIG FILE with pickle filenames, dir/trunk, and (maybe) new day # ---------------------------------------------------------------------- configfile = config.replace('startup','update') # Random_file needs updating, else we always start from the same random # state when update.config is reread! random_file = open(tonights.parameters['random_file'],"w"); random_state = np.random.get_state(); cPickle.dump(random_state,random_file); random_file.close(); swap.write_config(configfile, tonights.parameters) return
def MachineClassifier(options, args): """ NAME MachineClassifier.py PURPOSE Machine learning component of Galaxy Zoo Express Read in a training sample generated by human users (which have previously been analyzed by SWAP). Learn on the training sample and moniter progress. Once "fully trained", apply learned model to test sample. COMMENTS Lots I'm sure. FLAGS -h Print this message -c config file name """ #----------------------------------------------------------------------- # LOAD CONFIG FILE PARAMETERS #----------------------------------------------------------------------- # Check for config file in array args: if (len(args) >= 1) or (options.configfile): if args: config = args[0] elif options.configfile: config = options.configfile print swap.doubledashedline print swap.ML_hello print swap.doubledashedline print "ML: taking instructions from",config else: print MachineClassifier.__doc__ return machine_sim_directory = 'sims_Machine/redo_with_circular_morphs/' tonights = swap.Configuration(config) # Read the pickled random state file random_file = open(tonights.parameters['random_file'],"r"); random_state = cPickle.load(random_file); random_file.close(); np.random.set_state(random_state) time = tonights.parameters['start'] # Get the machine threshold (to make retirement decisions) swap_thresholds = {} swap_thresholds['detection'] = tonights.parameters['detection_threshold'] swap_thresholds['rejection'] = tonights.parameters['rejection_threshold'] threshold = tonights.parameters['machine_threshold'] prior = tonights.parameters['prior'] # Get list of evaluation metrics and criteria eval_metrics = tonights.parameters['evaluation_metrics'] # How much cross-validation should we do? cv = tonights.parameters['cross_validation'] survey = tonights.parameters['survey'] # To generate training labels based on the subject probability, # we need to know what should be considered the positive label: # i.e., GZ2 has labels (in metadatafile) Smooth = 1, Feat = 0 # Doing a Smooth or Not run, the positive label is 1 # Doing a Featured or Not run, the positive label is 0 pos_label = tonights.parameters['positive_label'] #---------------------------------------------------------------------- # read in the metadata for all subjects storage = swap.read_pickle(tonights.parameters['metadatafile'], 'metadata') # 11TH HOUR QUICK FIX CUZ I F****D UP. MB 10/27/16 if 'GZ2_raw_combo' not in storage.subjects.colnames: gz2_metadata = Table.read('metadata_ground_truth_labels.fits') storage.subjects['GZ2_raw_combo'] = gz2_metadata['GZ2_raw_combo'] swap.write_pickle(storage, tonights.parameters['metadatafile']) subjects = storage.subjects #---------------------------------------------------------------------- # read in the PROJECT COLLECTION -- (shared between SWAP/Machine) #sample = swap.read_pickle(tonights.parameters['samplefile'],'collection') # read in or create the ML bureau for machine agents (history for Machines) MLbureau = swap.read_pickle(tonights.parameters['MLbureaufile'],'bureau') #----------------------------------------------------------------------- # FETCH TRAINING & VALIDATION SAMPLES #----------------------------------------------------------------------- train_sample = storage.fetch_subsample(sample_type='train', class_label='GZ2_raw_combo') """ Notes about the training sample: # this will select only those which have my morphology measured for them # AND which have "ground truth" according to GZ2 # Eventually we could open this up to include the ~10k that aren't in the # GZ Main Sample but I think, for now, we should reduce ourselves to this # stricter sample so that we always have back-up "truth" for each galaxy. """ try: train_meta, train_features = ml.extract_features(train_sample, keys=['M20_corr', 'C_corr', 'E', 'A_corr', 'G_corr']) original_length = len(train_meta) except TypeError: print "ML: can't extract features from subsample." print "ML: Exiting MachineClassifier.py" sys.exit() else: # TODO: consider making this part of SWAP's duties? # 5/18/16: Only use those subjects which are no longer on the prior off_the_fence = np.where(train_meta['SWAP_prob']!=prior) train_meta = train_meta[off_the_fence] train_features = train_features[off_the_fence] train_labels = np.array([pos_label if p > prior else 1-pos_label for p in train_meta['SWAP_prob']]) shortened_length = len(train_meta) print "ML: found a training sample of %i subjects"%shortened_length removed = original_length - shortened_length print "ML: %i subjects removed to create balanced training sample"%removed valid_sample = storage.fetch_subsample(sample_type='valid', class_label='Expert_label') try: valid_meta, valid_features = ml.extract_features(valid_sample, keys=['M20_corr', 'C_corr', 'E', 'A_corr', 'G_corr']) except: print "ML: there are no subjects with the label 'valid'!" else: valid_labels = valid_meta['Expert_label'].filled() print "ML: found a validation sample of %i subjects"%len(valid_meta) # --------------------------------------------------------------------- # Require a minimum size training sample [Be reasonable, my good man!] # --------------------------------------------------------------------- if len(train_sample) < 10000: print "ML: training sample is too small to be worth anything." print "ML: Exiting MachineClassifier.py" sys.exit() else: print "ML: training sample is large enough to give it a shot." # TODO: LOOP THROUGH DIFFERENT MACHINES? # 5/12/16 -- no... need to make THIS a class and create multiple # instances? Each one can be passed an instance of a machine? # Machine can be trained to optimize different metrics # (ACC, completeness, purity, etc. Have a list of acceptable ones.) # Minimize a Loss function. for metric in eval_metrics: # REGISTER Machine Classifier # Construct machine name --> Machine+Metric machine = 'RF' Name = machine+'_'+metric # register an Agent for this Machine try: test = MLbureau.member[Name] except: MLbureau.member[Name] = swap.Agent_ML(Name, metric) MLagent = MLbureau.member[Name] #--------------------------------------------------------------- # TRAIN THE MACHINE; EVALUATE ON VALIDATION SAMPLE #--------------------------------------------------------------- # Now we run the machine -- need cross validation on whatever size # training sample we have .. # Fixed until we build in other machine options # Need to dynamically determine appropriate parameters... #max_neighbors = get_max_neighbors(train_features, cv) #n_neighbors = np.arange(1, (cv-1)*max_neighbors/cv, 5, dtype=int) #params = {'n_neighbors':n_neighbors, # 'weights':('uniform','distance')} num_features = train_features.shape[1] min_features = int(round(np.sqrt(num_features))) params = {'max_features':np.arange(min_features, num_features+1), 'max_depth':np.arange(2,16)} # Create the model # for "estimator=XXX" all you need is an instance of a machine -- # any scikit-learn machine will do. However, non-sklearn machines.. # That will be a bit trickier! (i.e. Phil's conv-nets) general_model = GridSearchCV(estimator=RF(n_estimators=30), param_grid=params, n_jobs=31, error_score=0, scoring=metric, cv=cv) # Train the model -- k-fold cross validation is embedded print "ML: Searching the hyperparameter space for values that "\ "optimize the %s."%metric trained_model = general_model.fit(train_features, train_labels) MLagent.model = trained_model # Test accuracy (metric of choice) on validation sample score = trained_model.score(valid_features, valid_labels) ratio = np.sum(train_labels==pos_label) / len(train_labels) MLagent.record_training(model_described_by= trained_model.best_estimator_, with_params=trained_model.best_params_, trained_on=len(train_features), with_ratio=ratio, at_time=time, with_train_score=trained_model.best_score_, and_valid_score=trained_model.score( valid_features, valid_labels)) valid_prob_thresh = trained_model.predict_proba(valid_features)[:,pos_label] fps, tps, thresh = mtrx.roc_curve(valid_labels,valid_prob_thresh, pos_label=pos_label) metric_list = compute_binary_metrics(fps, tps) ACC, TPR, FPR, FNR, TNR, PPV, FDR, FOR, NPV = metric_list MLagent.record_validation(accuracy=ACC, recall=TPR, precision=PPV, false_pos=FPR, completeness_f=TNR, contamination_f=NPV) #MLagent.plot_ROC() # --------------------------------------------------------------- # IF TRAINED MACHINE PREDICTS WELL ON VALIDATION .... # --------------------------------------------------------------- if MLagent.is_trained(metric) or MLagent.trained: print "ML: %s has successfully trained and will be applied "\ "to the test sample."%Name # Retrieve the test sample test_sample = storage.fetch_subsample(sample_type='test', class_label='GZ2_raw_combo') """ Notes on test sample: The test sample will, in real life, be those subjects for which we don't have an answer a priori. However, for now, this sample is how we will judge, in part, the performance of the overall method. As such, we only include those subjects which have GZ2 labels in the Main Sample. """ try: test_meta, test_features = ml.extract_features(test_sample, keys=['M20_corr', 'C_corr', 'E', 'A_corr', 'G_corr']) except: print "ML: there are no subjects with the label 'test'!" print "ML: Either there is nothing more to do or there is a BIG mistake..." else: print "ML: found test sample of %i subjects"%len(test_meta) #----------------------------------------------------------- # APPLY MACHINE TO TEST SAMPLE #----------------------------------------------------------- predictions = MLagent.model.predict(test_features) probabilities = MLagent.model.predict_proba(test_features)[:,pos_label] print "ML: %s has finished predicting labels for the test "\ "sample."%Name print "ML: Generating performance report on the test sample:" test_labels = test_meta['GZ2_raw_combo'].filled() print mtrx.classification_report(test_labels, predictions) test_accuracy = mtrx.accuracy_score(test_labels,predictions) test_precision = mtrx.precision_score(test_labels,predictions,pos_label=pos_label) test_recall = mtrx.recall_score(test_labels,predictions,pos_label=pos_label) MLagent.record_evaluation(accuracy_score=test_accuracy, precision_score=test_precision, recall_score=test_recall, at_time=time) # ---------------------------------------------------------- # Save the predictions and probabilities to a new pickle test_meta['predictions'] = predictions test_meta['machine_probability'] = probabilities # If is hasn't been done already, save the current directory # --------------------------------------------------------------------- tonights.parameters['trunk'] = survey+'_'+tonights.parameters['start'] # This is the standard directory... #tonights.parameters['dir'] = os.getcwd()+'/'+tonights.parameters['trunk'] # This is to put files into the sims_Machine/... directory. tonights.parameters['dir'] = os.getcwd() filename=tonights.parameters['dir']+'/'+tonights.parameters['trunk']+'_'+Name+'.fits' test_meta.write(filename) count=0 noSWAP=0 for sub, pred, prob in zip(test_meta, predictions, probabilities): # IF MACHINE P >= THRESHOLD, INSERT INTO SWAP COLLECTION # -------------------------------------------------------- if (prob >= threshold) or (1-prob >= threshold): # Flip the set label in the metadata file -- # don't want to use this as a training sample! idx = np.where(subjects['asset_id'] == sub['asset_id']) storage.subjects['MLsample'][idx] = 'mclass' storage.subjects['retired_date'][idx] = time count+=1 print "MC: Machine classifed {0} subjects with >= 90% confidence".format(count) print "ML: Of those, {0} had never been seen by SWAP".format(noSWAP) tonights.parameters['trunk'] = survey+'_'+tonights.parameters['start'] tonights.parameters['dir'] = os.getcwd() if not os.path.exists(tonights.parameters['dir']): os.makedirs(tonights.parameters['dir']) # Repickle all the shits # ----------------------------------------------------------------------- if tonights.parameters['repickle']: #new_samplefile = swap.get_new_filename(tonights.parameters,'collection') #print "ML: saving SWAP subjects to "+new_samplefile #swap.write_pickle(sample, new_samplefile) #tonights.parameters['samplefile'] = new_samplefile new_bureaufile=swap.get_new_filename(tonights.parameters,'bureau','ML') print "ML: saving MLbureau to "+new_bureaufile swap.write_pickle(MLbureau, new_bureaufile) tonights.parameters['MLbureaufile'] = new_bureaufile metadatafile = swap.get_new_filename(tonights.parameters,'metadata') print "ML: saving metadata to "+metadatafile swap.write_pickle(storage, metadatafile) tonights.parameters['metadatafile'] = metadatafile # UPDATE CONFIG FILE with pickle filenames, dir/trunk, and (maybe) new day # ---------------------------------------------------------------------- configfile = config.replace('startup','update') # Random_file needs updating, else we always start from the same random # state when update.config is reread! random_file = open(tonights.parameters['random_file'],"w"); random_state = np.random.get_state(); cPickle.dump(random_state,random_file); random_file.close(); swap.write_config(configfile, tonights.parameters) return