def compute_proba(dataFrame): object_g, object_r, object_z, object_W1, object_W2 = magsExtFromFlux( dataFrame) attributes = colors(object_g.size, nfeatures, object_g, object_r, object_z, object_W1, object_W2) rf_fileName = pathToRF + f'/rf_model_dr9_final.npz' logger.info('Load Random Forest: ') logger.info(' * ' + rf_fileName) logger.info('Random Forest over: ', len(attributes), ' objects\n') logger.info(' * start RF calculation...') myrf = myRF(attributes, pathToRF, numberOfTrees=500, version=2) myrf.loadForest(rf_fileName) proba_rf = myrf.predict_proba() return proba_rf
def isSV0_QSO(gflux=None, rflux=None, zflux=None, w1flux=None, w2flux=None, objtype=None, release=None, dchisq=None, maskbits=None, primary=None): """Early SV QSO target class using random forest. Returns a boolean array. Parameters ---------- - See :func:`~desitarget.cuts.set_target_bits` for other parameters. Returns ------- :class:`array_like` ``True`` for objects that pass the quasar color/morphology/logic cuts. Notes ----- - This version (06/05/19) is version 68 on `the SV wiki`_. """ # BRICK_PRIMARY if primary is None: primary = np.ones_like(gflux, dtype=bool) # Build variables for random forest. nFeatures = 11 # Number of attributes describing each object to be classified by the rf. nbEntries = rflux.size # ADM shift the northern photometry to the southern system. # ADM we don't need to exactly correspond to SV for SV0. # gflux, rflux, zflux = shift_photo_north(gflux, rflux, zflux) # ADM photOK here should ensure (g > 0.) & (r > 0.) & (z > 0.) & (W1 > 0.) & (W2 > 0.) colors, r, photOK = _getColors(nbEntries, nFeatures, gflux, rflux, zflux, w1flux, w2flux) r = np.atleast_1d(r) # ADM Preselection to speed up the process rMax = 23.0 # r < 23.0 (different for SV) rMin = 17.5 # r > 17.5 preSelection = (r < rMax) & (r > rMin) & photOK & primary # ADM relaxed morphology cut for SV. # ADM we never target sources with dchisq[..., 0] = 0, so force # ADM those to have large values of morph2 to avoid divide-by-zero. d1, d0 = dchisq[..., 1], dchisq[..., 0] bigmorph = np.array(np.zeros_like(d0) + 1e9) dcs = np.divide(d1 - d0, d0, out=bigmorph, where=d0 != 0) morph2 = dcs < 0.02 preSelection &= _psflike(objtype) | morph2 # ADM Reject objects in masks. # ADM BRIGHT BAILOUT GALAXY CLUSTER (1, 10, 12, 13) bits not set. if maskbits is not None: for bit in [1, 10, 12, 13]: preSelection &= ((maskbits & 2**bit) == 0) # "qso" mask initialized to "preSelection" mask qso = np.copy(preSelection) if np.any(preSelection): from desitarget.myRF import myRF # Data reduction to preselected objects colorsReduced = colors[preSelection] r_Reduced = r[preSelection] colorsIndex = np.arange(0, nbEntries, dtype=np.int64) colorsReducedIndex = colorsIndex[preSelection] # Path to random forest files pathToRF = resource_filename('desitarget', 'data') # ADM Use RF trained over DR7 rf_fileName = pathToRF + '/rf_model_dr7.npz' rf_HighZ_fileName = pathToRF + '/rf_model_dr7_HighZ.npz' # rf initialization - colors data duplicated within "myRF" rf = myRF(colorsReduced, pathToRF, numberOfTrees=500, version=2) rf_HighZ = myRF(colorsReduced, pathToRF, numberOfTrees=500, version=2) # rf loading rf.loadForest(rf_fileName) rf_HighZ.loadForest(rf_HighZ_fileName) # Compute rf probabilities tmp_rf_proba = rf.predict_proba() tmp_rf_HighZ_proba = rf_HighZ.predict_proba() # Compute optimized proba cut (all different for SV/main). pcut = np.where(r_Reduced > 20.0, 0.65 - (r_Reduced - 20.0) * 0.075, 0.65) pcut[r_Reduced > 22.0] = 0.50 - 0.25 * (r_Reduced[r_Reduced > 22.0] - 22.0) pcut_HighZ = np.where(r_Reduced > 20.5, 0.5 - (r_Reduced - 20.5) * 0.025, 0.5) # Add rf proba test result to "qso" mask qso[colorsReducedIndex] = \ (tmp_rf_proba >= pcut) | (tmp_rf_HighZ_proba >= pcut_HighZ) # In case of call for a single object passed to the function with scalar arguments # Return "numpy.bool_" instead of "numpy.ndarray" if nbEntries == 1: qso = qso[0] return qso
def isELG_randomforest(pcut=None, gflux=None, rflux=None, zflux=None, w1flux=None, w2flux=None, primary=None, training='spectro'): """Target Definition of ELG using a random forest returning a boolean array. Args: gflux, rflux, zflux, w1flux, w2flux: array_like The flux in nano-maggies of g, r, z, W1, and W2 bands. primary: array_like or None If given, the BRICK_PRIMARY column of the catalogue. Returns: mask : array_like. True if and only the object is a ELG target. Three RF - Training with spectro redshift (VIPERS and DEEP2) : rf_model_dr3_elg.npz - Training with photo z HSC : rf_model_dr3_elg_HSC.npz - Training with photo z HSC and depth=15 and max leaves = 2000 : rf_model_dr3_elg_HSC_V2.npz """ #----- ELG if primary is None: primary = np.ones_like(gflux, dtype='?') # build variables for random forest nfeatures = 11 # number of variables in random forest nbEntries = rflux.size colors, g, r, DECaLSOK = _getColors(nbEntries, nfeatures, gflux, rflux, zflux, w1flux, w2flux) #Preselection to speed up the process, store the indexes rMax = 23.5 # r<23.5 gMax = 23.8 # g<23.8 proxy of OII flux preSelection = np.where((r < rMax) & (g < gMax) & DECaLSOK) colorsCopy = colors.copy() colorsReduced = colorsCopy[preSelection] colorsIndex = np.arange(0, nbEntries, dtype=np.int64) colorsReducedIndex = colorsIndex[preSelection] #Path to random forest files pathToRF = resource_filename('desitarget', "sandbox/data") # Compute random forest probability from desitarget.myRF import myRF prob = np.zeros(nbEntries) if (colorsReducedIndex.any()): if (training == 'spectro'): # Training with VIPERS and DEEP2 Fileds 2,3,4 print(' === Trained with DEEP2 and VIPERS with spectro z == ') fileName = pathToRF + '/rf_model_dr3_elg.npz' rf = myRF(colorsReduced, pathToRF, numberOfTrees=200, version=1) elif (training == 'photo'): # Training with HSC with photometric redshifts # pathToRF = os.environ['DESITARGET'] pathToRF = '.' print( ' === Trained with HSC with photo z, you need locally /global/project/projectdirs/desi/target/RF_files/rf_model_dr3_elg_HSC_V2.npz nersc file ' ) # fileName = pathToRF + '/rf_model_dr3_elg_HSC.npz' fileName = pathToRF + '/rf_model_dr3_elg_HSC_V2.npz' rf = myRF(colorsReduced, pathToRF, numberOfTrees=500, version=2) rf.loadForest(fileName) objects_rf = rf.predict_proba() # add random forest probability to preselected objects j = 0 for i in colorsReducedIndex: prob[i] = objects_rf[j] j += 1 #define pcut #pcut = 0.98 elg = primary.copy() elg &= r < rMax elg &= g < gMax elg &= DECaLSOK if nbEntries == 1: # for call of a single object elg &= prob[0] > pcut else: elg &= prob > pcut return elg, prob
def isQSO_randomforest(gflux=None, rflux=None, zflux=None, w1flux=None, w2flux=None, objtype=None, deltaChi2=None, primary=None): """Target Definition of QSO using a random forest returning a boolean array. Args: gflux, rflux, zflux, w1flux, w2flux: array_like The flux in nano-maggies of g, r, z, W1, and W2 bands. objtype: array_like or None If given, the TYPE column of the Tractor catalogue. deltaChi2: array_like or None If given, difference of chi2 bteween PSF and SIMP morphology primary: array_like or None If given, the BRICK_PRIMARY column of the catalogue. Returns: mask : array_like. True if and only the object is a QSO target. """ #----- Quasars if primary is None: primary = np.ones_like(gflux, dtype='?') # build variables for random forest nfeatures = 11 # number of variables in random forest nbEntries = rflux.size colors, r, DECaLSOK = _getColors(nbEntries, nfeatures, gflux, rflux, zflux, w1flux, w2flux) #Preselection to speed up the process, store the indexes rMax = 22.7 # r<22.7 #ADM this previous had no np.where but was flagging DeprecationWarnings on #ADM indexing a Boolean, so I switched the Boolean to an integer via np.where preSelection = np.where((r < rMax) & _psflike(objtype) & DECaLSOK) colorsCopy = colors.copy() colorsReduced = colorsCopy[preSelection] colorsIndex = np.arange(0, nbEntries, dtype=np.int64) colorsReducedIndex = colorsIndex[preSelection] #Path to random forest files pathToRF = resource_filename('desitarget', "data") # Compute random forest probability from desitarget.myRF import myRF prob = np.zeros(nbEntries) if (colorsReducedIndex.any()): rf = myRF(colorsReduced, pathToRF, numberOfTrees=200, version=1) fileName = pathToRF + '/rf_model_dr3.npz' rf.loadForest(fileName) objects_rf = rf.predict_proba() # add random forest probability to preselected objects j = 0 for i in colorsReducedIndex: prob[i] = objects_rf[j] j += 1 #define pcut, relaxed cut for faint objects pcut = np.where(r > 20.0, 0.95 - (r - 20.0) * 0.08, 0.95) qso = primary.copy() qso &= r < rMax qso &= DECaLSOK if objtype is not None: qso &= _psflike(objtype) if deltaChi2 is not None: qso &= deltaChi2 > 30. if nbEntries == 1: # for call of a single object qso &= prob[0] > pcut else: qso &= prob > pcut return qso
def train_mva_decals(Step, debug=False): # number of variables nfeatures = 11 # ----------------------------------------------- # files to be used for training and for tests # ----------------------------------------------- # files available on nersc modelDir = './' # dataDir='/global/project/projectdirs/desi/target/qso_training/' dataDir = './' # region of control 36<ra<42 is removed starTraining = dataDir + 'star_dr3_nora36-42_normalized.fits' # dr3 qsoTraining = dataDir + 'qso_dr3_nora36-42.fits' # dr3 # Test over stripe 82 fileName = 'Stripe82_dr3_decals' # dr3 objectTesting = dataDir + fileName + '.fits' outputFile = './' + fileName + '_newTraining.fits' if Step == 'train': star0 = pyfits.open(starTraining, memmap=True)[1].data star0_g, star0_r, star0_z, star0_W1, star0_W2 = magsExtFromFlux(star0) star = star0[(star0_g > 0) & (star0_r < 22.7)] qso0 = pyfits.open(qsoTraining, memmap=True)[1].data qso0_g, qso0_r, qso0_z, qso0_W1, qso0_W2 = magsExtFromFlux(qso0) qso = qso0[(qso0_r > 0) & (qso0_r < 22.7)] elif (Step == 'test' or Step == 'extract_myRF'): object = pyfits.open(objectTesting, memmap=True)[1].data object_g, object_r, object_z, object_W1, object_W2 = magsExtFromFlux( object) nobjecttot = len(object) object_colors = colors(nobjecttot, nfeatures, object_g, object_r, object_z, object_W1, object_W2) else: print('Unknown option') sys.exit() # ------------------------ if Step == 'train': # ---------------------------------------- # prepare arrays for Machine Learning # ---------------------------------------- print('qsos in file:', len(qso)) print('star in file:', len(star)) nqsotot = len(qso) nqso = len(qso) nstartot = len(star) nstar = len(star) if nqsotot * nstartot == 0: sys.exit() data = np.zeros((nqso + nstar, nfeatures)) target = np.zeros(nqso + nstar) qso_g, qso_r, qso_z, qso_W1, qso_W2 = magsExtFromFlux(qso) qso_colors = colors(nqsotot, nfeatures, qso_g, qso_r, qso_z, qso_W1, qso_W2) if debug: debug_qso_cols = pyfits.ColDefs([ pyfits.Column(name='r', format='E', array=qso_r[:]), pyfits.Column(name='g', format='E', array=qso_g[:]), pyfits.Column(name='z', format='E', array=qso_z[:]), pyfits.Column(name='W1', format='E', array=qso_W1[:]), pyfits.Column(name='W2', format='E', array=qso_W2[:]), pyfits.Column(name='colors', format='11E', array=qso_colors[:, :]), ]) hduQso = pyfits.BinTableHDU.from_columns(debug_qso_cols) hduQso.writeto('debug_qso.fits', clobber=True) print(' Debug qsos') print(qso_colors) star_g, star_r, star_z, star_W1, star_W2 = magsExtFromFlux(star) star_colors = colors(nstartot, nfeatures, star_g, star_r, star_z, star_W1, star_W2) if debug: debug_star_cols = pyfits.ColDefs([ pyfits.Column(name='r', format='E', array=star_r[:]), pyfits.Column(name='g', format='E', array=star_g[:]), pyfits.Column(name='z', format='E', array=star_z[:]), pyfits.Column(name='W1', format='E', array=star_W1[:]), pyfits.Column(name='W2', format='E', array=star_W2[:]), pyfits.Column(name='colors', format='11E', array=star_colors[:, :]), ]) hduStar = pyfits.BinTableHDU.from_columns(debug_star_cols) hduStar.writeto('debug_star.fits', clobber=True) print(' Debug stars') print(star_colors) # final arrays data[0:nqso, :] = qso_colors[0:nqso, :] data[nqso:nqso + nstar, :] = star_colors[0:nstar, :] target[0:nqso] = 1 target[nqso:nqso + nstar] = 0 # ----------------------- # Start the training # ----------------------- print('training over ', nqso, ' qsos and ', nstar, ' stars') print('with random Forest') np.random.seed(0) rf = RandomForestClassifier(200) rf.fit(data, target) joblib.dump(rf, modelDir + 'rf_model_dr3.pkl.gz', compress=9) np.random.seed(0) rf.fit(data[:, 0:9], target) joblib.dump(rf, modelDir + 'rf_model_normag_dr3.pkl.gz', compress=9) print('with adaBoost') ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=8), algorithm="SAMME.R", n_estimators=200) np.random.seed(0) ada.fit(data, target) joblib.dump(ada, modelDir + 'adaboost_model_dr3.pkl.gz', compress=9) np.random.seed(0) ada.fit(data[:, 0:9], target) joblib.dump(ada, modelDir + 'adaboost_model_normag_dr3.pkl.gz', compress=9) sys.exit() # ----------------------- if Step == 'test': print('Check over a test sample') # ----------------------- print('random Forest over ', len(object_colors), ' objects ') rf = joblib.load(modelDir + 'rf_model_dr3.pkl.gz') pobject_rf = rf.predict_proba(object_colors) rf = joblib.load(modelDir + 'rf_model_normag_dr3.pkl.gz') pobject_rf_ns = rf.predict_proba(object_colors[:, 0:9]) # ----------------------- print('adaBoost over ', len(object_colors), ' objects ') ada = joblib.load(modelDir + 'adaboost_model_dr3.pkl.gz') pobject_ada = ada.predict_proba(object_colors) ada = joblib.load(modelDir + 'adaboost_model_normag_dr3.pkl.gz') pobject_ada_ns = ada.predict_proba(object_colors[:, 0:9]) # ----------------------- print('updating fits file') hdusel = pyfits.BinTableHDU(data=object) print('create fit file with', len(object), ' objects') orig_cols = object.columns new_cols = pyfits.ColDefs([ pyfits.Column(name='PADA_new', format='E', array=pobject_ada[:, 1]), pyfits.Column(name='PADAnomagr_new', format='E', array=pobject_ada_ns[:, 1]), pyfits.Column(name='PRANDF_new', format='E', array=pobject_rf[:, 1]), pyfits.Column(name='PRANDFnomagr_new', format='E', array=pobject_rf_ns[:, 1]), ]) hduNew = pyfits.BinTableHDU.from_columns(orig_cols + new_cols) hduNew.writeto(outputFile, clobber=True) sys.exit() # ---------------------------- if Step == 'extract_myRF': print('Produce the random forest with our own persistency') rf = joblib.load(modelDir + 'rf_model_dr3.pkl.gz') # rf = joblib.load(modelDir+'rf_model_elg_ref.pkl.gz') newDir = modelDir + 'RF/' print('dump all files in ', newDir) if not os.path.isdir(newDir): os.makedirs(newDir) joblib.dump(rf, newDir + 'bdt.pkl') nTrees = 200 # nTrees = 500 myrf = myRF(object_colors, newDir, numberOfTrees=nTrees, version=2) myrf.saveForest(modelDir + 'rf_model_dr3.npz') # myrf.saveForest(modelDir+'rf_model_new.npz') sys.exit()