Example #1
0
def compute_proba(dataFrame):
    object_g, object_r, object_z, object_W1, object_W2 = magsExtFromFlux(
        dataFrame)
    attributes = colors(object_g.size, nfeatures, object_g, object_r, object_z,
                        object_W1, object_W2)

    rf_fileName = pathToRF + f'/rf_model_dr9_final.npz'

    logger.info('Load Random Forest: ')
    logger.info('    * ' + rf_fileName)
    logger.info('Random Forest over: ', len(attributes), ' objects\n')
    logger.info('    * start RF calculation...')
    myrf = myRF(attributes, pathToRF, numberOfTrees=500, version=2)
    myrf.loadForest(rf_fileName)
    proba_rf = myrf.predict_proba()

    return proba_rf
Example #2
0
def isSV0_QSO(gflux=None,
              rflux=None,
              zflux=None,
              w1flux=None,
              w2flux=None,
              objtype=None,
              release=None,
              dchisq=None,
              maskbits=None,
              primary=None):
    """Early SV QSO target class using random forest. Returns a boolean array.

    Parameters
    ----------
    - See :func:`~desitarget.cuts.set_target_bits` for other parameters.

    Returns
    -------
    :class:`array_like`
        ``True`` for objects that pass the quasar color/morphology/logic cuts.

    Notes
    -----
    - This version (06/05/19) is version 68 on `the SV wiki`_.
    """
    # BRICK_PRIMARY
    if primary is None:
        primary = np.ones_like(gflux, dtype=bool)

    # Build variables for random forest.
    nFeatures = 11  # Number of attributes describing each object to be classified by the rf.
    nbEntries = rflux.size
    # ADM shift the northern photometry to the southern system.
    # ADM we don't need to exactly correspond to SV for SV0.
    # gflux, rflux, zflux = shift_photo_north(gflux, rflux, zflux)

    # ADM photOK here should ensure (g > 0.) & (r > 0.) & (z > 0.) & (W1 > 0.) & (W2 > 0.)
    colors, r, photOK = _getColors(nbEntries, nFeatures, gflux, rflux, zflux,
                                   w1flux, w2flux)
    r = np.atleast_1d(r)

    # ADM Preselection to speed up the process
    rMax = 23.0  # r < 23.0 (different for SV)
    rMin = 17.5  # r > 17.5
    preSelection = (r < rMax) & (r > rMin) & photOK & primary

    # ADM relaxed morphology cut for SV.
    # ADM we never target sources with dchisq[..., 0] = 0, so force
    # ADM those to have large values of morph2 to avoid divide-by-zero.
    d1, d0 = dchisq[..., 1], dchisq[..., 0]
    bigmorph = np.array(np.zeros_like(d0) + 1e9)
    dcs = np.divide(d1 - d0, d0, out=bigmorph, where=d0 != 0)
    morph2 = dcs < 0.02
    preSelection &= _psflike(objtype) | morph2

    # ADM Reject objects in masks.
    # ADM BRIGHT BAILOUT GALAXY CLUSTER (1, 10, 12, 13) bits not set.
    if maskbits is not None:
        for bit in [1, 10, 12, 13]:
            preSelection &= ((maskbits & 2**bit) == 0)

    # "qso" mask initialized to "preSelection" mask
    qso = np.copy(preSelection)

    if np.any(preSelection):

        from desitarget.myRF import myRF

        # Data reduction to preselected objects
        colorsReduced = colors[preSelection]
        r_Reduced = r[preSelection]
        colorsIndex = np.arange(0, nbEntries, dtype=np.int64)
        colorsReducedIndex = colorsIndex[preSelection]

        # Path to random forest files
        pathToRF = resource_filename('desitarget', 'data')
        # ADM Use RF trained over DR7
        rf_fileName = pathToRF + '/rf_model_dr7.npz'
        rf_HighZ_fileName = pathToRF + '/rf_model_dr7_HighZ.npz'

        # rf initialization - colors data duplicated within "myRF"
        rf = myRF(colorsReduced, pathToRF, numberOfTrees=500, version=2)
        rf_HighZ = myRF(colorsReduced, pathToRF, numberOfTrees=500, version=2)
        # rf loading
        rf.loadForest(rf_fileName)
        rf_HighZ.loadForest(rf_HighZ_fileName)
        # Compute rf probabilities
        tmp_rf_proba = rf.predict_proba()
        tmp_rf_HighZ_proba = rf_HighZ.predict_proba()
        # Compute optimized proba cut (all different for SV/main).
        pcut = np.where(r_Reduced > 20.0, 0.65 - (r_Reduced - 20.0) * 0.075,
                        0.65)
        pcut[r_Reduced > 22.0] = 0.50 - 0.25 * (r_Reduced[r_Reduced > 22.0] -
                                                22.0)
        pcut_HighZ = np.where(r_Reduced > 20.5,
                              0.5 - (r_Reduced - 20.5) * 0.025, 0.5)

        # Add rf proba test result to "qso" mask
        qso[colorsReducedIndex] = \
            (tmp_rf_proba >= pcut) | (tmp_rf_HighZ_proba >= pcut_HighZ)

    # In case of call for a single object passed to the function with scalar arguments
    # Return "numpy.bool_" instead of "numpy.ndarray"
    if nbEntries == 1:
        qso = qso[0]

    return qso
Example #3
0
def isELG_randomforest(pcut=None,
                       gflux=None,
                       rflux=None,
                       zflux=None,
                       w1flux=None,
                       w2flux=None,
                       primary=None,
                       training='spectro'):
    """Target Definition of ELG using a random forest returning a boolean array.

    Args:
        gflux, rflux, zflux, w1flux, w2flux: array_like
            The flux in nano-maggies of g, r, z, W1, and W2 bands.
                
        primary: array_like or None
            If given, the BRICK_PRIMARY column of the catalogue.

    Returns:
        mask : array_like. True if and only the object is a ELG
            target.

    Three RF
    - Training with spectro redshift (VIPERS and DEEP2)  :   rf_model_dr3_elg.npz
    - Training with photo z HSC : rf_model_dr3_elg_HSC.npz
    - Training with photo z HSC and depth=15 and max leaves = 2000 : rf_model_dr3_elg_HSC_V2.npz

            
    """
    #----- ELG
    if primary is None:
        primary = np.ones_like(gflux, dtype='?')

    # build variables for random forest
    nfeatures = 11  # number of variables in random forest
    nbEntries = rflux.size
    colors, g, r, DECaLSOK = _getColors(nbEntries, nfeatures, gflux, rflux,
                                        zflux, w1flux, w2flux)

    #Preselection to speed up the process, store the indexes
    rMax = 23.5  # r<23.5
    gMax = 23.8  # g<23.8 proxy of OII flux

    preSelection = np.where((r < rMax) & (g < gMax) & DECaLSOK)
    colorsCopy = colors.copy()
    colorsReduced = colorsCopy[preSelection]
    colorsIndex = np.arange(0, nbEntries, dtype=np.int64)
    colorsReducedIndex = colorsIndex[preSelection]

    #Path to random forest files
    pathToRF = resource_filename('desitarget', "sandbox/data")

    # Compute random forest probability
    from desitarget.myRF import myRF
    prob = np.zeros(nbEntries)

    if (colorsReducedIndex.any()):
        if (training == 'spectro'):
            # Training with VIPERS and DEEP2 Fileds 2,3,4
            print(' === Trained with DEEP2 and VIPERS with spectro z == ')
            fileName = pathToRF + '/rf_model_dr3_elg.npz'
            rf = myRF(colorsReduced, pathToRF, numberOfTrees=200, version=1)
        elif (training == 'photo'):
            # Training with HSC with photometric redshifts
            # pathToRF = os.environ['DESITARGET']
            pathToRF = '.'
            print(
                ' === Trained with HSC with photo z, you need locally /global/project/projectdirs/desi/target/RF_files/rf_model_dr3_elg_HSC_V2.npz nersc file '
            )
            #            fileName = pathToRF + '/rf_model_dr3_elg_HSC.npz'
            fileName = pathToRF + '/rf_model_dr3_elg_HSC_V2.npz'
            rf = myRF(colorsReduced, pathToRF, numberOfTrees=500, version=2)

        rf.loadForest(fileName)
        objects_rf = rf.predict_proba()
        # add random forest probability to preselected objects
        j = 0
        for i in colorsReducedIndex:
            prob[i] = objects_rf[j]
            j += 1

    #define pcut
    #pcut = 0.98

    elg = primary.copy()
    elg &= r < rMax
    elg &= g < gMax
    elg &= DECaLSOK

    if nbEntries == 1:  # for call of a single object
        elg &= prob[0] > pcut
    else:
        elg &= prob > pcut

    return elg, prob
Example #4
0
def isQSO_randomforest(gflux=None,
                       rflux=None,
                       zflux=None,
                       w1flux=None,
                       w2flux=None,
                       objtype=None,
                       deltaChi2=None,
                       primary=None):
    """Target Definition of QSO using a random forest returning a boolean array.

    Args:
        gflux, rflux, zflux, w1flux, w2flux: array_like
            The flux in nano-maggies of g, r, z, W1, and W2 bands.
        objtype: array_like or None
            If given, the TYPE column of the Tractor catalogue.
        deltaChi2: array_like or None
             If given, difference of chi2 bteween PSF and SIMP morphology
        primary: array_like or None
            If given, the BRICK_PRIMARY column of the catalogue.

    Returns:
        mask : array_like. True if and only the object is a QSO
            target.

    """
    #----- Quasars
    if primary is None:
        primary = np.ones_like(gflux, dtype='?')

    # build variables for random forest
    nfeatures = 11  # number of variables in random forest
    nbEntries = rflux.size
    colors, r, DECaLSOK = _getColors(nbEntries, nfeatures, gflux, rflux, zflux,
                                     w1flux, w2flux)

    #Preselection to speed up the process, store the indexes
    rMax = 22.7  # r<22.7
    #ADM this previous had no np.where but was flagging DeprecationWarnings on
    #ADM indexing a Boolean, so I switched the Boolean to an integer via np.where
    preSelection = np.where((r < rMax) & _psflike(objtype) & DECaLSOK)
    colorsCopy = colors.copy()
    colorsReduced = colorsCopy[preSelection]
    colorsIndex = np.arange(0, nbEntries, dtype=np.int64)
    colorsReducedIndex = colorsIndex[preSelection]

    #Path to random forest files
    pathToRF = resource_filename('desitarget', "data")

    # Compute random forest probability
    from desitarget.myRF import myRF
    prob = np.zeros(nbEntries)

    if (colorsReducedIndex.any()):
        rf = myRF(colorsReduced, pathToRF, numberOfTrees=200, version=1)
        fileName = pathToRF + '/rf_model_dr3.npz'
        rf.loadForest(fileName)
        objects_rf = rf.predict_proba()
        # add random forest probability to preselected objects
        j = 0
        for i in colorsReducedIndex:
            prob[i] = objects_rf[j]
            j += 1

    #define pcut, relaxed cut for faint objects
    pcut = np.where(r > 20.0, 0.95 - (r - 20.0) * 0.08, 0.95)

    qso = primary.copy()
    qso &= r < rMax
    qso &= DECaLSOK

    if objtype is not None:
        qso &= _psflike(objtype)

    if deltaChi2 is not None:
        qso &= deltaChi2 > 30.

    if nbEntries == 1:  # for call of a single object
        qso &= prob[0] > pcut
    else:
        qso &= prob > pcut

    return qso
Example #5
0
def train_mva_decals(Step, debug=False):

    # number of variables
    nfeatures = 11

    # -----------------------------------------------
    #   files to be used for training and for tests
    # -----------------------------------------------

    # files available on nersc
    modelDir = './'
    #    dataDir='/global/project/projectdirs/desi/target/qso_training/'
    dataDir = './'

    # region of control   36<ra<42 is removed
    starTraining = dataDir + 'star_dr3_nora36-42_normalized.fits'  # dr3
    qsoTraining = dataDir + 'qso_dr3_nora36-42.fits'  # dr3

    # Test over stripe 82
    fileName = 'Stripe82_dr3_decals'  # dr3
    objectTesting = dataDir + fileName + '.fits'
    outputFile = './' + fileName + '_newTraining.fits'

    if Step == 'train':
        star0 = pyfits.open(starTraining, memmap=True)[1].data
        star0_g, star0_r, star0_z, star0_W1, star0_W2 = magsExtFromFlux(star0)
        star = star0[(star0_g > 0) & (star0_r < 22.7)]

        qso0 = pyfits.open(qsoTraining, memmap=True)[1].data
        qso0_g, qso0_r, qso0_z, qso0_W1, qso0_W2 = magsExtFromFlux(qso0)
        qso = qso0[(qso0_r > 0) & (qso0_r < 22.7)]

    elif (Step == 'test' or Step == 'extract_myRF'):
        object = pyfits.open(objectTesting, memmap=True)[1].data
        object_g, object_r, object_z, object_W1, object_W2 = magsExtFromFlux(
            object)
        nobjecttot = len(object)
        object_colors = colors(nobjecttot, nfeatures, object_g, object_r,
                               object_z, object_W1, object_W2)

    else:
        print('Unknown option')
        sys.exit()

    # ------------------------
    if Step == 'train':

        # ----------------------------------------
        #   prepare arrays for Machine Learning
        # ----------------------------------------

        print('qsos in file:', len(qso))
        print('star in file:', len(star))
        nqsotot = len(qso)
        nqso = len(qso)
        nstartot = len(star)
        nstar = len(star)

        if nqsotot * nstartot == 0:
            sys.exit()

        data = np.zeros((nqso + nstar, nfeatures))
        target = np.zeros(nqso + nstar)

        qso_g, qso_r, qso_z, qso_W1, qso_W2 = magsExtFromFlux(qso)
        qso_colors = colors(nqsotot, nfeatures, qso_g, qso_r, qso_z, qso_W1,
                            qso_W2)

        if debug:
            debug_qso_cols = pyfits.ColDefs([
                pyfits.Column(name='r', format='E', array=qso_r[:]),
                pyfits.Column(name='g', format='E', array=qso_g[:]),
                pyfits.Column(name='z', format='E', array=qso_z[:]),
                pyfits.Column(name='W1', format='E', array=qso_W1[:]),
                pyfits.Column(name='W2', format='E', array=qso_W2[:]),
                pyfits.Column(name='colors',
                              format='11E',
                              array=qso_colors[:, :]),
            ])
            hduQso = pyfits.BinTableHDU.from_columns(debug_qso_cols)
            hduQso.writeto('debug_qso.fits', clobber=True)

            print(' Debug qsos')
            print(qso_colors)

        star_g, star_r, star_z, star_W1, star_W2 = magsExtFromFlux(star)
        star_colors = colors(nstartot, nfeatures, star_g, star_r, star_z,
                             star_W1, star_W2)

        if debug:
            debug_star_cols = pyfits.ColDefs([
                pyfits.Column(name='r', format='E', array=star_r[:]),
                pyfits.Column(name='g', format='E', array=star_g[:]),
                pyfits.Column(name='z', format='E', array=star_z[:]),
                pyfits.Column(name='W1', format='E', array=star_W1[:]),
                pyfits.Column(name='W2', format='E', array=star_W2[:]),
                pyfits.Column(name='colors',
                              format='11E',
                              array=star_colors[:, :]),
            ])
            hduStar = pyfits.BinTableHDU.from_columns(debug_star_cols)
            hduStar.writeto('debug_star.fits', clobber=True)
            print(' Debug stars')
            print(star_colors)

    # final arrays
        data[0:nqso, :] = qso_colors[0:nqso, :]
        data[nqso:nqso + nstar, :] = star_colors[0:nstar, :]
        target[0:nqso] = 1
        target[nqso:nqso + nstar] = 0

        # -----------------------
        #   Start the training
        # -----------------------

        print('training over ', nqso, ' qsos and ', nstar, ' stars')

        print('with random Forest')
        np.random.seed(0)
        rf = RandomForestClassifier(200)
        rf.fit(data, target)
        joblib.dump(rf, modelDir + 'rf_model_dr3.pkl.gz', compress=9)
        np.random.seed(0)
        rf.fit(data[:, 0:9], target)
        joblib.dump(rf, modelDir + 'rf_model_normag_dr3.pkl.gz', compress=9)

        print('with adaBoost')
        ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=8),
                                 algorithm="SAMME.R",
                                 n_estimators=200)
        np.random.seed(0)
        ada.fit(data, target)
        joblib.dump(ada, modelDir + 'adaboost_model_dr3.pkl.gz', compress=9)
        np.random.seed(0)
        ada.fit(data[:, 0:9], target)
        joblib.dump(ada,
                    modelDir + 'adaboost_model_normag_dr3.pkl.gz',
                    compress=9)

        sys.exit()

    # -----------------------
    if Step == 'test':
        print('Check over a test sample')

        # -----------------------
        print('random Forest over ', len(object_colors), ' objects ')

        rf = joblib.load(modelDir + 'rf_model_dr3.pkl.gz')
        pobject_rf = rf.predict_proba(object_colors)

        rf = joblib.load(modelDir + 'rf_model_normag_dr3.pkl.gz')
        pobject_rf_ns = rf.predict_proba(object_colors[:, 0:9])

        # -----------------------
        print('adaBoost over ', len(object_colors), ' objects ')

        ada = joblib.load(modelDir + 'adaboost_model_dr3.pkl.gz')
        pobject_ada = ada.predict_proba(object_colors)

        ada = joblib.load(modelDir + 'adaboost_model_normag_dr3.pkl.gz')
        pobject_ada_ns = ada.predict_proba(object_colors[:, 0:9])

        # -----------------------
        print('updating fits file')

        hdusel = pyfits.BinTableHDU(data=object)
        print('create fit file with', len(object), ' objects')
        orig_cols = object.columns
        new_cols = pyfits.ColDefs([
            pyfits.Column(name='PADA_new', format='E', array=pobject_ada[:,
                                                                         1]),
            pyfits.Column(name='PADAnomagr_new',
                          format='E',
                          array=pobject_ada_ns[:, 1]),
            pyfits.Column(name='PRANDF_new',
                          format='E',
                          array=pobject_rf[:, 1]),
            pyfits.Column(name='PRANDFnomagr_new',
                          format='E',
                          array=pobject_rf_ns[:, 1]),
        ])
        hduNew = pyfits.BinTableHDU.from_columns(orig_cols + new_cols)
        hduNew.writeto(outputFile, clobber=True)

        sys.exit()

    # ----------------------------
    if Step == 'extract_myRF':
        print('Produce the random forest with our own persistency')

        rf = joblib.load(modelDir + 'rf_model_dr3.pkl.gz')
        #        rf = joblib.load(modelDir+'rf_model_elg_ref.pkl.gz')

        newDir = modelDir + 'RF/'
        print('dump all files in ', newDir)
        if not os.path.isdir(newDir):
            os.makedirs(newDir)
        joblib.dump(rf, newDir + 'bdt.pkl')

        nTrees = 200
        #        nTrees = 500
        myrf = myRF(object_colors, newDir, numberOfTrees=nTrees, version=2)
        myrf.saveForest(modelDir + 'rf_model_dr3.npz')
        #        myrf.saveForest(modelDir+'rf_model_new.npz')

        sys.exit()