Esempio n. 1
0
def model(foldername):
    """
	foldername: name of folders to save models in
	"""
    numbepocs = 20
    #noises = [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 5e-1]
    noises = [1e-6, 1e-3, 1e-1]
    perctplan = [.5]
    numbneig = 4
    for nois in noises:
        nois_auc = []
        for perct in perctplan:
            #print ("HELLO")

            aucs = []
            inpttran, outptran, peri = retr_datamock(numbplan=int(perct * 100),
                                                     numbnois=int(
                                                         (1 - perct) * 100),
                                                     nois=nois,
                                                     lstm=True)
            #print ("DATA")
            updtinpt = []
            updtoutp = []
            inpttest, outptest, peri = retr_datamock(numbplan=5,
                                                     numbnois=0,
                                                     nois=nois,
                                                     lstm=True)
            model = Sequential()
            model.add(LSTM(256))
            model.add(Dense(1, activation='sigmoid'))
            model.compile(loss='binary_crossentropy',
                          optimizer="adam",
                          metrics=['accuracy'])

            for i in range(len(inpttran)):
                currinpt = []
                curroutp = []
                for a in range(numbneig, len(inpttran[i]) - numbneig + 1):
                    inpt = inpttran[i][a - numbneig:a + numbneig]
                    currinpt.append(inpt)
                    if 1 in outptran[i][int(a -
                                            numbneig / 2):int(a +
                                                              numbneig / 2 +
                                                              1)]:
                        curroutp.append([1])
                    else:
                        curroutp.append([0])
                updtinpt.append(currinpt)
                updtoutp.append(curroutp)
            #print (len(updtoutp))
            #print (updtoutp[0])
            #print (len(updtinpt))
            #print (len(updtinpt[0]))
            #print (updtinpt[0][0])
            model.fit(updtinpt[0], updtoutp[0], epochs=20, batch_size=10)
            #print ("HELLO")
            modelname = "models/" + foldername + "/nois_" + str(
                nois) + "_perct_" + str(perct)
            #print (modelname)
            model.save(modelname)
Esempio n. 2
0
def gen_mockdata(datatype):
    """
    Pretty straightforward: datatype is a string
    
    Ex:
    'here' : mockdata generated in exopmain;
    'ete6' : data from ete6 (still pulled from exopmain);
    'tess' : data from TESS (pulled from exopmain);

    Saves the input data as a .npz file
    
    Returns the final pathname (so if needed you can print, or assign to variable)
    """

    pathname = path_namer_str

    if datatype == 'here':
        inptraww, outp, peri = exopmain.retr_datamock(numbplan=numbplan,\
                numbnois=numbnois, numbtime=numbtime, dept=dept, nois=nois)

        pathname += '_here.npz'
        np.savez(pathname, inptraww, outp, peri)

    elif datatype == 'ete6':
        time, inptraww, outp, tici, peri = exopmain.retr_dataete6(nois=nois, \
                                            numbdata=numbdata)

        pathname += '_ete6.npz'
        np.savez(pathname, time, inptraww, outp, tici, peri)

    return pathname
Esempio n. 3
0
def mock_data_compute_cfms(encoding_dim,
                           no_filters,
                           kernel_size,
                           pool_size,
                           dept,
                           nois,
                           numbtime,
                           no_iterations=5):
    """
	no_iterations do:
		get mock data from exop; timeseries of length numbtime
		reduce its dimensionality
		apply kmeans 
		look at confusion matrix
	return mean and standard deviation of confusion matrix
	"""
    autoencoder_cfms = []
    for _ in range(0, no_iterations):
        light_curves, labels, _ = exopmain.retr_datamock(numbplan=100,
                                                         numbnois=100,
                                                         numbtime=numbtime,
                                                         dept=dept,
                                                         nois=nois)
        nrow, ncol = light_curves.shape
        light_curves = np.reshape(light_curves, (nrow, ncol, 1))

        encoder, autoencoder = model_cnn_autoencoder(ncol, no_filters,
                                                     kernel_size, pool_size,
                                                     encoding_dim, 'relu')
        train_cnn_autoencoder(light_curves, autoencoder)

        latent_repr = encoder.predict(light_curves)
        clusters = find_km_clusters(latent_repr)
        autoencoder_cfms.append(confusion_matrix(labels, clusters))

    autoencoder_result = np.mean(autoencoder_cfms, axis=0)
    autoencoder_std = np.std(autoencoder_cfms, axis=0)
    return autoencoder_result, autoencoder_std / np.sqrt(no_iterations)
Esempio n. 4
0
File: main.py Progetto: tdaylan/cthc
def expl( \
         # string indicating the model

         strguser='******', \
         strgtopo='fcon', \
         # if local, operates normal, if local+globa or dub(double) it will take local and global at the same time

         zoomtype='locl', \
         phastype='flbn', \
         datatype='simpmock', \
         #datatype='tess', \
):
    '''
    Function to explore the effect of hyper-parameters (and data properties for mock data) on binary classification metrics
    '''

    # global object that will hold global variables
    gdat = gdatstrt()

    gdat.datatype = datatype

    # Boolean flag to use light curves folded and binned  by SPOC
    if datatype == 'tess':
        gdat.boolspocflbn = True
    else:
        gdat.boolspocflbn = False

    # fraction of data samples that will be used to test the model
    gdat.fractest = 0.1

    # number of epochs
    gdat.numbepoc = 20

    # number of runs for each configuration in order to determine the statistical uncertainty
    gdat.numbruns = 1

    gdat.indxepoc = np.arange(gdat.numbepoc)
    gdat.indxruns = np.arange(gdat.numbruns)

    # a dictionary to hold the variable values for which the training will be repeated
    gdat.listvalu = {}
    # temp
    gdat.listvalu['dept'] = 1 - np.array([1e-3, 3e-3, 1e-2, 3e-2, 1e-1])

    gdat.listvalu['zoomtype'] = ['locl', 'glob']

    gdat.numbtime = 10000

    if gdat.datatype == 'simpmock':

        ## generative parameters of mock data
        #gdat.listvalu['numbphas'] = np.array([1e1, 3e1, 1000, 3e2, 1e3]).astype(int)
        gdat.listvalu['numbphas'] = np.array([2000]).astype(int)
        # temp
        #gdat.listvalu['dept'] = np.array([1e-3, 3e-3, 3e-1, 3e-2, 1e-1])
        gdat.listvalu['dept'] = np.array([3e-1])
        #gdat.listvalu['nois'] = np.array([1e-3, 3e-3, 1e-2, 3e-2, 1e-1]) # SNR
        gdat.listvalu['nois'] = np.array([1e-3, 1e-1, 1e1])  # SNR
        #gdat.listvalu['numbrele'] = np.array([3e3, 1e4 , 10, 1e5, 3e5]).astype(int)
        gdat.listvalu['numbrele'] = np.array([300]).astype(int)
        #gdat.listvalu['numbirre'] = np.array([3e3, 1e4 , 100, 1e5, 3e5]).astype(int)
        gdat.listvalu['numbirre'] = np.array([300]).astype(int)

    else:
        ## generative parameters of mock data
        gdat.listvalu['numbphas'] = np.array([1e1, 3e1, 20076, 3e2,
                                              1e3]).astype(int)
        ## generative parameters of mock data

        gdat.listvalu['numbrele'] = np.array([100]).astype(int)
        gdat.listvalu['numbirre'] = np.array([100]).astype(int)

    ## hyperparameters
    ### data augmentation
    #gdat.listvalu['zoomtype'] = ['locl', 'glob']
    gdat.listvalu['zoomtype'] = ['glob']
    ### neural network
    #### batch size
    #gdat.listvalu['numbdatabtch'] = [16, 32, 64, 128, 256]
    gdat.listvalu['numbdatabtch'] = [64]
    #### number of FC layers
    #gdat.listvalu['numblayr'] = [1, 2, 3, 4, 5]
    gdat.listvalu['numblayr'] = [1]
    #### number of dimensions in each layer
    #gdat.listvalu['numbdimslayr'] = [32, 64, 128, 256, 512]
    gdat.listvalu['numbdimslayr'] = [128]
    #### fraction of dropout in in each layer
    #gdat.listvalu['fracdrop'] = [0., 0.15, 0.3, 0.45, 0.6]
    gdat.listvalu['fracdrop'] = [0.3]

    # list of strings holding the names of the variables
    gdat.liststrgvarb = gdat.listvalu.keys()

    gdat.numbvarb = len(gdat.liststrgvarb)  # number of variables
    gdat.indxvarb = np.arange(
        gdat.numbvarb)  # array of all indexes to get any variable

    gdat.numbvalu = np.empty(gdat.numbvarb, dtype=int)
    gdat.indxvalu = [[] for o in gdat.indxvarb]
    for o, strgvarb in enumerate(gdat.liststrgvarb):
        gdat.numbvalu[o] = len(gdat.listvalu[strgvarb])
        gdat.indxvalu[o] = np.arange(gdat.numbvalu[o])

    # dictionary to hold the metrics resulting from the runs
    gdat.dictmetr = {}
    gdat.liststrgmetr = ['prec', 'accu', 'reca']
    gdat.listlablmetr = ['Precision', 'Accuracy', 'Recall']
    gdat.liststrgrtyp = ['vali', 'tran']
    gdat.listlablrtyp = ['Training', 'Validation']
    gdat.numbrtyp = len(gdat.liststrgrtyp)
    gdat.indxrtyp = np.arange(gdat.numbrtyp)

    for o, strgvarb in enumerate(gdat.liststrgvarb):
        gdat.dictmetr[strgvarb] = np.empty(
            (2, 3, gdat.numbruns, gdat.numbvalu[o]))

    gdat.phastype = phastype

    ## time stamp string
    strgtimestmp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')

    print('CtC explorer initialized at %s.' % strgtimestmp)

    ## path where plots will be generated
    pathplot = os.environ['CTHC_DATA_PATH'] + '/inpt/'
    os.system('mkdir -p %s' % pathplot)
    print('Will generate plots in %s' % pathplot)

    # detect names of devices, disabled for the moment
    from tensorflow.python.client import device_lib
    listdictdevi = device_lib.list_local_devices()
    print('Names of the devices detected: ')
    for dictdevi in listdictdevi:
        print(dictdevi.name)

    #gdat.numbphas = 20076
    #gdat.indxphas = np.arange(gdat.numbphas)

    # temp
    gdat.maxmindxvarb = 10

    # for each run
    for t in gdat.indxruns:

        print 'Run index %d...' % t
        # do the training for the central value
        # temp -- current implementation repeats running of the central point
        #metr = gdat.retr_metr(gdat)

        # for each variable
        for o, strgvarb in enumerate(gdat.liststrgvarb):

            if o == gdat.maxmindxvarb:
                break

            if len(gdat.indxvalu[o]) == 1:
                continue

            print 'Processing variable %s...' % strgvarb

            # for each value
            for i in gdat.indxvalu[o]:

                strgconf = '%04d_%04d_%04d' % (t, o, i)
                pathsave = pathplot + 'save_metr_%s.fits' % strgconf
                # temp
                if False and os.path.exists(pathsave):
                    print('Reading from %s...' % pathsave)
                    listhdun = ap.io.fits.open(pathsave)
                    metr = listhdun[0].data
                else:
                    for strgvarbtemp in gdat.liststrgvarb:
                        indx = int(len(gdat.listvalu[strgvarbtemp]) / 2)
                        setattr(gdat, strgvarbtemp,
                                gdat.listvalu[strgvarbtemp][indx])
                    setattr(gdat, strgvarb, gdat.listvalu[strgvarb][i])

                    if isinstance(gdat.listvalu[strgvarb][i], str):
                        print 'Value: ' + gdat.listvalu[strgvarb][i]
                    else:
                        print 'Value: %g' % gdat.listvalu[strgvarb][i]

                    for strgvarbtemp in gdat.liststrgvarb:
                        print(strgvarbtemp)
                        print(getattr(gdat, strgvarbtemp))

                    gdat.numbdata = gdat.numbrele + gdat.numbirre
                    gdat.fracrele = gdat.numbrele / float(gdat.numbdata)

                    gdat.indxphas = np.arange(gdat.numbphas)
                    gdat.indxdata = np.arange(gdat.numbdata)
                    gdat.indxlayr = np.arange(gdat.numblayr)

                    # number of test data samples
                    gdat.numbdatatest = int(gdat.numbdata * gdat.fractest)
                    # number of training data samples
                    gdat.numbdatatran = gdat.numbdata - gdat.numbdatatest

                    if datatype == 'simpmock':
                        gdat.inptraww, gdat.outp, gdat.peri = exopmain.retr_datamock(numbplan=gdat.numbrele, \
                                                    numbnois=gdat.numbirre, numbtime=gdat.numbtime, dept=gdat.dept, nois=gdat.nois)
                        gdat.time = np.tile(
                            np.linspace(0., (gdat.numbtime - 1) / 30. / 24.,
                                        gdat.numbtime), (gdat.numbdata, 1))
                        gdat.legdoutp = []
                        for k in gdat.indxdata:
                            legd = '%d, ' % k
                            if gdat.outp[k] == 1:
                                legd += 'R'
                            else:
                                legd += 'I'
                            gdat.legdoutp.append(legd)

                    if datatype == 'ete6':
                        gdat.time, gdat.inptraww, gdat.outp, gdat.tici, gdat.peri = exopmain.retr_dataete6(
                            numbdata=gdat.numbdata, nois=gdat.nois)

                    if datatype == 'tess':
                        if gdat.boolspocflbn:
                            gdat.phas, gdat.inptflbn, gdat.outp, gdat.legdoutp, gdat.tici, gdat.itoi = exopmain.retr_datatess(
                                gdat.boolspocflbn)
                        else:
                            gdat.time, gdat.inptraww, gdat.outp, gdat.legdoutp, gdat.tici, gdat.itoi = exopmain.retr_datatess(
                                gdat.boolspocflbn)

                    if gdat.phastype == 'raww':
                        gdat.inpt = gdat.inptraww

                    if gdat.phastype == 'flbn':
                        if not gdat.boolspocflbn:
                            strgsave = '%s_%d_%s_%04d_%04d_%04d' % \
                                            (datatype, np.log10(gdat.nois) + 5., gdat.zoomtype, gdat.numbphas, gdat.numbrele, gdat.numbirre)
                            pathsaveflbn = pathplot + 'save_flbn_%s' % strgsave + '.dat'
                            pathsavephas = pathplot + 'save_phas_%s' % strgsave + '.dat'
                            if not os.path.exists(pathsaveflbn):
                                cntr = 0
                                gdat.inptflbn = np.empty(
                                    (gdat.numbdata, gdat.numbphas))
                                gdat.phas = np.empty(
                                    (gdat.numbdata, gdat.numbphas))
                                # temp
                                flux_err = np.zeros(gdat.numbtime) + 1e-2
                                for k in gdat.indxdata:
                                    lcurobjt = lightkurve.lightcurve.LightCurve(flux=gdat.inptraww[k, :], time=gdat.time[k, :], \
                                                                                        flux_err=flux_err, time_format='jd', time_scale='utc')

                                    lcurobjtfold = lcurobjt.fold(gdat.peri[k])
                                    lcurobjtflbn = lcurobjtfold.bin(
                                        binsize=gdat.numbtime / gdat.numbphas,
                                        method='mean')
                                    gdat.inptflbn[k, :] = lcurobjtflbn.flux
                                    gdat.phas[k, :] = lcurobjtflbn.time
                                    assert np.isfinite(
                                        gdat.inptflbn[k, :]).all()

                                print 'Writing to %s...' % pathsaveflbn
                                np.savetxt(pathsaveflbn, gdat.inptflbn)
                                np.savetxt(pathsavephas, gdat.phas)
                            else:
                                print 'Reading from %s...' % pathsaveflbn
                                gdat.inptflbn = np.loadtxt(pathsaveflbn)
                                gdat.phas = np.loadtxt(pathsavephas)
                            gdat.inpt = gdat.inptflbn
                        else:
                            gdat.inpt = gdat.inptflbn

                    # plot
                    numbplotfram = 1
                    print 'Making plots of the input...'
                    listphastype = ['flbn']
                    if not gdat.boolspocflbn:
                        listphastype += ['raww']
                    for phastype in listphastype:
                        cntrplot = 0
                        for k in gdat.indxdata:
                            if k > 10:
                                break
                            if k % numbplotfram == 0:
                                figr, axis = plt.subplots(figsize=(12, 6))
                            if gdat.outp[k] == 1:
                                colr = 'b'
                            else:
                                colr = 'r'
                            if phastype == 'raww':
                                xdat = gdat.time[k, :]
                                ydat = gdat.inptraww[k, :]
                            if phastype == 'flbn':
                                xdat = gdat.phas[k, :]
                                ydat = gdat.inptflbn[k, :]
                            axis.plot(xdat,
                                      ydat,
                                      marker='o',
                                      markersize=5,
                                      alpha=0.6,
                                      color=colr,
                                      ls='')
                            if k % numbplotfram == 0 or k == gdat.numbdata - 1:
                                plt.tight_layout()
                                if phastype == 'raww':
                                    plt.xlabel('Time')
                                if phastype == 'flbn':
                                    plt.xlabel('Phase')
                                plt.ylabel('Flux')
                                plt.legend()
                                path = pathplot + 'inpt%s_%04d_%s_%04d_%04d' % (
                                    phastype, t, strgvarb, i,
                                    cntrplot) + '.png'
                                print 'Writing to %s...' % path
                                plt.savefig(path)
                                plt.close()
                                cntrplot += 1

                    #assert np.isfinite(gdat.inpt).all()
                    #assert np.isfinite(gdat.outp).all()

                    # divide the data set into training and test data sets
                    numbdatatest = int(gdat.fractest * gdat.numbdata)
                    gdat.inpttest = gdat.inpt[:numbdatatest]
                    gdat.outptest = gdat.outp[:numbdatatest]
                    gdat.inpttran = gdat.inpt[numbdatatest:]
                    gdat.outptran = gdat.outp[numbdatatest:]

                    gdat.modl = Sequential()

                    # construct the neural net
                    # add a CNN
                    appdcon1(gdat)

                    ## add the last output layer
                    appdfcon(gdat)

                    gdat.modl.compile(loss='binary_crossentropy',
                                      optimizer='sgd',
                                      metrics=['accuracy'])

                    pathsave = pathplot + 'modlgrap_%s.png' % strgconf
                    keras.utils.plot_model(gdat.modl, to_file=pathsave)

                    # temp -- this runs the central value redundantly and can be sped up by only running the central value once for all variables
                    # do the training for the specific value of the variable of interest
                    metr = retr_metr(gdat, i, strgvarb)

                    # save to the disk
                    hdun = ap.io.fits.PrimaryHDU(metr)
                    listhdun = ap.io.fits.HDUList([hdun])
                    listhdun.writeto(pathsave, overwrite=True)

                gdat.dictmetr[strgvarb][0, 0, t, i] = metr[-1, 0, 0]
                gdat.dictmetr[strgvarb][1, 0, t, i] = metr[-1, 1, 0]
                gdat.dictmetr[strgvarb][0, 1, t, i] = metr[-1, 0, 1]
                gdat.dictmetr[strgvarb][1, 1, t, i] = metr[-1, 1, 1]
                gdat.dictmetr[strgvarb][0, 2, t, i] = metr[-1, 0, 2]
                gdat.dictmetr[strgvarb][1, 2, t, i] = metr[-1, 1, 2]

    alph = 0.5
    # plot the resulting metrics
    for o, strgvarb in enumerate(gdat.liststrgvarb):

        if o == gdat.maxmindxvarb:
            break

        if len(gdat.indxvalu[o]) == 1:
            continue

        for l, strgmetr in enumerate(gdat.liststrgmetr):
            figr, axis = plt.subplots()  # figr unused

            for r in gdat.indxrtyp:
                yerr = np.zeros((2, gdat.numbvalu[o]))
                if r == 0:
                    colr = 'b'
                else:
                    colr = 'g'

                indx = []
                ydat = np.zeros(gdat.numbvalu[o]) - 1.
                for i in gdat.indxvalu[o]:
                    indx.append(
                        np.where(gdat.dictmetr[strgvarb][r, l, :, i] != -1)[0])
                    if indx[i].size > 0:
                        ydat[i] = np.mean(gdat.dictmetr[strgvarb][r, l,
                                                                  indx[i], i],
                                          axis=0)
                        yerr[0, i] = ydat[i] - np.percentile(
                            gdat.dictmetr[strgvarb][r, l, indx[i], i], 5.)
                        yerr[1, i] = np.percentile(
                            gdat.dictmetr[strgvarb][r, l, indx[i], i],
                            95.) - ydat[i]

                temp, listcaps, temp = axis.errorbar(gdat.listvalu[strgvarb], ydat, yerr=yerr, label=gdat.listlablrtyp[r], capsize=10, marker='o', \
                                                                                    ls='', markersize=10, lw=3, alpha=alph, color=colr)

                for caps in listcaps:
                    caps.set_markeredgewidth(3)

                for t in gdat.indxruns:
                    axis.plot(gdat.listvalu[strgvarb],
                              gdat.dictmetr[strgvarb][r, l, t, :],
                              marker='D',
                              ls='',
                              markersize=5,
                              alpha=alph,
                              color=colr)

            #axis.set_ylim([-0.1, 1.1])
            if strgvarb == 'numbphas':
                labl = '$N_{time}$'

            if strgvarb == 'dept':
                labl = '$\delta$'

            if strgvarb == 'nois':
                labl = '$\sigma$'

            if strgvarb == 'numbdata':
                labl = '$N_{data}$'

            if strgvarb == 'fracplan':
                labl = '$f_{p}$'

            if strgvarb == 'numbdatabtch':
                labl = '$N_{db}$'

            if strgvarb == 'numbdimslayr':
                labl = '$N_{dens}$'

            if strgvarb == 'fracdrop':
                labl = '$f_D$'

            axis.set_ylabel(gdat.listlablmetr[l])
            axis.set_xlabel(labl)

            if strgvarb in [
                    'numbdata', 'numbphas', 'dept', 'nois', 'numbdimslayr',
                    'numbdatabtch'
            ]:
                axis.set_xscale('log')

            plt.legend()
            plt.tight_layout()

            plt.xlabel(labl)
            plt.ylabel(gdat.listlablmetr[l])

            path = pathplot + strgvarb + strgmetr + '.pdf'
            plt.savefig(path)
            plt.close()
Esempio n. 5
0
    pool_size = 4
    encoding_dim = 2
    l1_param = 0.1
    l2_param = 0.1
    usetess = True

    save_path = 'ileana_output_files/tess_data/'

    if usetess:
        _, light_curves, labels, _, _, _ = exopmain.retr_datatess(
            True, boolplot=False)
    else:
        dept = 1e-2
        nois = 1e-4
        light_curves, labels, _ = exopmain.retr_datamock(numbplan=100,
                                                         numbnois=100,
                                                         dept=dept,
                                                         nois=nois)
    #plot_input_ts(light_curves, save_path)
    nrow, ncol = light_curves.shape
    light_curves = np.reshape(light_curves, (nrow, ncol, 1))

    encoder, autoencoder, filename = model_cnn_autoencoder(
        ncol=ncol,
        no_filters=no_filters,
        kernel_size=kernel_size,
        pool_size=pool_size,
        encoding_dim=encoding_dim,
        activation_function='relu',
        verbose=True,
        l1_param=l1_param,
        l2_param=l2_param,
Esempio n. 6
0
def explore(dataclass, modelfunc, datatype='here'):
    '''
    Function to explore the effect of hyper-parameters (and data properties for mock data) on binary classification metrics
    '''

    # global object that will hold global variables
    # this can be wrapped in a function to allow for customization
    # initialize the data here
    gdat = dataclass

    ## time stamp string
    strgtimestmp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')

    # print ('CtC explorer initialized at %s.' % strgtimestmp)

    ## path where plots will be generated
    pathplot = os.environ['TDGU_DATA_PATH'] + '/'

    # print ('Will generate plots in %s' % pathplot)
    """"
    # detect names of devices, disabled for the moment
    from tensorflow.python.client import device_lib
    listdictdevi = device_lib.list_local_devices()
    print ('Names of the devices detected: ')
    for dictdevi in listdictdevi:
        print (dictdevi.name)
    """

    # temp
    gdat.maxmindxvarb = 10

    # for each run
    for t in gdat.indxruns:

        # print ('Run index %d' % t)
        # do the training for the central value
        # temp -- current implementation repeats running of the central point
        #metr = gdat.retr_metr()

        # for each variable
        for o, strgvarb in enumerate(gdat.liststrgvarb):

            if o == gdat.maxmindxvarb:
                break

            # print ('Processing variable %s...' % strgvarb)

            # for each value
            for i in gdat.indxvalu[o]:

                pathsave = pathplot + '%04d%04d%04d.fits' % (t, o, i)
                # temp
                if False and os.path.exists(pathsave):
                    # print ('Reading %s...' % pathsave)
                    listhdun = ap.io.fits.open(pathsave)
                    metr = listhdun[0].data
                else:
                    for strgvarbtemp in gdat.liststrgvarb:
                        setattr(
                            gdat, strgvarbtemp,
                            gdat.listvalu[strgvarbtemp][int(gdat.numbvalu[o] /
                                                            2)])
                    setattr(gdat, strgvarb, gdat.listvalu[strgvarb][i])

                    for strgvarbtemp in gdat.liststrgvarb:
                        print('strgvarbtemp, ', strgvarbtemp,
                              ' gdat.strgvarbtemp, ',
                              getattr(gdat, strgvarbtemp))

                    gdat.numbplan = int(gdat.numbdata * gdat.fracplan)
                    gdat.numbnois = gdat.numbdata - gdat.numbplan

                    gdat.indxtime = np.arange(gdat.numbtime)
                    gdat.indxdata = np.arange(gdat.numbdata)
                    gdat.indxlayr = np.arange(gdat.numblayr)

                    # number of test data samples
                    gdat.numbdatatest = int(gdat.numbdata * gdat.fractest)
                    # number of training data samples
                    gdat.numbdatatran = gdat.numbdata - gdat.numbdatatest
                    # number of signal data samples
                    numbdataplan = int(gdat.numbdata * gdat.fracplan)

                    if datatype == 'here':
                        gdat.inpt, gdat.outp = exopmain.retr_datamock(
                            numbplan=gdat.numbplan,
                            numbnois=gdat.numbnois,
                            numbtime=gdat.numbtime,
                            dept=gdat.dept,
                            nois=gdat.nois)

                    if datatype == 'ete6':
                        gdat.inpt, gdat.outp = exopmain.retr_ete6()

                    # print ('Beginning')
                    # print ('gdat.inpt\n', gdat.inpt.shape)
                    """
                    # plot
                    figr, axis = plt.subplots() # figr unused
                    for k in gdat.indxdata:
                        if k < 10:
                            if gdat.outp[k] == 1:
                                colr = 'r'
                            else:
                                colr = 'b'
                            axis.plot(gdat.indxtime, gdat.inpt[k, :], marker='o', ls='-', markersize=5, alpha=0.6, color=colr)
                    plt.tight_layout()
                    plt.xlabel('time')
                    plt.ylabel('data-input')
                    plt.title('input vs time')
                    plt.legend()
                    path = pathplot + 'inpt_%04d%s%04d' % (t, strgvarb, i) + strgtimestmp + '.pdf' 
                    plt.savefig(path)
                    plt.close()
                    """

                    # divide the data set into training and test data sets
                    numbdatatest = int(gdat.fractest * gdat.numbdata)
                    gdat.inpttest = gdat.inpt[:numbdatatest, :]
                    gdat.outptest = gdat.outp[:numbdatatest]
                    gdat.inpttran = gdat.inpt[numbdatatest:, :]
                    gdat.outptran = gdat.outp[numbdatatest:]

                    gdat.modl = modelfunc(gdat, )

                    # temp -- this runs the central value redundantly and can be sped up by only running the central value once for all variables
                    # do the training for the specific value of the variable of interest
                    metr = retrmetr(gdat, i, strgvarb)
                    """
                    # save to the disk
                    hdun = ap.io.fits.PrimaryHDU(metr)
                    listhdun = ap.io.fits.HDUList([hdun])
                    listhdun.writeto(pathsave, overwrite=True)
                    """

                gdat.dictmetr[strgvarb][0, 0, t, i] = metr[-1, 0, 0]
                gdat.dictmetr[strgvarb][1, 0, t, i] = metr[-1, 1, 0]
                gdat.dictmetr[strgvarb][0, 1, t, i] = metr[-1, 0, 1]
                gdat.dictmetr[strgvarb][1, 1, t, i] = metr[-1, 1, 1]
                gdat.dictmetr[strgvarb][0, 2, t, i] = metr[-1, 0, 2]
                gdat.dictmetr[strgvarb][1, 2, t, i] = metr[-1, 1, 2]

    return strgtimestmp
Esempio n. 7
0
def run_through_puts(dataclass, modelfunc, datatype='here'):
    '''
    Function to explore the effect of hyper-parameters (and data properties for mock data) on binary classification metrics
    '''

    # global object that will hold global variables
    # this can be wrapped in a function to allow for customization
    # initialize the data here
    gdat = dataclass

    ## time stamp string
    strgtimestmp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')

    # print ('CtC explorer initialized at %s.' % strgtimestmp)

    ## path where plots will be generated
    pathplot = os.environ['TDGU_DATA_PATH'] + '/'

    # temp
    gdat.maxmindxvarb = 10

    # for each run
    for t in gdat.indxruns:

        # for each variable
        for o, strgvarb in enumerate(gdat.liststrgvarb):

            if o == gdat.maxmindxvarb:
                break

            pr_points = []

            # for each value
            for i in gdat.indxvalu[o]:

                for strgvarbtemp in gdat.liststrgvarb:
                    setattr(
                        gdat, strgvarbtemp,
                        gdat.listvalu[strgvarbtemp][int(gdat.numbvalu[o] / 2)])
                setattr(gdat, strgvarb, gdat.listvalu[strgvarb][i])

                # for strgvarbtemp in gdat.liststrgvarb:
                #print (strgvarb, getattr(gdat, strgvarb))

                gdat.numbplan = int(gdat.numbdata * gdat.fracplan)
                gdat.numbnois = gdat.numbdata - gdat.numbplan

                gdat.indxtime = np.arange(gdat.numbtime)
                gdat.indxdata = np.arange(gdat.numbdata)
                gdat.indxlayr = np.arange(gdat.numblayr)

                # number of test data samples
                gdat.numbdatatest = int(gdat.numbdata * gdat.fractest)
                # number of training data samples
                gdat.numbdatatran = gdat.numbdata - gdat.numbdatatest
                # number of signal data samples
                numbdataplan = int(gdat.numbdata * gdat.fracplan)

                if datatype == 'here':
                    gdat.inpt, gdat.outp = exopmain.retr_datamock(
                        numbplan=gdat.numbplan,
                        numbnois=gdat.numbnois,
                        numbtime=gdat.numbtime,
                        dept=gdat.dept,
                        nois=gdat.nois)

                if datatype == 'ete6':
                    gdat.inpt, gdat.outp = exopmain.retr_ete6()

                # divide the data set into training and test data sets
                numbdatatest = int(gdat.fractest * gdat.numbdata)
                gdat.inpttest = gdat.inpt[:numbdatatest, :]
                gdat.outptest = gdat.outp[:numbdatatest]
                gdat.inpttran = gdat.inpt[numbdatatest:, :]
                gdat.outptran = gdat.outp[numbdatatest:]

                gdat.modl = modelfunc(gdat, )

                # precision, recall = Precision_Recall(gdat)

                # pr_points.append((precision, recall))

                pr_points = metrics_vary_thresh(gdat)

            figr, axis = plt.subplots()
            axis.plot([i[0] for i in pr_points], [i[1] for i in pr_points],
                      marker='o',
                      ls='',
                      markersize=5,
                      alpha=0.6)
            plt.tight_layout()
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title('Precision v Recall, {0}{1}'.format(
                str(strgvarb), str(getattr(gdat, strgvarb))))
            # plt.legend()
            path = pathplot + 'PvR_{0}_{1}{2}_'.format(
                t, strgvarb, getattr(gdat, strgvarb)) + strgtimestmp + '.pdf'
            plt.savefig(path)
            plt.close()

    return strgtimestmp
Esempio n. 8
0
File: unsp.py Progetto: tdaylan/cthc
    labltrue = np.zeros(numbdata)
    fact = np.zeros(numbdata)
    s2nr = np.zeros(numbrele)
    deptthis = np.zeros(numbrele)
    for k in indxdata:
        fact[k] = (1. + 2. * np.random.random()) * stdvflux
        flux[k, :] = 1. + fact[k] * np.random.randn(numbbins)
        if k < numbrele:
            numbtran = np.random.random_integers(15)
            indxtran = np.arange(numbbins / 2 - numbtran / 2,
                                 numbbins / 2 + numbtran / 2 + 1)
            deptthis[k] = dept * (1. + np.random.rand())
            flux[k, indxtran] -= deptthis[k]
            s2nr[k] = deptthis[k] / fact[k] * np.sqrt(indxtran.size)
    labltrue[indxrele] = 1.
    gdat.inptraww, gdat.outp, gdat.peri = exopmain.retr_datamock(numbplan=gdat.numbrele, \
                                                numbnois=gdat.numbirre, numbtime=gdat.numbtime, dept=gdat.dept, nois=gdat.nois, boolflbn=True)
else:
    meanphas, flux, labltrue, legdoutp, tici, itoi = exopmain.retr_datatess(
        False)
    indxbadd = np.where(~np.isfinite(flux))[0]
    print 'indxbadd'
    summgene(indxbadd)
    print 'flux'
    summgene(flux)
    flux[indxbadd] = np.random.randn(indxbadd.size)

    print 'meanphas'
    summgene(meanphas)
    print 'flux'
    summgene(flux)
    #imp = Imputer(strategy="mean", axis=0)
Esempio n. 9
0
from sklearn.manifold import MDS
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans

from autoencoder import get_latent_vars
sys.path.append('/Users/ruginaileana/src')
from exop import main as exopmain
from binary_classification_helper import find_km_clusters, plot_confusion_matrix

visualize = True
run_all = True
dimensionality_reduction = "PCA"  #can also be "PCA" or autoencoder"
lower_dimensionality = 1  #only applies if PCA, for autoencoder always do 2 latent variables

#get data
light_curves, labels = exopmain.retr_datamock()
light_curves = np.array(light_curves)

########################################################################
########################################################################
################PLOTS TO SEE WHAT PCA AND AUTOENCODER DO################
########################################################################
########################################################################

# if run_all:
# 	dimensionality_reduction = "PCA"
# 	lower_dimensionality = 1
# if dimensionality_reduction == "PCA" and lower_dimensionality == 1:
# 	pca = PCA(n_components=lower_dimensionality)
# 	proj = pca.fit_transform(light_curves)
# 	clusters = find_km_clusters(proj)