Ejemplo n.º 1
0
def UndersampleData(data, max_sample):

    target = np.unique(data['num_rays'])
    random_state = 27
    y_population = ClassImbalance(data, plot=False)

    data_sampled = pd.DataFrame(columns=data.columns)

    for raynr in target:
        if y_population[raynr][0] > max_sample:
            data_slice = data.loc[data['num_rays'] == raynr]
            data_sample = data_slice.sample(n=max_sample,
                                            random_state=random_state)
            data_sampled = data_sampled.append(data_sample, ignore_index=False)
        else:
            data_sampled = data_sampled.append(
                data.loc[data['num_rays'] == raynr], ignore_index=False)

    return data_sampled
Ejemplo n.º 2
0
            graql_insert_query = f'match $sspval isa SSP_value;'
            graql_insert_query += f' $sspval == {sspval:.{precision}};'
            graql_insert_query += f' insert $sspval has depth {depth};'
            graql_queries.append(graql_insert_query)

    return graql_queries
            

#### DATA SELECTION FOR GRAKN TESTING
#data = pd.concat([ALLDATA.iloc[0:10,:],ALLDATA.iloc[440:446,:],ALLDATA.iloc[9020:9026,:]])
#ssp_select = ["Mediterranean Sea Winter","Mediterranean Sea Spring","South Pacific Ocean Spring"]
#SSP_Stat[ssp_select][:]
#SSP_Prop = SSP_Prop[(SSP_Prop['SSP'] == "Mediterranean Sea Winter") | (SSP_Prop['SSP'] == "Mediterranean Sea Spring") | (SSP_Prop['SSP'] == "South Pacific Ocean Spring")]
#SSP_Input = SSP_Input.loc[:,["DEPTH"]+ssp_select]

data_pop = ClassImbalance(ALLDATA)
data_sparse2 = ALLDATA[(ALLDATA.loc[:,'num_rays'] == 500) | (ALLDATA.loc[:, 'num_rays'] == 1000)] #2classes
#data_sparse3 = ALLDATA[(ALLDATA.loc[:,'num_rays'] == 500) | (ALLDATA.loc[:, 'num_rays'] == 6000) | (ALLDATA.loc[:, 'num_rays'] == 15000)] #3classes
#data = UndersampleData(data_sparse2, max_sample = 80)
data = data_sparse2
# Check for sound ducts for the selected data, ducts[:,0] = 'SLD', ducts[:,1] = 'DC'
ducts = np.zeros([np.size(data,0),3],int)
i = 0
for ssp,dmax,idx in zip(data['profile'],data['water_depth_max'], data.index):
    for SSP,DMAX,SLD,DC in zip(SSP_Prop['SSP'], SSP_Prop['dmax'], SSP_Prop['SLD_depth'], SSP_Prop['DC_axis']):
        ducts[i,0] = idx
        if str(ssp) == str(SSP) and int(dmax) == int(DMAX):
            if np.isnan(SLD)==False:
                ducts[i,1] = 1
            if np.isnan(DC)==False:
                ducts[i,2] = 1
Ejemplo n.º 3
0

"""
# Upsampling with SMOT-ENC technique that can handle both cont. and categorical variables
#categorical_var = np.hstack([2, np.arange(5,33)])
categorical_var = np.hstack([2,np.arange(5,33)])
minority = np.arange(4,17)
samplenr = 250
population_target = dict(zip(minority, (np.ones(len(minority))*samplenr).astype(int)))
smote_nc = SMOTENC(categorical_features=categorical_var, sampling_strategy=population_target, random_state=42)
#smote_nc_max = SMOTENC(categorical_features=categorical_var, sampling_strategy='auto', random_state=42)
X_smot, y_smot = smote_nc.fit_resample(X_train, y_train)
dtrain_smot = pd.concat((X_smot, y_smot), axis =1)
dtrain_smot = dtrain_smot.sample(frac = 1) #shuffle the upsampled dataset
"""
"""
# TESTING NEW FEATURES AND THEIR CORRELATIONS

import os
path = os.getcwd()+'\data\\'

#ssp = pd.read_excel(path+"env.xlsx", sheet_name = "SSP")
#ssp_grad = pd.read_excel(path+"env.xlsx", sheet_name = "SSP_GRAD")

rawdata = LoadData(path)
data1 = FeatDuct(rawdata, Input_Only = True)
#data2 = FeatBathy(data1, path)
#data3 = FeatSSPId(data2, path, src_cond = True)
#data4 = FeatSSPStat(data3,path)
#data5 = FeatSSPOnDepth(data4, path, save = True)
data = UndersampleData(data1, 100)
Ejemplo n.º 4
0
def CreateSplits(data,
                 level_out=1,
                 remove_outliers=True,
                 replace_outliers=True,
                 feature_dropout=False,
                 plot_distributions=False,
                 plot_correlations=False):
    """
    1. Create 3 separate data splits based on the 'wedge_slope' value
    --- OUTLIERS ---
    2. Investigate the distribution of each set
    3. Fix outliers based on 'level_out'% threshold, i.e. classes < 1% of the subset. 
    If replace = True, the outliers will be propagated to the closest higher class
    up to 2! classes up. 
    If after propagating up, the class is still < 1% it will be discared as an outlier.
    --- FEATURES ---
    4. Remove features that become redundant withing subsets i.e. water depth max, or slope value
    
    """

    data_00 = data.loc[data['wedge_slope'] == 0]
    data_2U = data.loc[data['wedge_slope'] == 2]  #2 deg up
    data_2D = data.loc[data['wedge_slope'] == -2]  #2 deg down

    ### Outliers Correction
    distributions = []
    SplitSets = []

    def remove_outliers(dat, rayclass):
        outliers = dat.index[dat['num_rays'] == rayclass]
        dat = dat.drop(index=outliers)
        return dat

    def constant_features(X, frac_constant_values=0.95):
        # Get number of rows in X
        num_rows = X.shape[0]
        # Get column labels
        allLabels = X.columns.tolist()
        # Make a dict to store the fraction describing the value that occurs the most
        constant_per_feature = {
            label: X[label].value_counts().iloc[0] / num_rows
            for label in allLabels
        }
        # Determine the features that contain a fraction of missing values greater than threshold
        labels = [
            label for label in allLabels
            if constant_per_feature[label] > frac_constant_values
        ]

        return labels

    # using more articulate names for optins in the func. def only
    replace = replace_outliers
    remove = remove_outliers
    if remove == False and replace != True:
        replace = remove  #can't replace without starting remove procedure
        #remove = False means no interruption into original distr.
    if remove == False and replace == True:
        print(
            'Set remove to True, to allow replacement. This will allow to remove empty classes.'
        )
        print('No outliers have been corrected')
    if remove:
        #check the datasets statistics: class popualtion, ...
        for t, dat in enumerate([data_00, data_2U, data_2D]):
            ystat = ClassImbalance(dat, plot=False)
            distributions.append(ystat)
            classlist = list(ystat.keys())
            for r, rayclass in enumerate(ystat):
                ystatnew = ClassImbalance(dat, plot=False)
                #remove outliers when sample size < 1% of total samples
                if ystatnew[rayclass][1] < level_out:
                    if replace and r <= len(ystat) - 3:
                        propagated_class = ystat[classlist[r]][1] + ystat[
                            classlist[r + 1]][1]
                        if propagated_class >= level_out:
                            dat.loc[dat['num_rays'] == rayclass,
                                    ('num_rays')] = classlist[r + 1]
                        else:
                            dat.loc[dat['num_rays'] == rayclass,
                                    ('num_rays')] = classlist[r + 1]
                            propagated_class = ystat[classlist[
                                r + 1]][1] + ystat[classlist[r + 2]][1]
                            if propagated_class >= level_out:
                                dat.loc[dat['num_rays'] == classlist[r + 1],
                                        ('num_rays')][1:] = classlist[r + 2]

                            else:
                                dat = remove_outliers(dat, rayclass)

                    if replace and r == len(
                            ystat
                    ) - 2:  #second last class can be propagated only once
                        propagated_class = ystat[classlist[r]][1] + ystat[
                            classlist[r + 1]][1]
                        if propagated_class >= level_out:
                            dat.loc[dat['num_rays'] == rayclass,
                                    ('num_rays')] = classlist[r + 1]
                        else:
                            dat = remove_outliers(dat, rayclass)

                    if replace and r == len(
                            ystat
                    ) - 1:  #last class can be only removed if it's still < 1%
                        dat = remove_outliers(dat, rayclass)

                    if not replace:  #if replace = False then always remove outliers
                        dat = remove_outliers(dat, rayclass)

            ystat = ClassImbalance(dat, plot=plot_distributions)
            distributions.append(ystat)
            SplitSets.append(dat)

        SplitSets[0] = remove_outliers(SplitSets[0], 6000)
        distributions[1] = ClassImbalance(SplitSets[0],
                                          plot=plot_distributions)

    #2. TODO : A value is trying to be set on a copy of a slice from a DataFrame.
    #          Try using .loc[row_indexer,col_indexer] = value instead

    ### End of Outlier Correction

    ### Feature dropout
    if feature_dropout:
        # Remove redundant features with constant values in each set
        for i in range(len(SplitSets)):
            features = data.columns.tolist()
            redF = constant_features(SplitSets[i][features],
                                     frac_constant_values=0.99)
            print('Removed constant features ' + f'{redF} '
                  'for SplitSets '
                  f'{i}')
            SplitSets[i] = SplitSets[i].drop(columns=redF)
            features.remove('num_rays')
            features = [f for f in features if f not in redF]

            if plot_correlations:
                PlotCorrelation(SplitSets[i], features, annotate=True)

    return SplitSets, distributions
Ejemplo n.º 5
0
from data_prep import CreateSplits

#data = UndersampleData(ALLDATA, max_sample = 100)
#data = UndersampleData(data, max_sample = 30) #at 30 you got 507 nx graphs created, howeve with NotDuct at this point

# === Flat bottom data only ==== #
#keyspace = "ssp_2class_full"
data_sparse2 = ALLDATA[(ALLDATA.loc[:,'num_rays'] == 500) | (ALLDATA.loc[:,'num_rays'] == 1000)]
data = UndersampleData(data_sparse2, max_sample = 794)

# === 3 classes of 80 samples: 500/6000/15000 ===== 
#keyspace = "ssp_2class"
#data_sparse3 = ALLDATA[(ALLDATA.loc[:,'num_rays'] == 500) | (ALLDATA.loc[:, 'num_rays'] == 6000) | (ALLDATA.loc[:, 'num_rays'] == 15000)] #3classes
#data = UndersampleData(data_sparse3, max_sample = 80)

class_population = ClassImbalance(data, plot = False)
print(class_population)


client = GraknClient(uri=URI)
session = client.session(keyspace=KEYSPACE)

with session.transaction().read() as tx:
        # Change the terminology here onwards from thing -> node and role -> edge
        node_types = get_thing_types(tx)
        [node_types.remove(el) for el in TYPES_TO_IGNORE]
        edge_types = get_role_types(tx)
        [edge_types.remove(el) for el in ROLES_TO_IGNORE]
        print(f'Found node types: {node_types}')
        print(f'Found edge types: {edge_types}')