def imputation_transform(model, missing_data_train, missing_test, flag,
                         missingness):
    if flag == 4:
        x_train = missing_data_train[:, :(missing_data_train.shape[1] - 1)]
        y_train = missing_data_train[:, -1]
        x_test = missing_test[:, :(missing_test.shape[1] - 1)]
        y_test = missing_test[:, -1]
        X_train_predicted = model.transform(x_train)
        X_test_predicted = model.transform(x_test)
        y_tr_reshape = y_train.reshape(y_train.shape[0], 1)
        y_te_reshape = y_test.reshape(y_test.shape[0], 1)
        train_predict = np.concatenate((X_train_predicted, y_tr_reshape),
                                       axis=1)
        test_predict = np.concatenate((X_test_predicted, y_te_reshape), axis=1)
    elif flag == 5:
        if missingness != 0:
            train_predict = mice(missing_data_train)
            test_predict = mice(missing_test)
        else:
            train_predict = missing_data_train
            test_predict = missing_test
    else:
        train_predict = model.transform(missing_data_train)
        test_predict = model.transform(missing_test)
    return train_predict, test_predict
def impute(xt, xv, strategy):
    if strategy == 'mice':
        xt_imputed = mice(xt)
        xv_imputed = mice(xv)
    elif strategy == 'knn':
        xt_imputed = KNN(k=K, verbose=False).fit_transform(xt)
        xv_imputed = KNN(k=K, verbose=False).fit_transform(xv)
    else:
        imp = SimpleImputer(strategy=strategy)
        xt_imputed = pd.DataFrame(imp.fit_transform(xt))
        xv_imputed = pd.DataFrame(imp.transform(xv))
        # put column names back after imputation
        xt_imputed.columns = xt.columns
        xv_imputed.columns = xv.columns
    return xt_imputed, xv_imputed
def processing(dataInt):
    ## missing value
    df = dataInt.copy()
    df_num = df.drop(['timestamp','loc_1', 'loc_2', 'loc_secondary_1', 'loc_secondary_2', 'loc_secondary_3'], axis=1)
    df_NonNum = df.select_dtypes(include=[np.object])
    imputed_training_mice = mice(df_num.values)
    data_mice = pd.DataFrame(imputed_training_mice, columns = df_num.columns, index = list(df.index.values))
    dClean = data_mice.join(df_NonNum)
    ## drop variable inutile
    d_tr = dClean.drop(['loc_1', 'loc_2', 'loc_secondary_1', 'loc_secondary_2', 'loc_secondary_3'], axis=1)
    ## create extra attribute
    conv(d_tr)
    d_tr['timestamp'] = pd.to_datetime(d_tr.timestamp, format = '%Y-%m-%dT%H:%M:%S.%f')
    ## create season and rangeInYear
    s = pd.to_datetime(pd.Series(d_tr['timestamp']))
    d_tr['rangeInYear'] = s.dt.strftime('%j').astype(int)
    d_tr['season'] = d_tr['rangeInYear'].apply(lambda d : get_season(d))
    #create jours working days
    d_tr['is_business_day'] = d_tr['datetime_perso'].apply(lambda e : int(business_day(e)))
    # Is it an holiday for zone A, B or C?
    d = SchoolHolidayDates()
    d_tr['is_holiday'] = d_tr['datetime_perso'].apply(lambda f : int(d.is_holiday(datetime.date(f))))

    dataInt1 = d_tr.drop(['rangeInYear', 'datetime_perso', 'date', 'timestamp'], axis=1)
    return (dataInt1)    
def parse_and_interpolate(filename_list):

    all_dfs = []
    len_list = [0] * len(filename_list)

    for idx, i in enumerate(filename_list):
        df = pd.read_csv(i)
        df_lists = df.values.tolist()

        valid_df_list = []
        for list_idx, each_list in enumerate(df_lists):
            nanPercent = sum(
                [1 if math.isnan(val) else 0
                 for val in each_list]) / len(each_list)
            #nan_check = np.isnan(np.array(each_list)).all()
            if nanPercent < 0.3:
                valid_df_list.append(each_list)
        all_dfs = all_dfs + valid_df_list
        len_list[idx] = len(valid_df_list)
    filled_df_list = []
    if np.isnan(np.array(all_dfs)).any():
        filled_df = mice(np.array(all_dfs))
    else:
        filled_df = np.array(all_dfs)

    result = []
    start = 0
    for i in len_list:
        result.append(filled_df[start:start + i])
        start = i

    return result
def generate_training_data():
    print("Retrieving dataset...")
    dataset = prepare_data.get_sleep_model_training_data()

    print("Cleaning and preparing dataframe")

    dataset['sleep_or_wake'] = (dataset['sleep_or_wake'] == 'S').astype(int)

    # handle missing values
    # from sklearn.impute import SimpleImputer
    # imp_mean = SimpleImputer( strategy='mean') #for median imputation replace 'mean' with 'median'
    # imp_mean.fit(dataset)
    # dataset[dataset.columns] = imp_mean.transform(dataset)

    dataset[dataset.columns] = mice(dataset.values)
    print("impute finished")

    # dataset = dataset.fillna(0)

    # dataset.to_csv(training_dataset_output_path + 'sleep_model_training_data.csv', index=False)
    dataset = dataset.drop(['timestamp'], axis=1)

    X = dataset.drop(['sleep_or_wake'], axis=1)
    y = dataset['sleep_or_wake']

    # normalizing
    scaler = MinMaxScaler(feature_range=(0, 1))
    X[X.columns] = scaler.fit_transform(X[X.columns])

    return X, y
def parse_and_interpolate_test(filename_list):

    all_dfs = []
    len_list = [0] * len(filename_list)

    for idx, i in enumerate(filename_list):
        df = pd.read_csv(i)
        df_lists = df.values.tolist()

        all_dfs = all_dfs + df_lists
        len_list[idx] = len(df_lists)
    filled_df_list = []
    if np.isnan(np.array(all_dfs)).any():
        filled_df = mice(np.array(all_dfs))
    else:
        filled_df = np.array(all_dfs)

    result = []
    start = 0
    for i in len_list:
        result.append(filled_df[start:start + i])
        start = i

    return result
Example #7
0
#combine
data_raw = dataInt.append(dataTest)
data_raw.reset_index(inplace=True)
data_raw.drop('index', inplace=True, axis=1)
data_blink = pd.concat([dataInt, dataOut[['consumption_1', 'consumption_2']]],
                       axis=1)
#data engeebering

dI = data_raw.copy()
#dI_labels = dI.drop(['ID', 'timestamp', 'temp_1', 'temp_2', 'mean_national_temp', 'humidity_1', 'humidity_2', 'consumption_secondary_1', 'consumption_secondary_2', 'consumption_secondary_3', 'loc_1', 'loc_2', 'loc_secondary_1', 'loc_secondary_2', 'loc_secondary_3'], axis=1)
dI_num = dI.drop([
    'timestamp', 'loc_1', 'loc_2', 'loc_secondary_1', 'loc_secondary_2',
    'loc_secondary_3'
],
                 axis=1)
imputed_training_mice = mice(dI_num.values)
data_mice = pd.DataFrame(imputed_training_mice,
                         columns=dI_num.columns,
                         index=list(dI.index.values))
dI_NonNum = dI.select_dtypes(include=[np.object])

# ca deconne ici
dClean = data_mice.join(dI_NonNum)

d_tr = dClean.drop([
    'loc_1', 'loc_2', 'loc_secondary_1', 'loc_secondary_2', 'loc_secondary_3'
],
                   axis=1)

#create extra attribute
Example #8
0
#let's name the categorical and numeical attributes 
categorical_attributes = list(dataInt_Xy.select_dtypes(include=['category']).columns)
numerical_attributes = list(dataInt_Xy.select_dtypes(include=['float64', 'int64']).columns)
print('categorical_attributes:', categorical_attributes)
print('numerical_attributes:', numerical_attributes)

dataInt_Xy = dataInt_Xy[['temp_1', 'temp_2', 'mean_national_temp', 'humidity_1', 'humidity_2','consumption_secondary_1', 'consumption_secondary_2','consumption_secondary_3', 'date', 'hour', 'consumption_1', 'consumption_2']]


#missing value
msno.matrix(dataInt_Xy, figsize=(12,5)) # vizulaize
null_values_apptr = dataInt_Xy.isnull().sum() #count missing the same for the same locate
null_values_apptr = null_values_apptr[null_values_apptr != 0].sort_values(ascending = False).reset_index() #only show rows with null values
null_values_apptr.columns = ["variable", "n_missing"]
null_values_apptr.head()

# matrice des donnees manquantes
dataInt_Xy.isnull().sum()


data_missing = dataInt_Xy[['temp_1', 'temp_2', 'mean_national_temp', 'humidity_1', 'humidity_2','consumption_secondary_1', 'consumption_secondary_2','consumption_secondary_3']].copy()

# imputation par MICE
imputed_training_mice=mice(data_missing.values)


# Imputation par KNN 
sys.setrecursionlimit(100000) #Increase the recursion limit of the OS
# start the KNN training
imputed_training_KNN=fast_knn(data_missing.values, k=30)
Example #9
0
##Advantages are that it is more accurate than mean,median,mode (depending on the dataset)
##Disadvantages are that it is computationally expensive and it is sensitive to outliers. (Unlike SVM**)

###Example Code for implementing k-nn imputation
#sys.setrecursionlimit(100000)  # Increase the recursion limit of the OS
# start the KNN training
#imputed_training = fast_knn(train.values, k=30)

##5.) Imputation using multi-variate imputation by chained equation (MICE)
##This type of imputation works by filling missing values multiple times and by doing so uncertainity of the missing values  is
##measured better

##Example Code for MICE
from impyute.imputation.cs import mice
# start the MICE training
imputed_training = mice(train.values)

##6.) Imputing using deep neural networks (Datawig)
##This method works really well with numeric and categorical variables . It is a library that learns ML models by using DNN to impute
##missing values. It has support for both CPU and GPU for training
##Advantages are that it is quite accurate compared to other imputation techniques,it can handle categorical data with 'Feature Encoder'
##Disadvatages are that it is slow with large datasets, a requirement is that you need to specify the columns that contain information about the target column
##that will be impyuted

##Example Code for imputation using neural networks

import datawig
df_train, df_test = datawig.utils.random_split(train)

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
Example #10
0
    def fit(self, dataframe):
        """
        Method to be fitted, default is the most frequent value for str and int columns and median for float columns
        
        Custom: use a dict to set columns and imputation method like:
        {'mean':[columnname1,columnname2],'knn':[columname3,columnname4],'most_frequent':[columname5,columname6]}
        
        All unrelated columns will be imputed using default method
        :param dataframe: The input dataframe
        """
        self.fill = pd.Series([dataframe[c].value_counts().index[0]\
        if dataframe[c].dtype in [np.dtype('O'),np.dtype('int8'),np.dtype('int32'),np.dtype('int64')]\
        else dataframe[c].median() for c in dataframe],index=dataframe.columns)

        if self.strategy is not None:

            if type(self.strategy) is not dict:
                raise ValueError(
                    "Dict is required. Try {'method':[columname,...,],'method':['columname'...]} instead"
                )

            else:
                self.strategy_single_imp = {
                    method: column
                    for method, aux in self.strategy.items() for column in aux
                }

                for method, column in self.strategy_single_imp.items():

                    if column not in dataframe.columns:
                        raise ValueError(
                            "Column {} is not in dataset".format(column))
                    if method not in [
                            'most_frequent', 'mean', 'median', 'mice', 'knn'
                    ]:
                        raise ValueError("Unavailable method")

                    for c in dataframe:

                        if (column == c and method == 'most_frequent'):
                            self.fill[c] = dataframe[c].value_counts().index[0]
                        elif (column == c and method == 'mean'):
                            self.fill[c] = dataframe[c].mean()
                        elif (column == c and method == 'median'):
                            self.fill[c] = dataframe[c].median()

                for method, columns in self.strategy.items():

                    if column not in dataframe.columns:
                        raise ValueError(
                            "Column {} is not in dataset".format(column))
                    if method not in [
                            'most_frequent', 'mean', 'median', 'mice', 'knn'
                    ]:
                        raise ValueError("Unavailable method")

                    if method == 'knn':

                        train_cols = list(
                            dataframe.select_dtypes(include=['floating']))
                        train = pd.DataFrame(
                            cs.fast_knn(dataframe[train_cols], k=5))
                        train.columns = train_cols
                        dataframe[train_cols] = train.values

                    if method == 'mice':

                        train_cols = list(
                            dataframe.select_dtypes(include=['floating']))
                        train = pd.DataFrame(cs.mice(dataframe[train_cols]))
                        train.columns = train_cols
                        dataframe[train_cols] = train.values

        return self
Example #11
0
X = df[['Mean1', 'Std1', 'EK1', 'Skew1', 'Mean2', 'Std2', 'EK2', 'Skew2']]
y = df[['targetclass']]

#Randomly replace 40% of the first column with NaN values
column = X['Skew2']
print(column.size)
missing_pct = int(column.size * 0.4)
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN
print(column.shape[0])
print(column)

from impyute.imputation.cs import mice

# start the MICE training
X = mice(X.values)

print(X)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.3, random_state=500)  # 70% training and 30% test

#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets6
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
Example #12
0
 def MICE(self):
     # start the MICE training
     imputed_training=mice(self.data)
Example #13
0
X_val_cat = imputer_cat.transform(X_val[cat_cols])
X_test_cat = imputer_cat.transform(X_test[cat_cols])

# Forward or Backward fill
#for back fill 
train.fillna(method='bfill')
#for forward-fill
train.fillna(method=''ffill)
#one can also specify an axis to propagate (1 is for rows and 0 is for columns)
train.fillna(method='bfill', axis=1)

# MICE - Works & takes only Numerical Vars 
from impyute.imputation.cs import mice
# start the MICE training (Can be applied to all numerical Vars that have missing info in datasets)
Df_NumericalVars = Df.select_dtypes(include = np.number)
Df_Imputed_MICE = pd.DataFrame(data=mice(Df_NumericalVars.values), columns=Df_NumericalVars.columns, index=Df_NumericalVars.index)

# DataWig Imputation - https://github.com/awslabs/datawig - takes a lot of time
import datawig
# Var1 needs to be imputed
# Split data into obs with Var1 not missing and Var1 missing
X_train = X[pd.notnull(X.var1)] #Var1 not missing is used to in training
X_test = X[pd.isnull(X.Var1)] #Var1 missing
# Parameters
imputer = datawig.SimpleImputer(
    input_columns=['Var2','Var3','Var4','Var5','Var6', 'Var7'], # column(s), Categorical & Numerical, these vars themselves can have missing data
    output_column='revol_util', # the column we'd like to impute values for. Can take only 1 column at a time
    output_path = 'imputer_model') # stores model data and metrics
#Fit an imputer model on the train data
imputer.fit(train_df=X_train, num_epochs=50)    #num_epochs is not needed while imputing for Categorical Var (i.e misisng in Cat var)
#Impute missing values and return original dataframe with predictions
            annot=True)

# Since missing value in feature 'expnum'(which means the number of calls the person expect) is too much, I dropped this column.

# In[14]:

trainTest = trainTest.drop(columns=['expnum'])

# Missing values in features of tuition_target,career_target,income_target,from_target,field_target,mn_sat_target and zipcode_target all need to be imputed. Here I used mice to calculate the missing value according to the other features. And according to the correlation between target-'them_cal' and other features, tuition_target and from_target seemed rather important. I used random forest model to predict those two features this time.

# In[16]:

from impyute.imputation.cs import mice
trainTest_without = trainTest.drop(columns=['tuition_target', 'from_target'])
# start the MICE training
trainTest_be = mice(trainTest_without.values)
trainTest1 = pd.DataFrame(trainTest_be, columns=trainTest_without.columns)
trainTestf = trainTest1.join(trainTest['tuition_target', 'from_target'])

# In[18]:

trainTest2 = trainTestf[[
    'tuition_target', 'career_target', 'income_target', 'field_target'
]]
tuition_notMissing_col = trainTest2[
    trainTest2['tuition_target'].notnull()].drop(columns=['tuition_target'])
tuition_notMissing_target = trainTest2[
    trainTest2['tuition_target'].notnull()]['tuition_target']
tuition_Missing_col = trainTest2[trainTest2['tuition_target'].isnull()].drop(
    columns=['tuition_target'])
Example #15
0
'''
GOAL: so here in the abbreviation MICE the word multivariate mean is that till now we are replacing or filling NAN values using a single variate either median, mean or mode etc.., but in MICE we will be using more than one variate to fill the missing NAN values. We will be doing this in the following steps:
        1. Firstly we will take the dataset, and plot only a selected count of values on the coordinate axis and shall draw a regression line.
        2.Now we will take the corresponding axis of missing value and shall find the missing value from the regression line Now we got our missing value, later again we would take another set of random values and draw another regression line this time we will get another value for the missing value.
       3.we shall repeat this further and at last we will get a set of multivariate possible values for the missing value.
      4. Now we will find the mean value in the column by replacing missing values with the values from the multivariate set, forming a set of mean values.
     5. finally we will take the mean of all possible means that are formed in above case, using this we get most unbiased value for our missing NAN value.

'''
from impyute.imputation.cs import mice
import pandas as pd
import numpy as np
import random
import sys

diab = np.genfromtxt(
    'NaN_k-nn_filling.csv', delimiter=","
)  #taking the CSV dataset containing NaN values  and converting it into numpy.ndarray
# start the MICE training
imputed_training = mice(
    diab
)  #this mice function enables the happening of multivariate imputation by taking the mean of means generated by taking the missing value from the regression line
pd.DataFrame(imputed_training).to_csv(
    "file_MICE.csv")  # writing the replaced NAN values into a new csv file
Example #16
0
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from commonPart import fittingModel, predictValues, validatingResults, displayResults, splitDataRandomly

# %%
# getting the dataset
train_data = pd.read_csv('train.csv')
features_data = pd.read_csv('features.csv')
stores_data = pd.read_csv('stores.csv')

# %%
# predicting the missing values
imputed_training = mice(features_data.iloc[:, 2:11].values)
# %%
imputed_training[imputed_training < 0] = 0

for i in range(0, 7):
    features_data.iloc[:, 4 + i] = imputed_training[:, 2 + i]

# %%
# merging data
result = pd.merge(train_data,
                  features_data,
                  how='inner',
                  left_on=['Store', 'Date', 'IsHoliday'],
                  right_on=['Store', 'Date', 'IsHoliday'])

dataset = pd.merge(result, stores_data, on='Store')
Example #17
0
# read data
train_data = pd.read_csv("stock_XY_train.csv")
test_data = pd.read_csv("stock_X_test.csv")

train_data = train_data.drop(['id', 'Ticker', 'Sector'], axis=1).values
test_data = test_data.drop(['id', 'Unnamed: 0', 'Ticker', 'Sector'],
                           axis=1).values

# train_data imputation
# df_train_tr, df_val_tr = datawig.utils.random_split(train_data)
# df_train_te, df_val_te = datawig.utils.random_split(test_data)

from impyute.imputation.cs import mice

# start the MICE training
imputed_training = mice(train_data)
print(train_data.shape)
print(imputed_training.shape)

imputed_testing = mice(test_data)
print(test_data.shape)
print(imputed_testing.shape)

# Output the results to files
imputed_training.to_csv("imputed_train.csv", index=False, sep=',')
imputed_testing.to_csv("imputed_test.csv", index=False, sep=',')
# f = open("imputed_train.csv", "w", newline = '')
# writer = csv.writer(f)
# writer.writerow(())
# for i in range(len()):
#     writer.writerow(())
Example #18
0
def show():
    data_path = "../data/DlRsrpSinrStats.txt"
    grid_width = 40
    time_slots = 100
    ue_num = grid_width * grid_width
    uav_num = grid_width * grid_width
    total_num = ue_num + uav_num

    #data_set = np.loadtxt(data_path,delimiter='	',skiprows=1);
    data_set = pd.read_table(data_path, delimiter='\t')
    print(data_set.shape)

    df = pd.DataFrame(data_set)

    #    out = open('/home/tao/dataset/out2.txt','w')
    #    csv_write = csv.writer(out)

    rsrp = []
    #construct matrix with location and timeslot
    bias = 3200 * 149
    for i in range(time_slots):
        temp = np.array(
            (df.loc[bias + i * total_num:bias + (i + 1) * total_num -
                    1].sort_values('IMSI'))['rsrp'])
        temp = temp[0:1600]

        row = np.array(temp)
        #        csv_write.writerow(temp)
        rsrp.append(row)
    #rsrp = np.array((df.loc[0:time_slots*total_num-1].sort_values('IMSI'))['rsrp'])
    #print('rsrp shape:',rsrp.shape)
    #print(rsrp.shape)
    mrsrp = []
    for power in rsrp:
        temp_power = 10 * np.log10(power) + 30
        mrsrp.append(temp_power)

    mrsrp = np.array(mrsrp)
    mrsrp = scale(mrsrp)

    print('mrsrp', mrsrp.shape)
    mat_rsrp = mrsrp.T
    print('fdfa')

    m = mat_rsrp.shape[0]
    n = mat_rsrp.shape[1]

    missing_rates = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]

    #    [0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95]

    # add some random noise to the data
    def add_noise(origin_data, noise_level):
        np.random.seed(42)
        mu = 0
        sigma = noise_level
        noise = np.random.normal(mu, sigma, origin_data.shape)
        # vonvert the noise the dBm
        print(noise)
        return origin_data + noise

    #random_mat = np.random.uniform(size=[])
    #mat_rsrp = add_noise(mat_rsrp,3)
    xaxis = []
    yaxis = []
    knny = []

    nnmy = []
    micey = []

    knntime = []
    nnmtime = []
    micetime = []

    for missing_rate in missing_rates:
        mask = gen_mask(m, n, missing_rate)
        sample_data = mat_rsrp * mask
        print(missing_rate)
        print('origin_data')
        plot_image(mat_rsrp)
        print(mat_rsrp)

        print('the sample_data')
        plot_image(sample_data)
        print(sample_data)

        try:
            t1 = time.time()

            sample_data[sample_data == 0] = np.nan
            knn_recover = fast_knn(sample_data, k=3)
            print('knn')
            plot_image(knn_recover)
            error_knn = mean_absolute_error(mat_rsrp, knn_recover)
            knny.append(error_knn)

            t2 = time.time()
            ktime = (t2 - t1)

            knntime.append(ktime)
        except ValueError:
            knny.append(2)

            t2 = time.time()
            knntime.append(1200 * (1 + missing_rate))

        try:
            t1 = time.time()

            mice_data = mice(sample_data)
            print('mice')
            plot_image(mice_data)
            error_mice = mean_absolute_error(mat_rsrp, mice_data)
            micey.append(error_mice)

            t2 = time.time()
            micetime.append(t2 - t1)
        except ValueError:
            micey.append(2)

            t2 = time.time()
            micetime.append(1200 * (1 + missing_rate))

        try:
            t1 = time.time()

            X_filled_nnm = SoftImpute().fit_transform(sample_data)
            print('SoftImpute')
            plot_image(X_filled_nnm)
            error_nuclear = mean_absolute_error(mat_rsrp, X_filled_nnm)
            nnmy.append(error_nuclear)

            t2 = time.time()
            nnmtime.append(t2 - t1)

        except:
            nnmy.append(2)

            t2 = time.time()
            nnmtime.append(1200 * (1 + missing_rate))

        xaxis.append(missing_rate)


#    plt.plot(xaxis,yaxis)
#    plt.xlabel("missing_rate")
#    plt.ylabel("mae")
#    plt.show()
    return np.array([xaxis, knny, micey, nnmy, knntime, micetime, nnmtime])
def fill_empty_values(data: pd.DataFrame):
    imputed_training = mice(data.values)
    empty_mask = data.isna()
    data_array = data.values
    data_array[empty_mask] = imputed_training[empty_mask]
    return pd.DataFrame(data_array, columns=data.columns, index=data.index)
Example #20
0
# import libs
import pandas as pd
from impyute.imputation.cs import mice


# training data
train = pd.read_csv('train.csv')

# test data
test = pd.read_csv('test.csv')

# print(train.describe())
mice(train.values).to_csv('train11.csv')
Example #21
0
def show():
    time_slots = 100

    data_path = "../data/DlRsrpSinrStats.txt"

    #data_set = np.loadtxt(data_path,delimiter='	',skiprows=1);
    data_set = pd.read_table(data_path, delimiter='\t')
    print(data_set.shape)

    df = pd.DataFrame(data_set)

    rsrp = []
    #construct matrix with location and timeslot
    bias = 3200 * 149
    for i in range(time_slots):
        temp = np.array((df.loc[bias + i * 3200:bias + (i + 1) * 3200 -
                                1].sort_values('IMSI'))['rsrp'])
        temp = temp[0:1600]

        row = np.array(temp)

        rsrp.append(row)
    #rsrp = np.array((df.loc[0:time_slots*total_num-1].sort_values('IMSI'))['rsrp'])
    #print('rsrp shape:',rsrp.shape)
    #print(rsrp.shape)

    mrsrp = []
    for power in rsrp:
        temp_power = 10 * np.log10(power) + 30
        mrsrp.append(temp_power)

    mrsrp = np.array(mrsrp)

    #    mrsrp = scale(mrsrp)

    print('mrsrp', mrsrp.shape)

    # generate a maskmatrix
    m = 40
    n = 40

    missing_rates = [0.75]
    #[0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95]
    #random_mat = np.random.uniform(size=[])
    xaxis = []

    knnxaxis = []
    knnyaxis = []

    #    iiyaxis=[]
    nnmxaxis = []
    nnmyaxis = []
    micexaxis = []
    miceyaxis = []

    knntime = []
    nnmtime = []
    micetime = []

    for missing_rate in missing_rates:
        knny = []
        #        iiy=[]
        nny = []
        micey = []
        nnmy = []

        ktime = 0
        ntime = 0
        mtime = 0

        for mat_rsrp in mrsrp:
            mat_rsrp = np.array(mat_rsrp).reshape(40, 40)
            mask = gen_mask(m, n, missing_rate)
            #            mask = gen_mask2().reshape(40,40)
            print(mask)
            sample_data = mat_rsrp * mask

            print('origin_data')
            plot_image(mat_rsrp)

            print('the sample_data')
            plot_image(sample_data)

            try:
                t1 = time.time()
                sample_data[sample_data == 0] = np.nan
                knn_recover = fast_knn(sample_data, k=3)
                print('knn')
                plot_image(knn_recover)
                error_knn = mean_absolute_error(mat_rsrp, knn_recover)
                print(error_knn)
                knny.append(error_knn)
                t2 = time.time()
                ktime = ktime + (t2 - t1)
            except ValueError:
                knny.append(2)
                t2 = time.time()
                ktime = ktime + 600 * (1 + missing_rate)

            try:
                t1 = time.time()
                mice_data = mice(sample_data)
                print('mice')
                plot_image(mice_data)
                error_mice = mean_absolute_error(mat_rsrp, mice_data)

                micey.append(error_mice)

                t2 = time.time()
                mtime = mtime + (t2 - t1)
            except ValueError:
                micey.append(2)

                t2 = time.time()
                mtime = mtime + 600 * (1 + missing_rate)

            try:
                t1 = time.time()

                X_filled_nnm = SoftImpute().fit_transform(sample_data)
                print('NuclearNormMinimization')
                plot_image(X_filled_nnm)
                error_nuclear = mean_absolute_error(mat_rsrp, X_filled_nnm)
                nnmy.append(error_nuclear)

                t2 = time.time()
                ntime = ntime + (t2 - t1)
            except:
                nnmy.append(2)

                t2 = time.time()
                ntime = ntime + 600 * (1 + missing_rate)

            break


#            print("\tknn:",error_knn,"\tmice:",error_mice,"\titer:",error_iter,"\tnuclear",error_nuclear)
        knntime.append(ktime)
        nnmtime.append(ntime)
        micetime.append(mtime)

        knnyaxis.append(np.mean(np.array(knny)))
        miceyaxis.append(np.mean(np.array(micey)))
        #        iiyaxis.append(np.mean(np.array(iiy)))
        nnmyaxis.append(np.mean(np.array(nnmy)))
        xaxis.append(missing_rate)

        #    plt.plot(xaxis,iiy,c='red',label='iter')
        #    plt.plot(xaxis,knny,c='blue',label='knn')
        #    plt.plot(xaxis,nnmy,c='orange',label='nnm')
        #    plt.plot(xaxis,micey,c='black',label='mice')
        #
        #    plt.xlabel("missing rate")
        #    plt.ylabel("mae")
        #    plt.legend()
        #    plt.show()
        res = np.array(
            [xaxis, knnyaxis, nnmyaxis, miceyaxis, knntime, nnmtime, micetime])
    return res
# Forma bruta de eliminar una columna, aunque no se elimina de verdad
X = X[:,1:]

# Codificar datos de embarque
labelencoder_X = LabelEncoder()
X[:, 8] = labelencoder_X.fit_transform(X[:, 8])

ct_1 = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [8])],   
    remainder='passthrough')
X = np.array(ct_1.fit_transform(X))
X = X[:,1:]
X = X.astype(np.float)

from impyute.imputation.cs import mice
imputed = mice(X)
mice_ages = imputed[:, 5]
X[:,5 ] = mice_ages
# Escalado de variables
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X = sc_X.fit_transform(X)

# Dividir el data set en conjunto de entrenamiento y conjunto de testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Ajustar el clasificador en el Conjunto de Entrenamiento
from sklearn.svm import SVC
classifier = SVC(kernel = "rbf", random_state = 0)
classifier.fit(X_train, y_train)
Example #23
0
    # Y_predictions = [0 if values[0] > values[1] else 1 for values in results]
    # sns.heatmap(confusion_matrix(Y[3600:4000], Y_predictions), annot=True)
    # plt.show()


sys.setrecursionlimit(100000)
X_train = pd.read_csv("Case_Assignment/train.csv")
X_train = X_train.sort_values(['user_id'])
data_preprocessing(X_train)
data_normalization(X_train)
X_train = pd.concat([X_train, get_groups_from_json('train')], axis=1)
X = X_train[COLUMNS].to_numpy()
np.random.shuffle(X)
X, Y = X[:, 1:], X[:, 0]

X = mice(X)

X_test = pd.read_csv("Case_Assignment/test.csv")
X_test = X_test.sort_values((['user_id']))
data_preprocessing(X_test)
data_normalization(X_test)
X_test = pd.concat([X_test, get_groups_from_json('test')], axis=1)
X_test = X_test[COLUMNS].to_numpy()
X_test = X_test[:, 1:]
X_test = mice(X_test)

results = train_neural_network(X, Y, 25, 9, 2, X_test).to_numpy()
with open('Case_Assignment/scored_test.csv', mode='w') as file:
    writer = csv.writer(file,
                        delimiter=',',
                        quotechar='"',