Esempio n. 1
0
 def impute(self, df):
     if self.knn:
         knn = KNN()
         return pd.DataFrame(knn.fit_transform(df), columns=df.columns)
     else:
         mice = IterativeImputer()
         return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
 def complete(self, data: pd.DataFrame):
     df = data.copy()
     cols = list(df)
     df = pd.DataFrame(KNN(k=self.k, verbose=False).fit_transform(df))
     df.columns = cols
     return df
Esempio n. 3
0
import pandas as pd
import numpy as np
data = pd.read_excel(
    "C:/Users/mudmoham/Documents/pr/case studies/Employee Absenteeism/Absenteeism_at_work_Project.xls",
    sheetname="Year_Sheet")

pd.isnull(data).sum()
data["Reason for absence"] = data["Reason for absence"].fillna(20)
data.shape
data = data[data["Reason for absence"] != 0]

from fancyimpute import KNN
data = pd.DataFrame(KNN(k=3).complete(data), columns=data.columns)
data = data.apply(np.round, axis=1)

pd.isnull(data).sum()
for col in num_columns:
    q75, q25 = np.percentile(data[col], [75, 25])
    iqr = q75 - q25
    maximum = q75 + iqr * 1.5
    minimum = q25 - iqr * 1.5
    data.loc[data[col] < minimum, col] = np.nan
    data.loc[data[col] > maximum, col] = np.nan

data = pd.DataFrame(KNN(k=3).complete(data), columns=data.columns)
data = data.apply(np.round, axis=1)

data = data.drop(["Weight", "Height", "Disciplinary failure"], axis=1)

data["BMI"] = pd.cut(
    data["Body mass index"], [0, 18.5, 24.9, 29.9, 40],
Esempio n. 4
0
        with open(args.config) as f:
            config = json.load(f)


        data_path =   config["data_path"]     #Ground truth data
        corrupt_data_path = config["corrupt_data_path"] #Data containing missing values
        n_neighbor = config["n_neighbor"]
        trial_ind = config["trial_ind"]



       # LOAD DATA
        data= pd.read_csv(data_path).values
        data_missing = pd.read_csv(corrupt_data_path).values

        
        n_row = data_missing.shape[1] # dimensionality of data space
        non_missing_row_ind= np.where(np.isfinite(np.sum(data_missing,axis=1)))
        na_ind = np.where(np.isnan(data_missing))
        na_count= len(na_ind[0])
         
        knnImpute = KNN(k=n_neighbor)
        print("Start Knn")
        #X_impute_KNN = knnImpute.complete(Xdata_Missing)
        data_impute_KNN = knnImpute.fit_transform(data_missing)
        print("Knn finished")
        ReconstructionErrorKNN = sum(((data_impute_KNN[na_ind] - data[na_ind])**2)**0.5)/na_count
        print('Reconstruction error (KNN):')
        print(ReconstructionErrorKNN) 
        
       	np.savetxt("./imputed_data_trial_"+str(trial_ind)+"_KNN.csv", data_impute_KNN, delimiter=",")  
Esempio n. 5
0
def impute_knn(X):
    # Use 3 nearest rows which have a feature to fill in each row's missing features
    return KNN(k=3).complete(X)
Esempio n. 6
0
    dataset['Mode_transport'])
dataset['comorbidity'] = labelencoder_X.fit_transform(dataset['comorbidity'])
dataset['Pulmonary score'] = labelencoder_X.fit_transform(
    dataset['Pulmonary score'])
dataset['cardiological pressure'] = labelencoder_X.fit_transform(
    dataset['cardiological pressure'])

dataset = dataset.drop(['Name'], axis=1)

scaler = StandardScaler()
standardized_features = scaler.fit_transform(dataset.iloc[:, 1:26])
#use the above to standardize all columns
#standardized_features = scaler.fit_transform(dataset[['Age', 'Coma score','Diuresis', 'Platelets','HBB','d-dimer','Heart rate','HDL cholesterol', 'Charlson Index','Insurance','salary']])
#dataset[['Age', 'Coma score','Diuresis', 'Platelets','HBB','d-dimer','Heart rate','HDL cholesterol', 'Charlson Index','Insurance','salary']]=standardized_features

features_knn_imputed = KNN(k=100,
                           verbose=0).fit_transform(standardized_features)

dataset.iloc[:, 1:26] = features_knn_imputed

correlation = dataset.iloc[:, 1:].corr(method='pearson')
columns = correlation.nlargest(25, 'Infect_Prob').index

correlation_map = np.corrcoef(dataset[columns].values.T)
sns.set(font_scale=1.0)
heatmap = sns.heatmap(correlation_map,
                      cbar=True,
                      annot=True,
                      square=True,
                      fmt='.2f',
                      yticklabels=columns.values,
                      xticklabels=columns.values)
import os
os.chdir('....\\data\\input.csv')

#You DO NOT talk about Fight Club
import pandas as pd
from fancyimpute import KNN
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

#Only two guys to a fight
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

#Someone yells stop, goes limp, taps out, the fight is over
train.isnull().sum()
train = KNN(k=3).complete(train)
test  = KNN(k=3).complete(test)

#One fight at a time
le = LabelEncoder()
cat = ['genre','certificate', 'distributor']
for col in cat:
    train[col] = le.fit_transform(train[col])
    test[col] = le.fit_transform(test[col])

#no shirts, no shoes
train_X = train.drop(['year','oscar', 'movie_name', 'actor_name', 'href'], axis=1)  
test_X = test.drop(['year','oscar', 'movie_name', 'actor_name', 'href'], axis = 1)
 
train_Y = train['oscar']
Esempio n. 8
0
# 4.11填充缺失值

import numpy as np
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

features, _ = make_blobs(n_samples=1000, n_features=2, random_state=1)

scaler = StandardScaler()  #标准化特征
standardized_features = scaler.fit_transform(features)

true_value = standardized_features[0, 0]  #将第一个特征向量的第一个值替换成缺失值
standardized_features[0, 0] = np.nan

# [1]KNN 算法来预测缺失值
features_knn_imputed = KNN(k=5, verbose=0).fit_transform(
    standardized_features)  #预测特征矩阵中的缺失值

# 对比真实值和填充值
print('True Value:', true_value)
print('Imputed Value:', features_knn_imputed[0, 0])

# [2]特征值的平均值,中位数或者众数来填充
from sklearn.preprocessing import Imputer
mean_imputer = Imputer(strategy="mean", axis=0)
features_mean_imputed = mean_imputer.fit_transform(features)

print('True Value:', true_value)
print('Imputed Value:', features_mean_imputed[0, 0])
m = 20
inner_rank = 4
X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m))
print("Mean squared element: %0.4f" % (X**2).mean())

# X is a data matrix which we're going to randomly drop entries from
missing_mask = np.random.rand(*X.shape) < 0.1
X_incomplete = X.copy()
# missing entries indicated with NaN
X_incomplete[missing_mask] = np.nan

meanFill = SimpleFill("mean")
X_filled_mean = meanFill.fit_transform(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.fit_transform(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
#strategy: "mean" or "median" or "most_frequent"
train['N30_missing_imputed'] = imp.fit_transform(train['N30'].values.reshape(
    -1, 1))
imp.fit_transform(
    train.iloc[:, 1:])  #Removing first column as it is a text variable

#Reference: https://pypi.python.org/pypi/fancyimpute/0.0.4
#pip3 install fancyimpute
#ONLY NUMERIC VALUES
from fancyimpute import NuclearNormMinimization, KNN, MICE
solver = NuclearNormMinimization(min_value=0.0,
                                 max_value=1.0,
                                 error_tolerance=0.0005)
X_filled = solver.complete(train['N30'].values.reshape(-1, 1))
X_filled = solver.complete(train)
X_filled_knn = KNN(k=3).complete(train)
#https://github.com/hammerlab/fancyimpute/blob/master/fancyimpute/mice.py
X_filled_mice = MICE().complete(train.as_matrix())
X_filled_mice_df = pd.DataFrame(X_filled_mice)
X_filled_mice_df.columns = train.columns
X_filled_mice_df.index = train.index
#Other methods: SimpleFill, SoftImpute, IterativeSVD, MICE, MatrixFactorization, NuclearNormMinimization, KNN, BiScaler
#SimpleFill: uses mean or median; SoftImpute: Matrix completion;

###Smote
#Only numeric/boolean and non_null values as input to TSNE model :: BETTER TRY THIS AFTER MISSING VALUE IMPUTATION AND ENCODING
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_train_new, y_train_new = sm.fit_sample(train.dropna().iloc[:, 1:44],
                                         train.dropna()['Dependent_Variable'])
from fancyimpute import KNN
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
# to display all the columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)

# load dataset
data = pd.read_csv('credit-card-data.csv')
data.drop('CUST_ID', axis=1, inplace=True)

#Apply KNN imputation algorithm
data = pd.DataFrame(KNN(k=3).fit_transform(data), columns=data.columns)

#Variables with Missing Value percentage
data.apply(lambda x: sum(x.isnull() / len(data)) * 100)

data.to_csv('credit_card_knn_imputed.csv', index=False)

data['MA_PURCHASE'] = data['PURCHASES'] / data['TENURE']
data['MA_CASH_ADVANCE'] = data['CASH_ADVANCE'] / data['TENURE']
data['LIMIT_USAGE'] = data['BALANCE'] / data['CREDIT_LIMIT']
data['PAY_MINPAY_RATIO'] = data['PAYMENTS'] / data['MINIMUM_PAYMENTS']

#drop purchases,cash_advance,tenure(less variability),Balance,CreditLimit


def purchase_type(data):
    if (data['ONEOFF_PURCHASES'] == 0) & (data['INSTALLMENTS_PURCHASES'] == 0):
stock = stock.join(upperband)
stock = stock.join(middleband)
stock = stock.join(lowerband)
stock = stock.join(Roc)
stock = stock.join(Atr)
stock = stock.join(rollingrank)
stock = stock.join(rollingrank1)
stock = stock.join(div)
stock = stock.join(voldiff)
stock = stock.join(VolROC)
stock = stock.join(opendiff)

# Impute missing values using KNN
stock = stock.as_matrix()
stock = np.append(stock, arima, 1)
stock = KNN(k=15).fit_transform(stock)

stock = pd.DataFrame(stock)

stock_train = stock.iloc[0:round(len(stock) * 0.8), :]
stock_test = stock.iloc[round(len(stock) * 0.8):, :]

# Feature Scaling
from sklearn.preprocessing import MinMaxScaler

sc = MinMaxScaler()
training_set_scaled = sc.fit_transform(stock_train)
sc_predict = MinMaxScaler()
test_set_scaled = sc_predict.fit_transform(stock_test)

X_train = []
round(df.loc[df.Cabin.isnull(), :].groupby(
    ["Pclass"]).size() / df.groupby(["Pclass"]).size() * 100)
# Percentage of NAs by Parch:
round(df.loc[df.Cabin.isnull(), :].groupby(
    ["Parch"]).size() / df.groupby(["Parch"]).size() * 100)


# How could we impute the missing data?

# Instead of using the mean, median or mode to impute the NAs, we could
# use KNN imputation or regression. Here I have chosen to use KNN.

# Create new columns for KNN
num_vars = [
    'Age',
    'Survived',
    'Pclass',
    'SibSp',
    'Parch',
    'Fare']
df_impute = pd.DataFrame(KNN(k=3).complete(df.loc[:, num_vars]))
df_impute.columns = num_vars

# Round the predictions
df_impute.Age = df_impute.Age.round()

# Update df
df_impute.isnull().sum(axis=0)
df = pd.concat([df.drop(num_vars, axis=1), df_impute], axis=1)

Esempio n. 14
0
def knn_imputation(k):
	X_train_new = KNN(k=k).complete(X_train)
	X_val_new = KNN(k=k).complete(X_val)
	X_test_new = KNN(k=k).complete(X_test)
	return X_train_new, X_val_new, X_test_new
absentData['Pet']               = absentData['Pet'].astype('category')

absentData.dtypes


#--------------------------------- Missing Value Analysis ---------------------------------#
missingVal  = pd.DataFrame(absentData.isnull().sum()).sum()
missingValPercent = missingVal/len(absentData.index)*100
missingValPercent.round()

#Approx 24% values are null in the dataset. So we need to impute them by suitable method. 

#Missing Value Imputation

absentData.isnull().sum()
absentData = pd.DataFrame(KNN(k = 3).fit_transform(absentData), columns = absentData.columns)
absentData = absentData.round()

#------------------------------- Outlier Analysis -----------------------------------------#
sns.boxplot(data=absentData[['Absenteeism time in hours','Service time','Height','Weight','Transportation expense','Age']])
fig=plt.gcf()
fig.set_size_inches(8,12)
sns.boxplot(data=absentData[['Work load Average/day']])


#Computing the benchmark for the numeric values
numericValues = ['Work load Average/day','Distance from Residence to Work', 'Service time', 'Age','Transportation expense','Hit target', 'Weight', 'Height', 'Body mass index', 'Absenteeism time in hours']

for i in numericValues:
    q75, q25 = np.percentile(absentData[i], [75,25])
    
Esempio n. 16
0
# Wrangle all data into one dataframe
allstations = pd.concat([
    station1df["Value"], station2df["Value"], station3df["Value"],
    station4df["Value"]
],
                        axis=1)
allstations.columns = ["station1", "station2", "station3", "station4"]

# Run the selected imputation routine to fill all missing cases
if method == "SoftImpute":
    allstations_complete = pd.DataFrame(
        data=SoftImpute().complete(allstations),
        columns=allstations.columns,
        index=allstations.index)
elif method == "KNN":
    allstations_complete = pd.DataFrame(data=KNN().complete(allstations),
                                        columns=allstations.columns,
                                        index=allstations.index)
elif method == "MICE":
    allstations_complete = pd.DataFrame(data=MICE().complete(allstations),
                                        columns=allstations.columns,
                                        index=allstations.index)
else:
    print "Sorry, the imputation method %s is not available, try MICE, KNN, or SoftImpute" % method

# Unstack the data to get values back in monthly columns, and then Output the filled datasets with appended prefixes to the input filenames. Round data values to nearest integer if asked.
if Round is True:
    pd.DataFrame(allstations_complete["station1"].values.reshape(-1, n),
                 columns=names).round(0).astype("int32").to_csv(
                     "%sfilled_%s" % (method, station1))
    pd.DataFrame(allstations_complete["station2"].values.reshape(-1, n),
Esempio n. 17
0
            count += 1
    pre =  count * 1.0 / sum(predict) # 准确率
    recall =  count * 1.0 / sum(train) # 召回率
    return 2 * pre * recall / (pre + recall)
    
train_data = pd.read_csv('C:\\Users\\JingYi\\Desktop\\diabetes_prediction\\train_data.csv', encoding='gbk')
# 1000,85

filter_feature = ['id','label'] # 取预测值
features = []
for x in train_data.columns: # 取特征
    if x not in filter_feature:
        features.append(x)

train_data_x = train_data[features]
train_data_x = pd.DataFrame(KNN(k=6).fit_transform(train_data_x), columns=features)
train_data_y = train_data['label']

X_train, X_test, y_train, y_test = train_test_split(train_data_x, train_data_y, random_state=1) # 划分训练集、测试集

linreg = LogisticRegression() 
linreg.fit(X_train, y_train) # 模型训练


y_pred = linreg.predict(X_train) # 模型预测
print ("训练集",countF1(y_train.values, y_pred))

y_pred = linreg.predict(X_test) # 模型预测
print ("测试集",countF1(y_test.values, y_pred))

# Step 2: Let us impute the missing values.

# MisVal = ImputationMissingValues()
# imputed_mean_dataset = MisVal.impute_mean(copy.deepcopy(dataset), 'hr_watch_rate')
# imputed_median_dataset = MisVal.impute_median(copy.deepcopy(dataset), 'hr_watch_rate')
# imputed_interpolation_dataset = MisVal.impute_interpolate(copy.deepcopy(dataset), 'hr_watch_rate')
# DataViz.plot_imputed_values(dataset, ['original', 'mean', 'interpolation'], 'hr_watch_rate', imputed_mean_dataset['hr_watch_rate'], imputed_interpolation_dataset['hr_watch_rate'])

X_incomplete = dataset
# print(list(X_incomplete))
# # X is the complete data matrix
# # X_incomplete has the same values as X except a subset have been replace with NaN
#
# # Use 3 nearest rows which have a feature to fill in each row's missing features
X_filled_knn = KNN(k=6).complete(X_incomplete)
# X_filled_knn = knnimpute.(X_incomplete)

DataViz.plot_imputed_values(dataset, ['original', 'imputed'], 'hr_watch_rate',
                            X_filled_knn[:, 0])

# # matrix completion using convex optimization to find low-rank solution
# # that still matches observed values. Slow!
# X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)
#
# # Instead of solving the nuclear norm objective directly, instead
# # induce sparsity using singular value thresholding
# X_filled_softimpute = SoftImpute().complete(X_incomplete_normalized)
#
# # print mean squared error for the three imputation methods above
# nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
Esempio n. 19
0
# #Calculate IQR
     iqr = q75 - q25

# #Calculate inner and outer fence
    minimum = q25 - (iqr*1.5)
    maximum = q75 + (iqr*1.5)

# #Replace with NA
   train[train.iloc[[:,i] < minimum] = np.nan
    train[train.loc[:,i] > maximum] = np.nan

# #Calculate missing value
    missing_val = pd.DataFrame(marketing_train.isnull().sum())

# #Impute with KNN
    train = pd.DataFrame(KNN(k = 3).complete(train), columns = train.columns)


# In[13]:


##Correlation analysis
#Correlation plot
df_corr = train.loc[:,cnames]


# In[14]:


df_corr
    if input_file_name in saved_list:
        print("已经完成%s" % input_file_name)
        continue
    print("========正在计算%s========" % input_file_name)
    # 读取
    data_Aqua = pd.read_excel(input_file_path_Aqua + input_file_name)
    data_Terra = pd.read_excel(input_file_path_Terra + input_file_name)
    # 删除字符串,便于计算
    del data_Terra["监测站"]
    del data_Aqua["监测站"]
    data_Aqua = data_Aqua.set_index('日期')
    data_Terra = data_Terra.set_index('日期')
    # 时间局部:KNN
    # 最近邻估算,使用两行都具有观测数据的特征的均方差来对样本进行加权。然后用加权的结果进行特征值填充
    # 相当于A0D17个点为特征进行近邻,则参数K为时间,即时间上最近的16行按特征的均方差进行加权,即哪个时间点的权重大一些
    data_Aqua_KNN = KNN(k=7).fit_transform(data_Aqua)
    data_Aqua_KNN = pd.DataFrame(data_Aqua_KNN)  # 结果中有许多零值,应为空值
    data_Terra_KNN = KNN(k=7).fit_transform(data_Terra)
    data_Terra_KNN = pd.DataFrame(data_Terra_KNN)  # 结果中有许多零值,应为空值

    # 时间全局: 平滑,常用于股市
    data_Aqua_ewm = pd.DataFrame.ewm(self=data_Aqua,
                                     com=0.5,
                                     ignore_na=True,
                                     adjust=True).mean()
    data_Terra_ewm = pd.DataFrame.ewm(self=data_Terra,
                                      com=0.5,
                                      ignore_na=True,
                                      adjust=True).mean()

    # 空间局部: IDW
import numpy as np
import pandas as pd
from fancyimpute import KNN

dataset = pd.read_csv('MissingData2.csv', sep=",", header=None)
dataset = dataset.replace(1e99, np.NaN)

dataset = dataset.values
df_filled = pd.DataFrame(KNN(3).complete(dataset))

np.savetxt('induriMissingResult2.txt', df_filled, delimiter=',', newline='\n')
'pass_yards_mean', 'pass_yards_max', 'pass_td_mean', 'pass_td_max', 'intcp_mean', 'intcp_max',
'rating_mean', 'rating_max', 'rush_att_mean', 'rush_att_max',
'rush_yards_mean', 'rush_yards_max', 'rush_td_mean', 'rush_td_max', 'rec_mean', 'rec_max', 'rec_yards_mean',
'rec_yards_max', 'rec_td_mean', 'rec_td_max', 'college_points_mean', 'college_points_max', 'avg_diff', 'age']

imp_numeric = all_data[['QB', 'RB', 'TE', 'WR', 'height', 'weight', 'bmi', 'arm_length',
'hand_size', 'front_shoulder', 'back_shoulder', 'wonderlic', 'pass_velocity', 'ten_yard', 
'twenty_yard', 'forty_yard', 'bench_press', 'vertical_leap', 'broad_jump', 'shuttle', 'sixty_shuttle',
'three_cone', 'four_square', 'games_mean', 'games_max', 'cmp_mean', 'cmp_max', 'pass_att_mean', 'pass_att_max', 
'pass_yards_mean', 'pass_yards_max', 'pass_td_mean', 'pass_td_max', 'intcp_mean', 'intcp_max',
'rating_mean', 'rating_max', 'rush_att_mean', 'rush_att_max',
'rush_yards_mean', 'rush_yards_max', 'rush_td_mean', 'rush_td_max', 'rec_mean', 'rec_max', 'rec_yards_mean',
'rec_yards_max', 'rec_td_mean', 'rec_td_max', 'college_points_mean', 'college_points_max', 'avg_diff', 'age']].values

# KNN imputing
imp = pd.DataFrame(KNN(k=5).fit_transform(imp_numeric), columns=imp_columns)

# add imputed values rest of dataset
all_data_imp = all_data.drop(imp_columns, axis=1)

master_data = all_data_imp.merge(imp, left_index=True, right_index=True)

# new combine variables
master_data['speed_score'] = (master_data.weight * 200)/(master_data.forty_yard**4)
master_data['agility_score'] = master_data.three_cone + master_data.shuttle
master_data['height_adj_ss'] = master_data.speed_score * (master_data.height / 73.5) ** 1.5
master_data['burst_score'] = master_data.vertical_leap + master_data.broad_jump
# catch radius and weight adjusted bench ?

# merge and drop players without combine data or without any stats
stats = stats.merge(df[['player_id', 'draft_year']], on='player_id', how='outer')
import sklearn.datasets as SKD

data = pd.read_csv('ai_mavan_adhd7.csv', sep=',', index_col=None)

# In[ ]:

# MICE IMPUTATION
mice_impute = IterativeImputer()
traindatafill = mice_impute.fit_transform(adhd)

# In[ ]:

# KNN way to impute

adhd_filled_knn = KNN(k=3).fit_transform(
    adhd
)  #use 3 nearest rows which have a feature to fill in each row’s missing features

# In[ ]:

# NUCLEARNOMMINIMIZATION
adhd_filled_nnm = NuclearNormMinimization().fit_transform(adhd)

# In[69]:

#ENTER COLUMNS LABELS THAT HAVE DISCRETE VARIABLES

discrete_columns = [
    'Hamilton', 'gender_male', 'Dob_MONTH_DIGIT', 'Hamilton', 'above_college',
    'QuintMat_w', 'QuintSoc_w', 'Mood_drug', 'Pren_income4', 'No_depression',
    'Postpartum_depression', 'Materl_anxiety', 'B_HTTLPR_2', 'B_DRD1_hap',
Esempio n. 24
0
features = data.iloc[:, :-1]
labels = data.iloc[:, -1]

#%%
features.count()
(features == 0).astype(int).sum(axis=0)

features[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
          'BMI']] = features[[
              'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'
          ]].replace(0, np.nan)
(features == 0).astype(int).sum(axis=0)

#%%

features = pd.DataFrame(data=KNN(k=23).fit_transform(features),
                        index=list(range(len(features))),
                        columns=list(data.columns[:-1]))

#%%
plt.bar(x=0,
        height=(labels == 0).sum(),
        width=0.5,
        color='salmon',
        label='Outcome 0')
plt.bar(x=1,
        height=(labels == 1).sum(),
        width=0.5,
        color='cyan',
        label='Outcome 1')
plt.xticks([0, 1])
Esempio n. 25
0
m = 20
inner_rank = 4
X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m))
print("Mean squared element: %0.4f" % (X**2).mean())

# X is a data matrix which we're going to randomly drop entries from
missing_mask = np.random.rand(*X.shape) < 0.1
X_incomplete = X.copy()
# missing entries indicated with NaN
X_incomplete[missing_mask] = np.nan

meanFill = SimpleFill("mean")
X_filled_mean = meanFill.complete(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.complete(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
rain['Age'].fillna(train.groupby('Sex')['Age'].transform("median"), inplace=True)

#Pearson correlation of features
colormap = plt.cm.RdBu
plt.figure(figsize=(32,10))
plt.title('Pearson correlation of features',y=1.05, size=15)
sns.heatmap(train.corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)


#KNN method to replace missing values of emp_length column by finding 3 nearest
from fancyimpute import KNN
# we use dataframe, fancyimpute removes column names
train_cols=list(train)

# use 5 nearest rows to fill missing features
train = pd.DataFrame(KNN(k=5).complete(train))
train.columns = train_cols

#MICE method uses Bayesian ridge regression avoids baises
from fancyimpute import IterativeImputer as MICE
#use MICE to fill missing rows
train_cols=list(loans)
train=pd.DataFrame(MICE(verbose=False).fit_transform(train))
train.columns =train_cols




#Linear regression method
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
Esempio n. 27
0
                        axis=1)

numeric_feats = caddn + caddn0 + caddabs + defn + defn0 + ['TrimerAvg']
cat_feats = caddc + defc + ['Trimer', 'TrimerMut']

df[numeric_feats] = df[numeric_feats].convert_objects(convert_numeric=True)

from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, MICE
import random

d = df[numeric_feats + ['gene']]
newd = pd.DataFrame()
for i in df['gene'].unique().tolist():
    print "Doing gene ", i
    dtemp = d[d['gene'] == i][numeric_feats]
    tempx = pd.DataFrame(data=KNN(k=3).complete(dtemp),
                         columns=dtemp.columns,
                         index=dtemp.index)
    newd = pd.concat([newd, tempx])

X = newd
#X=pd.DataFrame(data=Ximp, columns=df[numeric_feats].columns, index=df[numeric_feats].index)
X[cat_feats + ['gene']] = df[cat_feats + ['gene']]

#X.to_csv('ImputedImcomplete.tsv',sep="\t",index=False)

#X=df[numeric_feats+cat_feats]
import category_encoders as ce
encoder = ce.OneHotEncoder(cols=cat_feats + ['gene'])
X = encoder.fit_transform(X)
previsores[:, :] = imputer.fit_transform(previsores[:, :])

# Transforma os dados categoricos/nominais em numericos
from sklearn.preprocessing import LabelEncoder
previsores[:, 0] = LabelEncoder().fit_transform(previsores[:, 0].astype('str'))
previsores[:, 1] = LabelEncoder().fit_transform(previsores[:, 1].astype('str'))
previsores[:, 2] = LabelEncoder().fit_transform(previsores[:, 2].astype('str'))
previsores[:, 3] = LabelEncoder().fit_transform(previsores[:, 3].astype('str'))
previsores[:, 5] = LabelEncoder().fit_transform(previsores[:, 5].astype('str'))
previsores[:, 6] = LabelEncoder().fit_transform(previsores[:, 6].astype('str'))
previsores[:, 7] = LabelEncoder().fit_transform(previsores[:, 7].astype('str'))

# Pacote para uso de algoritmos para tratatar valores faltantes em um dataset
from fancyimpute import KNN
# Usa 5NN que tenham um recurso para preencher os valores ausentes de cada linha
previsores = KNN(k=5).fit_transform(previsores)

# Transforma Objeto em DATAFRAME para verificar pre-processamento
result = pd.DataFrame(previsores)

# Cria atributo a ser previsto
classe = result.iloc[:, 10].values
# Exclui o mesmo atrivuto a ser classificado, da base de dados previsora
result = result.drop(columns=10)

# Exclui atributo Skin Color por conter muitos valores ausentes
result = result.drop(columns=6)

# Retorna a modificação
previsores = result.iloc[:, :].values
# Determina o tipo int para todas bases usadas
Esempio n. 29
0
test_data_statiton = pd.read_csv('data/test/test_normal.csv')
test_data_null = pd.read_csv('data/test/test_null.csv')
test_mask = pd.read_csv('data/test/test_mask.csv')

trend_matrix = pd.read_csv('data/test/mstlplus_trend.csv', index_col=0)
seasonal12_matrix = pd.read_csv('data/test/mstlplus_seasonal12.csv',
                                index_col=0)
seasonal84_matrix = pd.read_csv('data/test/mstlplus_seasonal84.csv',
                                index_col=0)
remainder_matrix = pd.read_csv('data/test/mstlplus_remainder.csv', index_col=0)

validate_null_number = test_data_null.isna().sum().sum(
) - test_data_statiton.isna().sum().sum()

k_number = 10
remainder_knn = pd.DataFrame(KNN(k=k_number).fit_transform(remainder_matrix))

data_mstlplus_knn = remainder_knn.to_numpy(
) + trend_matrix + seasonal12_matrix + seasonal84_matrix

error_mask = (test_data_statiton.fillna(0).to_numpy() -
              data_mstlplus_knn) * (1 - test_mask).to_numpy()

mse_error = error_mask**2
mae_error = mre_error = abs(error_mask)

total_error_MSE = mse_error.sum().sum()
total_error_MAE = mae_error.sum().sum()
total_error_MRE = mre_error.sum().sum()

total_label_MRE = abs(
Esempio n. 30
0
# use RandomForestRegression to train data                 
#RFR = RandomForestRegressor(n_estimators=80, n_jobs=-1)                   
#RFR1 = RandomForestRegressor(n_estimators=80, n_jobs=-1) 
#RFR.fit(X,Y)                                               
#RFR1.fit(x,y)
#predictAges = RFR.predict(age_df_isnull.values[:,1:])      
#predictAges1 = RFR1.predict(age_df1_isnull.values[:,1:])
#train.loc[train['Age'].isnull(), ['Age']]= predictAges
#test.loc[test['Age'].isnull(), ['Age']]= predictAges1
#print(test.info())
#age_bins = [0,1,4,13,18,35,45,55,65,180]
#train['age_group'] = pd.cut(train['Age'],age_bins)
#train['age_group']
#train['Age']
from fancyimpute import KNN
age_train = KNN(10).complete(train)
train = pd.DataFrame(age_train,columns = train.columns)
#printtrain['Age']
age_test = KNN(k=10).complete(test)

test = pd.DataFrame(age_test, columns = test.columns)

age_bins = [0,1,4,13,18,35,45,55,65,180]
train['age_group'] = pd.cut(train['Age'],age_bins)
label = LabelEncoder()
train['age_group'] = label.fit_transform(train['age_group'])

test['age_group'] = pd.cut(test['Age'],age_bins)
test['age_group'] = label.fit_transform(test['age_group'])
train.head()
Esempio n. 31
0
m = 20
inner_rank = 4
X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m))
print("Mean squared element: %0.4f" % (X ** 2).mean())

# X is a data matrix which we're going to randomly drop entries from
missing_mask = np.random.rand(*X.shape) < 0.1
X_incomplete = X.copy()
# missing entries indicated with NaN
X_incomplete[missing_mask] = np.nan

meanFill = SimpleFill("mean")
X_filled_mean = meanFill.complete(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.complete(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
Esempio n. 32
0
def KNNtrans(d):
    m = KNN(k=20).fit_transform(d)
    m = pd.DataFrame(m, columns=d.columns)
    return m