def impute(self, df): if self.knn: knn = KNN() return pd.DataFrame(knn.fit_transform(df), columns=df.columns) else: mice = IterativeImputer() return pd.DataFrame(mice.fit_transform(df), columns=df.columns)
def impute_features(scaled_features): # Impute missing values from fancyimpute import KNN knn = KNN(k=5) imputed_values = knn.fit_transform(scaled_features.values) imputed_features = pd.DataFrame(imputed_values, index=scaled_features.index, columns=scaled_features.columns) return imputed_features
def impute(self, trained_model, input): """ Loads the input table and gives the imputed table :param trained_model: trained model returned by train function - not used in our case :param input: input table which needs to be imputed :return: X_filled_knn: imputed table as a numpy array """ # Use 3 nearest rows which have a feature to fill in each row's missing features # will not use trained_model as training happens during imputation X_incomplete = input knnImpute = KNN(k=3) X_filled_knn = knnImpute.fit_transform(X_incomplete) return X_filled_knn
def KNNfill(df, usecols, predcol, knn_k=5): dfcpy = df.copy().fillna(value=float('NaN')).loc[:, usecols] minval = dfcpy.loc[dfcpy[predcol].notnull(), (predcol)].min() meanval = dfcpy.loc[dfcpy[predcol].notnull(), (predcol)].mean() maxval = dfcpy.loc[dfcpy[predcol].notnull(), (predcol)].max() predictor = KNN(k=knn_k, min_value=minval, max_value=maxval) print("Starting Imputation, Printing NaNs for Passed DataFrame::\n{}\n". format(dfcpy.isnull().sum())) print("{} values missing for {}".format(dfcpy[predcol].isnull().sum(), predcol)) imputed_df = pd.DataFrame(data=predictor.fit_transform(dfcpy), columns=usecols) imputed_df['orig_' + predcol] = dfcpy.loc[:, (predcol)] return imputed_df
def impute_df(df): # imputer = KNN() imputer = KNN(k=2) object_types = list(df.select_dtypes(include=['object']).columns) num_types = list(set(df.columns) - set(object_types)) encoders_store = {} for column in num_types: skew = df[column].skew() if (-1 < skew < 1): df[column] = df[column].fillna(df[column].mean()) else: df[column] = df[column].fillna(df[column].median()) #create a for loop to iterate through each column in the data for columns in object_types: new = encode(df[columns]) encoders_store[columns] = new[1] imputed_data = pd.DataFrame(np.round(imputer.fit_transform(df)), columns=df.columns) for columns in object_types: imputed_data[columns] = encoders_store[columns].inverse_transform( np.array(imputed_data[columns]).reshape(-1, 1)) return imputed_data
def tune(): print("Tuning for k= 2 to 10") min_rms = float('inf') min_k = -1 for k in range(2, 11): model = KNN(k, verbose=False ) #or different k or SoftImpute or BiScaler or SimpleFill #Read Data knndf = pd.DataFrame([], columns=[ 'ptnum', 'time', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'timediff_1', 'timediff_5', 'timediff_median' ]) for file in os.listdir("train_data/train_with_missing"): if file.endswith(".csv"): filepath = os.path.join("train_data/train_with_missing", file) df_pt = pd.read_csv(filepath) ptnum = int(file.split(".")[0]) df_oth = df_pt.iloc[:, 1:] df_time = df_pt.iloc[:, 0] df_pt = pd.concat([df_oth, df_time], axis=1) df_pt["ptnum"] = ptnum mins, maxs = feature_engg(df_pt, returnmin=True) df_numeric = df_pt.select_dtypes( include=[np.float]).as_matrix() df_filled = pd.DataFrame(model.fit_transform(df_numeric), columns=[ 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'timediff_1', 'timediff_5', 'timediff_median' ]) df_filled["ptnum"] = df_pt["ptnum"] df_filled["time"] = df_pt["time"] for i in range(13): name = df_filled.iloc[:, i].name max_val = maxs[name] min_val = mins[name] df_filled[name] = df_filled[name].map( lambda x: x * (max_val - min_val) + min_val) knndf = knndf.append(df_filled, ignore_index=True) knndf = knndf.sort_values(['ptnum', 'time'], ascending=[1, 1]).reset_index(drop=True) #Evaluate imputed = knndf.copy() rmsList = [] for i in range(1, 14): imputed["X" + str(i) + "_groundtruth"] = df2["X" + str(i)] imputed["X" + str(i) + "_masked"] = df["masked_X" + str(i)] rms = nrms(imputed["X" + str(i) + "_masked"], imputed["X" + str(i)], imputed["X" + str(i) + "_groundtruth"], imputed["ptnum"]) print(k, i, rms) rmsList.append(rms) print(k, rmsList) avgrms = np.mean(rmsList) print(avgrms) if avgrms < min_rms: min_rms = avgrms min_k = k
# Import KNN from fancyimpute from fancyimpute import KNN # Copy diabetes to diabetes_knn_imputed diabetes_knn_imputed = diabetes.copy(deep=True) # Initialize KNN knn_imputer = KNN() # Impute using fit_tranform on diabetes_knn_imputed diabetes_knn_imputed.iloc[:, :] = knn_imputer.fit_transform(diabetes_knn_imputed)
results = calc.pandas(mols.values()) results = results.set_index(pd.Index(mols.keys(), name='CID')) results.head() results.shape # + def fix(x): try: x = float(x) except: x = None return x results = results.applymap(fix) # - frac_bad = results.isnull().mean() good = frac_bad[frac_bad < 0.3].index results = results.loc[:, good] from fancyimpute import KNN knn = KNN(k=5) results[:] = knn.fit_transform(results.values) results.to_csv('data/snitz-mordred.csv') results.shape
inner_rank = 4 X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m)) print("Mean squared element: %0.4f" % (X**2).mean()) # X is a data matrix which we're going to randomly drop entries from missing_mask = np.random.rand(*X.shape) < 0.1 X_incomplete = X.copy() # missing entries indicated with NaN X_incomplete[missing_mask] = np.nan meanFill = SimpleFill("mean") X_filled_mean = meanFill.fit_transform(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features knnImpute = KNN(k=3) X_filled_knn = knnImpute.fit_transform(X_incomplete) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance X_incomplete_normalized = biscaler.fit_transform(X_incomplete)
if n_filled / n_total < 0.5: continue input_array.append(input_array_tmp[k,:]) input_array = np.asarray(input_array) input_shape = input_array.shape print('Feature array shape: %s' % str(feature_shape)) print('Input shape for imputer: %s' % str(input_shape)) imputed_array = gain(\ input_array.astype(np.float), gain_parameters) from fancyimpute import KNN imp_mean = KNN(5) # IterativeImputer() imp_mean.fit_transform(input_array) #imp_mean.transform(input_array) df = pd.DataFrame(imputed_array) plt.matshow(df.corr()) #plt.title('Correlations between laboratory parameters over time', y=-0.01) plt.xticks(range(0,len(feature_rows)), feature_rows, rotation='vertical',fontsize='6') plt.yticks(range(0,len(feature_rows)), feature_rows, fontsize='6') plt.gca().xaxis.set_ticks_position('top') plt.colorbar() plt.tight_layout(pad=2) plt.show() imputed_array = imputed_array.transpose()
X_new = X_new.drop(['Purpose_nan', 'Housing_nan'], axis=1) Y_new = pd.get_dummies(Y, drop_first=True) """print(Y_new) print(X_new)""" #treating the job features saperately as they are having numbers but need to be one hot encoded F = df.loc[:, 'Job'] F = pd.get_dummies(F, prefix='Job') F = F.drop(['Job_0'], axis=1) X_new = pd.concat([F, X_new], axis=1) #Adding the job feature back to X_new df_new = pd.concat( [X_new, Y_new, df_1, df_2], axis=1 ) #Combining X_new, Y_new and the 'Savings account' and 'Checking account' features for imputing the missing data in 'Savings account' and 'Checking account' features encode_data = pd.DataFrame(np.round(imputer.fit_transform(df_new)), columns=df_new.columns) #imputing the missing data #after imputing the missing data, getting dummy variables from the imputed features F1 = encode_data.loc[:, 'Saving accounts'] F1 = pd.get_dummies(F1, prefix='Saving accounts') F1 = F1.drop(['Saving accounts_3.0'], axis=1) F2 = encode_data.loc[:, 'Checking account'] F2 = pd.get_dummies(F2, prefix='Checking account') F2 = F2.drop(['Checking account_2.0'], axis=1) encode_data = encode_data.drop(['Saving accounts', 'Checking account'], axis=1) encode_data = pd.concat( [encode_data, F1, F2], axis=1) #again gettig the full dataset by combining all the features
class Preprocess(): def __init__(self, rootDir, word_dict, inv_words): self.rootDir_ = rootDir self.class_words_dict_ = word_dict self.inv_words_dict_ = inv_words self.imputer_ = KNN(k=1) self.enc_ = OrdinalEncoder() self.spanish_stemmer_ = SnowballStemmer('spanish') self.special_words_ = ['piez'] self.stopwords_spanish_ = stopwords.words('spanish') self.df_ = pd.DataFrame(columns=[ 'Tipo', 'Tipo_2', 'Tipo_3', 'Tipo_4', 'Marca', 'Submarca', 'Empaque', 'Contenido', 'UnidadMedida', 'LocalidadGeografica', 'Fuente', 'precio', 'fecha' ]) self.data_ = self.import_data() self.add_stop_words() self.preprocess('descripcion') self.categorize() self.append_df() self.join_marca_submarca_drop_null() self.imputation() self.inv_words_funct() self.drop_unused_columns() def import_data(self): ''' Import all files in a library without subfolders ''' data = {} path = self.rootDir_ + '*.csv' for fname in glob.glob(path): data[fname.split('\\')[1].split('.csv')[0]] = pd.read_csv( fname, index_col=0) try: data.get(fname.split('\\')[1].split('.csv') [0])['fecha'] = pd.to_datetime(data.get( fname.split('\\')[1].split('.csv')[0])['fecha'], format='%d-%m-%Y') except KeyError: print('Check datetime values, as I didnt find them.') return data def add_stop_words(self): new_stop_words = ['s'] self.stopwords_spanish_.extend(new_stop_words) return self def tokenize(self, data): ''' Input: the complete strins Output: the tokenize string in a list of strings ''' return word_tokenize(data) def remove_stopwords_punctuation(self, data): clean_description = [] for word in data: if (word not in self.stopwords_spanish_ and word not in string.punctuation): clean_description.append(word) return clean_description def remove_accents(self, data): return [unidecode.unidecode(word) for word in data] def lowercasing(self, data): return [word.lower() for word in data] def stemming(self, data): return [self.spanish_stemmer_.stem(word) for word in data] def remove_duplicates(self, data): seen = set() result = [] for item in data: if item not in seen: seen.add(item) result.append(item) return result def split_number_letter(self, data): result = [] for word in data: match = re.match(r'([0-9]+)([a-z]+)', word, re.I) if match: for element in match.groups(): result.append(element) else: result.append(word) return result def remove_special_char(self, data): result = [] for word in data: if (word not in self.special_words_): result.append(word) return result def preprocess(self, column_name): for values in self.data_.values(): values[column_name] = values.apply( lambda row: self.tokenize(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.remove_accents(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.lowercasing(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.split_number_letter(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.remove_stopwords_punctuation(row[column_name] ), axis=1) values[column_name] = values.apply( lambda row: self.stemming(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.remove_special_char(row[column_name]), axis=1) values[column_name] = values.apply( lambda row: self.remove_duplicates(row[column_name]), axis=1) return self def append_df(self): for element in self.data_.keys(): self.df_ = self.df_.append(self.data_.get(element), ignore_index=True) return self def categorize(self): for base_key in self.data_.keys(): self.data_.get(base_key).reset_index(drop=True, inplace=True) columns_to_add = [ 'Tipo', 'Tipo_2', 'Tipo_3', 'Tipo_4', 'Marca', 'Submarca', 'Empaque', 'Contenido', 'UnidadMedida' ] for i in columns_to_add: self.data_.get(base_key)[i] = np.nan self.data_.get(base_key)['Fuente'] = base_key for row in range(len(self.data_.get(base_key))): for element in self.data_.get(base_key)['descripcion'][row]: if element in self.class_words_dict_.get('Tipo'): self.data_.get(base_key)['Tipo'].loc[row] = element if element in self.class_words_dict_.get('Tipo_2'): self.data_.get(base_key)['Tipo_2'].loc[row] = element if element in self.class_words_dict_.get('Tipo_3'): self.data_.get(base_key)['Tipo_3'].loc[row] = element if element in self.class_words_dict_.get('Tipo_4'): self.data_.get(base_key)['Tipo_4'].loc[row] = element if element in self.class_words_dict_.get('Marca'): self.data_.get(base_key)['Marca'].loc[row] = element if element in self.class_words_dict_.get('Submarca'): self.data_.get(base_key)['Submarca'].loc[row] = element if element in self.class_words_dict_.get('Empaque'): self.data_.get(base_key)['Empaque'].loc[row] = element if element in self.class_words_dict_.get('Contenido'): self.data_.get( base_key)['Contenido'].loc[row] = element if element in self.class_words_dict_.get('UnidadMedida'): self.data_.get( base_key)['UnidadMedida'].loc[row] = element return self def join_marca_submarca_drop_null(self): self.df_['Submarca'].fillna('', inplace=True) self.df_['Marca'] = self.df_['Marca'] + self.df_['Submarca'] self.df_.drop(['Submarca'], axis=1, inplace=True) self.df_.dropna(subset=['Tipo'], inplace=True) return self def imputation(self): self.df_.fillna('', inplace=True) self.df_.reset_index(drop=True, inplace=True) for row in range(len(self.df_)): if self.df_.Tipo.loc[row] == 'huev' and self.df_.UnidadMedida.loc[ row] == '': self.df_['UnidadMedida'].loc[row] = 'pz' if self.df_.Tipo.loc[ row] == 'tortill' and self.df_.UnidadMedida.loc[row] == '': self.df_['UnidadMedida'].loc[row] = 'pz' if self.df_.Tipo.loc[row] == 'papel' and self.df_.UnidadMedida.loc[ row] == '': self.df_['UnidadMedida'].loc[row] = 'roll' if self.df_.Tipo.loc[row] == 'lech' and self.df_.UnidadMedida.loc[ row] == '': self.df_['UnidadMedida'].loc[row] = 'l' if self.df_.Contenido.loc[row] == '': self.df_['Contenido'].loc[row] = '1' if self.df_.Marca.loc[row] == '': self.df_['Marca'].loc[row] = 'no_especificado' if self.df_['Tipo_2'].loc[row] == '': if self.df_['Tipo_4'].loc[row] == '' and self.df_[ 'Tipo_3'].loc[row] == '': self.df_['Tipo_2'].loc[row] = 'no_especificado' else: if self.df_['Tipo_4'].loc[row] == '': self.df_['Tipo_2'].loc[row] = self.df_['Tipo_3'].loc[ row] else: if self.df_['Tipo_3'].loc[row] == '': self.df_['Tipo_2'].loc[row] = self.df_[ 'Tipo_4'].loc[row] else: self.df_['Tipo_2'].loc[ row] = self.df_['Tipo_3'].loc[ row] + '_' + self.df_['Tipo_4'].loc[row] else: if self.df_['Tipo_4'].loc[row] == '' and self.df_[ 'Tipo_3'].loc[row] == '': self.df_['Tipo_2'].loc[row] = self.df_['Tipo_2'].loc[row] else: if self.df_['Tipo_4'].loc[row] == '': self.df_['Tipo_2'].loc[row] = self.df_['Tipo_2'].loc[ row] + '_' + self.df_['Tipo_3'].loc[row] else: if self.df_['Tipo_3'].loc[row] == '': self.df_['Tipo_2'].loc[ row] = self.df_['Tipo_2'].loc[ row] + '_' + self.df_['Tipo_4'].loc[row] else: self.df_['Tipo_2'].loc[row] = self.df_[ 'Tipo_2'].loc[row] + '_' + self.df_[ 'Tipo_3'].loc[row] + '_' + self.df_[ 'Tipo_4'].loc[row] self.knn_imputer_for_empaque() return self def knn_imputer_for_empaque(self): data = self.df_.copy(deep=True) data['Empaque'][(data['Empaque'] == '')] = np.nan # initialize variables ordinal_enc_dict = {} columns_to_encode = ['Tipo', 'Tipo_2', 'Empaque'] # loop over columns to encode for col_name in data[columns_to_encode]: # create ordinal encoder for the column ordinal_enc_dict[col_name] = OrdinalEncoder() # select the non-null values in the column col = data[col_name] col_not_null = col[col.notnull()] reshaped_vals = col_not_null.values.reshape(-1, 1) # encode the non-null values of the column encoded_vals = ordinal_enc_dict[col_name].fit_transform( reshaped_vals) # store the values to non-null values of the column in data data.loc[col.notnull(), col_name] = np.squeeze(encoded_vals) # imputing with KNN data.iloc[:, [data.columns.get_loc(col_) for col_ in columns_to_encode]] = np.round( self.imputer_.fit_transform(data[columns_to_encode])) for col_name in data[columns_to_encode]: # reshape the data reshaped = data[col_name].values.reshape(-1, 1) # perform inverse transformation of the ordinally encoded columns data[col_name] = ordinal_enc_dict[col_name].inverse_transform( reshaped) self.df_ = data.copy(deep=True) return self def search_in_dict(self, data): for key, value in self.inv_words_dict_.items(): for i in value: if i == data: return key else: pass return data def inv_words_funct(self): column_name = [ 'Tipo', 'Tipo_2', 'Marca', 'Empaque', 'UnidadMedida', 'Contenido' ] for element in column_name: self.df_[element] = self.df_.apply( lambda row: self.search_in_dict(row[element]), axis=1) return self def drop_unused_columns(self): columns_to_drop = [ 'descripcion', 'producto', 'LocalidadGeografica', 'Tipo_3', 'Tipo_4' ] self.df_.drop(columns_to_drop, axis=1, inplace=True) return self
with open(args.config) as f: config = json.load(f) data_path = config["data_path"] #Ground truth data corrupt_data_path = config["corrupt_data_path"] #Data containing missing values n_neighbor = config["n_neighbor"] trial_ind = config["trial_ind"] # LOAD DATA data= pd.read_csv(data_path).values data_missing = pd.read_csv(corrupt_data_path).values n_row = data_missing.shape[1] # dimensionality of data space non_missing_row_ind= np.where(np.isfinite(np.sum(data_missing,axis=1))) na_ind = np.where(np.isnan(data_missing)) na_count= len(na_ind[0]) knnImpute = KNN(k=n_neighbor) print("Start Knn") #X_impute_KNN = knnImpute.complete(Xdata_Missing) data_impute_KNN = knnImpute.fit_transform(data_missing) print("Knn finished") ReconstructionErrorKNN = sum(((data_impute_KNN[na_ind] - data[na_ind])**2)**0.5)/na_count print('Reconstruction error (KNN):') print(ReconstructionErrorKNN) np.savetxt("./imputed_data_trial_"+str(trial_ind)+"_KNN.csv", data_impute_KNN, delimiter=",")
#%% # dummify alldata = pd.get_dummies(alldata, columns=['Age'], prefix='In_AgeGRP') #%% # Age - Use Sex, Pclass, Parch, SibSp, Prefix to Fill Age minage = alldata.loc[alldata['Age'].notnull(), ('Age')].min() maxage = alldata.loc[alldata['Age'].notnull(), ('Age')].max() medianage = alldata.loc[alldata['Age'].notnull(), ('Age')].mean() cols = ['Sex', 'Pclass', 'Parch', 'SibSp', 'Prefix', 'Age'] targetdf = alldata.fillna(value=float('NaN')).copy().loc[:, cols] predictage = KNN(k=5, min_value=minage, max_value=maxage) imp_agesdf = pd.DataFrame(data=predictage.fit_transform(targetdf), columns=cols) imp_agesdf['orig_ages'] = targetdf.loc[:, ('Age')] imp_agesdf.loc[imp_agesdf['orig_ages'].isnull() == True].head() imp_minage = imp_agesdf.loc[alldata['Age'].notnull(), ('Age')].min() imp_maxage = imp_agesdf.loc[alldata['Age'].notnull(), ('Age')].max() imp_medianage = imp_agesdf.loc[alldata['Age'].notnull(), ('Age')].mean() print("Min:{}->{},Mean:{}->{},Max:{}->{}".format(minage, imp_minage, medianage, imp_medianage, maxage, imp_maxage)) #%% # SibSp and Parch -> Family Size
from fancyimpute import KNN KNN_imputer = KNN() num_features = [ 'cod_municipio', 'feature_16', 'feature_17', 'feature_13', 'feature_14', 'feature_15', 'feature_18', 'feature_04', 'feature_06', 'feature_07', 'feature_09', 'feature_10' ] df_knn = cenarios.copy() df_knn = KNN_imputer.fit_transform(df_knn[num_features]) df_knn
# + # Remove these from the Snitz data df_snitz_dragon = df_dragon.loc[snitz_cids.difference(no_dragon)] for nd in no_dragon: df_snitz_dragon.loc[nd, :] = 0 # + # Remove bad features (too many NaNs) and impute remaining NaNs frac_bad = df_snitz_dragon.isnull().mean() good = frac_bad[frac_bad<0.3].index df_snitz_dragon = df_snitz_dragon.loc[:, good] knn = KNN(k=5) df_snitz_dragon[:] = knn.fit_transform(df_snitz_dragon.values) # + #from olfactometer.odorants import from_cids #pubchem_data = from_cids([int(x) for x in snitz_cids]) #pd.DataFrame.from_dict(pubchem_data).set_index('CID').to_csv('data/snitz-odorant-info.csv') # + #df_snitz_mordred = pd.read_csv('data/snitz-mordred.csv').set_index('CID') #df_snitz_mordred[:] = mms.fit_transform(df_snitz_mordred.values) #df_snitz_mordred.head() # - df_snitz_features = df_snitz_dragon # Normalize every molecule to have unit norm (to be unit vector in feature space)