def mi(self, m): """Validate mi and set default properties. The MiBaseRegressor validates the mi argument. mi must be a valid instance of a MultipleImputer. It can also be None. If None, the MiBaseRegressor will create a MultipleImputer on its own, either by default or with any key values passed to the mi_kwgs args dict. Args: m (MultipleImputer, None): Instance of a MultipleImputer. Raises: ValueError: mi is not an instance of a MultipleImputer. """ # check if m is None or a MultipleImputer if not isinstance(m, (type(None), MultipleImputer)): err = f"{m} must be None or a valid instance of MultipleImputer." raise ValueError(err) # handle each case if None or MultipleImputer if m is not None: self._mi = m else: # handle whether or not mi_kwgs should be passed if self.mi_kwgs: self._mi = MultipleImputer(**self.mi_kwgs) else: self.mi = MultipleImputer()
def pmm_impute(data, n_impute=3): signals = data.iloc[:, :-1] labels = data.iloc[:, -1] # PMM default number of neighbors is 5: imputer = MultipleImputer(n=n_impute, strategy='pmm', return_list=True, imp_kwgs={ 'tune': 1000, 'sample': 1000, 'neighbors': 5 }) data_sets = imputer.fit_transform(signals) signals = avg_data_sets(data_sets) return (signals.join(labels))
sns.distplot(data['release_year']) sns.distplot(data['release_month']) data.status.unique() data.status.isna().sum() sum(data['status'] == "In Production") sum(data['status'] == "Post Production") sum(data['status'] == "Rumored") sns.catplot(x='status', kind='count', palette="ch:.25", data=data) from seaborn import countplot countplot(data=data, x='status') #Step 2: imp = MultipleImputer(n=3) res = imp.fit_transform(data) print(res) res.shape data.shape from autoimpute.imputations import SingleImputer single = SingleImputer( strategy={ 'status': "categorical", 'release_year': "median", 'runtime': 'norm', 'release_month': 'random' }) data_imputed_once = single.fit_transform(data)
np.random.choice(['Male', 'Feamle'], n_samples), 'employment': np.random.choice(['Unemployed', 'Employed', 'Part Time', 'Self-Employed'], n_samples, p=[0.05, 0.6, 0.15, 0.2]) }) toy_df['gender'].value_counts() toy_df['employment'].value_counts() for c in toy_df.columns: toy_df.loc[np.random.choice(range(n_samples), replace=False, size=100), c] = np.nan sim = SingleImputer(strategy='categorical') sim.fit(toy_df) toy_df_fill = sim.transform(toy_df) toy_df_fill.loc[toy_df['employment'].isnull(), 'employment'].value_counts() toy_df.loc[toy_df['employment'].isnull() == False, 'employment'].value_counts() / sum( toy_df['employment'].isnull() == False) from autoimpute.imputations import MultipleImputer mi = MultipleImputer(strategy='stochastic', return_list=True, predictors={'y': ['x']}) mi_data_fill = mi.fit_transform(data_het_miss)
class MiBaseRegressor: """Building blocks to create an Autoimpute regressor. Every Autoimpute regressor inherits from the MiBaseRegressor. The class provides the functionality necessary for Autoimpute regressors to wrap sklearn or statsmodels libraries and apply them to multiply imputed datasets. It also creates the MultipleImputer used to impute data if the user does not specify a custom MultipleImputer during instantiation. Attributes: model_libs (tuple): libraries supported by Autoimpute regressors. """ model_libs = ("sklearn", "statsmodels") def __init__(self, mi, model_lib, mi_kwgs, model_kwgs): """Create an instance of the MiBaseRegressor class. The MiBaseRegressor class is not a stand-alone class and should not be used other than as a parent class to an Autoimpute regressor. An Autoimpute regressor wraps either sklearn or statsmodels regressors to apply them on multiply imputed datasets. The MiBaseRegressor contains the logic Autoimpute regressors share. In addition, it creates an instance of the MultipleImputer to impute missing data. Args: mi (MultipleImputer): An instance of a MultipleImputer. If `mi` passed explicitly, this `mi` will be used for MultipleImptuer. Can use `mi_kwgs` instead, although `mi` is cleaner/preferred. model_lib (str): library the regressor will use to implement regression. Options are sklearn and statsmodels. Default is statsmodels. mi_kwgs (dict): keyword args to instantiate MultipleImputer. If valid MultipleImputer passed to `mi`, model_kwgs ignored. If `mi_kwgs` is None and `mi` is None, MiBaseRegressor creates default instance of MultipleImputer. model_kwgs (dict): keyword args to instantiate regressor. Arg is passed along to either sklearn or statsmodels regressor. If `model_kwgs` is None, default instance of regressor created. Returns: self. Instance of MiBaseRegressor class. """ # Order Important. `mi_kwgs` validation first b/c it's used in `mi` # also note - encoder is not argument, b/c one-hot only right now self.mi_kwgs = mi_kwgs self.mi = mi self.model_kwgs = model_kwgs self.model_lib = model_lib @property def mi_kwgs(self): """Property getter to return the value of mi_kwgs.""" return self._mi_kwgs @mi_kwgs.setter def mi_kwgs(self, kwgs): """Validate the mi_kwgs and set default properties. The MiBaseRegressor validates mi_kwgs argument. mi_kwgs contain optional keyword arguments to create a MultipleImputer. The argument is optional, and its default is None. Args: kwgs (dict, None): None or dictionary of keywords. Raises: ValueError: mi_kwgs not correctly specified as argument. """ if not isinstance(kwgs, (type(None), dict)): err = "mi_kwgs must be None or dict of args for MultipleImputer." raise ValueError(err) self._mi_kwgs = kwgs @property def mi(self): """Property getter to return the value of mi.""" return self._mi @mi.setter def mi(self, m): """Validate mi and set default properties. The MiBaseRegressor validates the mi argument. mi must be a valid instance of a MultipleImputer. It can also be None. If None, the MiBaseRegressor will create a MultipleImputer on its own, either by default or with any key values passed to the mi_kwgs args dict. Args: m (MultipleImputer, None): Instance of a MultipleImputer. Raises: ValueError: mi is not an instance of a MultipleImputer. """ # check if m is None or a MultipleImputer if not isinstance(m, (type(None), MultipleImputer)): err = f"{m} must be None or a valid instance of MultipleImputer." raise ValueError(err) # handle each case if None or MultipleImputer if m is not None: self._mi = m else: # handle whether or not mi_kwgs should be passed if self.mi_kwgs: self._mi = MultipleImputer(**self.mi_kwgs) else: self.mi = MultipleImputer() @property def model_kwgs(self): """Property getter to return the value of model_kwargs.""" return self._model_kwgs @model_kwgs.setter def model_kwgs(self, kwgs): """Validate the model_kwgs and set default properties. The MiBaseRegressor validates the model_kwgs argument. model_kwgs contain optional keyword arguments pased to a regression model. The argument is optional, and its default is None. Args: kwgs (dict, None): None or dictionary of keywords. Raises: ValueError: model_kwgs not correctly specified as argument. """ if not isinstance(kwgs, (type(None), dict)): err = "model_kwgs must be dict of args used to instantiate model." raise ValueError(err) self._model_kwgs = kwgs @property def model_lib(self): """Property getter to return the value of model_lib.""" return self._model_lib @model_lib.setter def model_lib(self, lib): """Validate model_lib and set default properties. The MiBaseRegressor validates the model_lib argument. model_lib should be in the MiBaseRegressor.model_libs tuple, which contains the libs to use for regression of multiply imputed datasets. The library chosen is important. Only statsmodels (the default) provides proper parameter pooling using Rubin's rules. sklearn provides mean estimate pooling only. sklearn variance parameter pooling and diagnostics in dev, TBD. Args: lib (iter): library to use Raises: ValueError: lib not a valid library to use. """ if lib not in self.model_libs: err = f"{lib} not valid `model_lib`. Must be {self.model_libs}." raise ValueError(err) self._model_lib = lib def _fit_strategy_validator(self, X, y): """Private method to validate data before fitting model.""" # y must be a series or dataframe if not isinstance(y, (pd.Series, pd.DataFrame)): err = "y must be a Series or DataFrame" raise ValueError(err) # y must have a name if series. if isinstance(y, pd.Series): self._yn = y.name if self._yn is None: err = "series y must have a name" raise ValueError(err) # y must have one column if dataframe. if isinstance(y, pd.DataFrame): yc = y.shape[1] if yc != 1: err = "y should only have one column" raise ValueError(err) y = y.iloc[:, 0] self._yn = y.name # y and X must have the same number of rows if X.shape[0] != y.shape[0]: err = "y and X must have the same number of records" raise ValueError(err) # if no errors thus far, add y to X for imputation X[self._yn] = y # return the multiply imputed datasets return self.mi.fit_transform(X) def _fit_model(self, model_type, regressor, X, y): """Private method to fit a model using sklearn or statsmodels.""" # encoding for predictor variable # we enforce that predictors were imputed in imputation phase. X = _one_hot_encode(X) self.new_X_columns = X.columns.tolist() # encoding for response variable if model_type == "logistic": ycat = y.astype("category").cat y = ycat.codes self._response_categories = ycat.categories # statsmodels fit case, which requires different logic than sklearn if self.model_lib == "statsmodels": X = add_constant(X) if self.model_kwgs: model = regressor(y, X, **self.model_kwgs) else: model = regressor(y, X) model = model.fit() # sklearn fit case, which requires different logic than statsmodels if self.model_lib == "sklearn": if self.model_kwgs: model = regressor(**self.model_kwgs) else: model = regressor() # sklearn doesn't need encoding for response model.fit(X, y) # return the model after fitting it to a given dataset return model def _apply_models_to_mi_data(self, model_dict, X, y): """Private method to apply analysis model to multiply imputed data.""" # find regressor based on model lib, then get mutliply imputed data model_type = model_dict["type"] regressor = model_dict[self.model_lib] mi_data = self._fit_strategy_validator(X, y) models = {} # then preform analysis models. Sequential only right now. for dataset in mi_data: ind, X = dataset y = X.pop(self._yn) model = self._fit_model(model_type, regressor, X, y) models[ind] = model # returns a dictionary: k=imp #; v=analysis model applied to imp # return models def _predict_strategy_validator(self, instance, X): """Private method to validate before prediction.""" # first check that model is fitted, then check columns are the same check_is_fitted(instance, "statistics_") X_cols = X.columns.tolist() fit_cols = set(instance.fit_X_columns) diff_fit = set(fit_cols).difference(X_cols) if diff_fit: err = "Same columns that were fit must appear in predict." raise ValueError(err) # encoding for predictor variable # we enforce that predictors were imputed in imputation phase. if X.isnull().sum().any(): me = "Data passed to make predictions can't contain missingness." raise ValueError(me) X = _one_hot_encode(X) return X def _var_ratios(self, imps, num, denom): """Private method for the variance ratios.""" return (num + (num / imps)) / denom def _degrees_freedom(self, imps, lambda_, v_com): """Private method to calculate degrees of freedom for estimates.""" # note we nudge lambda if zero b/c need lambda for other stats # see source code barnard.rubin.R from MICE for more lambda_ = np.maximum(1e-04, lambda_) v_old = (imps - 1) / lambda_**2 v_obs = ((v_com + 1) / (v_com + 3)) * v_com * (1 - lambda_) v = (v_old * v_obs) / (v_old + v_obs) return v def _get_stats_from_models(self, models): """Private method to generate statistics given on model lib chosen.""" # initial setup - get items from models and get number of models items = models.items() m = self.mi.n # pooling phase: sklearn - coefficients only, no variance if self.model_lib == "sklearn": # find basic parameters, but can't return much more than coeff # sklearn does not implement inference out of the box # will have to write methods to do so from stratch, so TBD self.mi_alphas_ = [j.intercept_ for i, j in items] self.mi_params_ = [j.coef_ for i, j in items] alpha = sum(self.mi_alphas_) / m params = sum(self.mi_params_) / m coefs = pd.Series(np.insert(params, 0, alpha)) coefs.index = ["const"] + self.new_X_columns statistics = OrderedDict(coefs=coefs) # pooling phase: statsmodels - coefficients and variance possible if self.model_lib == "statsmodels": # data and model parameters self.mi_params_ = [j.params for i, j in items] self.mi_std_errors_ = [j.bse for i, j in items] coefs = sum(self.mi_params_) / m k = coefs.index.size n = list(items)[0][1].nobs df_com = n - k # variance metrics (See VB Ch 2.3) vw = sum(map(lambda x: x**2, self.mi_std_errors_)) / m vb = sum(map(lambda p: (p - coefs)**2, self.mi_params_)) / max(1, m - 1) vt = vw + vb + (vb / m) stdt = np.sqrt(vt) # variance ratios (See VB Ch 2.3) # efficiency as specified in stats manual lambda_ = self._var_ratios(m, vb, vt) r_ = self._var_ratios(m, vb, vw) v_ = self._degrees_freedom(m, lambda_, df_com) fmi_ = ((v_ + 1) / (v_ + 3)) * lambda_ + 2 / (v_ + 3) eff_ = (1 + (np.maximum(1e-04, fmi_) / m))**-1 # create statistics with pooled metrics from above statistics = OrderedDict(coefs=coefs, std=stdt, vw=vw, vb=vb, vt=vt, dfcom=df_com, dfadj=v_, lambda_=lambda_, riv=r_, fmi=fmi_, eff=eff_) # finally, return dictionary with stats from fit used in transform return statistics
def data_new(): import pandas as pd import glob import numpy as np from autoimpute.imputations import MultipleImputer data_path = r'C:\Users\de_hauk\PowerFolders\apps_tzakiris_rep\data\data_new_12_08_2020\data_raw\csv' all_files = glob.glob(data_path + "/*.csv") sample_fullinfo = pd.read_csv(r'C:\Users\de_hauk\PowerFolders\apps_tzakiris_rep\data\data_new_12_08_2020\data_list\stimuli_list.csv') SAMPLE_fullinfo = sample_fullinfo.drop(columns = ['Unnamed: 0']).copy() #DATA_raw_DF = pd.DataFrame(SAMPLE_fullinfo).copy() DATA_raw_DF = pd.DataFrame() for data in all_files: unique_ID = data[-12] + '_' + data[-5] curr_raw_data = pd.read_csv(data,header=None, sep='\t').drop(axis='index', labels = [0,1])[2] perspective = [] answer_correct = [] #1 yes // 0 no answer_raw = [] # YES -> 1 // NO -> 0 // Do you remember face? for data_point in curr_raw_data: if len(data_point) < 5: perspective.append(data_point) if ('1' in data_point[0])and (len(data_point)>5): answer_correct.append(1) elif ('0' in data_point[0])and (len(data_point)>5): answer_correct.append(0) if '[YES]' in data_point: answer_raw.append(1) elif '[NO]' in data_point: answer_raw.append(0) elif 'missed' in data_point: answer_raw.append(np.nan) DATA_raw_DF[unique_ID + 'perspective'] = perspective DATA_raw_DF[unique_ID + 'perf'] = answer_correct DATA_raw_DF[unique_ID + 'answer'] = answer_raw #raw_dat_unproc = [pd.read_csv(i,header=None, sep='\t').drop(axis='index', labels = [0,1])[2] for i in all_files] ### determine place and amount of missing value missing_dat_raw = pd.DataFrame(DATA_raw_DF.isnull().sum(), columns = ['dat']) ### filter out clmns with no missing dat #missing_dat_overview = missing_dat_raw.loc[missing_dat_raw['dat'] > 0].T #drop irrelevant columns containig weird stim codings miss_perspect = [i for i in DATA_raw_DF if ('perspective' in i) or ('perf' in i)] miss_dat = DATA_raw_DF.drop(labels =miss_perspect, axis = 1 ).copy() # Impute missing dat with binary logistic regress returning only one DF with imputed values ### Uses = https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html imp = MultipleImputer(n=1,strategy= 'binary logistic',return_list=True, imp_kwgs={"binary logistic": {"max_iter": 10000}} ) impute_res = imp.fit_transform(miss_dat) # # merge imputed DF with relevant Info i.e. generating list etc DATA_imput = impute_res[0][1] for i in SAMPLE_fullinfo: DATA_imput[i] = sample_fullinfo[i] for i in DATA_raw_DF: if i not in [i for i in DATA_imput]: DATA_imput[i] = DATA_raw_DF[i] return DATA_imput
DATA_raw_DF[unique_ID + 'answer'] = answer_raw raw_dat_unproc = [pd.read_csv(i,header=None, sep='\t').drop(axis='index', labels = [0,1])[2] for i in all_files] ### determine place and amount of missing value missing_dat_raw = pd.DataFrame(DATA_raw_DF.isnull().sum(), columns = ['dat']) ### filter out clmns with no missing dat missing_dat_overview = missing_dat_raw.loc[missing_dat_raw['dat'] > 0].T #drop irrelevant columns containig weird stim codings miss_perspect = [i for i in DATA_raw_DF if ('perspective' in i) or ('perf' in i)] miss_dat = DATA_raw_DF.drop(labels =miss_perspect, axis = 1 ).copy() # Impute missing dat with binary logistic regress returning only one DF with imputed values ### Uses = https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html imp = MultipleImputer(n=1,strategy= 'binary logistic',return_list=True, imp_kwgs={"binary logistic": {"max_iter": 10000}} ) impute_res = imp.fit_transform(miss_dat) # # merge imputed DF with relevant Info i.e. generating list etc DATA_imput = impute_res[0][1] for i in SAMPLE_fullinfo: DATA_imput[i] = sample_fullinfo[i] for i in DATA_raw_DF: if i not in [i for i in DATA_imput]: DATA_imput[i] = DATA_raw_DF[i] ############################################################################### def data_new(): import pandas as pd import glob
plt.xlim([-1, X.shape[1]]) plt.show() import os for ids in train['id']: print(ids) for file in os.listdir(text): if file.endswith(".gz"): print(file) data = pd.read_csv(text + file, compression='gzip') print(data.index[data['id'] == ids].tolist()) break from autoimpute.imputations import SingleImputer, MultipleImputer si = SingleImputer() # imputation methods, passing through the data once mi = MultipleImputer() # imputation methods, passing through the data multiple times # train_cols = list(train) X = train.drop(['id', 'target'], axis=1) X = MICE().fit_transform(X) X_test1 = test.drop(['id'], axis = 1) X_test1 = MICE().fit_transform(X_test) def scorer(true,pred): error = math.sqrt(mean_squared_error(pred,true)) return math.exp(-1*error) score = make_scorer(scorer, greater_is_better=True) X = train.drop(['id', 'target'], axis=1) y= train['target']
def __init__(self, T, mask, algo, miss_info, kf, notobj, obj, target): try: self.miss_info = miss_info self.columns = notobj self.ord_num_col = self.miss_info["ord_col"] + self.miss_info[ "num_col"] metric = {"rmse": {}, "nrmse": {}} self.rawT = T self.target = target if target is not None: self.target_y = T[target] else: self.target_y = None self.cv = {} self.cv.update(deepcopy(metric)) self.kf = kf self.MSE = {} self.MSE.update(deepcopy(metric)) self.result = {} self.time_ck = {} X = deepcopy(T) mask = pd.DataFrame(mask, columns=T.columns.tolist()) self.rawmask = mask X[(mask == 1).values] = np.nan if obj in [None, []]: obj = None else: pass ########################################## self.X = X[notobj] self.T = T[notobj] self.mask = mask[notobj] self.notobj = notobj ########################################## if obj is not None: ############ Numeric + Category ################# cat_impute = SimpleImputer(strategy="most_frequent") X[obj] = cat_impute.fit_transform(X[obj]) self.true_obj = T[obj] self.pd_obj = X[obj] ################################################### TT = deepcopy(T) cat_encoder = miss_info["ce_encoder"] for k in cat_encoder.category_mapping: col, map_ = k["col"], k["mapping"] TT[col] = TT[col].replace( dict(zip(k["mapping"].index, k["mapping"].values))) self.full_miss_data = TT self.full_miss_data[(mask == 1).values] = np.nan mice_data = deepcopy(T) for obj_col in obj: mice_data[obj_col] = "Cols_" + mice_data[obj_col] self.full_mice_data = mice_data self.full_mice_data[(mask == 1).values] = np.nan else: ########## Numeric ############################### num_data = deepcopy(self.X) num_data[(self.mask == 1).values] = np.nan self.full_miss_data = deepcopy(num_data) self.full_mice_data = deepcopy(num_data) ################################################### self.algo = algo self.method = { "MissForest" : lambda x : MissForest(verbose = 0, n_jobs = -1 ).fit(x) , "mean" : lambda x : impy.mean(x) , "median" : lambda x : impy.median(x) , "mode" : lambda x : impy.mode(x) , "knn" : lambda x : impy.fast_knn(x) , "MICE" : lambda x : impy.mice(x) , "EM" : lambda x : impy.em(x), "MultipleImputer" : lambda x : MultipleImputer(n=1, return_list = True).\ fit_transform(pd.DataFrame(x)).values, } except Exception as e: print(e) pass
# Fill in required Inputs x_train = train_data.iloc[:, list(range(3, 11))] y_train = train_data.iloc[:, list(range(11,12))].values x_train_num = train_data.iloc[:, list(range(3, 9))] x_train_txt = train_data.iloc[:, list(range(9, 11))] x_train_txt_encode_split = 2 # Split at Column Number x_test = test_data.iloc[:, list(range(3, 11))] x_test_num = test_data.iloc[:, list(range(3, 9))] x_test_txt = test_data.iloc[:, list(range(9, 11))] x_test_txt_encode_split = 2 # Split at Column Number # Impute Missing values # Numerical Imputer imputer_num = MultipleImputer(strategy='stochastic', return_list=True, n=5, seed=101) x_train_num_avg = imputer_num.fit_transform(x_train_num) x_train_num_concat = x_train_num_avg[0][1] for i in range(len(x_train_num_avg)-1): x_train_num_concat = pd.concat([x_train_num_concat,x_train_num_avg[i+1][1]], axis=1) x_train_num_avg = x_train_num_concat.groupby(by=x_train_num_concat.columns, axis=1).apply(lambda g: g.mean(axis=1)) x_test_num_avg = imputer_num.fit_transform(x_test_num) x_test_num_concat = x_test_num_avg[0][1] for i in range(len(x_test_num_avg)-1): x_test_num_concat = pd.concat([x_test_num_concat,x_test_num_avg[i+1][1]], axis=1) x_test_num_avg = x_test_num_concat.groupby(by=x_test_num_concat.columns, axis=1).apply(lambda g: g.mean(axis=1))
def data_old_sample(): import pandas as pd import glob import numpy as np from autoimpute.imputations import MultipleImputer data_path = r'C:\Users\de_hauk\PowerFolders\apps_tzakiris_rep\A_T_Implementation\data_lab_jansen' ############################################### Data ################################################################################# ''' Data Processing in two batches. Second batch containes additional Information in form of an additional row per trial which needs to be deleted''' ## First Batch path_1_batch_raw = data_path + r'\first_batch' all_files_1B = glob.glob(path_1_batch_raw + "/*.csv") ## Second Batch path_2_batch_raw = data_path + r'\second_batch' all_files_2B = glob.glob(path_2_batch_raw + "/*.csv") ## Merge Batches so that subj 1-5 are first batch and 6-10 are second batch all_files = all_files_1B + all_files_2B ## Get raw data i.e. trial list generated by previous optimization run sample_fullinfo = pd.read_csv(data_path + r'\stimuli_list.csv') sample_fullinfo = sample_fullinfo.drop(columns=['Unnamed: 0']).copy() ## setup dataframe not containing raw dat SAMPLE_onlydat = pd.DataFrame() raw_dat_unproc = [ pd.read_csv(i, sep='\s+', header=None).drop(columns=[0, 1, 2, 5, 6, 7, 8, 9]).drop( axis='index', labels=[0]) for i in all_files ] ## For each (subject)-data in directories for ID in all_files: curr_ID = ID[ -8:-6] ## get unique ID Number/Subject Number from raw dat names curr_file = ID ## get current file-path ## get actual data into dataframe & drop irrelevant clmns df_sample_raw = pd.read_csv(curr_file, sep='\s+', header=None).drop( columns=[0, 1, 2, 5, 6, 7, 8, 9]).drop(axis='index', labels=[0]) ## select only relevant rows ## ACHTUNG: Length of df = batch 1 < batch 2 ## length use as a seperator to filter irrelevant rows if len(df_sample_raw[3]) >= 800: # aka. if data from batch 2 df_sample = df_sample_raw.loc[ (df_sample_raw[4] != '2=switched_1=notswitched') & (df_sample_raw[4] != 'actual_key_press_for_RT')] else: df_sample = df_sample_raw.loc[(df_sample_raw[4] != 'actual_key_press_for_RT')] ## Initialize protocoll data answer_correct = [] #1==yes//0==no faceID_known = [] #1==yes//0==no perspective = [] for i, j in zip(df_sample[3], df_sample[4]): ### Which stimulus was observed if (len(i) == 4) and (i not in ['None', 'right', 'left']): perspective.append(i) ### What was the Answer, regardless of correctness?? if ('right' in str(i)) and (str(j) == 'left=yes_right=no_None=missed'): faceID_known.append(0) # NOT familiar elif ('left' in str(i)) and (str(j) == 'left=yes_right=no_None=missed'): faceID_known.append(1) # familiar elif (str(i) == 'None') and (str(j) == 'left=yes_right=no_None=missed'): faceID_known.append( np.nan ) # code responding too early and missed responses as np.nan ### Was the answer correct w.r.t. to the task? // Missing an Answer is also worng if (str(i) == '1') and ('right' in str(j)): answer_correct.append(int(i)) elif (str(i) == '0') and ('wrong' in str(j)): answer_correct.append(int(i)) elif (str(i) == '0') and (str(j) == 'missed'): answer_correct.append(int(i)) sample_fullinfo[str(curr_ID) + 'answer'] = pd.Series(faceID_known) sample_fullinfo[str(curr_ID) + 'perf'] = pd.Series(answer_correct) sample_fullinfo[str(curr_ID) + 'perspective'] = pd.Series(perspective) SAMPLE_onlydat[str(curr_ID) + 'answer'] = pd.Series(faceID_known) SAMPLE_onlydat[str(curr_ID) + 'perf'] = pd.Series(answer_correct) SAMPLE_onlydat[str(curr_ID) + 'perspective'] = pd.Series(perspective) #################################### Impute missing Data ############################################################### ### determine place and amount of missing value missing_dat_raw = pd.DataFrame(sample_fullinfo.isnull().sum(), columns=['dat']) ### filter out clmns with no missing dat missing_dat_overview = missing_dat_raw.loc[missing_dat_raw['dat'] > 0].T #drop irrelevant columns containig weird stim codings miss_perspect = [i for i in SAMPLE_onlydat if 'perspective' in i] miss_dat = SAMPLE_onlydat.drop(labels=miss_perspect, axis=1).copy() # Impute missing dat with binary logistic regress returning only one DF with imputed values ### Uses = https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html imp = MultipleImputer(n=1, strategy='binary logistic', return_list=True, imp_kwgs={"binary logistic": { "max_iter": 10000 }}) impute_res = imp.fit_transform(miss_dat) # merge imputed DF with relevant Info i.e. generating list etc DATA_imput = impute_res[0][1] for i in sample_fullinfo: if i not in [ i for i in sample_fullinfo if ('answer' in i) or ('perf' in i) ]: DATA_imput[i] = sample_fullinfo[i] return { 'Data_raw_unproc': raw_dat_unproc, 'Data_raw': SAMPLE_onlydat, 'DATA_imput': DATA_imput, }