Ejemplo n.º 1
0
    def mi(self, m):
        """Validate mi and set default properties.

        The MiBaseRegressor validates the mi argument. mi must be a valid
        instance of a MultipleImputer. It can also be None. If None, the
        MiBaseRegressor will create a MultipleImputer on its own, either by
        default or with any key values passed to the mi_kwgs args dict.

        Args:
            m (MultipleImputer, None): Instance of a MultipleImputer.

        Raises:
            ValueError: mi is not an instance of a MultipleImputer.
        """

        # check if m is None or a MultipleImputer
        if not isinstance(m, (type(None), MultipleImputer)):
            err = f"{m} must be None or a valid instance of MultipleImputer."
            raise ValueError(err)

        # handle each case if None or MultipleImputer
        if m is not None:
            self._mi = m
        else:
            # handle whether or not mi_kwgs should be passed
            if self.mi_kwgs:
                self._mi = MultipleImputer(**self.mi_kwgs)
            else:
                self.mi = MultipleImputer()
Ejemplo n.º 2
0
def pmm_impute(data, n_impute=3):

    signals = data.iloc[:, :-1]
    labels = data.iloc[:, -1]

    # PMM default number of neighbors is 5:
    imputer = MultipleImputer(n=n_impute,
                              strategy='pmm',
                              return_list=True,
                              imp_kwgs={
                                  'tune': 1000,
                                  'sample': 1000,
                                  'neighbors': 5
                              })

    data_sets = imputer.fit_transform(signals)

    signals = avg_data_sets(data_sets)

    return (signals.join(labels))
Ejemplo n.º 3
0
sns.distplot(data['release_year'])
sns.distplot(data['release_month'])

data.status.unique()
data.status.isna().sum()
sum(data['status'] == "In Production")
sum(data['status'] == "Post Production")
sum(data['status'] == "Rumored")
sns.catplot(x='status', kind='count', palette="ch:.25", data=data)

from seaborn import countplot
countplot(data=data, x='status')

#Step 2:

imp = MultipleImputer(n=3)

res = imp.fit_transform(data)
print(res)
res.shape
data.shape

from autoimpute.imputations import SingleImputer
single = SingleImputer(
    strategy={
        'status': "categorical",
        'release_year': "median",
        'runtime': 'norm',
        'release_month': 'random'
    })
data_imputed_once = single.fit_transform(data)
    np.random.choice(['Male', 'Feamle'], n_samples),
    'employment':
    np.random.choice(['Unemployed', 'Employed', 'Part Time', 'Self-Employed'],
                     n_samples,
                     p=[0.05, 0.6, 0.15, 0.2])
})

toy_df['gender'].value_counts()
toy_df['employment'].value_counts()

for c in toy_df.columns:
    toy_df.loc[np.random.choice(range(n_samples), replace=False, size=100),
               c] = np.nan

sim = SingleImputer(strategy='categorical')
sim.fit(toy_df)
toy_df_fill = sim.transform(toy_df)

toy_df_fill.loc[toy_df['employment'].isnull(), 'employment'].value_counts()

toy_df.loc[toy_df['employment'].isnull() == False,
           'employment'].value_counts() / sum(
               toy_df['employment'].isnull() == False)

from autoimpute.imputations import MultipleImputer

mi = MultipleImputer(strategy='stochastic',
                     return_list=True,
                     predictors={'y': ['x']})
mi_data_fill = mi.fit_transform(data_het_miss)
Ejemplo n.º 5
0
class MiBaseRegressor:
    """Building blocks to create an Autoimpute regressor.

    Every Autoimpute regressor inherits from the MiBaseRegressor. The class
    provides the functionality necessary for Autoimpute regressors to wrap
    sklearn or statsmodels libraries and apply them to multiply imputed
    datasets. It also creates the MultipleImputer used to impute data if the
    user does not specify a custom MultipleImputer during instantiation.

    Attributes:
        model_libs (tuple): libraries supported by Autoimpute regressors.
    """

    model_libs = ("sklearn", "statsmodels")

    def __init__(self, mi, model_lib, mi_kwgs, model_kwgs):
        """Create an instance of the MiBaseRegressor class.

        The MiBaseRegressor class is not a stand-alone class and should not be
        used other than as a parent class to an Autoimpute regressor. An
        Autoimpute regressor wraps either sklearn or statsmodels regressors to
        apply them on multiply imputed datasets. The MiBaseRegressor contains
        the logic Autoimpute regressors share. In addition, it creates an
        instance of the MultipleImputer to impute missing data.

        Args:
            mi (MultipleImputer): An instance of a MultipleImputer. If `mi`
                passed explicitly, this `mi` will be used for MultipleImptuer.
                Can use `mi_kwgs` instead, although `mi` is cleaner/preferred.
            model_lib (str): library the regressor will use to implement
                regression. Options are sklearn and statsmodels.
                Default is statsmodels.
            mi_kwgs (dict): keyword args to instantiate MultipleImputer.
                If valid MultipleImputer passed to `mi`, model_kwgs ignored.
                If `mi_kwgs` is None and `mi` is None, MiBaseRegressor creates
                default instance of MultipleImputer.
            model_kwgs (dict): keyword args to instantiate regressor. Arg is
                passed along to either sklearn or statsmodels regressor. If
                `model_kwgs` is None, default instance of regressor created.

        Returns:
            self. Instance of MiBaseRegressor class.
        """
        # Order Important. `mi_kwgs` validation first b/c it's used in `mi`
        # also note - encoder is not argument, b/c one-hot only right now
        self.mi_kwgs = mi_kwgs
        self.mi = mi
        self.model_kwgs = model_kwgs
        self.model_lib = model_lib

    @property
    def mi_kwgs(self):
        """Property getter to return the value of mi_kwgs."""
        return self._mi_kwgs

    @mi_kwgs.setter
    def mi_kwgs(self, kwgs):
        """Validate the mi_kwgs and set default properties.

        The MiBaseRegressor validates mi_kwgs argument. mi_kwgs contain
        optional keyword arguments to create a MultipleImputer. The argument
        is optional, and its default is None.

        Args:
            kwgs (dict, None): None or dictionary of keywords.

        Raises:
            ValueError: mi_kwgs not correctly specified as argument.
        """
        if not isinstance(kwgs, (type(None), dict)):
            err = "mi_kwgs must be None or dict of args for MultipleImputer."
            raise ValueError(err)
        self._mi_kwgs = kwgs

    @property
    def mi(self):
        """Property getter to return the value of mi."""
        return self._mi

    @mi.setter
    def mi(self, m):
        """Validate mi and set default properties.

        The MiBaseRegressor validates the mi argument. mi must be a valid
        instance of a MultipleImputer. It can also be None. If None, the
        MiBaseRegressor will create a MultipleImputer on its own, either by
        default or with any key values passed to the mi_kwgs args dict.

        Args:
            m (MultipleImputer, None): Instance of a MultipleImputer.

        Raises:
            ValueError: mi is not an instance of a MultipleImputer.
        """

        # check if m is None or a MultipleImputer
        if not isinstance(m, (type(None), MultipleImputer)):
            err = f"{m} must be None or a valid instance of MultipleImputer."
            raise ValueError(err)

        # handle each case if None or MultipleImputer
        if m is not None:
            self._mi = m
        else:
            # handle whether or not mi_kwgs should be passed
            if self.mi_kwgs:
                self._mi = MultipleImputer(**self.mi_kwgs)
            else:
                self.mi = MultipleImputer()

    @property
    def model_kwgs(self):
        """Property getter to return the value of model_kwargs."""
        return self._model_kwgs

    @model_kwgs.setter
    def model_kwgs(self, kwgs):
        """Validate the model_kwgs and set default properties.

        The MiBaseRegressor validates the model_kwgs argument. model_kwgs
        contain optional keyword arguments pased to a regression model. The
        argument is optional, and its default is None.

        Args:
            kwgs (dict, None): None or dictionary of keywords.

        Raises:
            ValueError: model_kwgs not correctly specified as argument.
        """
        if not isinstance(kwgs, (type(None), dict)):
            err = "model_kwgs must be dict of args used to instantiate model."
            raise ValueError(err)
        self._model_kwgs = kwgs

    @property
    def model_lib(self):
        """Property getter to return the value of model_lib."""
        return self._model_lib

    @model_lib.setter
    def model_lib(self, lib):
        """Validate model_lib and set default properties.

        The MiBaseRegressor validates the model_lib argument. model_lib should
        be in the MiBaseRegressor.model_libs tuple, which contains the libs to
        use for regression of multiply imputed datasets. The library chosen is
        important. Only statsmodels (the default) provides proper parameter
        pooling using Rubin's rules. sklearn provides mean estimate pooling
        only. sklearn variance parameter pooling and diagnostics in dev, TBD.

        Args:
            lib (iter): library to use

        Raises:
            ValueError: lib not a valid library to use.
        """
        if lib not in self.model_libs:
            err = f"{lib} not valid `model_lib`. Must be {self.model_libs}."
            raise ValueError(err)
        self._model_lib = lib

    def _fit_strategy_validator(self, X, y):
        """Private method to validate data before fitting model."""

        # y must be a series or dataframe
        if not isinstance(y, (pd.Series, pd.DataFrame)):
            err = "y must be a Series or DataFrame"
            raise ValueError(err)

        # y must have a name if series.
        if isinstance(y, pd.Series):
            self._yn = y.name
            if self._yn is None:
                err = "series y must have a name"
                raise ValueError(err)

        # y must have one column if dataframe.
        if isinstance(y, pd.DataFrame):
            yc = y.shape[1]
            if yc != 1:
                err = "y should only have one column"
                raise ValueError(err)
            y = y.iloc[:, 0]
            self._yn = y.name

        # y and X must have the same number of rows
        if X.shape[0] != y.shape[0]:
            err = "y and X must have the same number of records"
            raise ValueError(err)

        # if no errors thus far, add y to X for imputation
        X[self._yn] = y

        # return the multiply imputed datasets
        return self.mi.fit_transform(X)

    def _fit_model(self, model_type, regressor, X, y):
        """Private method to fit a model using sklearn or statsmodels."""

        # encoding for predictor variable
        # we enforce that predictors were imputed in imputation phase.
        X = _one_hot_encode(X)
        self.new_X_columns = X.columns.tolist()

        # encoding for response variable
        if model_type == "logistic":
            ycat = y.astype("category").cat
            y = ycat.codes
            self._response_categories = ycat.categories

        # statsmodels fit case, which requires different logic than sklearn
        if self.model_lib == "statsmodels":
            X = add_constant(X)
            if self.model_kwgs:
                model = regressor(y, X, **self.model_kwgs)
            else:
                model = regressor(y, X)
            model = model.fit()

        # sklearn fit case, which requires different logic than statsmodels
        if self.model_lib == "sklearn":
            if self.model_kwgs:
                model = regressor(**self.model_kwgs)
            else:
                model = regressor()
            # sklearn doesn't need encoding for response
            model.fit(X, y)

        # return the model after fitting it to a given dataset
        return model

    def _apply_models_to_mi_data(self, model_dict, X, y):
        """Private method to apply analysis model to multiply imputed data."""

        # find regressor based on model lib, then get mutliply imputed data
        model_type = model_dict["type"]
        regressor = model_dict[self.model_lib]
        mi_data = self._fit_strategy_validator(X, y)
        models = {}

        # then preform analysis models. Sequential only right now.
        for dataset in mi_data:
            ind, X = dataset
            y = X.pop(self._yn)
            model = self._fit_model(model_type, regressor, X, y)
            models[ind] = model

        # returns a dictionary: k=imp #; v=analysis model applied to imp #
        return models

    def _predict_strategy_validator(self, instance, X):
        """Private method to validate before prediction."""

        # first check that model is fitted, then check columns are the same
        check_is_fitted(instance, "statistics_")
        X_cols = X.columns.tolist()
        fit_cols = set(instance.fit_X_columns)
        diff_fit = set(fit_cols).difference(X_cols)
        if diff_fit:
            err = "Same columns that were fit must appear in predict."
            raise ValueError(err)

        # encoding for predictor variable
        # we enforce that predictors were imputed in imputation phase.
        if X.isnull().sum().any():
            me = "Data passed to make predictions can't contain missingness."
            raise ValueError(me)
        X = _one_hot_encode(X)
        return X

    def _var_ratios(self, imps, num, denom):
        """Private method for the variance ratios."""
        return (num + (num / imps)) / denom

    def _degrees_freedom(self, imps, lambda_, v_com):
        """Private method to calculate degrees of freedom for estimates."""

        # note we nudge lambda if zero b/c need lambda for other stats
        # see source code barnard.rubin.R from MICE for more
        lambda_ = np.maximum(1e-04, lambda_)
        v_old = (imps - 1) / lambda_**2
        v_obs = ((v_com + 1) / (v_com + 3)) * v_com * (1 - lambda_)
        v = (v_old * v_obs) / (v_old + v_obs)
        return v

    def _get_stats_from_models(self, models):
        """Private method to generate statistics given on model lib chosen."""

        # initial setup - get items from models and get number of models
        items = models.items()
        m = self.mi.n

        # pooling phase: sklearn - coefficients only, no variance
        if self.model_lib == "sklearn":

            # find basic parameters, but can't return much more than coeff
            # sklearn does not implement inference out of the box
            # will have to write methods to do so from stratch, so TBD
            self.mi_alphas_ = [j.intercept_ for i, j in items]
            self.mi_params_ = [j.coef_ for i, j in items]
            alpha = sum(self.mi_alphas_) / m
            params = sum(self.mi_params_) / m
            coefs = pd.Series(np.insert(params, 0, alpha))
            coefs.index = ["const"] + self.new_X_columns
            statistics = OrderedDict(coefs=coefs)

        # pooling phase: statsmodels - coefficients and variance possible
        if self.model_lib == "statsmodels":

            # data and model parameters
            self.mi_params_ = [j.params for i, j in items]
            self.mi_std_errors_ = [j.bse for i, j in items]
            coefs = sum(self.mi_params_) / m
            k = coefs.index.size
            n = list(items)[0][1].nobs
            df_com = n - k

            # variance metrics (See VB Ch 2.3)
            vw = sum(map(lambda x: x**2, self.mi_std_errors_)) / m
            vb = sum(map(lambda p:
                         (p - coefs)**2, self.mi_params_)) / max(1, m - 1)
            vt = vw + vb + (vb / m)
            stdt = np.sqrt(vt)

            # variance ratios (See VB Ch 2.3)
            # efficiency as specified in stats manual
            lambda_ = self._var_ratios(m, vb, vt)
            r_ = self._var_ratios(m, vb, vw)
            v_ = self._degrees_freedom(m, lambda_, df_com)
            fmi_ = ((v_ + 1) / (v_ + 3)) * lambda_ + 2 / (v_ + 3)
            eff_ = (1 + (np.maximum(1e-04, fmi_) / m))**-1

            # create statistics with pooled metrics from above
            statistics = OrderedDict(coefs=coefs,
                                     std=stdt,
                                     vw=vw,
                                     vb=vb,
                                     vt=vt,
                                     dfcom=df_com,
                                     dfadj=v_,
                                     lambda_=lambda_,
                                     riv=r_,
                                     fmi=fmi_,
                                     eff=eff_)

        # finally, return dictionary with stats from fit used in transform
        return statistics
Ejemplo n.º 6
0
def data_new():
    import pandas as pd
    import glob
    import numpy as np
    from autoimpute.imputations import MultipleImputer
    
    data_path = r'C:\Users\de_hauk\PowerFolders\apps_tzakiris_rep\data\data_new_12_08_2020\data_raw\csv'
    all_files = glob.glob(data_path + "/*.csv")
    
    sample_fullinfo = pd.read_csv(r'C:\Users\de_hauk\PowerFolders\apps_tzakiris_rep\data\data_new_12_08_2020\data_list\stimuli_list.csv')
    SAMPLE_fullinfo = sample_fullinfo.drop(columns = ['Unnamed: 0']).copy()  
    
    #DATA_raw_DF = pd.DataFrame(SAMPLE_fullinfo).copy()
    DATA_raw_DF = pd.DataFrame()
    for data in all_files:
        unique_ID = data[-12] + '_' + data[-5] 
        curr_raw_data = pd.read_csv(data,header=None, sep='\t').drop(axis='index', labels = [0,1])[2]
        perspective = []
        answer_correct = [] #1 yes // 0 no
        answer_raw = [] # YES -> 1 // NO -> 0 // Do you remember face?
        for data_point in curr_raw_data:
            if len(data_point) < 5:
                perspective.append(data_point)
            if ('1' in data_point[0])and (len(data_point)>5):
                answer_correct.append(1)
            elif ('0' in data_point[0])and (len(data_point)>5):
                answer_correct.append(0)
            if '[YES]' in data_point:
                answer_raw.append(1)
            elif '[NO]' in data_point:
                answer_raw.append(0)
            elif 'missed' in data_point:
                answer_raw.append(np.nan)
                
    
        DATA_raw_DF[unique_ID + 'perspective'] = perspective
        DATA_raw_DF[unique_ID + 'perf'] = answer_correct
        DATA_raw_DF[unique_ID + 'answer'] = answer_raw
    #raw_dat_unproc = [pd.read_csv(i,header=None, sep='\t').drop(axis='index', labels = [0,1])[2] for i in all_files]
    
    ### determine place and amount of missing value
    missing_dat_raw = pd.DataFrame(DATA_raw_DF.isnull().sum(), columns = ['dat'])
    
    ### filter out clmns with no missing dat
    #missing_dat_overview = missing_dat_raw.loc[missing_dat_raw['dat'] > 0].T
    
    #drop irrelevant columns containig weird stim codings
    miss_perspect = [i for i in DATA_raw_DF if ('perspective' in i) or ('perf' in i)]
    miss_dat = DATA_raw_DF.drop(labels =miss_perspect, axis = 1 ).copy()
    
    # Impute missing dat with binary logistic regress returning only one DF with imputed values
    ### Uses = https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    imp = MultipleImputer(n=1,strategy= 'binary logistic',return_list=True, imp_kwgs={"binary logistic": {"max_iter": 10000}} )
    impute_res = imp.fit_transform(miss_dat)
    
    # # merge imputed DF with relevant Info i.e. generating list etc
    DATA_imput = impute_res[0][1]
    for i in SAMPLE_fullinfo:
            DATA_imput[i] = sample_fullinfo[i]
    for i in DATA_raw_DF:
        if i not in [i for i in DATA_imput]:
            DATA_imput[i] = DATA_raw_DF[i]
    return DATA_imput

        
Ejemplo n.º 7
0
    DATA_raw_DF[unique_ID + 'answer'] = answer_raw
raw_dat_unproc = [pd.read_csv(i,header=None, sep='\t').drop(axis='index', labels = [0,1])[2] for i in all_files]

### determine place and amount of missing value
missing_dat_raw = pd.DataFrame(DATA_raw_DF.isnull().sum(), columns = ['dat'])

### filter out clmns with no missing dat
missing_dat_overview = missing_dat_raw.loc[missing_dat_raw['dat'] > 0].T

#drop irrelevant columns containig weird stim codings
miss_perspect = [i for i in DATA_raw_DF if ('perspective' in i) or ('perf' in i)]
miss_dat = DATA_raw_DF.drop(labels =miss_perspect, axis = 1 ).copy()

# Impute missing dat with binary logistic regress returning only one DF with imputed values
### Uses = https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
imp = MultipleImputer(n=1,strategy= 'binary logistic',return_list=True, imp_kwgs={"binary logistic": {"max_iter": 10000}} )
impute_res = imp.fit_transform(miss_dat)

# # merge imputed DF with relevant Info i.e. generating list etc
DATA_imput = impute_res[0][1]
for i in SAMPLE_fullinfo:
        DATA_imput[i] = sample_fullinfo[i]
for i in DATA_raw_DF:
    if i not in [i for i in DATA_imput]:
        DATA_imput[i] = DATA_raw_DF[i]

###############################################################################

def data_new():
    import pandas as pd
    import glob
Ejemplo n.º 8
0
plt.xlim([-1, X.shape[1]])
plt.show()

import os
for ids in train['id']:
    print(ids)
    for file in os.listdir(text):
        if file.endswith(".gz"):
            print(file)
            data = pd.read_csv(text + file, compression='gzip')
            print(data.index[data['id'] == ids].tolist())
        break

from autoimpute.imputations import SingleImputer, MultipleImputer
si = SingleImputer() # imputation methods, passing through the data once
mi = MultipleImputer() # imputation methods, passing through the data multiple times

# train_cols = list(train)
X = train.drop(['id', 'target'], axis=1)
X = MICE().fit_transform(X)

X_test1 = test.drop(['id'], axis = 1)
X_test1 = MICE().fit_transform(X_test)

def scorer(true,pred):
    error = math.sqrt(mean_squared_error(pred,true))
    return math.exp(-1*error)
score = make_scorer(scorer, greater_is_better=True)

X = train.drop(['id', 'target'], axis=1)
y= train['target']
Ejemplo n.º 9
0
 def __init__(self, T, mask, algo, miss_info, kf, notobj, obj, target):
     try:
         self.miss_info = miss_info
         self.columns = notobj
         self.ord_num_col = self.miss_info["ord_col"] + self.miss_info[
             "num_col"]
         metric = {"rmse": {}, "nrmse": {}}
         self.rawT = T
         self.target = target
         if target is not None: self.target_y = T[target]
         else: self.target_y = None
         self.cv = {}
         self.cv.update(deepcopy(metric))
         self.kf = kf
         self.MSE = {}
         self.MSE.update(deepcopy(metric))
         self.result = {}
         self.time_ck = {}
         X = deepcopy(T)
         mask = pd.DataFrame(mask, columns=T.columns.tolist())
         self.rawmask = mask
         X[(mask == 1).values] = np.nan
         if obj in [None, []]: obj = None
         else: pass
         ##########################################
         self.X = X[notobj]
         self.T = T[notobj]
         self.mask = mask[notobj]
         self.notobj = notobj
         ##########################################
         if obj is not None:
             ############ Numeric + Category  #################
             cat_impute = SimpleImputer(strategy="most_frequent")
             X[obj] = cat_impute.fit_transform(X[obj])
             self.true_obj = T[obj]
             self.pd_obj = X[obj]
             ###################################################
             TT = deepcopy(T)
             cat_encoder = miss_info["ce_encoder"]
             for k in cat_encoder.category_mapping:
                 col, map_ = k["col"], k["mapping"]
                 TT[col] = TT[col].replace(
                     dict(zip(k["mapping"].index, k["mapping"].values)))
             self.full_miss_data = TT
             self.full_miss_data[(mask == 1).values] = np.nan
             mice_data = deepcopy(T)
             for obj_col in obj:
                 mice_data[obj_col] = "Cols_" + mice_data[obj_col]
             self.full_mice_data = mice_data
             self.full_mice_data[(mask == 1).values] = np.nan
         else:
             ########## Numeric  ###############################
             num_data = deepcopy(self.X)
             num_data[(self.mask == 1).values] = np.nan
             self.full_miss_data = deepcopy(num_data)
             self.full_mice_data = deepcopy(num_data)
             ###################################################
         self.algo = algo
         self.method = {
             "MissForest" : lambda x : MissForest(verbose = 0, n_jobs  = -1 ).fit(x) ,
             "mean" : lambda x : impy.mean(x) ,
             "median" : lambda x : impy.median(x) ,
             "mode" : lambda x : impy.mode(x) ,
             "knn" : lambda x : impy.fast_knn(x) ,
             "MICE" : lambda x : impy.mice(x) ,
             "EM" : lambda x : impy.em(x),
             "MultipleImputer" : lambda x : MultipleImputer(n=1, return_list = True).\
             fit_transform(pd.DataFrame(x)).values,
         }
     except Exception as e:
         print(e)
         pass
Ejemplo n.º 10
0
# Fill in required Inputs
x_train = train_data.iloc[:, list(range(3, 11))]
y_train = train_data.iloc[:, list(range(11,12))].values
x_train_num = train_data.iloc[:, list(range(3, 9))]
x_train_txt = train_data.iloc[:, list(range(9, 11))]
x_train_txt_encode_split = 2  # Split at Column Number

x_test = test_data.iloc[:, list(range(3, 11))]
x_test_num = test_data.iloc[:, list(range(3, 9))]
x_test_txt = test_data.iloc[:, list(range(9, 11))]
x_test_txt_encode_split = 2  # Split at Column Number

# Impute Missing values

# Numerical Imputer
imputer_num = MultipleImputer(strategy='stochastic', return_list=True, n=5, seed=101)
x_train_num_avg = imputer_num.fit_transform(x_train_num)

x_train_num_concat = x_train_num_avg[0][1]

for i in range(len(x_train_num_avg)-1):
    x_train_num_concat = pd.concat([x_train_num_concat,x_train_num_avg[i+1][1]], axis=1)
x_train_num_avg = x_train_num_concat.groupby(by=x_train_num_concat.columns, axis=1).apply(lambda g: g.mean(axis=1))

x_test_num_avg = imputer_num.fit_transform(x_test_num)

x_test_num_concat = x_test_num_avg[0][1]

for i in range(len(x_test_num_avg)-1):
    x_test_num_concat = pd.concat([x_test_num_concat,x_test_num_avg[i+1][1]], axis=1)
x_test_num_avg = x_test_num_concat.groupby(by=x_test_num_concat.columns, axis=1).apply(lambda g: g.mean(axis=1))
Ejemplo n.º 11
0
def data_old_sample():
    import pandas as pd
    import glob
    import numpy as np
    from autoimpute.imputations import MultipleImputer

    data_path = r'C:\Users\de_hauk\PowerFolders\apps_tzakiris_rep\A_T_Implementation\data_lab_jansen'

    ############################################### Data #################################################################################
    ''' Data Processing in two batches. Second batch containes additional Information in form of an additional row per trial which 
    needs to be deleted'''

    ## First Batch
    path_1_batch_raw = data_path + r'\first_batch'
    all_files_1B = glob.glob(path_1_batch_raw + "/*.csv")

    ## Second Batch
    path_2_batch_raw = data_path + r'\second_batch'
    all_files_2B = glob.glob(path_2_batch_raw + "/*.csv")

    ## Merge Batches so that subj 1-5 are first batch and 6-10 are second batch
    all_files = all_files_1B + all_files_2B

    ## Get raw data i.e. trial list generated by previous optimization run
    sample_fullinfo = pd.read_csv(data_path + r'\stimuli_list.csv')
    sample_fullinfo = sample_fullinfo.drop(columns=['Unnamed: 0']).copy()

    ## setup dataframe not containing raw dat
    SAMPLE_onlydat = pd.DataFrame()

    raw_dat_unproc = [
        pd.read_csv(i, sep='\s+',
                    header=None).drop(columns=[0, 1, 2, 5, 6, 7, 8, 9]).drop(
                        axis='index', labels=[0]) for i in all_files
    ]

    ## For each (subject)-data in directories
    for ID in all_files:

        curr_ID = ID[
            -8:-6]  ## get unique ID Number/Subject Number from raw dat names
        curr_file = ID  ## get current file-path

        ## get actual data into dataframe & drop irrelevant clmns
        df_sample_raw = pd.read_csv(curr_file, sep='\s+', header=None).drop(
            columns=[0, 1, 2, 5, 6, 7, 8, 9]).drop(axis='index', labels=[0])

        ## select only relevant rows
        ## ACHTUNG: Length of df = batch 1 < batch 2
        ## length use as a seperator to filter irrelevant rows

        if len(df_sample_raw[3]) >= 800:  # aka. if data from batch 2
            df_sample = df_sample_raw.loc[
                (df_sample_raw[4] != '2=switched_1=notswitched')
                & (df_sample_raw[4] != 'actual_key_press_for_RT')]
        else:
            df_sample = df_sample_raw.loc[(df_sample_raw[4] !=
                                           'actual_key_press_for_RT')]

        ## Initialize protocoll data
        answer_correct = []  #1==yes//0==no
        faceID_known = []  #1==yes//0==no
        perspective = []

        for i, j in zip(df_sample[3], df_sample[4]):

            ### Which stimulus was observed
            if (len(i) == 4) and (i not in ['None', 'right', 'left']):
                perspective.append(i)

            ### What was the Answer, regardless of correctness??
            if ('right' in str(i)) and (str(j)
                                        == 'left=yes_right=no_None=missed'):
                faceID_known.append(0)  # NOT familiar
            elif ('left' in str(i)) and (str(j)
                                         == 'left=yes_right=no_None=missed'):
                faceID_known.append(1)  # familiar
            elif (str(i) == 'None') and (str(j)
                                         == 'left=yes_right=no_None=missed'):
                faceID_known.append(
                    np.nan
                )  # code responding too early and missed responses as np.nan

            ### Was the answer correct w.r.t. to the task? // Missing an Answer is also worng
            if (str(i) == '1') and ('right' in str(j)):
                answer_correct.append(int(i))
            elif (str(i) == '0') and ('wrong' in str(j)):
                answer_correct.append(int(i))
            elif (str(i) == '0') and (str(j) == 'missed'):
                answer_correct.append(int(i))

        sample_fullinfo[str(curr_ID) + 'answer'] = pd.Series(faceID_known)
        sample_fullinfo[str(curr_ID) + 'perf'] = pd.Series(answer_correct)
        sample_fullinfo[str(curr_ID) + 'perspective'] = pd.Series(perspective)

        SAMPLE_onlydat[str(curr_ID) + 'answer'] = pd.Series(faceID_known)
        SAMPLE_onlydat[str(curr_ID) + 'perf'] = pd.Series(answer_correct)
        SAMPLE_onlydat[str(curr_ID) + 'perspective'] = pd.Series(perspective)

    #################################### Impute missing Data ###############################################################

    ### determine place and amount of missing value
    missing_dat_raw = pd.DataFrame(sample_fullinfo.isnull().sum(),
                                   columns=['dat'])

    ### filter out clmns with no missing dat
    missing_dat_overview = missing_dat_raw.loc[missing_dat_raw['dat'] > 0].T

    #drop irrelevant columns containig weird stim codings
    miss_perspect = [i for i in SAMPLE_onlydat if 'perspective' in i]
    miss_dat = SAMPLE_onlydat.drop(labels=miss_perspect, axis=1).copy()

    # Impute missing dat with binary logistic regress returning only one DF with imputed values
    ### Uses = https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
    imp = MultipleImputer(n=1,
                          strategy='binary logistic',
                          return_list=True,
                          imp_kwgs={"binary logistic": {
                              "max_iter": 10000
                          }})
    impute_res = imp.fit_transform(miss_dat)

    # merge imputed DF with relevant Info i.e. generating list etc
    DATA_imput = impute_res[0][1]
    for i in sample_fullinfo:
        if i not in [
                i for i in sample_fullinfo if ('answer' in i) or ('perf' in i)
        ]:
            DATA_imput[i] = sample_fullinfo[i]

    return {
        'Data_raw_unproc': raw_dat_unproc,
        'Data_raw': SAMPLE_onlydat,
        'DATA_imput': DATA_imput,
    }