Beispiel #1
0
def process_data(regressor_name, X, normalise=None):
    if regressor_name in classical_ml:
        tmp = []
        for i in tqdm(range(len(X))):
            # 1. flatten
            # 2. fill missing values
            x = X.iloc[i, 0].reset_index(drop=True)
            x.interpolate(method='linear',
                          inplace=True,
                          limit_direction='both')
            if normalise == "standard":
                x = StandardScaler().fit_transform(x.values.reshape(-1, 1))
                x = pd.DataFrame(x)
            elif normalise == "minmax":
                x = MinMaxScaler().fit_transform(x.values.reshape(-1, 1))
                x = pd.DataFrame(x)
            tmp2 = x.values.tolist()
            for j in range(1, len(X.columns)):
                x = X.iloc[i, j].reset_index(drop=True)
                x.interpolate(method='linear',
                              inplace=True,
                              limit_direction='both')
                if normalise == "standard":
                    x = StandardScaler().fit_transform(x.values.reshape(-1, 1))
                    x = pd.DataFrame(x)
                elif normalise == "minmax":
                    x = MinMaxScaler().fit_transform(x.values.reshape(-1, 1))
                    x = pd.DataFrame(x)
                tmp2 = tmp2 + x.values.tolist()
            tmp2 = pd.DataFrame(tmp2).transpose()

            tmp.append(tmp2)
        X = pd.concat(tmp).reset_index(drop=True)
    else:
        tmp = []
        for i in tqdm(range(len(X))):
            x = X.iloc[i, :]
            _x = x.copy(deep=True)

            # 1. find the maximum length of each dimension
            all_len = [len(y) for y in _x]
            max_len = max(all_len)

            # 2. adjust the length of each dimension
            _y = []
            for y in _x:
                # 2.1 fill missing values
                if y.isnull().any():
                    y = y.interpolate(method='linear', limit_direction='both')

                # 2.2. if length of each dimension is different, uniformly scale the shorted one to the max length
                if len(y) < max_len:
                    y = uniform_scaling(y, max_len)
                _y.append(y)
            _y = np.array(np.transpose(_y))
            if normalise == "standard":
                scaler = StandardScaler().fit(_y)
                _y = scaler.transform(_y)
            if normalise == "minmax":
                scaler = MinMaxScaler().fit(_y)
                _y = scaler.transform(_y)

            tmp.append(_y)
        X = np.array(tmp)
    return X