コード例 #1
0
def data():
    data = pd.read_csv('facies_vectors.csv')
    feature_names = [
        'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS'
    ]
    data = data.fillna(data['PE'].mean())
    train, test = train_test_split(data, test_size=0.3)
    X_train = train[feature_names].values
    y_train = train['Facies'].values
    X_test = test[feature_names].values
    y_test = test['Facies'].values
    well_train = train['Well Name'].values
    well_test = test['Well Name'].values
    depth_train = train['Depth'].values
    depth_test = test['Depth'].values
    robust = preprocessing.RobustScaler(quantile_range=(25.0,
                                                        75.0)).fit(X_train)
    X_train_robust = robust.transform(X_train)
    X_test_robust = robust.transform(X_test)
    scaler = preprocessing.RobustScaler(
        quantile_range=(25.0, 75.0)).fit(X_train_robust)
    X_train = scaler.transform(X_train_robust)
    X_test = scaler.transform(X_test_robust)
    Y_train = to_categorical(y_train, 10)
    Y_test = to_categorical(y_test, 10)
    return X_train, Y_train, X_test, Y_test
コード例 #2
0
ファイル: nnload.py プロジェクト: jgdwyer/ML-convection
def init_pp(ppi, raw_data):
    # Initialize list of scaler objects
    if ppi['name'] == 'MinMax':
        pp = [preprocessing.MinMaxScaler(feature_range=(-1.0, 1.0)),  # temp
              preprocessing.MinMaxScaler(feature_range=(-1.0, 1.0))]  # humid.
    elif ppi['name'] == 'MaxAbs':
        pp = [preprocessing.MaxAbsScaler(),  # for temperature
              preprocessing.MaxAbsScaler()]  # and humidity
    elif ppi['name'] == 'StandardScaler':
        pp = [preprocessing.StandardScaler(),  # for temperature
              preprocessing.StandardScaler()]  # and humidity
    elif ppi['name'] == 'RobustScaler':
        pp = [preprocessing.RobustScaler(),  # for temperature
              preprocessing.RobustScaler()]  # and humidity
    elif ppi['name'] == 'SimpleY':
        pp = [10./1., 10./2.5]  # for temperature
    else:
        ValueError('Incorrect scaler name')
    # Initialize scalers with data
    if ppi['method'] == 'individually':
        pp[0].fit(unpack(raw_data, 'T'))
        pp[1].fit(unpack(raw_data, 'q'))
    elif ppi['method'] == 'alltogether':
        pp[0].fit(np.reshape(unpack(raw_data, 'T'), (-1, 1)))
        pp[1].fit(np.reshape(unpack(raw_data, 'q'), (-1, 1)))
    elif ppi['method'] == 'qTindividually':
        if ppi['name'] != 'SimpleY':
            pp = pp[0]
            pp.fit(raw_data)
    else:
        raise ValueError('Incorrect scaler method')
    return pp
コード例 #3
0
ファイル: calibration.py プロジェクト: presto8/birdhouse
def compute_offset(data_df, ref_df):

    # d_scaler = preprocessing.StandardScaler(with_std=False)
    d_scaler = preprocessing.RobustScaler()
    r_scaler = preprocessing.RobustScaler()

    d_scaler.fit(data_df)
    r_scaler.fit(ref_df)

    return r_scaler.center_ - d_scaler.center_
コード例 #4
0
    def load_data_prediction(self):
        self.x_pre = np.loadtxt('x_pre.dat')
        self.x_range = np.loadtxt('x_range.dat')
        for i in range(len(self.x_pre)):
            if self.x_pre[i] < self.x_range[i][0] or self.x_pre[
                    i] > self.x_range[i][1]:
                print('The structure is out of range, go to QM calulation')


#                raise IOError

        print(self.x_pre.ndim)

        if self.x_pre.ndim == 0:
            self.n_pre = 1
        if self.x_pre.ndim == 1:
            if self.n_x_dim == 1:
                self.n_pre = self.x_pre.shape[0]
            else:
                self.n_pre = 1
        if self.x_pre.ndim > 1:
            self.n_pre = self.x_pre.shape[0]

        if self.n_pre == 1 and self.n_x_dim == 1:
            self.x_pre = self.x_pre.reshape(1, 1)
        if self.n_pre == 1 and self.n_x_dim != 1:
            self.x_pre = self.x_pre.reshape(1, -1)
        if self.n_pre != 1 and self.n_x_dim == 1:
            self.x_pre = self.x_pre.reshape(self.n_pre, 1)
        self.x_pre_old = self.x_pre

        if self.rescale != "NO":
            if self.rescale == "Normal":
                scale_xdata = preprocessing.StandardScaler()
                scale_xdata.fit(self.x_train)
                self.scale_x_factor = scale_xdata.scale_
                self.mean_x = scale_xdata.mean
            elif self.rescale == "Robust":
                scale_xdata = preprocessing.RobustScaler()
                scale_xdata.fit(self.x_train)
                self.scale_x_factor = scale_xdata.scale_
                self.mean_x = scale_xdata.center_

            x_train_scale = scale_xdata.transform(self.x_train)
            self.x_train = x_train_scale
            x_pre_scale = scale_xdata.transform(self.x_pre)
            self.x_pre = x_pre_scale

            #    Scale y
            scale_ydata = preprocessing.RobustScaler()
            scale_ydata.fit(self.y_train)
            self.scale_y_factor = scale_ydata.scale_
            self.mean_y = scale_ydata.center_
コード例 #5
0
ファイル: data_preprocessing.py プロジェクト: Tina-Gu/452-GP
def gendata(doPCA=False):
    data = pd.read_csv('LengthOfStay.csv')
    #Save the length of stay in a different variable
    labels = data['lengthofstay']
    # Drop columns that we dont need like specific dates, or the id of the patient
    data = data.drop(["eid", "vdate", "discharged", "lengthofstay"], axis=1)
    # Add dummy encoding for the object and type variables
    # For example, turn gender column into 2 columns, where a male will be 1 in the first column
    # and a 0 in the second column, and a female will be the inverse
    data = pd.get_dummies(data, columns=['rcount'])
    data = pd.get_dummies(data, columns=['gender'])
    data = pd.get_dummies(data, columns=['facid'])

    if not doPCA:
        hematocrit = data[['hematocrit']].values
        data['hematocrit'] = preprocessing.StandardScaler().fit_transform(
            hematocrit)

        bloodureanitro = data[['neutrophils']].values
        data['neutrophils'] = preprocessing.RobustScaler().fit_transform(
            bloodureanitro)

        sodium = data[['sodium']].values
        data['sodium'] = preprocessing.StandardScaler().fit_transform(sodium)

        glucose = data[['glucose']].values
        data['glucose'] = preprocessing.StandardScaler().fit_transform(glucose)

        bloodureanitro = data[['bloodureanitro']].values
        data['bloodureanitro'] = preprocessing.RobustScaler().fit_transform(
            bloodureanitro)

        creatinine = data[['creatinine']].values
        data['creatinine'] = preprocessing.StandardScaler().fit_transform(
            creatinine)

        bmi = data[['bmi']].values
        data['bmi'] = preprocessing.StandardScaler().fit_transform(bmi)

        pulse = data[['pulse']].values
        data['pulse'] = preprocessing.StandardScaler().fit_transform(pulse)

        respiration = data[['respiration']].values
        data['respiration'] = preprocessing.StandardScaler().fit_transform(
            respiration)

    # Seperate for train and test
    train_X = data.head(n=80000).to_numpy()
    test_X = data.tail(n=20000).to_numpy()
    train_Y = labels.head(n=80000).to_numpy()
    test_Y = labels.tail(n=20000).to_numpy()

    return train_X, test_X, train_Y, test_Y
コード例 #6
0
ファイル: Together.py プロジェクト: Aimee-Fu/CISC-452
def dataProcess():
    data = pd.read_csv("//Users/tsukeka/Downloads/LengthOfStay.csv")
    data.drop(columns=["eid", "vdate", "discharged", "facid"], inplace=True)
    data = data.replace({'gender': {'M': 1, 'F': 0}, 'rcount': {'5+': 5}})
    data = data.astype({'rcount': 'int'})

    hematocrit = data[['hematocrit']].values
    data['hematocrit'] = preprocessing.StandardScaler().fit_transform(
        hematocrit)

    bloodureanitro = data[['neutrophils']].values
    data['neutrophils'] = preprocessing.RobustScaler().fit_transform(
        bloodureanitro)

    sodium = data[['sodium']].values
    data['sodium'] = preprocessing.StandardScaler().fit_transform(sodium)

    glucose = data[['glucose']].values
    data['glucose'] = preprocessing.StandardScaler().fit_transform(glucose)

    bloodureanitro = data[['bloodureanitro']].values
    data['bloodureanitro'] = preprocessing.RobustScaler().fit_transform(
        bloodureanitro)

    creatinine = data[['creatinine']].values
    data['creatinine'] = preprocessing.StandardScaler().fit_transform(
        creatinine)

    bmi = data[['bmi']].values
    data['bmi'] = preprocessing.StandardScaler().fit_transform(bmi)

    pulse = data[['pulse']].values
    data['pulse'] = preprocessing.StandardScaler().fit_transform(pulse)

    respiration = data[['respiration']].values
    data['respiration'] = preprocessing.StandardScaler().fit_transform(
        respiration)

    data = pd.concat(
        [data, pd.get_dummies(data['secondarydiagnosisnonicd9'])], axis=1)
    data = data.drop(columns=['secondarydiagnosisnonicd9'])
    labels = data['lengthofstay']
    data = data.drop(columns=['lengthofstay'])
    pca = PCA()
    data = pca.fit_transform(data)

    train_X = np.array(data[:80000])
    train_Y = labels.head(n=80000).to_numpy()
    test_X = np.array(data[80000:])
    test_Y = labels.tail(n=20000).to_numpy()
    return train_X, test_X, train_Y, test_Y
コード例 #7
0
    def __init__(self,
                 x,
                 nb_epoch=770,
                 batch_size=64,
                 learning_rate=0.001,
                 H1=58,
                 H2=32,
                 H3=19,
                 DRP=0.1):
        # You can add any input parameters you need
        # Remember to set them with a default value for LabTS tests
        """ 
        Initialise the model.
          
        Arguments:
            - x {pd.DataFrame} -- Raw input data of shape 
                (batch_size, input_size), used to compute the size 
                of the network.
            - nb_epoch {int} -- number of epoch to train the network.

        """

        #Attributes to store constants to be applied on test data
        self.yScaler = preprocessing.RobustScaler()
        self.xScaler = preprocessing.RobustScaler()
        self.lb = preprocessing.LabelBinarizer()

        self.x = x
        if x is not None:
            X, _ = self._preprocessor(x, training=True)

        #init parameters
        self.loss_values = []
        self.input_size = X.shape[1]
        self.output_size = 1

        self.nb_epoch = nb_epoch
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.H1 = H1
        self.H2 = H2
        self.H3 = H3
        self.DRP = DRP

        self.net = Net(self.input_size, self.H1, self.H2, self.H3,
                       self.output_size, self.DRP)

        return
コード例 #8
0
    def _process(self, data):
        #features = preprocessing.PolynomialFeatures().fit_transform(features)
        features = preprocessing.RobustScaler().fit_transform(data)
        #features = decomposition.TruncatedSVD().fit_transform(features)

        #cols = list(['f_' + i for i in range(features.shape[1])])
        return pd.DataFrame(features, columns=data.columns)
コード例 #9
0
def median_scaling(train_ds, val_ds, test_ds, y_col_idx):
    train_X, train_y = tf.data.experimental.get_single_element(train_ds)
    val_X, val_y = tf.data.experimental.get_single_element(val_ds)
    test_X, test_y = tf.data.experimental.get_single_element(test_ds)
    train_X, train_y = train_X.numpy(), train_y.numpy()
    val_X, val_y = val_X.numpy(), val_y.numpy()
    test_X, test_y = test_X.numpy(), test_y.numpy()

    from sklearn import preprocessing
    scaler_X = preprocessing.RobustScaler(with_centering=False,
                                          quantile_range=(0.02, 0.98)).fit(
                                              train_X.reshape(
                                                  (-1, train_X.shape[-1])))
    #scaler_y = preprocessing.RobustScaler(with_centering=False, quantile_range=(0.02, 0.98)).fit(train_y[:, -1, :])
    #train_y[:, 0, :] = scaler_y.transform(train_y[:, 0, :])
    #val_y[:, 0, :] = scaler_y.transform(val_y[:, 0, :])
    #test_y[:, 0, :] = scaler_y.transform(test_y[:, 0, :])
    for i in range(train_X.shape[1]):
        train_X[:, i, :] = scaler_X.transform(train_X[:, i, :])
        val_X[:, i, :] = scaler_X.transform(val_X[:, i, :])
        test_X[:, i, :] = scaler_X.transform(test_X[:, i, :])

    train_ds = tf.data.Dataset.from_tensors((train_X, train_y))
    val_ds = tf.data.Dataset.from_tensors((val_X, val_y))
    test_ds = tf.data.Dataset.from_tensors((test_X, test_y))

    return train_ds, val_ds, test_ds
コード例 #10
0
def enhancement(template, query, k):

    if k == 1:
        ss = preprocessing.StandardScaler()
        ss.fit(template)
        template = ss.transform(template)
        query = ss.transform(query)

    elif k == 2:
        rs = preprocessing.RobustScaler()
        rs.fit(template)
        template = rs.transform(template)
        query = rs.transform(query)

    elif k == 3:
        mm = preprocessing.MinMaxScaler()
        mm.fit(template)
        template = mm.transform(template)
        query = mm.transform(query)

    elif k == 4:
        pca = PCA(n_components=5)
        pca.fit(template)
        template = pca.transform(template)
        query = pca.transform(query)

    else:
        print("No enhancement applied. Returning original data.")

    return template, query
コード例 #11
0
    def _preprocess_features(self):
        """ Standardize and normalize features. """

        if self.standardize:

            if self.scaler is None:
                # define a new scaler if none is given
                self.scaler = preprocessing.RobustScaler()

                # store all features into numpy array and fit standardizer
                X = np.vstack([
                    self.features[item].reshape(1, -1) for item in self.items
                ])
                self.scaler.fit(X)

        for item in self.features:
            x = self.features[item].reshape(1, -1)

            if self.standardize:
                x = self.scaler.transform(x)

            if self.normalize is not None:
                x = preprocessing.normalize(x, norm=self.normalize)

            self.features[item] = x.flatten()
コード例 #12
0
def scale_columns(dataframe, columns, scaler_name="RobustScaler"):
    """
    Apply a data normalization method to the specified columns of a Pandas dataframe.

    :param dataframe: Pandas dataframe.
    :param columns: String containing the columns name separated by comma.
    :param scaler_name: String containing the name of the scaler method (default="RobustScaler").
    :return: Pandas dataframe and the scaler object.
    """
    import pandas as pd
    from sklearn import preprocessing

    scaler = None
    if scaler_name == "StandardScaler":
        scaler = preprocessing.StandardScaler()
    if scaler_name == "RobustScaler":
        scaler = preprocessing.RobustScaler()
    if scaler_name == "MinMaxScaler":
        scaler = preprocessing.MinMaxScaler()
    if scaler_name == "Normalizer":
        scaler = preprocessing.Normalizer()
    assert scaler is not None

    data = dataframe.filter(columns, axis=1)
    print(scaler.fit(data))

    scaled_data = scaler.transform(data)
    scaled_df = pd.DataFrame(scaled_data, columns=columns)

    dataframe_scaled = dataframe.copy()
    dataframe_scaled = dataframe_scaled.reset_index(drop=True)
    for column in columns:
        dataframe_scaled[column] = scaled_df[column]

    return dataframe_scaled, scaler
コード例 #13
0
ファイル: ML_Func_New.py プロジェクト: kaifan88/ml_kennewick
def preprocess(preprocesstype, var):
    #preprocesstype: selects preproccesing type for model, "MMS" for MinMaxScaler, "RS" for Robustscaler, "SS" for StandardScaler, "MAS" for MaxAbsScaler
    #var for varibale np.array is set to

    from sklearn import preprocessing

    if preprocesstype == "MMS":
        print("preprocessing is done with MinMaxScaler")
        X = preprocessing.StandardScaler()
        var = X.fit_transform(var)
        return var
    elif preprocesstype == "RS":
        print("preprocessing is done with RobustScaler")
        X = preprocessing.RobustScaler()
        var = X.fit_transform(var)
        return var
    elif preprocesstype == "SS":
        print("preprocessing is done with StandardScaler")
        X = preprocessing.StandardScaler()
        var = X.fit_transform(var)
        return var
    elif preprocesstype == "MAS":
        print("preprocessing is done with MaxAbsScaler")
        X = preprocessing.MaxAbsScaler()
        var = X.fit_transform(var)
        return var
    else:
        print("Preprocessing type not recognized")
コード例 #14
0
def get_scaler(scale_method='StandardScaler'):
    """
  Get different kinds of scalers from scikit-learn

  :param scale_method: scale method
  :returns: scaler instance
  :raises: none
  """
    scaler = None

    if scale_method == 'StandardScaler':
        scaler = preprocessing.StandardScaler()

    elif scale_method == 'MinMaxScaler':
        scaler = preprocessing.MinMaxScaler()

    elif scale_method == 'MaxAbsScaler':
        scaler = preprocessing.MaxAbsScaler()

    elif scale_method == 'RobustScaler':
        scaler = preprocessing.RobustScaler()

    elif scale_method == 'QuantileTransformer':
        scaler = preprocessing.QuantileTransformer()

    elif scale_method == 'Normalizer':
        scaler = preprocessing.Normalizer()

    elif scale_method == 'PowerTransformer':
        scaler = preprocessing.PowerTransformer()

    else:
        print(scale_method, ' not found')

    return scaler
コード例 #15
0
def keras_mlp3(train2, y, test2, v, z):
    from keras import layers
    from keras import models
    from keras import optimizers
    cname = sys._getframe().f_code.co_name
    num_splits = 9
    scaler = preprocessing.RobustScaler()
    train3 = scaler.fit_transform(train2)
    test3 = scaler.transform(test2)
    input_dims = train3.shape[1]
    def build_model():
        input_ = layers.Input(shape=(input_dims,))
        model = layers.Dense(512, kernel_initializer='Orthogonal')(input_)
        model = layers.Activation('selu')(model)

        model = layers.Dense(256, kernel_initializer='Orthogonal')(model)
        model = layers.Activation('selu')(model)

        model = layers.Dense(32, kernel_initializer='Orthogonal')(model)
        model = layers.Activation('selu')(model)

        model = layers.Dense(1, activation='sigmoid')(model)

        model = models.Model(input_, model)
        model.compile(loss = 'binary_crossentropy', optimizer = optimizers.SGD(nesterov=True))
        #print(model.summary(line_length=120))
        return model
    keras_common(train3, y, test3, v, z, num_splits, cname, build_model)
コード例 #16
0
def data():
    data = pd.read_csv('facies_vectors.csv')
    data['Facies'] -= 1
    feature_names = [
        'GR', 'ILD_log10', 'DeltaPHI', 'PHIND', 'PE', 'NM_M', 'RELPOS',
        'GR_diff_up', 'ILD_log10_diff_up', 'DeltaPHI_diff_up', 'PHIND_diff_up',
        'PE_diff_up', 'NM_M_diff_up', 'RELPOS_diff_up'
    ]
    data = data.fillna(data['PE'].mean())
    new_data = diffind.data_with_diff(data)
    test = new_data[new_data['Well Name'] == 'NEWBY']
    train = new_data[new_data['Well Name'] != 'NEWBY']
    X_train_1 = train[feature_names].values
    y_train = train['Facies'].values
    X_test_1 = test[feature_names].values
    y_test = test['Facies'].values
    well_train = train['Well Name'].values
    well_test = test['Well Name'].values
    depth_train = train['Depth'].values
    depth_test = test['Depth'].values
    X_aug_train = augmentation.augment_features(X_train_1, well_train,
                                                depth_train)
    X_aug_test = augmentation.augment_features(X_test_1, well_test, depth_test)
    robust = preprocessing.RobustScaler(quantile_range=(25.0,
                                                        75.0)).fit(X_aug_train)
    X_train_robust = robust.transform(X_aug_train)
    X_test_robust = robust.transform(X_aug_test)
    scaler = StandardScaler().fit(X_train_robust)
    X_train_robust_norm = scaler.transform(X_train_robust)
    X_test_robust_norm = scaler.transform(X_test_robust)
    X_train = X_train_robust_norm
    X_test = X_test_robust_norm
    Y_train = to_categorical(y_train, 9)
    Y_test = to_categorical(y_test, 9)
    return (X_train, Y_train, X_test, Y_test)
コード例 #17
0
    def test_robustScaler(self):
        data = np.random.normal(10, 3, size=100)
        data = np.array([data]).T

        rob_scaler = preprocessing.RobustScaler()

        self.scaler2dict2scaler_test(rob_scaler, data)
コード例 #18
0
ファイル: scalers.py プロジェクト: sbalan7/ML-and-Stats
def plot_scalers(X1, X2):

    scalers = [
        pp.StandardScaler(),
        pp.MinMaxScaler(),
        pp.Normalizer(),
        pp.RobustScaler()
    ]
    scaler_names = ['Standard', 'MinMax', 'Normalizer', 'Robust']
    no_of_scalers = len(scalers)

    i = 1
    fig = plt.figure(figsize=(6, 7))

    for scaler, scaler_name in zip(scalers, scaler_names):

        X1 = scaler.fit_transform(X1)
        X2 = scaler.fit_transform(X2)

        ax = plt.subplot(2, 2, i)
        ax.scatter(X1[:, 0], X1[:, 1], c='red', marker='.')
        ax.scatter(X2[:, 0], X2[:, 1], c='blue', marker='.')
        ax.set_xlim(-3, 3)
        ax.set_ylim(-3, 3)
        ax.grid(True)
        ax.set_title(scaler_name)
        centeralise_axes(ax)
        i += 1

    plt.show()
コード例 #19
0
def standardizing(df, methods):
    '''
    This function takes in  a dataframe and a method for standardizing, it
    returns the standardized dataframe.

    The methods are:
         - z: for z-scores
         - mm: for min-max
         - robust: for robust
         - gauss: for gaussian
    '''

    if methods == 'z':
        scaler = preprocessing.StandardScaler().fit(df)
        scaled_df = pd.DataFrame(scaler.transform(df))
    elif methods == 'mm':
        scaler = preprocessing.MinMaxScaler().fit(df)
        scaled_df = pd.DataFrame(scaler.transform(df))
    elif methods == 'robust':
        scaler = preprocessing.RobustScaler().fit(df)
        scaled_df = pd.DataFrame(scaler.transform(df))
    else:
        scaler = preprocessing.PowerTransformer(method='yeo-johnson',
                                                standardize=True)
        scaled_df = pd.DataFrame(scaler.fit_transform(df))

    return scaled_df
コード例 #20
0
 def robust_scaler(df):
     '''
     This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range).
     '''
     scaler = preprocessing.RobustScaler()
     df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
     return df_scaled
コード例 #21
0
def get_scaler(scaler_type=constants.SCALER):
    if scaler_type == constants.SCALER_TYPE_STANDARD:
        return preprocessing.StandardScaler()
    if scaler_type == constants.SCALER_TYPE_MIN_MAX:
        return preprocessing.MinMaxScaler()
    if scaler_type == constants.SCALER_TYPE_ROBUST:
        return preprocessing.RobustScaler()
コード例 #22
0
ファイル: machine_learning.py プロジェクト: soodoku/copro
def define_scaling(config):
    """Defines scaling method based on model configurations.

    Args:
        config (ConfigParser-object): object containing the parsed configuration-settings of the model.

    Raises:
        ValueError: raised if a non-supported scaling method is specified.

    Returns:
        scaler: the specified scaling method instance.
    """

    if config.get('machine_learning', 'scaler') == 'MinMaxScaler':
        scaler = preprocessing.MinMaxScaler()
    elif config.get('machine_learning', 'scaler') == 'StandardScaler':
        scaler = preprocessing.StandardScaler()
    elif config.get('machine_learning', 'scaler') == 'RobustScaler':
        scaler = preprocessing.RobustScaler()
    elif config.get('machine_learning', 'scaler') == 'QuantileTransformer':
        scaler = preprocessing.QuantileTransformer()
    else:
        raise ValueError(
            'no supported scaling-algorithm selected - choose between MinMaxScaler, StandardScaler, RobustScaler or QuantileTransformer'
        )

    if config.getboolean('general', 'verbose'):
        print('chosen scaling method is {}'.format(scaler))

    return scaler
コード例 #23
0
def pre_process_data(X_train,
                     X_validation,
                     X_test,
                     scaler_type,
                     feature_extract=True,
                     log_scale=True):

    if log_scale:
        X_train = np.log10(X_train)
        X_validation = np.log10(X_validation)
        X_test = np.log10(X_test)

    #create scaler
    if scaler_type == 'Standard':
        scaler = preprocessing.StandardScaler().fit(X_train)
    elif scaler_type == 'Robust':
        scaler = preprocessing.RobustScaler().fit(X_train)

    # robust scaling
    X_train_scale = scaler.transform(X_train)
    X_validation_scale = scaler.transform(X_validation)
    X_test_scale = scaler.transform(X_test)

    if feature_extract == True:
        X_train_scale, X_validation_scale, X_test_scale = feature_extraction(
            X_train_scale, X_validation_scale, X_test_scale)

    return X_train_scale[:
                         100000, :], X_validation_scale[:
                                                        100000, :], X_test_scale[:
                                                                                 100000, :]
コード例 #24
0
def robust(x_train_dum, scale_list):
    ''' Robust Scaler '''
    rs = preprocessing.RobustScaler()
    x_train_rs = x_train_dum.copy()
    for i in scale_list:
        x_train_rs.iloc[:, i] = rs.fit_transform(x_train_dum.iloc[:, i])
    return x_train_rs
コード例 #25
0
def cluster(ano, vals, n):
    if (ano == 2014):
        data = d14
    elif (ano == 2015):
        data = d15
    elif (ano == 2016):
        data = d16
    elif (ano == 2017):
        data = d17
    else:
        return "O ano não possui dados disponíveis"

    feats = vals
    cols = feats + ['municipio', 'ano']

    df_c = pd.DataFrame(data[cols])
    df_c = df_c.dropna()

    df_c = df_c[(np.abs(stats.zscore(df_c[feats])) < 3).all(axis=1)]
    scaler = preprocessing.RobustScaler().fit(df_c[feats])
    train = scaler.transform(df_c[feats])

    kmeans = KMeans(n_clusters=n, random_state=0).fit(train)
    labels = kmeans.labels_
    df_c['labels'] = labels
    return (df_c, silhouette_score(train, labels, metric='euclidean'),
            davies_bouldin_score(train, labels))
コード例 #26
0
def normalize(trainX, testX, type=None):
    print(trainX.shape)
    print(testX.shape)

    Scalar = None

    if type == 'standard':
        Scalar = preprocessing.StandardScaler()
    elif type == 'min_max':
        Scalar = preprocessing.MinMaxScaler()
    elif type == 'l1' or type == 'l2':
        Scalar = preprocessing.Normalizer(norm=type)
    elif type == 'l2_v2':
        trainX = trainX / np.expand_dims(np.sqrt(np.sum(trainX ** 2, axis=1)), axis=1)
        testX = testX / np.expand_dims(np.sqrt(np.sum(testX ** 2, axis=1)), axis=1)
    elif type == 'robust':
        Scalar = preprocessing.RobustScaler()
    elif type == 'min-max':
        trainX = (trainX - np.min(trainX))/(np.max(trainX) - np.min(trainX))
        testX = (testX - np.min(trainX))/(np.max(testX) - np.min(testX))


    if Scalar is not None:
        trainX = Scalar.fit_transform(trainX)
        testX = Scalar.fit_transform(testX)

    return trainX, testX
コード例 #27
0
def scale_periods(dict_dataframes):

    ddi_scaled = dict()
    for key, index_name in enumerate(dict_dataframes):
        ddi_scaled[index_name] = copy.deepcopy(dict_dataframes[index_name])
    for key, index_name in enumerate(ddi_scaled):

        scaler = preprocessing.RobustScaler(with_centering=True)

        for index, value in enumerate(ddi_scaled[index_name]):
            X_train = ddi_scaled[index_name][value][1]
            X_train_scaled = scaler.fit_transform(X_train)
            X_train_scaled_df = pd.DataFrame(X_train_scaled,
                                             columns=list(X_train.columns))

            X_val = ddi_scaled[index_name][value][2]
            X_val_scaled = scaler.transform(X_val)
            X_val_scaled_df = pd.DataFrame(X_val_scaled,
                                           columns=list(X_val.columns))

            X_test = ddi_scaled[index_name][value][3]
            X_test_scaled = scaler.transform(X_test)
            X_test_scaled_df = pd.DataFrame(X_test_scaled,
                                            columns=list(X_test.columns))

            ddi_scaled[index_name][value][1] = X_train_scaled_df
            ddi_scaled[index_name][value][2] = X_val_scaled_df
            ddi_scaled[index_name][value][3] = X_test_scaled_df
    return ddi_scaled
コード例 #28
0
def scale_columns(df, columns, scaler_name="RobustScaler"):
    from sklearn import preprocessing

    scaler = None
    if scaler_name == "StandardScaler":
        scaler = preprocessing.StandardScaler()
    if scaler_name == "RobustScaler":
        scaler = preprocessing.RobustScaler()
    if scaler_name == "MinMaxScaler":
        scaler = preprocessing.MinMaxScaler()
    if scaler_name == "Normalizer":
        scaler = preprocessing.Normalizer()
    assert scaler is not None

    data = df.filter(columns, axis=1)
    print(scaler.fit(data))

    scaled_data = scaler.transform(data)
    scaled_df = pd.DataFrame(scaled_data, columns=columns)

    dataframe_scaled = df.copy()
    dataframe_scaled = dataframe_scaled.reset_index(drop=True)
    for column in columns:
        dataframe_scaled[column] = scaled_df[column]

    return dataframe_scaled, scaler
コード例 #29
0
def model_train(feature, train, label, flag, labelname):
    #--归一化处理---
    N = preprocessing.RobustScaler()
    scale_feature = N.fit_transform(feature)
    train_feature = scale_feature[:train.shape[0]]
    test_feature = scale_feature[train.shape[0]:]
    print(train_feature.shape, test_feature.shape)

    #----------------liner_model----------------
    #--cv-5折交叉验证选取最优参数
    alphas = np.logspace(-4, -1, 30)
    cv_lasso = [
        mse_cv(linear_model.Lasso(alpha), train_feature, label).mean()
        for alpha in alphas
    ]

    # print(alphas)
    # print(cv_lasso)
    index = list(cv_lasso).index(min(cv_lasso))
    print("=best_mse      :", min(cv_lasso))
    print("=best_alphas   :", alphas[index])
    clf = linear_model.Lasso(alphas[index])
    model = clf.fit(train_feature, label)
    res = model.predict(test_feature)
    print("==模型系数:", model.coef_)
    test = pd.read_csv("data/test.csv")
    test["pred"] = res
    test[[labelname, "pred"]].to_csv('data/result_{}.csv'.format(flag),
                                     header=None,
                                     index=False)
コード例 #30
0
def normalize_attr(x, norm='l1'):
    """Normalize attribute matrix with given type.

    Parameters
    ----------
    x: Numpy array-like matrix
    norm: The specified type for the normalization.
        'l1': l1-norm for axis 1, from `sklearn.preprocessing`.
        'l1_0': l1-norm for axis 0, from `sklearn.preprocessing`.
        'scale': standard scale for axis 0, 
            from `sklearn.preprocessing.scale`
        'robust_scale', robust scale for axis 0, 
            from `sklearn.preprocessing.robust_scale`
        None: return the copy of `x`

    Returns
    -------
    A normalized attribute matrix in Numpy format.
    """
    if norm not in {'l1', 'l1_0', 'scale', 'robust_scale', None}:
        raise ValueError(f'{norm} is not a supported norm.')

    if norm == 'l1':
        x_norm = preprocessing.normalize(x, norm='l1', axis=1)
    elif norm == 'l1_0':
        x_norm = preprocessing.normalize(x, norm='l1', axis=0)
    elif norm == 'scale':
        # something goes wrong with type float32
        x_norm = preprocessing.StandardScaler().fit(x).transform(x)
    elif norm == 'robust_scale':
        x_norm = preprocessing.RobustScaler().fit(x).transform(x)
    else:
        x_norm = x.copy()
    return x_norm