Beispiel #1
0
def log_transform(X_train, X_valid, X_test, columns):
    t = FunctionTransformer(np.log1p)
    part_X_train = t.transform(X_train[:, columns])
    part_X_train = t.transform(X_train[:, columns])
    part_X_valid = t.transform(X_valid[:, columns])
    part_X_test = t.transform(X_test[:, columns])
    X_train[:, columns] = part_X_train
    X_valid[:, columns] = part_X_valid
    X_test[:, columns] = part_X_test
    return X_train, X_valid, X_test
Beispiel #2
0
def preprocess(data):
    num = [
        'variable2', 'variable3', 'variable8', 'variable11', 'variable14',
        'variable15', 'variable17', 'variable19'
    ]
    label = training.classLabel
    train = training.drop('classLabel', axis=1)
    train[num] = preprocessing.scale(train[num])
    transformer = FunctionTransformer(np.log1p, validate=True)
    transformer.transform(train[num])
    train[num] = preprocessing.normalize(train[num], norm='l2')
    return train, label
def prepare_data(input_filename, label_column, train_size, test_size, add_log_vars):
  df = pd.read_csv(input_filename, delimiter=',', index_col=False, header=0) 
  data = df.values
  column_names = np.char.array(df.columns.values)
  print 'Number of columns in data {}'.format(len(column_names))

  # Extract features/labels and their names from raw data. Don't include the column next
  # to label, since it's gender
  features = data[:, 0:label_column-1]
  labels = data[:, label_column].astype(int)
  
  feature_names = column_names[0:label_column-1]
  label_name = column_names[label_column]

  class_values = list(set(labels))
  class_values.sort()

  train_features, test_features, train_labels, test_labels = (
      model_selection.train_test_split(features, labels, test_size=test_size))
  
  # create requested train size.
  train_features, train_labels = undersample(train_features, train_labels, train_size)

  # Impute the data and replace missing values
  imputer = Imputer(missing_values="NaN", strategy='mean', axis=0, copy=False)
  imputer.fit(train_features)
  train_features = imputer.transform(train_features)
  test_features = imputer.transform(test_features)
 
  # Only after imputing nans, get list of columns with negative values, so we won't apply
  # log-transformation on them
  if add_log_vars:
    column_mins = np.amin(np.concatenate((train_features, test_features), axis=0), axis=0)
    pos_feature_names = feature_names[column_mins>=0]
    neg_feature_names = feature_names[column_mins<0]
    pos_train_features = train_features[:,column_mins>=0]
    pos_test_features = test_features[:,column_mins>=0]
    # make sure negative features are only skewness related
    assert all(['skewness' in feature for feature in neg_feature_names])
    
    # add logof(plus-one) version to features
    transformer = FunctionTransformer(np.log1p)
    log_pos_train_features = transformer.transform(pos_train_features)
    log_pos_test_features = transformer.transform(pos_test_features)
    log_pos_feature_names = pos_feature_names + "_log"
    train_features = np.concatenate((train_features, log_pos_train_features), axis=1)
    test_features = np.concatenate((test_features, log_pos_test_features), axis=1)
    feature_names = np.concatenate((feature_names, log_pos_feature_names))
    print 'Number of columns in data after adding log vars {}'.format(len(feature_names))

  return (train_features, train_labels, test_features, test_labels,
          class_values, feature_names, label_name)
Beispiel #4
0
class FunctionTransformerPreprocessor(object):
    """Processor that drops the first column, and returns log of features."""
    def _drop_first_feature(self, data):
        return data[:, 1:]

    def __init__(self):
        self._log_transformer = FunctionTransformer(np.log1p)
        self._drop_first_feature = FunctionTransformer(
            self._drop_first_feature)

    def preprocess(self, instances):
        return self._log_transformer.transform(
            self._drop_first_feature.transform(instances))
    def process_text(self, text: tf.Tensor) -> tf.Tensor:

        # Convert tensor to a single document
        corpus = ''
        for doc_index in range(self.args.number_of_periods):
            corpus += text[doc_index].numpy().decode('utf-8', 'ignore')

        # Get the word counts
        vectorizer = CountVectorizer()
        word_counts = vectorizer.fit_transform([corpus])

        # Apply the Log1P transformation
        transformer = FunctionTransformer(np.log1p)
        log1p_features = transformer.transform(word_counts.toarray())[0]

        # Get the word names in the right order and get rid of all digit sequences/numbers
        documents_words = [
            feature.lower() for feature in vectorizer.get_feature_names()
            if feature.isnumeric() is False and any(
                char.isdigit()
                for char in feature) is False and feature in self.dict
        ]

        output = [0.] * len(self.dict)
        for word_index, word in enumerate(documents_words):
            output[self.dict.index(word)] = log1p_features[word_index]

        return output
class LogLGBM(LGBMRegressor):
    def __init__(self, target=None, **kwargs):
        super().__init__(**kwargs)
        if target == "Oil_norm":
            self.target_scaler = PowerTransformer(method='box-cox',
                                                  standardize=False)
        elif target == 'Gas_norm':
            self.target_scaler = FunctionTransformer(func=np.log1p,
                                                     inverse_func=np.expm1)
        elif target == 'Water_norm':
            self.target_scaler = FunctionTransformer(func=np.log1p,
                                                     inverse_func=np.expm1)

    def fit(self, X, Y, **kwargs):
        # y_train = np.log1p(Y)
        self.target_scaler.fit(Y.values.reshape(-1, 1) + 1)
        y_train = pd.Series(
            self.target_scaler.transform(Y.values.reshape(-1, 1) + 1).reshape(
                -1, ))
        super(LogLGBM, self).fit(X, y_train, **kwargs)

        return self

    def predict(self, X):
        preds = super(LogLGBM, self).predict(X).reshape(-1, 1)
        preds = self.target_scaler.inverse_transform(preds) - 1
        return preds[:, 0]
Beispiel #7
0
class DataTransfomer:
    """A class to transform data based on user-defined function to get predicted outcomes.
       This class calls FunctionTransformer of scikit-learn internally
       (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html)."""

    def __init__(self, func=None, kw_args=None):
        self.func = func
        self.kw_args = kw_args

    def feed_data_params(self, data_interface):
        if self.kw_args is not None:
            self.kw_args['data_interface'] = data_interface
        else:
            self.kw_args = {'data_interface': data_interface}

    def initialize_transform_func(self):
        if self.func == 'ohe-min-max':
            self.data_transformer = FunctionTransformer(func=ohe_min_max_transformation, kw_args=self.kw_args, validate=False)
        elif self.func is None:
            # identity transformation
            # add more ready-to-use transformers (such as label-encoding) in elif loops.
            self.data_transformer = FunctionTransformer(func=self.func, kw_args=None, validate=False)
        else:
            # add more ready-to-use transformers (such as label-encoding) in elif loops.
            self.data_transformer = FunctionTransformer(func=self.func, kw_args=self.kw_args, validate=False)

    def transform(self, data):
        return self.data_transformer.transform(data)  # should return a numpy array

    def inverse_transform(self, data):
        return self.data_transformer.inverse_transform(data)  # should return a numpy array
class FunctionTransformerPrim(primitive):
    def __init__(self, random_state=0):
        super(FunctionTransformerPrim,
              self).__init__(name='FunctionTransformer')
        self.id = 11
        self.hyperparams = []
        self.type = 'feature preprocess'
        self.description = "Constructs a transformer from an arbitrary callable. A FunctionTransformer forwards its X (and optionally y) arguments to a user-defined function or function object and returns the result of this function. This is useful for stateless transformations such as taking the log of frequencies, doing custom scaling, etc."
        self.hyperparams_run = {'default': True}
        self.scaler = FunctionTransformer()
        self.accept_type = 'c_t'

    def can_accept(self, data):
        return self.can_accept_c(data)

    def is_needed(self, data):
        # data = handle_data(data)
        # Update
        return True

    def fit(self, data):
        data = handle_data(data)
        self.scaler.fit(data['X'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        cols = ["{}_qntl".format(x) for x in cols]
        output['X'] = pd.DataFrame(self.scaler.transform(output['X']),
                                   columns=cols)
        final_output = {0: output}
        return final_output
Beispiel #9
0
class TensorScaler(TransformerMixin):
    """Scaling for 3D tensors.

    Assumes the size is (..., length, input_channels), reshapes to (..., input_channels), performs the method
    operation and then reshapes back.

    Arguments:
        method (str): Scaling method, one of ('stdsc', 'ma', 'mms').
        scaling_function (transformer): Specification of an sklearn transformer that performs a scaling operation.
            Only one of this or scaling can be specified.
    """
    def __init__(self, method="stdsc", scaling_function=None):
        self.scaling = method

        if all([method is None, scaling_function is None]):
            self.scaler = FunctionTransformer(func=None)
        elif isinstance(method, str):
            self.scaler = SCALERS.get(method)()
            assert (
                self.scaler
                is not None), "Scalings allowed are {}, recieved {}.".format(
                    SCALERS.keys(), method)
        else:
            self.scaler = scaling_function

    @apply_fit_to_channels
    def fit(self, data, labels=None):
        self.scaler.fit(data)
        return self

    @apply_transform_to_channels
    def transform(self, data):
        output_data = torch.Tensor(self.scaler.transform(data))
        return output_data
def logarithmic_regression(input_data, cement, water, coarse_aggr, fine_aggr,
                           days):

    variables = input_data.iloc[:, :-1]
    results = input_data.iloc[:, -1]

    n = results.shape[0]
    results = results.values.reshape(
        n, 1
    )  #reshaping the values so that variables and results have the same shape

    #transforming x data to logarithmic fucntion
    log_regression = FunctionTransformer(np.log, validate=True)
    log_variables = log_regression.fit_transform(variables)

    #making linear model and fitting the logarithmic data into linear model
    regression = linear_model.LinearRegression()
    model = regression.fit(log_variables, results)

    input_values = [cement, water, coarse_aggr, fine_aggr, days]

    #transforming input data for prediction in logarithmic function
    input_values = log_regression.transform([input_values])

    #predicting the outcome based on the input_values
    predicted_strength = regression.predict(
        input_values)  #adding values for prediction
    predicted_strength = round(predicted_strength[0, 0], 2)

    return "Logarithmic prediction: " + str(predicted_strength)
Beispiel #11
0
class DistanceTransformer:
    """Transforms the raw distances to the appropriate modeling form
    """

    def __init__(self, pos_features, pipeline_obj_path):
        """
        Args:
          pos_features: list of positional features to use
          pipeline_obj_path: path to the serialized pipeline obj_path
        """
        self.pos_features = pos_features
        self.pipeline_obj_path = pipeline_obj_path

        # deserialize the pickle file
        with open(self.pipeline_obj_path, "rb") as f:
            pipeline_obj = pickle.load(f)
        self.POS_FEATURES = pipeline_obj[0]
        self.minmax_scaler = pipeline_obj[1]
        self.imp = pipeline_obj[2]

        self.funct_transform = FunctionTransformer(func=sign_log_func,
                                                   inverse_func=sign_log_func_inverse)
        # for simplicity, assume all current pos_features are the
        # same as from before
        assert self.POS_FEATURES == self.pos_features

    def transform(self, x):
        # impute missing values and rescale the distances
        xnew = self.minmax_scaler.transform(self.funct_transform.transform(self.imp.transform(x)))

        # convert distances to spline bases
        dist = {"dist_" + k: encodeSplines(xnew[:, i, np.newaxis], start=0, end=1, warn=False)
                for i, k in enumerate(self.POS_FEATURES)}
        return dist
Beispiel #12
0
 def log_trans(self):
     self._data_init()
     transformer = FunctionTransformer(np.log1p)
     X = self.data.values
     y = self.label.values
     X = self.data_array = transformer.transform(X)
     sio.savemat("clean_data/" + self.dataset, {'X': X, 'y': y})
Beispiel #13
0
def test_kw_arg():
    X = np.linspace(0, 1, num=10).reshape((5, 2))

    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))

    # Test that rounding is correct
    assert_array_equal(F.transform(X), np.around(X, decimals=3))
Beispiel #14
0
    def transform_to_depth_pct(self, data):
        """
        transform_to_volume takes in a dataframe like diamonds 
        and returns an np.ndarray consisting of the approximate 
        depth percentage of each diamond.

        :Example:
        >>> diamonds = sns.load_dataset('diamonds').drop(columns='depth')
        >>> out = TransformDiamonds(diamonds)
        >>> transformed = out.transform_to_depth_pct(diamonds)
        >>> len(transformed.shape) == 1
        True
        >>> np.isclose(transformed[0], 61.286, atol=0.0001)
        True
        """

        # Custom function to calc depth percentage
        def depth_pct(arrs):
            depth_pct = []
            for arr in arrs:
                x, y, z = arr[0], arr[1], arr[2]
                depth_pct.append(100 * z / ((x + y) / 2))
            return np.array(depth_pct)

        trans = FunctionTransformer(depth_pct, validate=True)
        transformed = trans.transform(data[['x', 'y', 'z']].values)
        return transformed
Beispiel #15
0
def log_trans(data):
    """
    :param data:
    :return:
    """
    transformer = FunctionTransformer(np.log1p)
    data = transformer.transform(data)
    return data
Beispiel #16
0
def q4():
    # Retorne aqui o resultado da questão 4.
    transformer = FunctionTransformer(np.log1p)
    df = get_sample(athletes, 'weight', n=3000)
    
    df = transformer.transform(df)
    (k2,pvalue) = sct.normaltest(df)
    return bool(pvalue>=0.05)
def Scaler(X_train):
    transformer = FunctionTransformer(np.log1p, validate=True)
    X_train = transformer.transform(X_train)

    scaler = MinMaxScaler(feature_range=(0, 1))

    scaler.fit(X_train)
    return scaler.transform(X_train)
def test_kw_arg():
    X = np.linspace(0, 1, num=10).reshape((5, 2))

    F = FunctionTransformer(np.around, kw_args=dict(decimals=3))

    # Test that rounding is correct
    assert_array_equal(F.transform(X),
                       np.around(X, decimals=3))
Beispiel #19
0
def test_inverse_transform():
    X = np.array([1, 4, 9, 16]).reshape((2, 2))

    # Test that inverse_transform works correctly
    F = FunctionTransformer(func=np.sqrt,
                            inverse_func=np.around,
                            inv_kw_args=dict(decimals=3))
    testing.assert_array_equal(F.inverse_transform(F.transform(X)),
                               np.around(np.sqrt(X), decimals=3))
Beispiel #20
0
def load_tensor_data(fileloc):
    """
    Helper function to load the actors data, filter by criterias of 1 million
    min. revenue and actors in at least 20 movies. Returns actor matrix and
    logNormal revenue as torch tensors.
    """

    data_actors = pd.read_csv(fileloc, index_col=0)
    X = data_actors.iloc[:, 2:]
    X_data = torch.Tensor(X.to_numpy(dtype='float32'))
    transformer = FunctionTransformer(np.log1p, validate=True)
    data_actors["log_revenue"] = transformer.transform(
        data_actors["revenue"].values.reshape(-1, 1))
    Y_data = torch.Tensor(data_actors["log_revenue"].to_numpy().reshape(
        X.shape[0], 1))

    cols_keep = ['Judi Dench', 'Cobie Smulders']
    cols_20 = ['title_x', 'revenue', 'log_revenue']
    for col in data_actors.columns[2:-1]:

        if col in cols_keep:
            continue
        elif np.sum(data_actors[col]) >= 20:
            cols_20.append(col)

    data_million = data_actors[cols_20 + cols_keep]
    must_keep = data_million[(data_million["Judi Dench"] == 1) |
                             (data_million["Cobie Smulders"] == 1)]
    data_million = data_million[data_million["revenue"] > 1000000]

    #     X_all = data_million[
    #         data_million.columns.difference(
    #             ['title_x', 'revenue', 'log_revenue']
    #         )
    #     ].append(must_keep[must_keep.columns.difference(
    #             ['title_x', 'revenue', 'log_revenue'])], ignore_index = True)

    X_all = data_million[data_million.columns.difference(
        ['revenue', 'log_revenue'])].append(
            must_keep[must_keep.columns.difference(['revenue',
                                                    'log_revenue'])],
            ignore_index=True)

    y_all = data_million['revenue'].append(must_keep['revenue'])

    x_train = X_all
    y_train = y_all
    x_train_tensors = torch.tensor(
        x_train.drop("title_x", axis=1).to_numpy(dtype='float32'))
    y_train_tensors = torch.tensor(y_all.to_numpy(dtype='float32'))

    cols = list(x_train.columns)
    cols = [cols[-1]] + cols[:-1]
    x_train = x_train[cols]

    return x_train_tensors, y_train_tensors, x_train.columns, x_train
def test_inverse_transform():
    X = np.array([1, 4, 9, 16]).reshape((2, 2))

    # Test that inverse_transform works correctly
    F = FunctionTransformer(
            func=np.sqrt,
            inverse_func=np.around, inv_kw_args=dict(decimals=3))
    testing.assert_array_equal(
            F.inverse_transform(F.transform(X)),
            np.around(np.sqrt(X), decimals=3))
Beispiel #22
0
def exponential_transformation(data):

    transformer = FunctionTransformer(np.exp1p, validate=True)

    for column in data.columns:

        if column not in config.CATEGORICALS:

            data[column] = transformer.transform(data[column])

    return data
Beispiel #23
0
def log_transformation(data):

    transformer = FunctionTransformer(np.log1p, validate=True)

    for column in data.columns:

        temp = data[column].values.reshape(-1, 1)

        data[column] = transformer.transform(temp + 1)

    return data
Beispiel #24
0
def combine_attr_adder(housing):
    attr_adder = FunctionTransformer(add_extra_features,
                                     validate=False,
                                     kw_args={"add_bedrooms_per_room": False})
    housing_extra_attribs = attr_adder.transform(housing.values)
    housing_extra_attribs = pd.DataFrame(
        housing_extra_attribs,
        columns=list(housing.columns) +
        ["rooms_per_household", "population_per_household"])
    # print(housing_extra_attribs.head())
    return housing_extra_attribs
    def test_function_transformer(self):
        x = numpy.array([[6.1, -5], [3.5, -7.8]], dtype=numpy.float32)
        tr = FunctionTransformer(custom_fct)
        tr.fit(x)
        y_exp = tr.transform(x)
        self.assertEqualArray(
            numpy.array([[6.1, 0.], [3.5, 0.]], dtype=numpy.float32), y_exp)

        onnx_model = to_onnx(tr, x)
        oinf = OnnxInference(onnx_model)
        y_onx = oinf.run({'X': x})
        self.assertEqualArray(y_exp, y_onx['variable'])
 def test_function_transformer_pickle(self):
     x = numpy.array([[6.1, -5], [3.5, -7.8]], dtype=numpy.float32)
     tr = FunctionTransformer(custom_fct)
     tr.fit(x)
     y_exp = tr.transform(x)
     st = BytesIO()
     # import cloudpickle as pkl
     pkl = pickle
     pkl.dump(tr, st)
     cp = BytesIO(st.getvalue())
     tr2 = pkl.load(cp)
     y_exp2 = tr2.transform(x)
     self.assertEqualArray(y_exp, y_exp2)
 def test_function_transformer_fft_abs(self):
     for rt, fct in [('py', custom_fft_abs),
                     ('ort', custom_fft_abs_ort)]:
         with self.subTest(runtime=rt):
             x = numpy.array([[6.1, -5], [3.5, -7.8]],
                             dtype=numpy.float32)
             tr = FunctionTransformer(fct)
             tr.fit(x)
             y_exp = tr.transform(x)
             onnx_model = to_onnx(tr, x)
             oinf = OnnxInference(onnx_model)
             y_onx = oinf.run({'X': x})
             self.assertEqualArray(y_exp, y_onx['variable'], decimal=5)
class FunctionExtractor(TransformerMixin):
    def __init__(self,
                 func,
                 result_column,
                 source_column=None,
                 validate=False):
        self.func = func
        self.source_column = source_column
        self.result_column = result_column
        self.validate = validate
        self.extractor = FunctionTransformer(self.func, validate=self.validate)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.source_column is not None:
            X[self.result_column] = self.extractor.transform(
                X[self.source_column])
        else:
            X[self.result_column] = self.extractor.transform(X)
        return X
def prepare_data(df):
    import numpy as np
    from sklearn.preprocessing import FunctionTransformer
    sc = FunctionTransformer(np.log1p)
    X = df[['goal']]
    X = sc.transform(X)
    df[['goal']] = X
    df = pd.get_dummies(df, columns=['country'])
    df = pd.get_dummies(df, columns=['category'])
    df = pd.get_dummies(df, columns=['deadline_weekday'])
    df = pd.get_dummies(df, columns=['created_at_weekday'])
    df = pd.get_dummies(df, columns=['launched_at_weekday'])
    return df
Beispiel #30
0
def scale_data(df, p, train=True, save=True):
    if p.log_scale:
        df.loc[df["last_pend_time"] == 0, "last_pend_time"] = 1
        if train:
            log_scaler = FunctionTransformer(np.log2)
            df.loc[:, ["last_pend_time"]] = log_scaler.fit_transform(
                df[["last_pend_time"]])
            if save:
                joblib.dump(log_scaler, "log_scaler.save")
        else:
            log_scaler = joblib.load("log_scaler.save")
            df.loc[:, ["last_pend_time"]] = log_scaler.transform(
                df[["last_pend_time"]])

    scale_cols = ["last_pend_time"]
    if p.use_using_cores:
        scale_cols.append("using_cores")
    if p.use_spending_run_time:
        scale_cols.append("spending_run_time")
    if p.use_pending_jobs:
        scale_cols.append("pending_jobs")
    if p.use_last_pend_time_submit:
        scale_cols.append("last_pend_time_submit")
    if p.use_submit_time:
        scale_cols.append("sin_submit_time")
        scale_cols.append("cos_submit_time")
    if p.use_day_of_week:
        scale_cols.append("sin_day_of_week")
        scale_cols.append("cos_day_of_week")

    if train:
        min_max_scaler = MinMaxScaler(feature_range=(0, 1))
        df.loc[:, scale_cols] = min_max_scaler.fit_transform(df[scale_cols])
        if save:
            joblib.dump(min_max_scaler, "min_max_scaler.save")
    else:
        min_max_scaler = joblib.load("min_max_scaler.save")
        df.loc[:, scale_cols] = min_max_scaler.transform(df[scale_cols])

    if p.standard_scale:
        if train:
            standard_scaler = StandardScaler()
            df.loc[:,
                   scale_cols] = standard_scaler.fit_transform(df[scale_cols])
            if save:
                joblib.dump(standard_scaler, "standard_scaler.save")
        else:
            standard_scaler = joblib.load("standard_scaler.save")
            df.loc[:, scale_cols] = standard_scaler.transform(df[scale_cols])

    return df
def test_functiontransformer_vs_sklearn():
    # Compare msmbuilder.preprocessing.FunctionTransformer
    # with sklearn.preprocessing.FunctionTransformer

    functiontransformerr = FunctionTransformerR()
    functiontransformerr.fit(np.concatenate(trajs))

    functiontransformer = FunctionTransformer()
    functiontransformer.fit(trajs)

    y_ref1 = functiontransformerr.transform(trajs[0])
    y1 = functiontransformer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
class DFFunctionTransformer(TransformerMixin):
    # FunctionTransformer but for pandas DataFrames

    def __init__(self, *args, **kwargs):
        self.ft = FunctionTransformer(*args, **kwargs)

    def fit(self, X, y=None):
        # stateless transformer
        return self

    def transform(self, X):
        Xt = self.ft.transform(X)
        Xt = pd.DataFrame(Xt, index=X.index, columns=X.columns)
        return Xt
Beispiel #33
0
def test_functiontransformer_vs_sklearn():
    # Compare msmbuilder.preprocessing.FunctionTransformer
    # with sklearn.preprocessing.FunctionTransformer

    functiontransformerr = FunctionTransformerR()
    functiontransformerr.fit(np.concatenate(trajs))

    functiontransformer = FunctionTransformer()
    functiontransformer.fit(trajs)

    y_ref1 = functiontransformerr.transform(trajs[0])
    y1 = functiontransformer.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Beispiel #34
0
#
# dataframe slicing
# selectionlist gets passed the parameterlist from json object
selectionlist = []
selectionlist.extend((args.list))

# read data,
df1=pd.read_table('penalties.csv', sep=';',header=0)

# all headers
colnames = list(df1.columns.values)
# slice data
X=df1.ix[:,selectionlist]
# sqrt transform the heavily skewed data
transformer = FunctionTransformer(np.sqrt)
Xtran = transformer.transform(X)
X = pd.DataFrame(Xtran)
selectionheaders = selectionlist
oldnames = X.columns.values

# rename all columns with original columnheaders
X.rename(columns=dict(zip(oldnames, selectionheaders)), inplace=True)
# rest indizes
colnamesrest = [x for x in colnames if x not in selectionlist]
Rest = df1.ix[:, colnamesrest]
# deletes multiplier columns
del Rest['multiplier']
#plot 3by3 scatterplotmatrix
from pandas.tools.plotting import scatter_matrix
scatter_matrix(X, alpha=0.2, figsize=(3, 3))
plt.show()
Beispiel #35
0
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.utils import np_utils, generic_utils

from sklearn.preprocessing import FunctionTransformer

#read data
train_tour1 = pd.read_csv('numerai_training_data.csv')
feature = pd.DataFrame(train_tour1.ix[:,0:21])
target = pd.DataFrame(train_tour1.target)

#log feature
transformer = FunctionTransformer(np.log1p)
feature_log = transformer.transform(feature)

#add all feature
feature_log = pd.DataFrame(feature_log)
feature_all = pd.concat([feature, feature_log], axis =1 )

#separate target and features
feature_all = np.asarray(feature_all)
target = np.asarray(target)

# convert list of labels to binary class matrix
target = np_utils.to_categorical(target) 

# pre-processing: divide by max and substract mean
scale = np.max(feature_all)
feature_all /= scale