def test_list_transformers_single_arg(simple_dataframe):
    """
    Multiple transformers can be specified in a list even if some of them
    only accept one X argument instead of two (X, y).
    """
    mapper = DataFrameMapper([("a", [MockXTransformer()])])
    # doesn't fail
    mapper.fit_transform(simple_dataframe)
Exemple #2
0
	def test_mapper(self):
		domain = CategoricalDomain()
		df = DataFrame([{"X" : "2", "y" : 2}, {"X" : "1"}, {"X" : "3"}])
		mapper = DataFrameMapper([
			("X", [domain, LabelBinarizer()]),
			("y", None)
		])
		mapper.fit_transform(df)
		self.assertEqual(numpy.array(["1", "2", "3"]).tolist(), domain.data_.tolist())
def test_transformed_names_simple(simple_dataframe):
    """
    Get transformed names of features in `transformed_names` attribute
    for simple transformation
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', None)])
    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['a']
def test_default_none_names():
    """
    If default=None, column names are returned unmodified.
    """
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
    mapper = DataFrameMapper([], default=None)

    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['a', 'b']
def test_transformed_names_simple_alias(simple_dataframe):
    """
    If we specify an alias for a single output column, it is used for the
    output
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', None, {'alias': 'new_name'})])
    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['new_name']
def test_transformed_names_complex_alias(complex_dataframe):
    """
    If we specify an alias for a multiple output column, it is used for the
    output
    """
    df = complex_dataframe
    mapper = DataFrameMapper([('target', LabelBinarizer(), {'alias': 'new'})])
    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['new_a', 'new_b', 'new_c']
def test_transformed_names_binarizer(complex_dataframe):
    """
    Get transformed names of features in `transformed_names` attribute
    for a transformation that multiplies the number of columns
    """
    df = complex_dataframe
    mapper = DataFrameMapper([('target', LabelBinarizer())])
    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c']
Exemple #8
0
	def test_mapper(self):
		domain = ContinuousDomain()
		df = DataFrame([{"X1" : 2.0, "X2" : 2, "y" : 2.0}, {"X1" : 1.0, "X2" : 0.5}, {"X1" : 3.0, "X2" : 3.5}])
		mapper = DataFrameMapper([
			(["X1", "X2"], [domain, StandardScaler()]),
			("y", None)
		])
		mapper.fit_transform(df)
		self.assertEqual(numpy.array([1.0, 0.5]).tolist(), domain.data_min_.tolist())
		self.assertEqual(numpy.array([3.0, 3.5]).tolist(), domain.data_max_.tolist())
def test_fit_transform(simple_dataframe):
    """
    Check that custom fit_transform methods of the transformers are invoked.
    """
    df = simple_dataframe
    mock_transformer = Mock()
    # return something of measurable length but does nothing
    mock_transformer.fit_transform.return_value = np.array([1, 2, 3])
    mapper = DataFrameMapper([("a", mock_transformer)])
    mapper.fit_transform(df)
    assert mock_transformer.fit_transform.called
def test_transformed_names_transformers_list(complex_dataframe):
    """
    When using a list of transformers, use them in inverse order to get the
    transformed names
    """
    df = complex_dataframe
    mapper = DataFrameMapper([
        ('target', [LabelBinarizer(), MockXTransformer()])
    ])
    mapper.fit_transform(df)
    assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c']
def test_cols_string_array(simple_dataframe):
    """
    If an string specified as the columns, the transformer
    is called with a 1-d array as input.
    """
    df = simple_dataframe
    mock_transformer = Mock()
    mock_transformer.transform.return_value = np.array([1, 2, 3])  # do nothing
    mapper = DataFrameMapper([("a", mock_transformer)])

    mapper.fit_transform(df)
    args, kwargs = mock_transformer.fit.call_args
    assert args[0].shape == (3,)
def test_cols_list_column_vector(simple_dataframe):
    """
    If a one-element list is specified as the columns, the transformer
    is called with a column vector as input.
    """
    df = simple_dataframe
    mock_transformer = Mock()
    mock_transformer.transform.return_value = np.array([1, 2, 3])  # do nothing
    mapper = DataFrameMapper([(["a"], mock_transformer)])

    mapper.fit_transform(df)
    args, kwargs = mock_transformer.fit.call_args
    assert args[0].shape == (3, 1)
def compute_cross_correlation_score(df, clfs, preprocess_scaling=True, nFold=10):
    """
    return an iterator with cross validation data
    :param df:
    :param clfs:
    :param preprocess_scaling:
    :param nFold:
    :return:
    """

    to_sklearn_features = DataFrameMapper([('features', sklearn.feature_extraction.DictVectorizer())])

    data_X = to_sklearn_features.fit_transform(df)
    data_Y = df.expected_class

    skf = cross_validation.StratifiedKFold(data_Y, n_folds=nFold)
    classification_results = []
    scores = []
    for num, (train_index, test_index) in enumerate(skf):
        X_train, X_test = data_X[train_index], data_X[test_index]
        Y_train, Y_test = data_Y[train_index], data_Y[test_index]
        print("Len train{}, Len test{}".format(Y_train.size, Y_test.size))
        cross_valid_data = Cross_validation_split(X_train, X_test, Y_train, Y_test)
        cross_valid_data = preprocess(cross_valid_data, preprocess_scaling=preprocess_scaling, preprocess_correlation=False)

        for clf in clfs:
            score, classification = generate_score(clf, cross_valid_data, fold=num)
            scores.append(score)
            classification_results.append(classification)
    return scores, classification_results
Exemple #14
0
def preprocess_train(train):
    train_y = train['count']
    train_y1 = train['casual']
    train_y2 = train['registered']

    preprocess_data(train)

    mapper = DataFrameMapper([
        ('hour', None),
        ('season', preprocessing.LabelBinarizer()),
        ('holiday', None),
        ('workingday', None),
        ('weather', preprocessing.LabelBinarizer()),
        ('temp', None),
        ('atemp', None),
        ('humidity', None),
        ('windspeed', None),
        ('weekday', None),
        ('is_sunday', None),
        ('bad_weather', None),
        ('year', None),
    ])

    train_X = mapper.fit_transform(train)
    return train_X, train_y, train_y1, train_y2, mapper
Exemple #15
0
class Transformer(object):
    """
    The purpose of this class is to take a dataframe and transform it into
    a numpy array compatible format.
    """

    def __init__(self, config):
        self.__config = config
        self.__mapper = None
        self.__label_encoder_adapter = TransformerAdapter(LabelEncoderMissingValuesTransformer())

    def prepare(self, dataframe):
        """
        Takes the already cleaned dataframe, splits it into train and test
        and returns the train and test as numpy arrays.
        If the problem is supervised, the target column will be that last one
        of the returned arrays.
        """
        mapping = DataFrameMapCreator().get_mapping_from_config(self.__config)
        self.__mapper = DataFrameMapper(mapping)
        train, test = split_dataframe_train_test(dataframe, self.__config.get_option_parameter("split", "train_percentage"))
        return self.__get_correct_return_parameters(train, test)

    def __get_correct_return_parameters(self, train, test):
        model = self.__config.get_data_model()

        train_transformed = self.__mapper.fit_transform(train)
        test_transformed = self.__mapper.transform(test)

        if model.has_target():
            return self.__add_target_data(train_transformed, train), \
                   self.__add_target_data(test_transformed, test)
        else:
            return train_transformed, test_transformed

    def __add_target_data(self, transformed_data, original_data):
        """
        Picks up the target data from the original_data and appends it as a
        column to the transformed_data.
        Both arguments are expected to be np.array's.
        """
        model = self.__config.get_data_model()
        target_feature = model.find_target_feature()
        name = target_feature.get_name()

        if target_feature.is_categorical():
            target_row = original_data[name]
            target = self.__label_encoder_adapter.transform(target_row)
        else:
            target = original_data[name].values.astype(type_name_to_data_type("float"))

        target = target[..., None]

        return np.hstack((transformed_data, target))

    def apply(self, dataframe):
        return self.__mapper.transform(dataframe)
def test_simple_df(simple_dataframe):
    """
    Get a dataframe from a simple mapped dataframe
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', None)], df_out=True)
    transformed = mapper.fit_transform(df)
    assert type(transformed) == pd.DataFrame
    assert len(transformed["a"]) == len(simple_dataframe["a"])
def test_binarizer2_df():
    """
    Check level names from LabelBinarizer with just one output column
    """
    df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'a']})
    mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
    transformed = mapper.fit_transform(df)
    cols = transformed.columns
    assert len(cols) == 1
    assert cols[0] == 'target'
def test_default_transformer():
    """
    If default=Transformer, non explicitly selected columns are applied this
    transformer.
    """
    df = pd.DataFrame({'a': [1, np.nan, 3], })
    mapper = DataFrameMapper([], default=Imputer())

    transformed = mapper.fit_transform(df)
    assert (transformed[: 0] == np.array([1., 2., 3.])).all()
def test_sparse_off(simple_dataframe):
    """
    If the resulting features are sparse but the "sparse" argument
    of the mapper is False, return a non-sparse matrix.
    """
    df = simple_dataframe
    mapper = DataFrameMapper([("a", ToSparseTransformer())], sparse=False)

    dmatrix = mapper.fit_transform(df)
    assert type(dmatrix) != sparse.csr.csr_matrix
def test_sparse_features(simple_dataframe):
    """
    If any of the extracted features is sparse and "sparse" argument
    is true, the hstacked result is also sparse.
    """
    df = simple_dataframe
    mapper = DataFrameMapper([("a", ToSparseTransformer())], sparse=True)
    dmatrix = mapper.fit_transform(df)

    assert type(dmatrix) == sparse.csr.csr_matrix
def scale_X(X, dataset):
    if dataset == 'noYelp':
        X_scaled = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)
    else:
        #use sklearn pandas data mapper to scale only non binary columns
        mapper = DataFrameMapper([(['yelp_rating'], StandardScaler()), (['yelp_reviews'], StandardScaler()), (['risk'], StandardScaler()), (['insp_badge'], StandardScaler()), (['crime_count'], StandardScaler()), (['311_count'], StandardScaler()), (['construction_count'], StandardScaler()), (['avg_high_temp'], StandardScaler()), (['time_diff'], StandardScaler()), (['prev_crit_viol'], StandardScaler()), ('Burgers', None), ('Convenience Stores', None), ('Sandwiches', None), ('Wine & Spirits', None), ('adultentertainment', None), ('afghani', None), ('african', None), ('apartments', None), ('asianfusion', None), ('bagels', None), ('bakeries', None), ('bangladeshi', None), ('bars', None), ('bbq', None), ('beerbar', None), ('beergardens', None), ('belgian', None), ('brasseries', None), ('breakfast_brunch', None), ('breweries', None), ('british', None), ('buffets', None), ('burgers', None), ('burmese', None), ('cafes', None), ('cafeteria', None), ('cajun', None), ('catering', None), ('cheesesteaks', None), ('chicken_wings', None), ('chinese', None), ('chocolate', None), ('churches', None),('cocktailbars', None), ('coffee', None), ('coffeeroasteries', None), ('comfortfood', None), ('cookingschools', None), ('creperies', None), ('cuban', None), ('cupcakes', None), ('danceclubs', None), ('delis', None), ('desserts', None), ('diners', None), ('discountstore', None), ('divebars', None), ('donuts', None), ('drugstores', None), ('ethiopian', None), ('ethnicmarkets', None), ('falafel', None), ('foodtrucks', None), ('french', None), ('gastropubs', None), ('gelato', None), ('german', None), ('gluten_free', None), ('golf', None), ('gourmet', None), ('greek', None), ('grocery', None), ('gyms', None), ('halal', None), ('healthtrainers', None), ('hookah_bars', None),  ('hotdog', None), ('hotdogs', None), ('hotels', None), ('icecream', None), ('indpak', None), ('irish', None), ('irish_pubs', None), ('italian', None), ('japanese', None),  ('jazzandblues', None), ('juicebars', None), ('korean', None), ('landmarks', None),  ('latin', None), ('lawyers', None), ('lebanese', None), ('libraries', None), ('lounges', None), ('mediterranean', None), ('mexican', None), ('mideastern', None), ('mini_golf', None), ('modern_european', None), ('musicvenues', None), ('newamerican', None), ('nonprofit', None), ('pakistani', None), ('peruvian', None), ('pianobars', None), ('pizza', None),  ('publicservicesgovt', None), ('pubs', None), ('puertorican', None), ('restaurants', None),  ('salad', None), ('salvadoran', None), ('sandwiches', None), ('seafood', None),  ('social_clubs', None), ('soulfood', None), ('soup', None), ('southern', None),  ('spanish', None), ('sports_clubs', None), ('sportsbars', None), ('steak', None), ('sushi', None), ('tapas', None), ('tapasmallplates', None), ('tea', None),  ('tex-mex', None), ('thai', None), ('tobaccoshops', None), ('tradamerican', None), ('turkish', None), ('vegetarian', None), ('venues', None), ('vietnamese', None), ('wholesale_stores', None), ('wine_bars', None)])

        X_scaled = pd.DataFrame(mapper.fit_transform(X.copy()), columns=X.columns)

    print "\n data scaled\n"
    return X_scaled
def test_multiindex_df(multiindex_dataframe_incomplete):
    """
    Get a dataframe from a multiindex dataframe with missing data
    """
    df = multiindex_dataframe_incomplete
    mapper = DataFrameMapper([([c], Imputer()) for c in df.columns],
                             df_out=True)
    transformed = mapper.fit_transform(df)
    assert len(transformed) == len(multiindex_dataframe_incomplete)
    for c in df.columns:
        assert len(transformed[str(c)]) == len(df[c])
def test_default_false():
    """
    If default=False, non explicitly selected columns are discarded.
    """
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
    mapper = DataFrameMapper([
        ('b', None)
    ], default=False)

    transformed = mapper.fit_transform(df)
    assert transformed.shape == (3, 1)
def test_fit_transform_equiv_mock(simple_dataframe):
    """
    Check for equivalent results for code paths fit_transform
    versus fit and transform in DataFrameMapper using the mock
    transformer which does not implement a custom fit_transform.
    """
    df = simple_dataframe
    mapper = DataFrameMapper([('a', MockXTransformer())])
    transformed_combined = mapper.fit_transform(df)
    transformed_separate = mapper.fit(df).transform(df)
    assert np.all(transformed_combined == transformed_separate)
def test_onehot_df():
    """
    Check level ids from one-hot
    """
    df = pd.DataFrame({'target': [0, 0, 1, 1, 2, 3, 0]})
    mapper = DataFrameMapper([(['target'], OneHotEncoder())], df_out=True)
    transformed = mapper.fit_transform(df)
    cols = transformed.columns
    assert len(cols) == 4
    assert cols[0] == 'target_0'
    assert cols[3] == 'target_3'
def test_default_none():
    """
    If default=None, non explicitly selected columns are passed through
    untransformed.
    """
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
    mapper = DataFrameMapper([
        (['a'], OneHotEncoder())
    ], default=None)

    transformed = mapper.fit_transform(df)
    assert (transformed[:, 3] == np.array([3, 5, 7]).T).all()
def test_preserve_df_index():
    """
    The index is preserved when df_out=True
    """
    df = pd.DataFrame({'target': [1, 2, 3]},
                      index=['a', 'b', 'c'])
    mapper = DataFrameMapper([('target', None)],
                             df_out=True)

    transformed = mapper.fit_transform(df)

    assert_array_equal(transformed.index, df.index)
def test_customtransform_df():
    """
    Check level ids from a transformer in which
    the number of classes is not equals to the number of output columns.
    """
    df = pd.DataFrame({'target': [6, 5, 7, 5, 4, 8, 8]})
    mapper = DataFrameMapper([(['target'], CustomTransformer())], df_out=True)
    transformed = mapper.fit_transform(df)
    cols = transformed.columns
    assert len(mapper.features[0][1].classes_) == 5
    assert len(cols) == 1
    assert cols[0] == 'target'
def test_binarizer_int_df():
    """
    Check level names from LabelBinarizer for a numeric array.
    """
    df = pd.DataFrame({'target': [5, 5, 6, 6, 7, 5]})
    mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
    transformed = mapper.fit_transform(df)
    cols = transformed.columns
    assert len(cols) == 3
    assert cols[0] == 'target_5'
    assert cols[1] == 'target_6'
    assert cols[2] == 'target_7'
def test_binarizer_df():
    """
    Check level names from LabelBinarizer
    """
    df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'a']})
    mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
    transformed = mapper.fit_transform(df)
    cols = transformed.columns
    assert len(cols) == 3
    assert cols[0] == 'target_a'
    assert cols[1] == 'target_b'
    assert cols[2] == 'target_c'
Exemple #31
0
def test_local_input_df_date_encoder():
    """
    When input_df is True we can apply a transformer that only works
    with pandas dataframes like a DateEncoder
    """
    df = pd.DataFrame(
        {'dates': pd.date_range('2015-10-30', '2015-11-02')})
    mapper = DataFrameMapper([
        ('dates', DateEncoder(), {'input_df': True})
    ], input_df=False)
    out = mapper.fit_transform(df)
    expected = np.array([
        [2015, 10, 30],
        [2015, 10, 31],
        [2015, 11, 1],
        [2015, 11, 2]
    ])
    assert_array_equal(out, expected)
def test_fit_with_required_y_arg(complex_dataframe):
    """
    Transformers with a required y argument in the fit method
    are handled and perform correctly
    """
    df = complex_dataframe
    mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))])

    # fit, doesn't fail
    ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target'])

    # fit_transform
    ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target'])
    assert_array_equal(ft_arr, df[['feat1']].values)

    # transform
    t_arr = mapper.transform(df[['feat1', 'feat2']])
    assert_array_equal(t_arr, df[['feat1']].values)
def preprocess_train(train):
    train_y = train.Survived.values

    mapper = DataFrameMapper([
        ('Pclass', preprocessing.LabelBinarizer()),
        ('Sex', preprocessing.LabelBinarizer()),
        ('Age', None),
        ('SibSp', preprocessing.Binarizer()),
        ('Parch', preprocessing.Binarizer()),
        ('Embarked', preprocessing.LabelBinarizer()),
        ('Fare', None),
    ])
    train_X = mapper.fit_transform(train)

    imputer = preprocessing.Imputer(strategy='mean')
    train_X = imputer.fit_transform(train_X)

    return train_X, train_y, mapper, imputer
Exemple #34
0
def test_input_df_true_next_transformers(simple_dataframe, monkeypatch):
    """
    If input_df is True, the subsequent transformers get passed pandas
    objects instead of numpy arrays (given the previous transformers
    output pandas objects as well)
    """
    df = simple_dataframe
    monkeypatch.setattr(MockTClassifier, 'fit', Mock())
    monkeypatch.setattr(MockTClassifier, 'transform',
                        Mock(return_value=pd.Series([1, 2, 3])))
    mapper = DataFrameMapper(
        [('a', [MockXTransformer(), MockTClassifier()])], input_df=True)
    out = mapper.fit_transform(df)

    args, _ = MockTClassifier().fit.call_args
    assert isinstance(args[0], pd.Series)

    assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
Exemple #35
0
def test_list_transformers():
    """
    Specifying a list of transformers applies them sequentially to the
    selected column.
    """
    dataframe = pd.DataFrame({"a": [1, np.nan, 3], "b": [1, 5, 7]})

    mapper = DataFrameMapper([
        (["a"], [Imputer(), StandardScaler()]),
        (["b"], StandardScaler()),
    ])
    dmatrix = mapper.fit_transform(dataframe)

    assert pd.isnull(dmatrix).sum() == 0  # no null values

    # all features have mean 0 and std deviation 1 (standardized)
    assert (abs(dmatrix.mean(axis=0) - 0) <= 1e-6).all()
    assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()
def testOptimizeBestModel():
    X, y = make_classification(n_samples=100, n_features=20, n_informative=2)
    Xall = pd.concat(
        [pd.DataFrame(X), pd.DataFrame(y, columns=['target'])], axis=1)
    conti_ftr = list(range(20))
    datamapper = DataFrameMapper([(conti_ftr, [
        ContinuousDomain(invalid_value_treatment='as_is',
                         missing_value_treatment='as_mean'),
        Imputer()
    ])],
                                 df_out=True)
    X_ = datamapper.fit_transform(X)
    lgb = BinaryClassifier("LightGBM")
    bestskopt, trace = lgb.optimizeBestModel(Xall,
                                             datamapper=datamapper,
                                             target='target',
                                             search_alg="GP",
                                             n_calls=10)
    def readDataset(self):

        train_df = pd.read_csv(self.trainFile)
        test_df = pd.read_csv(self.testFile)

        #print(train_df.columns)
        #print(train_df.head())
        #print(test_df.columns)
        self.test_index = test_df.Id
        train_df = train_df.astype(float)
        test_df = test_df.astype(float)
        #print(train_df.iloc[0].values)
        mapper = DataFrameMapper([
            ([
                'Elevation', 'Aspect', 'Slope',
                'Horizontal_Distance_To_Hydrology',
                'Vertical_Distance_To_Hydrology',
                'Horizontal_Distance_To_Roadways', 'Hillshade_9am',
                'Hillshade_Noon', 'Hillshade_3pm',
                'Horizontal_Distance_To_Fire_Points'
            ], MinMaxScaler()),
            ([
                'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
                'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
                'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7',
                'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11',
                'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15',
                'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19',
                'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23',
                'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27',
                'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31',
                'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35',
                'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39',
                'Soil_Type40'
            ], None)
        ])

        self.X_train = mapper.fit_transform(train_df)
        # print(X_train[0:2,:])

        self.y_train = train_df.Cover_Type.values
        # print(y_train[0:10])

        self.X_test = mapper.transform(test_df)
Exemple #38
0
def data_simple_imputer(data_train,
                        numeric_feature,
                        category_feature,
                        numeric_strategy='mean',
                        category_strategy='most_frequent',
                        data_test=None):
    '''
    使用DataFrameMapper进行简单的缺失值填补 指定类别型变量和连续型变量 并指定各自的填充策略
    data_train: 需要进行转换的训练集
    numeric_feature: 需要处理的数值型变量
    category_feature: 需要处理的类别型变量
    numeric_strategy: 连续型变量的填补策略 默认是均值
    category_strategy: 类别型变量的填补策略 默认是众数
    data_test: 需要进行转换的测试集 可以不给 不给就不会进行相应的转换
    
    return:
    X_train_imputed 添补完成的训练数据
    miss_transfer 训练好的DataFrameMapper类
    X_test_imputed 添补完成的测试数据 只有在给定测试数据的时候才会使用
    '''
    print('开始缺失值填充'.center(50, '='))
    ##从dict里面把特征list拿出来
    print('类别特征数', len(category_feature))
    print('数值特征数', len(numeric_feature))
    ##数值列和类别列用指定的方法填充
    miss_transfer = DataFrameMapper([
        (numeric_feature, [SimpleImputer(strategy=numeric_strategy)]),
        (category_feature, [SimpleImputer(strategy=category_strategy)])
    ])
    ##进行fit和transform
    X_train_imputed = miss_transfer.fit_transform(data_train[numeric_feature +
                                                             category_feature])
    X_train_imputed = pd.DataFrame(X_train_imputed,
                                   columns=numeric_feature + category_feature)
    print('train_mapper完成:', X_train_imputed.shape)
    ##如果测试数据不为空 那么对测试数据进行transform 并返回
    if data_test is not None:
        X_test_imputed = miss_transfer.transform(data_test[numeric_feature +
                                                           category_feature])
        X_test_imputed = pd.DataFrame(X_test_imputed,
                                      columns=numeric_feature +
                                      category_feature)
        return X_train_imputed, miss_transfer, X_test_imputed
    return X_train_imputed, miss_transfer
Exemple #39
0
def prepare_pseudobs_simu(df_train, y_train, df_test,name):
    """ Prepare the data for training
    The input data is formated so that one line corresponds to one subject at a particular time point.
    # Arguments
        df_train: the entire dataset (input + survival times + event status)
        y_train: the pseudo-values computed according to the method chosen. 
        df_test: the entire dataset (input + survival times + event status)
    # Returns
        x_train_all: input data with all input variables + time variable and one line represents one subject at one time point.
        y_train_all: pseudo-values computed according to the method chosen. 
        x_test_all: input data with all input variables + time variable and one line represents one subject at one time point.
        y_test_all: survival time and event status.
        n_picktime: the number of time point at which the pseudo-observations are computed.
    """
    y_test_all = df_test[['yy','status']]
    n_picktime = int(y_train[['s']].apply(pd.Series.nunique))
    x_test = df_test.drop(['yy','status'], axis = 1)
    x_test_all = pd.concat([x_test]*n_picktime)
    time_test = pd.DataFrame(np.repeat(np.unique(y_train[['s']]),len(x_test)))
    x_test_all.reset_index(inplace=True, drop=True)
    x_test_all = pd.concat([x_test_all, time_test], axis = 1)

    if name!= "pseudo_discrete":
        x_train = df_train.drop(['yy','status'], axis = 1)
        x_train_all = pd.concat([x_train]*n_picktime)
        x_train_all.reset_index(inplace=True, drop=True)
        x_train_all = pd.concat([x_train_all, y_train[['s']]], axis = 1)
        y_train_all = y_train[['pseudost']]
    else:
        x_train = df_train.drop(['yy','status'], axis = 1)
        x_train['id'] = np.arange(len(x_train)) + 1
        x_train = x_train.merge(y_train, left_on='id', right_on='id')
        x_train_all = x_train.drop(['id','pseudost'], axis = 1)
        y_train_all = x_train['pseudost']
    # Data normalization
    col_list = list(x_train_all.columns)
    x_test_all.columns = col_list
    cols_standardize = [e for e in col_list]
    standardize = [([col], StandardScaler()) for col in cols_standardize]
    x_mapper = DataFrameMapper(standardize, df_out=True)
    x_train_all = x_mapper.fit_transform(x_train_all).astype('float32')
    x_test_all = x_mapper.transform(x_test_all).astype('float32')
    
    return(x_train_all, y_train_all, x_test_all, y_test_all, n_picktime)
Exemple #40
0
def test_input_df_true_first_transformer(simple_dataframe, monkeypatch):
    """
    If input_df is True, the first transformer is passed
    a pd.Series instead of an np.array
    """
    df = simple_dataframe
    monkeypatch.setattr(MockXTransformer, 'fit', Mock())
    monkeypatch.setattr(MockXTransformer, 'transform',
                        Mock(return_value=np.array([1, 2, 3])))
    mapper = DataFrameMapper([('a', MockXTransformer())], input_df=True)
    out = mapper.fit_transform(df)

    args, _ = MockXTransformer().fit.call_args
    assert isinstance(args[0], pd.Series)

    args, _ = MockXTransformer().transform.call_args
    assert isinstance(args[0], pd.Series)

    assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
Exemple #41
0
def init_xgb_mapper(df, xgb_pickle=RealMasterServiceConfig.XGB_MAPPER):
    mapper = DataFrameMapper([
        ('A_c', sklearn.preprocessing.LabelBinarizer()),
        ('Bsmt1_out', sklearn.preprocessing.LabelBinarizer()),
        ('Community', sklearn.preprocessing.LabelBinarizer()),
        ('Gar_type', sklearn.preprocessing.LabelBinarizer()),
        ('Heating', sklearn.preprocessing.LabelBinarizer()),
        ('Pool', sklearn.preprocessing.LabelBinarizer()),
        ('Style', sklearn.preprocessing.LabelBinarizer()),
        ('Type_own1_out', sklearn.preprocessing.LabelBinarizer()),
        ('Den_fr', sklearn.preprocessing.LabelBinarizer()),
        (['Dom'], sklearn.preprocessing.StandardScaler()),
        (['Taxes'], sklearn.preprocessing.StandardScaler()),
        (['Area_code'], sklearn.preprocessing.StandardScaler()),
        (['Depth'], sklearn.preprocessing.StandardScaler()),
        (['Front_ft'], sklearn.preprocessing.StandardScaler()),
        (['Bath_tot'], sklearn.preprocessing.StandardScaler()),
        (['Br'], sklearn.preprocessing.StandardScaler()),
        (['Br_plus'], sklearn.preprocessing.StandardScaler()),
        (['Park_spcs'], sklearn.preprocessing.StandardScaler()),
        (['Kit_plus'], sklearn.preprocessing.StandardScaler()),
        (['Rms'], sklearn.preprocessing.StandardScaler()),
        (['Rooms_plus'], sklearn.preprocessing.StandardScaler()),
        (['Garage'], sklearn.preprocessing.StandardScaler()),
        (['lat'], sklearn.preprocessing.StandardScaler()),
        (['lng'], sklearn.preprocessing.StandardScaler()),
        (['Lp_dol'], sklearn.preprocessing.StandardScaler()),
        (constants.DISCRETE_ROOM_AREA, None),
        (constants.MONTH, None),
    ],
                             input_df=True)

    data_temp = np.round(mapper.fit_transform(df.copy()).astype(np.double), 3)

    if check_local_file_exist(xgb_pickle):
        os.remove(xgb_pickle)

    with open(xgb_pickle, "wb") as f:
        pickle.dump(mapper, f)
    print("Fitting: ", type(mapper))
    del data_temp
    gc.collect()
    return
def init_mapper(df, mapper_path):
    mapper = DataFrameMapper([
        (['start_lat'], sklearn.preprocessing.MinMaxScaler()),
        (['start_lng'], sklearn.preprocessing.MinMaxScaler()),
        (['end_lat'], sklearn.preprocessing.MinMaxScaler()),
        (['end_lng'], sklearn.preprocessing.MinMaxScaler()),
        (['month'], sklearn.preprocessing.MinMaxScaler()),
        (['day'], sklearn.preprocessing.MinMaxScaler()),
        (['weekday'], sklearn.preprocessing.MinMaxScaler()),
        (['time'], sklearn.preprocessing.MinMaxScaler()),
    ], df_out=True)

    data_mapper = np.round(mapper.fit_transform(df.copy()).astype(np.double), 3)
    if os.path.isfile(mapper_path):
        os.remove(mapper_path)
    with open(mapper_path, "wb") as f:
        pickle.dump(mapper, f)
    print("Fitting: ", type(mapper))
    return data_mapper
Exemple #43
0
def test_preserve_df_index_rows_dropped():
    """
    If df_out=True but the original df index length doesn't
    match the number of final rows, use a numeric index
    """
    class DropLastRowTransformer(object):
        def fit(self, X):
            return self

        def transform(self, X):
            return X[:-1]

    df = pd.DataFrame({'target': [1, 2, 3]}, index=['a', 'b', 'c'])
    mapper = DataFrameMapper([('target', DropLastRowTransformer())],
                             df_out=True)

    transformed = mapper.fit_transform(df)

    assert_array_equal(transformed.index, np.array([0, 1]))
Exemple #44
0
def preprocess_num(dataset_input, num_cols):

    numbers = num_cols
    categories = [col for col in list(dataset_input.columns) if col not in numbers][1:]
    means_dict = {}
    std_dict = {}
    
    #save means and std of fit
    for i in numbers:
        means_dict[i] = dataset_input.ix[:,i].mean()
        std_dict[i] = dataset_input.ix[:,i].std(ddof=0)
    
    mapper = DataFrameMapper(
            [('Date', None)] +
            [(category, None) for category in categories] +
            [(number, preprocessing.StandardScaler()) for number in numbers], df_out = True)
    
    transformedData = mapper.fit_transform(dataset_input)
    return transformedData, mapper, means_dict, std_dict
    def transform_data(self, df, runtime_label):
        df_features, df_labels = df, df.pop(runtime_label)

        # Define which features are going to be transformed to a range of 0 to 1 (continuous)
        nfeats = gen_features(
            columns=[[i] for i in list(df_features.select_dtypes(include=[float]))],
            classes=[sklearn.preprocessing.MinMaxScaler]  
        )

        # Define which features are going to be binarized (categorical)
        sfeats = gen_features(
            columns=list(df.select_dtypes(include=[object])),
            classes=[sklearn.preprocessing.LabelBinarizer]  
        )

        # Do the transformations defined above
        mapper = DataFrameMapper(nfeats+sfeats,df_out=True)
        df_features = mapper.fit_transform(df_features)

        return df_features, df_labels
Exemple #46
0
 def scale(self, features_of_type='numerical', return_series=False):
     """
     Scales numerical features in the dataset, unless the parameter 'what'
     specifies any other subset selection primitive.
     :param features_of_type: Subset selection primitive
     :return: the subset scaled.
     """
     assert features_of_type in self.meta_tags
     subset = self.select(features_of_type)
     mapper = DataFrameMapper([(subset.columns, StandardScaler())])
     scaled_features = mapper.fit_transform(subset.copy())
     self.features[self.names(features_of_type)] = pd.DataFrame(
         scaled_features,
         index=subset.index,
         columns=subset.columns)
     self.metainfo()
     if return_series is True:
         return self.features[self.names(features_of_type)]
     else:
         return self
Exemple #47
0
def gen_feature_specific_df(df, pivotfeature, feature, sex=None):

    from sklearn.preprocessing import LabelBinarizer
    from sklearn_pandas import DataFrameMapper

    columns = [(pivotfeature, None), (feature, LabelBinarizer()),
               ('sex', None)]
    mapper = DataFrameMapper(columns, df_out=True)

    df = mapper.fit_transform(df.copy())
    cols = df.columns
    rename_map = {}
    for col in cols:
        rename_map[col] = col.replace(feature + "_", "")

    df = df.rename(columns=rename_map)
    if sex is not None:
        df = df[df['sex'] == sex]

    return df
Exemple #48
0
 def fix_skewness(self, features_of_type='numerical', return_series=False):
     """
     Ensures that the numerical features in the dataset, unless the
     parameter 'what' specifies any other subset selection primitive,
     fit into a normal distribution by applying the Yeo-Johnson transform
     :param features_of_type: Subset selection primitive
     :param return_series: Return the normalized series
     :return: the subset fitted to normal distribution.
     """
     assert features_of_type in self.meta_tags
     subset = self.select(features_of_type)
     mapper = DataFrameMapper([(subset.columns,
                                PowerTransformer(method='yeo-johnson',
                                                 standardize=False))])
     normed_features = mapper.fit_transform(subset.copy())
     self.features[self.names(features_of_type)] = pd.DataFrame(
         normed_features, index=subset.index, columns=subset.columns)
     self.update()
     if return_series is True:
         return self.features[self.names(features_of_type)]
def preprocess_train_data(x, tr):
    """
    Impute, binarize, scale an input dataframe. Save the transformation.
    
    :param pandas.DataFrame x: dataframe to preprocess
    :param tuple tr: the transformation rule/code for preprocessing
    :return: the preprocessed dataframe xt and transformation details
    """
        
    from sklearn.preprocessing import StandardScaler, MinMaxScaler, Imputer
    from sklearn_pandas import DataFrameMapper
    
    map_instructions = list()
    
    if 'continuous_vars' in tr:
        map_instructions.extend([([v], [CatchAllNAN(), Imputer(strategy='mean'), StandardScaler()]) for v in tr['continuous_vars']])
    
    if 'categorical_vars' in tr:
        # map_instructions.extend([([v], [MapToStr(), LabelBinarizer()]) for v in tr['categorical_vars']])
        map_instructions.extend([([v], ToDummiesWrapper()) for v in tr['categorical_vars']])        
        
    if 'binary_vars' in tr:
        map_instructions.extend([([v], [CatchAllNAN(), Imputer(strategy='most_frequent'), MinMaxScaler()]) for v in tr['binary_vars']])
    
    mapper = DataFrameMapper(map_instructions)

    xt = mapper.fit_transform(x)
        
    # get column names
    column_names = list()
    for feature in mapper.features: 
        has_classes_flag = getattr(feature[1], "classes_", None)
        original_feature_name = feature[0][0]        
        if has_classes_flag is None:
            column_names.extend(original_feature_name)          
        else:              
            class_names = feature[1].classes_
            column_names.extend([original_feature_name+'_'+str(sub) for sub in class_names])       
        
    xt = pd.DataFrame(xt, columns=column_names)
    return xt, mapper, column_names
def compute_cross_correlation_score(df,
                                    clfs,
                                    preprocess_scaling=True,
                                    nFold=10):
    """
    return an iterator with cross validation data
    :param df:
    :param clfs:
    :param preprocess_scaling:
    :param nFold:
    :return:
    """

    to_sklearn_features = DataFrameMapper([
        ('features', sklearn.feature_extraction.DictVectorizer())
    ])

    data_X = to_sklearn_features.fit_transform(df)
    data_Y = df.expected_class

    skf = cross_validation.StratifiedKFold(data_Y, n_folds=nFold)
    classification_results = []
    scores = []
    for num, (train_index, test_index) in enumerate(skf):
        X_train, X_test = data_X[train_index], data_X[test_index]
        Y_train, Y_test = data_Y[train_index], data_Y[test_index]
        print("Len train{}, Len test{}".format(Y_train.size, Y_test.size))
        cross_valid_data = Cross_validation_split(X_train, X_test, Y_train,
                                                  Y_test)
        cross_valid_data = preprocess(cross_valid_data,
                                      preprocess_scaling=preprocess_scaling,
                                      preprocess_correlation=False)

        for clf in clfs:
            score, classification = generate_score(clf,
                                                   cross_valid_data,
                                                   fold=num)
            scores.append(score)
            classification_results.append(classification)
    return scores, classification_results
    def _scale_data(self, data, dummy_cols):
        """
        Scale the dummy columns and add scaled numerical columns to the original dataframe.
        
        The function first loop through each numerical column and create a copy of it with '_scaled' ending in the name.
        All these created names are added to an array. Afterwards are all dummy columns and duplicates of numerical columns
        scalled and replaced in the original dataframe.
        
        Parameters
        -------
        data: DataFrame
            DataFrame with the data.
        dummy_cols: array
            Array of names of dummy columns created from passed categorical columns.
        
        Returns
        -------
        data: DataFrame
            DataFrame with all the loaded columns extended by dummy encoded and scaled categorical columns and scaled numerical columns
        scaled_cols: array
            Array of names scaled numerical columns.
        scaler_mapper: obj
            Fitted standard scaler, wrapped in DataFrameMapper, so that its output is dataframe instead of a numpy array.   
            
        """

        scaled_cols = []
        for col in self.num_cols:
            column = str(col + '_scaled')
            scaled_cols.append(column)
            data[column] = data[col]
        df = data[dummy_cols + scaled_cols]
        scaler_mapper = DataFrameMapper([(df.columns, StandardScaler())])
        scaled_features = scaler_mapper.fit_transform(df.copy())
        data[dummy_cols + scaled_cols] = pd.DataFrame(scaled_features,
                                                      index=df.index,
                                                      columns=df.columns)

        return data, scaled_cols, scaler_mapper
Exemple #52
0
def cluster(data_frame, file_wr):
    global num_clusters
    f = file_wr
    file_str = ""
    preprocessors = [
        preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=1)
    ]

    data_columns = list(data_frame.columns.values)
    time_stmp = str(data_frame['timestamp'].iloc[0])

    data_columns.remove("timestamp")
    data_data_mapper = DataFrameMapper([(data_columns, preprocessors)])
    geo_data = data_data_mapper.fit_transform(data_frame)

    pred = KMeansCluster()

    pred.fit(geo_data)

    cluster_array = {
        i: geo_data[np.where(pred.model.labels_ == i)]
        for i in range(num_clusters)
    }
    radious_list = []

    for i in range(num_clusters):
        file_str += "['" + time_stmp + "','"
        cluster_val = cluster_array[i]
        cluster_centre = pred.model.cluster_centers_[i]
        distance_arr = []
        for point in cluster_val:
            distance_arr.append(global_distance(cluster_centre, point))
        radious_list.append(get_percentile_threshold(0.8, distance_arr))
        file_str += str(cluster_centre[0]) + "','" + str(
            cluster_centre[1]) + "',"
        file_str += str(radious_list[i]) + ","
        file_str += str(i) + "],"

    file_wr.write(file_str)
Exemple #53
0
    def test_scikit_pandas(self):
        from sklearn_pandas import DataFrameMapper

        df = pandas.DataFrame({
            'feat1': [1, 2, 3, 4, 5, 6],
            'feat2': [1.0, 2.0, 3.0, 2.0, 3.0, 4.0]
        })

        mapper = DataFrameMapper([(['feat1', 'feat2'], StandardScaler()),
                                  (['feat1', 'feat2'], MinMaxScaler())])
        df2 = mapper.fit_transform(df)

        try:
            model_onnx = convert_sklearn(
                mapper,
                'predictable_tsne',
                [('input', FloatTensorType([1, df.shape[1]]))],
                custom_shape_calculators={
                    DataFrameMapper: dataframe_mapper_shape_calculator
                })
        except RuntimeError as e:
            assert "DataFrameMapper has no associated parser." in str(e)
def DataPreprocessing():
    #-------importing the csv file and storing it as a new df
    A = pd.read_excel('expdata.xlsx', encoding='latin-1')

    #-----------Tokenizing the subjectline and storing in a separate column
    A['Tokens'] = lp.stemmingTokenize(A.EmailSubject)

    #--------------calculating the count of words and storing in a separate column
    A['wordcount'] = list(map(lp.countwords, A.EmailSubject))

    #-----------------calculating open rate and naming it as targetopenrate
    A['target_open_rate'] = A['Email_Opened'] / A['EmailDelivered']

    #--------------------Area to operate :
    list_words = list(itertools.chain.from_iterable(A.loc[:, 'Tokens']))
    set_words = set(list_words)
    len(set_words)
    #f=lp.getWordFrequency(list_words)
    #word_features = list(f.keys())[:200]
    #>>> Work on getting only the top n frequent words

    #-----------------------creating word vectors

    mapper = DataFrameMapper([
        ('wordcount', None),
        ('EmailSubject',
         TfidfVectorizer(sublinear_tf=True,
                         strip_accents='unicode',
                         analyzer='word',
                         token_pattern=r'\w{1,}',
                         stop_words='english',
                         ngram_range=(1, 1),
                         max_features=2000)),
    ])
    X = mapper.fit_transform(A.copy())
    #X,y split
    y = A['target_open_rate']
    return (X, y, A)
    def test_whole(self):
        # Create datatype
        datatype = Categorical()

        # Load observations
        observations = lib.load_mushroom()

        # Transform observations
        mapper = DataFrameMapper(
            [(['cap-shape'], datatype.default_transformation_pipeline)],
            df_out=True)
        transformed_df = mapper.fit_transform(observations)

        # Create network
        input_layer, input_nub = datatype.input_nub_generator(
            'cap-shape', transformed_df)
        output_nub = datatype.output_nub_generator('cap-shape', transformed_df)

        x = input_nub
        x = output_nub(x)

        model = Model(input_layer, x)
        model.compile(optimizer='adam', loss=datatype.output_suggested_loss())
Exemple #56
0
def _create_apply_transformers(df):
    from sklearn_pandas import DataFrameMapper
    import category_encoders as ce

    data_raw = df

    obj_cols = data_raw.select_dtypes("object").columns.to_list()

    from sklearn_pandas import gen_features

    feature_def = gen_features(
        columns=obj_cols,
        classes=[{
            "class": ce.OrdinalEncoder,
            "handle_unknown": "return_nan",
            "handle_missing": "return_nan"
        }],
    )

    mapper = DataFrameMapper(feature_def, default=None, df_out=True)

    data_transformed = mapper.fit_transform(data_raw)
    return data_transformed, mapper
Exemple #57
0
def generate_multi_term(data):
    """
    输入:原始数据集-DataFrame
    输出:包含所有变量的交乘项的数据集-DataFrame
    """
    mapper = DataFrameMapper([(list(data.columns), [
        preprocessing.PolynomialFeatures(degree=2,
                                         interaction_only=True,
                                         include_bias=False)
    ])])
    # 生成交乘项
    data_tr = mapper.fit_transform(data.copy())

    # 生成交乘项的列名
    names_tr = []

    for name in mapper.transformed_names_:
        names = map(int, re.findall(r'\d+', name))
        name_tr = ''
        for i in names:
            name_tr = name_tr + list(data.columns)[i] + '*'
        names_tr.append(name_tr[:-1])
    return pd.DataFrame(data_tr, columns=names_tr)
Exemple #58
0
def run(country):

    global MODEL
    global CURR_DATE

    # initialize
    init(country)
    cities = get_cities()
    # print(cities)
    news = get_cleaned_news(cities)
    sentiments = get_sentiment(cities)

    # df = pd.DataFrame({'news': news, 'sentiment_score': sentiments})

    # src_last = DATASET['src'][-1*len(cities):]
    # print(src_last)

    # df_last = DATASET[-1*len(cities):]

    mapper = DataFrameMapper([
        ('news', TfidfVectorizer()),
        ('sentiment_score', None),
        # ('src', None)
    ])

    # today_records = DATASET.index[DATASET['src'].startswith('{}'.format(CURR_DATE))].tolist()
    # print('{}'.format(CURR_DATE))
    df = DATASET[DATASET.src.str.startswith('{}'.format(CURR_DATE))]
    print(df)

    # print(DATASET.describe(include='all'))

    X = mapper.fit_transform(DATASET)
    # X_live = X[X.src.str.startswith('{}'.format(CURR_DATE))]
    # X_live.drop(['src'], axis=1, inplace=True)

    print(MODEL.predict(X))
Exemple #59
0
def make_scaling_mapper(dumpf, scaling_mapper):
    df = pd.read_json(dumpf)

    mapper = DataFrameMapper([
        (['goodfaith_scores_mean'], sklearn.preprocessing.StandardScaler()),
        (['goodfaith_scores_var'], sklearn.preprocessing.StandardScaler()),
        (['goodfaith_scores_max'], sklearn.preprocessing.StandardScaler()),
        (['goodfaith_scores_min'], sklearn.preprocessing.StandardScaler()),
        (['goodfaith_scores_reg_slope'],
         sklearn.preprocessing.StandardScaler()),
        (['goodfaith_scores_reg_intercept'],
         sklearn.preprocessing.StandardScaler()),
        (['goodfaith_scores_count'], sklearn.preprocessing.StandardScaler()),
        (['goodfaith_scores_count_log'],
         sklearn.preprocessing.StandardScaler()),
        (['goodfaith_timestamps_total_seconds'],
         sklearn.preprocessing.StandardScaler()),
        (['goodfaith_timestamps_variance'],
         sklearn.preprocessing.StandardScaler()),
        (['goodfaith_timestamps_min'], sklearn.preprocessing.StandardScaler()),
        (['goodfaith_timestamps_max'], sklearn.preprocessing.StandardScaler()),
        (['self_reverts'], sklearn.preprocessing.StandardScaler()),
        (['edit_wars'], sklearn.preprocessing.StandardScaler()),
        (['pages_unique_count'], sklearn.preprocessing.StandardScaler()),
        (['pages_namespace_count'], sklearn.preprocessing.StandardScaler()),
        (['pages_nonmain_count'], sklearn.preprocessing.StandardScaler()),
        (['pages_talk_count'], sklearn.preprocessing.StandardScaler()),
        (['singleton_session'], sklearn.preprocessing.StandardScaler()),
        (['first_edit_ores_goodfaith'],
         sklearn.preprocessing.StandardScaler()),
        (['first_edit_ores_damaging'], sklearn.preprocessing.StandardScaler()),
        (['any_edit_ores_goodfaith'], sklearn.preprocessing.StandardScaler()),
        (['any_edit_ores_damaging'], sklearn.preprocessing.StandardScaler()),
    ])

    data = mapper.fit_transform(df.copy())
    pickle.dump(mapper, open(scaling_mapper, 'wb'))
Exemple #60
0
def full():
    df = pd.read_csv("data/train.csv", dtype={'StateHoliday': str})
    mapper_data = DataFrameMapper([
        # (['Store'], preprocessing.LabelBinarizer()),
        (['Customers'], preprocessing.StandardScaler())
    ])
    np_target = np.asarray(df['Sales'])
    np_data = mapper_data.fit_transform(df.copy())

    # reduce compution time
    np_target = np_target[:10000]
    np_data = np_data[:10000]

    X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(
        np_data, np_target, test_size=0.1, random_state=0)
    print("train size: ", X_train.shape)
    print("test size: ", X_test.shape)

    print("train data: ", X_train[:2])
    print("target data: ", y_train[:2])

    # clf = sklearn.linear_model.SGDClassifier(alpha=0.001, n_iter=20).fit(X_train, y_train)
    clf = sklearn.svm.SVC(C=1).fit(X_train, y_train)
    print(clf.score(X_test, y_test))