コード例 #1
0
def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X = np.ones(5)
    assert_array_equal(scale(X, with_mean=False), X)
コード例 #2
0
def boston_DBSCAN(class_num=0):
    '''给出Boston房价数据集中class_num类,数据形式为归一化数据列
    :parameter
    ————
    class_num:类号,读取函数所返回的类别号
    :returns
    ————
    x_boston:波士顿数据集中class_num类的自变量,归一化数据列,共13列
    y_boston:波士顿数据集中class_num类自变量,归一化数据列,共1列
    '''
    # 读取全数据集
    bostondata = load_boston()
    boston_X = bostondata.data
    boston_y = bostondata.target
    boston_full = np.c_[boston_X, boston_y]
    # 进行全数据集归一化
    scale = StandardScaler()
    boston_full = scale.fit_transform(boston_full)
    # 数据集降维为3维,方便可视化调参。
    pca = PCA(n_components=3)
    boston_full3 = pca.fit_transform(boston_full)
    # 分类
    clt = DBSCAN(eps=0.8, min_samples=5, n_jobs=4)
    label3 = clt.fit_predict(X=boston_full3)
    # 给定输出数据
    group0_boston = boston_full[label3 == 0]
    x_boston = group0_boston[:, 0:-2]
    y_boston = group0_boston[:, -1]
    return x_boston, y_boston
コード例 #3
0
    def normalize_features(self, scaler: StandardScaler=None) \
            -> StandardScaler:
        '''
        Normalizes the features of the dataset using a StandardScaler
        (subtract mean, divide by standard deviation).

        If a scaler is provided, uses that scaler to perform the normalization.
        Otherwise fits a scaler to the features in the dataset and then
        performs the normalization.

        :param scaler: A fitted StandardScaler. Used if provided.
        Otherwise a StandardScaler is fit on this dataset and is then used.
        :param replace_nan_token: What to replace nans with.
        :return: A fitted StandardScaler. If a scaler is provided, this is the
        same scaler. Otherwise, this is a scaler fit on this dataset.
        '''
        if not self.data or not self.data[0].features:
            return None

        if not scaler:
            scaler = StandardScaler()

        features = np.vstack([d.features for d in self.data])
        scaler.fit(features)

        for d in self.data:
            d.set_features(scaler.transform(d.features.reshape(1, -1))[0])

        return scaler
コード例 #4
0
ファイル: train.py プロジェクト: scnakandala/hummingbird
    def fit(self, data, args):
        self.model = StandardScaler()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval
コード例 #5
0
ファイル: kaggle_homework.py プロジェクト: xuerenlv/PaperWork
def read_file():
    file_content = pd.read_csv('train.csv')
    exc_cols = [u'Id', u'Response']
    cols = [c for c in file_content.columns if c not in exc_cols]
    train_datas = file_content.ix[:, cols]
    train_lables = file_content['Response'].values
    
    test_file = pd.read_csv('test.csv')
    test_ids = test_file['Id'].values
    test_datas = test_file.ix[:, [c for c in test_file.columns if c not in [u'Id']]]
    
    # 填充平均值
    test_datas = test_datas.fillna(-1)
    train_datas = train_datas.fillna(-1)
    all_datas = pd.concat([train_datas, test_datas], axis=0) 
    
    # 对数据进行一下划分
    categoricalVariables = ["Product_Info_1", "Product_Info_2", "Product_Info_3", "Product_Info_5", "Product_Info_6", "Product_Info_7", "Employment_Info_2", "Employment_Info_3", "Employment_Info_5", "InsuredInfo_1", "InsuredInfo_2", "InsuredInfo_3", "InsuredInfo_4", "InsuredInfo_5", "InsuredInfo_6", "InsuredInfo_7", "Insurance_History_1", "Insurance_History_2", "Insurance_History_3", "Insurance_History_4", "Insurance_History_7", "Insurance_History_8", "Insurance_History_9", "Family_Hist_1", "Medical_History_2", "Medical_History_3", "Medical_History_4", "Medical_History_5", "Medical_History_6", "Medical_History_7", "Medical_History_8", "Medical_History_9", "Medical_History_10", "Medical_History_11", "Medical_History_12", "Medical_History_13", "Medical_History_14", "Medical_History_16", "Medical_History_17", "Medical_History_18", "Medical_History_19", "Medical_History_20", "Medical_History_21", "Medical_History_22", "Medical_History_23", "Medical_History_25", "Medical_History_26", "Medical_History_27", "Medical_History_28", "Medical_History_29", "Medical_History_30", "Medical_History_31", "Medical_History_33", "Medical_History_34", "Medical_History_35", "Medical_History_36", "Medical_History_37", "Medical_History_38", "Medical_History_39", "Medical_History_40", "Medical_History_41"]
    all_file_data = all_datas.ix[:, [c for c in all_datas.columns if c not in categoricalVariables]]
    all_file_cate = all_datas.ix[:, [c for c in categoricalVariables]]
 
    # 归一化 对数值数据
    scalar_this = StandardScaler()
    scalar_this.fit_transform(all_file_data)
    
    # 重新组合数据
    train_datas = pd.concat([all_file_data[:train_datas.shape[0]], all_file_cate[:train_datas.shape[0]]], axis=1)
    test_datas = pd.concat([all_file_data[file_content.shape[0]:], all_file_cate[file_content.shape[0]:]], axis=1)
    
    # 向量化
    train_datas = DictVectorizer().fit_transform(train_datas.to_dict(outtype='records')).toarray()
    test_datas = DictVectorizer().fit_transform(test_datas.to_dict(outtype='records')).toarray()
    
    return (train_datas, train_lables, test_ids, test_datas)
コード例 #6
0
def load_UCI_Credit_Card_data(infile=None, balanced=True, seed=5):

    X = []
    y = []
    sids = []

    with open(infile, "r") as fi:
        fi.readline()
        reader = csv.reader(fi)
        for row in reader:
            sids.append(row[0])
            X.append(row[1:-1])
            y0 = int(row[-1])
            if y0 == 0:
                y0 = -1
            y.append(y0)
    y = np.array(y)

    if balanced:
        X, y = balance_X_y(X, y, seed)

    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.float32)

    encoder = OneHotEncoder(categorical_features=[1, 2, 3])
    encoder.fit(X)
    X = encoder.transform(X).toarray()

    X, y = shuffle_X_y(X, y, seed)

    scale_model = StandardScaler()
    X = scale_model.fit_transform(X)

    return X, np.expand_dims(y, axis=1)
コード例 #7
0
    def prepare_time_data(data):
        data_scaler = StandardScaler()
        data_concat = np.concatenate(data, axis=0)
        data_scaler.fit(data_concat)
        new_data = [data_scaler.transform(data_) for data_ in data]

        return data_scaler, new_data
コード例 #8
0
def get_data(args, logger, debug):
    '''Get data.'''
    # Get data:
    train_data, val_data, test_data = _get_data(args, logger)

    debug(f'train size = {len(train_data):,} | val size = {len(val_data):,} |'
          f' test size = {len(test_data):,}')

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(args.data_df)
        debug('Class sizes')
        debug(class_sizes)

    # Scale features:
    if args.features_scaling:
        features_scaler = train_data.normalize_features()
        val_data.normalize_features(features_scaler)
        test_data.normalize_features(features_scaler)
    else:
        features_scaler = None

    # Initialise scaler and scale training targets by subtracting mean and
    # dividing standard deviation (regression only):
    if args.dataset_type == 'regression':
        debug('Fitting scaler')
        scaler = StandardScaler()
        targets = scaler.fit_transform(train_data.targets())
        train_data.set_targets(targets)
    else:
        scaler = None

    return train_data, val_data, test_data, scaler, features_scaler
コード例 #9
0
ファイル: dataset.py プロジェクト: dennerepin/StochNetV2
 def _create_scaler(self, positivity):
     self.scaler_positivity = positivity
     if positivity is True:
         eps = 1e-9
         self._scaler = MinMaxScaler(feature_range=(eps, 1))
     else:
         self._scaler = StandardScaler()
     self.scaler_is_fitted = False
コード例 #10
0
 def __stdScaler(self):
     all_cols = list(self.data_df.columns.values)
     for col in all_cols:
         if col not in self.non_numeric_cols and col != 'time_to_failure':
             stdScaler = StandardScaler()
             stdScaler.fit(self.data_df[[col]])
             self.data_df[col] = stdScaler.transform(self.data_df[[col]])
     print('Standard Scaler applied ... ')
コード例 #11
0
def imputeAndScale(X_train,X_test):
    imp= Imputer()
    X_train=imp.fit_transform(X_train)
    X_test=imp.transform(X_test)
    
    scaler= StandardScaler().fit(X_train)
    X_test=scaler.transform(X_test)
    X_train= scaler.transform(X_train)
    
    return X_train, X_test
コード例 #12
0
ファイル: analyze_data.py プロジェクト: testing32/bimbo
def test_scalar():
    from sklearn.preprocessing.data import MinMaxScaler, StandardScaler
    scalar = StandardScaler()
    
    training = pd.read_csv(TRAIN_FEATURES_CSV, nrows=200000)
    test = pd.read_csv(TEST_FEATURES_CSV)
    
    # normalize the values
    for column in TOTAL_TRAINING_FEATURE_COLUMNS:
        training[column] = scalar.fit_transform(training[column])
        test[column] = scalar.transform(test[column])
コード例 #13
0
def make_models(X, y, y_bin):
    return dict(ols=LinearRegression().fit(X, y),
                lr_bin=LogisticRegression().fit(X, y_bin),
                lr_ovr=LogisticRegression(multi_class='ovr').fit(X, y),
                lr_mn=LogisticRegression(solver='lbfgs',
                                         multi_class='multinomial').fit(X, y),
                svc=SVC(kernel='linear').fit(X, y_bin),
                svr=SVR(kernel='linear').fit(X, y),
                dtc=DecisionTreeClassifier(max_depth=4).fit(X, y),
                dtr=DecisionTreeRegressor(max_depth=4).fit(X, y),
                rfc=RandomForestClassifier(n_estimators=3,
                                           max_depth=3,
                                           random_state=1).fit(X, y),
                rfr=RandomForestRegressor(n_estimators=3,
                                          max_depth=3,
                                          random_state=1).fit(X, y),
                gbc=GradientBoostingClassifier(n_estimators=3,
                                               max_depth=3,
                                               random_state=1).fit(X, y),
                gbr=GradientBoostingRegressor(n_estimators=3,
                                              max_depth=3,
                                              random_state=1).fit(X, y),
                abc=AdaBoostClassifier(algorithm='SAMME',
                                       n_estimators=3,
                                       random_state=1).fit(X, y),
                abc2=AdaBoostClassifier(algorithm='SAMME.R',
                                        n_estimators=3,
                                        random_state=1).fit(X, y),
                abc3=AdaBoostClassifier(algorithm='SAMME',
                                        n_estimators=3,
                                        random_state=1).fit(X, y_bin),
                abc4=AdaBoostClassifier(algorithm='SAMME.R',
                                        n_estimators=3,
                                        random_state=1).fit(X, y_bin),
                km=KMeans(1).fit(X),
                km2=KMeans(5).fit(X),
                pc1=PCA(1).fit(X),
                pc2=PCA(2).fit(X),
                pc3=PCA(2, whiten=True).fit(X),
                mlr1=MLPRegressor([2], 'relu').fit(X, y),
                mlr2=MLPRegressor([2, 1], 'tanh').fit(X, y),
                mlr3=MLPRegressor([2, 2, 2], 'identity').fit(X, y),
                mlc=MLPClassifier([2, 2], 'tanh').fit(X, y),
                mlc_bin=MLPClassifier([2, 2], 'identity').fit(X, y_bin),
                bin=Binarizer(0.5),
                mms=MinMaxScaler().fit(X),
                mas=MaxAbsScaler().fit(X),
                ss1=StandardScaler().fit(X),
                ss2=StandardScaler(with_mean=False).fit(X),
                ss3=StandardScaler(with_std=False).fit(X),
                n1=Normalizer('l1'),
                n2=Normalizer('l2'),
                n3=Normalizer('max'))
コード例 #14
0
def retrieve_data(undersampling=False, ratio=1, random_state=None):
    ## Getting and reading csv-data files into a pandas dataframe
    path = os.path.dirname(os.path.realpath(__file__))
    file1 = path + "/../data/creditcard_part1.csv"
    file2 = path + "/../data/creditcard_part2.csv"

    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    df = pd.concat((df1, df2), ignore_index=True)

    ## Finding the class balances
    class_counts = df.Class.value_counts()
    num_fraudulent = class_counts[1]
    num_non_fraudulent = class_counts[0]

    ## Splitting the dataset into design matrix X and targets y
    X = df.loc[:, df.columns != 'Class'].values
    y = df.loc[:, df.columns == 'Class'].values.ravel()

    #### StandardScaler is more useful for classification, and Normalizer is more useful for regression.
    standard_scaler = StandardScaler()
    X = standard_scaler.fit_transform(X)

    ### Undersampling to fix imbalanced class
    if undersampling:

        if random_state is not None:
            np.random.seed(random_state)

        if ratio > 1:
            raise ValueError("Undersampling ratio can't be larger than one")

        multiplier = int(1.0 / ratio)

        ## Randomized undersampling method
        indices_nonfraud = np.where(y == 0)[0]
        indices_fraud = np.where(y == 1)[0]
        np.random.shuffle(indices_nonfraud)
        indices_nonfraud_under = indices_nonfraud[:multiplier * num_fraudulent]
        indices_under = np.concatenate((indices_fraud, indices_nonfraud_under))
        np.random.shuffle(indices_under)

        ## Using indices from undersampling method to create new balanced dataset
        X_under = X[indices_under]
        y_under = y[indices_under]

    ## Splitting the dataset into test and training sets
    X_train, X_test, y_train, y_test = train_test_split(X_under,
                                                        y_under,
                                                        test_size=0.33,
                                                        random_state=4)
    return X_train, X_test, y_train, y_test
コード例 #15
0
def NUS_WIDE_load_two_party_data(data_dir,
                                 selected_labels,
                                 neg_label=-1,
                                 n_samples=-1):
    print("# load_two_party_data")

    Xa, Xb, y = get_labeled_data_with_2_party(data_dir=data_dir,
                                              selected_labels=selected_labels,
                                              n_samples=n_samples)

    scale_model = StandardScaler()
    Xa = scale_model.fit_transform(Xa)
    Xb = scale_model.fit_transform(Xb)

    y_ = []
    pos_count = 0
    neg_count = 0
    for i in range(y.shape[0]):
        # the first label in y as the first class while the other labels as the second class
        if y[i, 0] == 1:
            y_.append(1)
            pos_count += 1
        else:
            y_.append(neg_label)
            neg_count += 1

    print("pos counts:", pos_count)
    print("neg counts:", neg_count)

    y = np.expand_dims(y_, axis=1)

    print("Xa shape:", Xa.shape)
    print("Xb shape:", Xb.shape)
    print("y shape:", y.shape)

    n_train = int(0.8 * Xa.shape[0])
    print("# of train samples:", n_train)
    # print("# of test samples:", n_test)

    Xa_train, Xb_train = Xa[:n_train], Xb[:n_train]
    Xa_test, Xb_test = Xa[n_train:], Xb[n_train:]
    y_train, y_test = y[:n_train], y[n_train:]

    print("Xa_train.shape:", Xa_train.shape)
    print("Xb_train.shape:", Xb_train.shape)
    print("Xa_test.shape:", Xa_test.shape)
    print("Xb_test.shape:", Xb_test.shape)
    print("y_train.shape:", y_train.shape)
    print("y_test.shape:", y_test.shape)
    return [Xa_train, Xb_train, y_train], [Xa_test, Xb_test, y_test]
コード例 #16
0
ファイル: test_data.py プロジェクト: zulily/scikit-learn
def test_scaler_without_copy():
    """Check that StandardScaler.fit does not change input"""
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)

    X_copy = X.copy()
    StandardScaler(copy=False).fit(X)
    assert_array_equal(X, X_copy)

    X_csr_copy = X_csr.copy()
    StandardScaler(with_mean=False, copy=False).fit(X_csr)
    assert_array_equal(X_csr.toarray(), X_csr_copy.toarray())
コード例 #17
0
ファイル: test_data.py プロジェクト: zulily/scikit-learn
def test_scale_sparse_with_mean_raise_exception():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X_csr = sparse.csr_matrix(X)

    # check scaling and fit with direct calls on sparse data
    assert_raises(ValueError, scale, X_csr, with_mean=True)
    assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csr)

    # check transform and inverse_transform after a fit on a dense array
    scaler = StandardScaler(with_mean=True).fit(X)
    assert_raises(ValueError, scaler.transform, X_csr)

    X_transformed_csr = sparse.csr_matrix(scaler.transform(X))
    assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
コード例 #18
0
def test_scale_sparse_with_mean_raise_exception():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X_csr = sparse.csr_matrix(X)

    # check scaling and fit with direct calls on sparse data
    assert_raises(ValueError, scale, X_csr, with_mean=True)
    assert_raises(ValueError, StandardScaler(with_mean=True).fit, X_csr)

    # check transform and inverse_transform after a fit on a dense array
    scaler = StandardScaler(with_mean=True).fit(X)
    assert_raises(ValueError, scaler.transform, X_csr)

    X_transformed_csr = sparse.csr_matrix(scaler.transform(X))
    assert_raises(ValueError, scaler.inverse_transform, X_transformed_csr)
コード例 #19
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
コード例 #20
0
ファイル: model.py プロジェクト: guancodes/palmtree
def make_model(classifier, **params):
    pipeline = Pipeline([
        ('feature_extractor', FeatureExtractor()),
        ('scaler', StandardScaler()),
        ('model', classifier(**params)),
    ])
    return pipeline
コード例 #21
0
ファイル: standard_scaler.py プロジェクト: ozgurgul/lale
 def __init__(self, copy=True, with_mean=True, with_std=True):
     self._hyperparams = {
         'copy': copy,
         'with_mean': with_mean,
         'with_std': with_std
     }
     self._wrapped_model = Op(**self._hyperparams)
コード例 #22
0
ファイル: metamodels.py プロジェクト: stbalduin/memobuilder
    def fit(self, x_train, y_train):
        self.processing_steps = [StandardScaler()]
        svr = SVR(kernel='rbf', gamma=0.1)

        # http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf
        # C = [2**i for i in np.arange(start=-5, stop=16, step=2)]
        # gamma = [2**i for i in np.arange(start=-15, stop=4, step=2)]
        # https://stats.stackexchange.com/questions/43943/
        # which-search-range-for-determining-svm-optimal-c-
        # and-gamma-parameters

        C = [2**i for i in [-3, -2, -1, 0, 1, 2, 3, 4, 5]]
        gamma = [2**i for i in [-5, -4, -3, -2, -1, 0, 1, 2, 3]]

        params = {"C": sp_uniform(0.125, 32), "gamma": sp_uniform(0.03125, 8)}
        params.update(self.kwargs)

        reg = RandomizedSearchCV(estimator=svr,
                                 param_distributions=params,
                                 n_iter=10,
                                 scoring=self.score['function'],
                                 cv=3,
                                 iid=True)

        clf = MultiOutputRegressor(reg)
        self._update_pipeline_and_fit(x_train, y_train, [clf])
コード例 #23
0
ファイル: test_data.py プロジェクト: zulily/scikit-learn
def test_fit_transform():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    for obj in ((StandardScaler(), Normalizer(), Binarizer())):
        X_transformed = obj.fit(X).transform(X)
        X_transformed2 = obj.fit_transform(X)
        assert_array_equal(X_transformed, X_transformed2)
コード例 #24
0
ファイル: metamodels.py プロジェクト: stbalduin/memobuilder
    def fit(self, x_train, y_train):

        self.processing_steps = [StandardScaler()]

        ann = MLPRegressor()
        params = {
            'hidden_layer_sizes': sp_randint(20, 150),
            'alpha': sp_uniform(0, 100),
            'max_iter': sp_randint(100, 2000),
            'solver': ['lbfgs'],
            # 'identity', 'logistic', 'tanh', 'relu'
            'activation': ['relu']
        }

        if 'hidden_layer_sizes' in self.kwargs:
            self.kwargs['hidden_layer_sizes'] = self.parsefunction(
                self.kwargs['hidden_layer_sizes'])

        params.update(self.kwargs)
        clf = RandomizedSearchCV(estimator=ann,
                                 param_distributions=params,
                                 n_iter=10,
                                 scoring=self.score['function'],
                                 cv=3,
                                 iid=True)

        self._update_pipeline_and_fit(x_train, y_train, [clf])
コード例 #25
0
def main():
    args = parse()
    n_rollout = args.nrollout
    n_epoch = args.epoch
    savename = args.savename if args.savename is not None else 'model-' + str(
        n_rollout) + 'unroll'

    np.random.seed(1098)
    path = args.filename
    names = ['target_pos', 'target_speed', 'pos', 'vel', 'effort']
    with h5py.File(path, 'r') as f:
        (target_pos, target_speed, pos, vel,
         effort) = [[np.array(val) for val in f[name].values()]
                    for name in names]

    x_target = np.array(target_pos)
    x_first = np.array([pos_[0] for pos_ in pos])
    x_speed = np.array(target_speed).reshape((-1, 1))
    aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort]

    x = np.concatenate((x_target, x_first, x_speed), axis=1)

    input_scaler = StandardScaler()
    x = input_scaler.fit_transform(x)
    output_scaler = StandardScaler()
    effort_concat = np.concatenate([a for a in effort], axis=0)
    output_scaler.fit(effort_concat)
    effort = [output_scaler.transform(eff) for eff in effort]

    y = pad_sequences(effort, padding='post', value=0.)
    aux_output = pad_sequences(aux_output, padding='post', value=0.)
    x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x,
                                                               y,
                                                               aux_output,
                                                               test_size=0.2)

    y_mask, y_test_mask = [this_y[:, :, 0] for this_y in (y_aux, y_aux_test)]
    y_aux_mask, y_aux_test_mask = [
        np.ones(this_y.shape[:2]) for this_y in (y_aux, y_aux_test)
    ]

    model = MyModel(train=[x, [y, y_aux]],
                    val=[x_test, [y_test, y_aux_test]],
                    train_mask=[y_mask, y_aux_mask],
                    val_mask=[y_test_mask, y_aux_test_mask],
                    max_unroll=n_rollout,
                    name=savename)

    if not os.path.exists('save'):
        os.makedirs('save')

    if args.train:
        model.fit(nb_epoch=n_epoch, batch_size=32)
    elif args.resume:
        model.resume(nb_epoch=n_epoch, batch_size=32)
コード例 #26
0
ファイル: test_data.py プロジェクト: zulily/scikit-learn
def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
コード例 #27
0
def test_theanets_regression():
    check_regression(
        TheanetsRegressor(layers=[3],
                          trainers=[dict(algo='rmsprop', **impatient)]),
        **regressor_params)
    check_regression(
        TheanetsRegressor(scaler=StandardScaler(),
                          trainers=[dict(algo='rmsprop', **impatient)]),
        **regressor_params)
コード例 #28
0
def load_scalers(path: str) -> Tuple[StandardScaler, StandardScaler]:
    '''
    Loads the scalers a model was trained with.

    :param path: Path where model checkpoint is saved.
    :return: A tuple with the data scaler and the features scaler.
    '''
    state = torch.load(path, map_location=lambda storage, loc: storage)

    scaler = StandardScaler(state['data_scaler']['means'],
                            state['data_scaler']['stds']) \
        if state['data_scaler'] else None

    features_scaler = StandardScaler(state['features_scaler']['means'],
                                     state['features_scaler']['stds']) \
        if state['features_scaler'] else None

    return scaler, features_scaler
コード例 #29
0
def test_theanets_regression():
    check_regression(
        TheanetsRegressor(layers=[20],
                          trainers=[{
                              'optimize': 'rmsprop',
                              'min_improvement': 0.1
                          }]), **regressor_params)
    check_regression(TheanetsRegressor(scaler=StandardScaler()),
                     **regressor_params)
コード例 #30
0
ファイル: standard_scaler.py プロジェクト: ozgurgul/lale
class StandardScalerImpl():
    def __init__(self, copy=True, with_mean=True, with_std=True):
        self._hyperparams = {
            'copy': copy,
            'with_mean': with_mean,
            'with_std': with_std
        }
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
コード例 #31
0
ファイル: test_theanets.py プロジェクト: yhaddad/rep
def test_theanets_regression():
    check_regression(
        TheanetsRegressor(layers=[3],
                          trainers=[{
                              'algo': 'rmsprop',
                              'learning_rate': 0.1
                          }]), **regressor_params)
    check_regression(TheanetsRegressor(scaler=StandardScaler()),
                     **regressor_params)
コード例 #32
0
ファイル: test_data.py プロジェクト: zkuncheva/scikit-learn
def test_warning_scaling_integers():
    # Check warning when scaling integer data
    X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8)

    w = "Data with input dtype uint8 was converted to float64"

    clean_warning_registry()
    assert_warns_message(DataConversionWarning, w, scale, X)
    assert_warns_message(DataConversionWarning, w, StandardScaler().fit, X)
    assert_warns_message(DataConversionWarning, w, MinMaxScaler().fit, X)
コード例 #33
0
ファイル: test_data.py プロジェクト: youngstone/scikit-learn
def test_warning_scaling_integers():
    """Check warning when scaling integer data"""
    X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8)

    w = "assumes floating point values as input, got uint8"

    clean_warning_registry()
    assert_warns_message(UserWarning, w, scale, X)
    assert_warns_message(UserWarning, w, StandardScaler().fit, X)
    assert_warns_message(UserWarning, w, MinMaxScaler().fit, X)
コード例 #34
0
ファイル: test_data.py プロジェクト: zulily/scikit-learn
def test_warning_scaling_integers():
    """Check warning when scaling integer data"""
    X = np.array([[1, 2, 0], [0, 0, 0]], dtype=np.uint8)

    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")
        assert_warns(UserWarning, StandardScaler().fit, X)

    with warnings.catch_warnings(record=True):
        warnings.simplefilter("always")
        assert_warns(UserWarning, MinMaxScaler().fit, X)
コード例 #35
0
def test_center_kernel():
    """Test that KernelCenterer is equivalent to StandardScaler
       in feature space"""
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    scaler = StandardScaler(with_std=False)
    scaler.fit(X_fit)
    X_fit_centered = scaler.transform(X_fit)
    K_fit = np.dot(X_fit, X_fit.T)

    # center fit time matrix
    centerer = KernelCenterer()
    K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
    K_fit_centered2 = centerer.fit_transform(K_fit)
    assert_array_almost_equal(K_fit_centered, K_fit_centered2)

    # center predict time matrix
    X_pred = rng.random_sample((2, 4))
    K_pred = np.dot(X_pred, X_fit.T)
    X_pred_centered = scaler.transform(X_pred)
    K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
    K_pred_centered2 = centerer.transform(K_pred)
    assert_array_almost_equal(K_pred_centered, K_pred_centered2)
コード例 #36
0
ファイル: ranking.py プロジェクト: lefterav/qualitative
    def train(self, dataset_filename, 
              scale=True, 
              feature_selector=None, 
              feature_selection_params={},
              feature_selection_threshold=.25, 
              learning_params={}, 
              optimize=True, 
              optimization_params={}, 
              scorers=['f1_score'],
              attribute_set=None,
              class_name=None,
              metaresults_prefix="./0-",
              **kwargs):
        
        plot_filename = "{}{}".format(metaresults_prefix, "featureselection.pdf")
        data, labels = dataset_to_instances(dataset_filename, attribute_set, class_name,  **kwargs)
        learner = self.learner
        
        #the class must remember the attribute_set and the class_name in order to reproduce the vectors
        self.attribute_set = attribute_set
        self.class_name = class_name

 
        #scale data to the mean
        if scale:
            log.info("Scaling datasets...")
            log.debug("Data shape before scaling: {}".format(data.shape))
            self.scaler = StandardScaler()
            data = self.scaler.fit_transform(data)
            log.debug("Data shape after scaling: {}".format(data.shape))
            log.debug("Mean: {} , Std: {}".format(self.scaler.mean_, self.scaler.std_))

        #avoid any NaNs and Infs that may have occurred due to the scaling
        data = np.nan_to_num(data)
        
        #feature selection
        if isinstance(feature_selection_params, basestring):
            feature_selection_params = eval(feature_selection_params)
        self.featureselector, data, metadata = self.run_feature_selection(data, labels, feature_selector, feature_selection_params, feature_selection_threshold, plot_filename) 
        
        #initialize learning method and scoring functions and optimize
        self.learner, self.scorers = self.initialize_learning_method(learner, data, labels, learning_params, optimize, optimization_params, scorers)

        log.info("Data shape before fitting: {}".format(data.shape))

        self.learner.fit(data, labels)
        self.fit = True
        return metadata
コード例 #37
0
def test_scaler_2d_arrays():
    """Test scaling of 2d array along first axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has been copied
    assert_true(X_scaled is not X)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert_true(X_scaled is not X)

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is X)

    X = rng.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)
コード例 #38
0
def test_scaler_without_centering():
    rng = np.random.RandomState(42)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    assert_raises(ValueError, StandardScaler().fit, X_csr)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    scaler = StandardScaler(with_mean=False).fit(X)
    X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
    X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
    X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(
        X_scaled.mean(axis=0), [0., -0.01, 2.24, -0.35, -0.78], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
コード例 #39
0
ファイル: ranking.py プロジェクト: lefterav/qualitative
class SkRanker(Ranker, SkLearner):
    '''
    Basic ranker wrapping scikit-learn functions
    '''
    
    def train(self, dataset_filename, 
              scale=True, 
              feature_selector=None, 
              feature_selection_params={},
              feature_selection_threshold=.25, 
              learning_params={}, 
              optimize=True, 
              optimization_params={}, 
              scorers=['f1_score'],
              attribute_set=None,
              class_name=None,
              metaresults_prefix="./0-",
              **kwargs):
        
        plot_filename = "{}{}".format(metaresults_prefix, "featureselection.pdf")
        data, labels = dataset_to_instances(dataset_filename, attribute_set, class_name,  **kwargs)
        learner = self.learner
        
        #the class must remember the attribute_set and the class_name in order to reproduce the vectors
        self.attribute_set = attribute_set
        self.class_name = class_name

 
        #scale data to the mean
        if scale:
            log.info("Scaling datasets...")
            log.debug("Data shape before scaling: {}".format(data.shape))
            self.scaler = StandardScaler()
            data = self.scaler.fit_transform(data)
            log.debug("Data shape after scaling: {}".format(data.shape))
            log.debug("Mean: {} , Std: {}".format(self.scaler.mean_, self.scaler.std_))

        #avoid any NaNs and Infs that may have occurred due to the scaling
        data = np.nan_to_num(data)
        
        #feature selection
        if isinstance(feature_selection_params, basestring):
            feature_selection_params = eval(feature_selection_params)
        self.featureselector, data, metadata = self.run_feature_selection(data, labels, feature_selector, feature_selection_params, feature_selection_threshold, plot_filename) 
        
        #initialize learning method and scoring functions and optimize
        self.learner, self.scorers = self.initialize_learning_method(learner, data, labels, learning_params, optimize, optimization_params, scorers)

        log.info("Data shape before fitting: {}".format(data.shape))

        self.learner.fit(data, labels)
        self.fit = True
        return metadata
    
    def get_model_description(self):
        params = {}
        
        if self.scaler:
            params = self.scaler.get_params(deep=True)
        try: #these are for SVC
            if self.learner.kernel == "rbf":
                params["gamma"] = self.learner.gamma
                params["C"] = self.learner.C
                for i, n_support in enumerate(self.learner.n_support_):
                    params["n_{}".format(i)] = n_support
                log.debug(len(self.learner.dual_coef_))
                return params
            elif self.learner.kernel == "linear":
                coefficients = self.learner.coef_
                att_coefficients = {}
                for attname, coeff in zip(self.attribute_set.get_names_pairwise(), coefficients[0]):
                    att_coefficients[attname] = coeff
                return att_coefficients
        except AttributeError:
            pass
        try: #adaboost etc
            params = self.learner.get_params()
            numeric_params = OrderedDict()
            for key, value in params.iteritems():
                try:
                    value = float(value)
                except ValueError:
                    continue
                numeric_params[key] = value
            return numeric_params
        except:
            pass
        return {}
    
    
    def get_ranked_sentence(self, parallelsentence, critical_attribute="rank_predicted", 
                            new_rank_name="rank_hard", 
                            del_orig_class_att=False, 
                            bidirectional_pairs=False, 
                            ties=True,
                            reconstruct='hard'):
        """
        """
        if type(self.learner) == str:
            if self.classifier:
                self.learner = self.classifier
                # this is to provide backwards compatibility for old models 
                # whose classes used differeent attribute names
                try:
                    self.learner._dual_coef_ = self.learner.dual_coef_
                    self.learner._intercept_ = self.learner.intercept_
                except AttributeError:
                    # it's ok if the model doesn't have these variables
                    pass

                try: # backwards compatibility for old LogisticRegression
                    try_classes = self.learner.classes_
                except AttributeError:
                    self.learner.classes_ = [-1, 1]

        #de-compose multiranked sentence into pairwise comparisons
        pairwise_parallelsentences = parallelsentence.get_pairwise_parallelsentences(bidirectional_pairs=bidirectional_pairs,
                                                                                     class_name=self.class_name,
                                                                                     ties=ties)        
        if len(parallelsentence.get_translations()) == 1:
            log.warning("Parallelsentence has only one target sentence")
            parallelsentence.tgt[0].add_attribute(new_rank_name, 1)
            return parallelsentence, {}
        elif len(parallelsentence.get_translations()) == 0:
            return parallelsentence, {}
        #list that will hold the pairwise parallel sentences including the learner's decision
        classified_pairwise_parallelsentences = []
        resultvector = {}
        
        for pairwise_parallelsentence in pairwise_parallelsentences:
            #convert pairwise parallel sentence into an orange instance
            instance = parallelsentence_to_instance(pairwise_parallelsentence, attribute_set=self.attribute_set)
            #scale data instance to mean, based on trained scaler
            if self.scaler:
                try:
                    instance = np.nan_to_num(instance)
                    instance = self.scaler.transform(instance)
                except ValueError as e:
                    log.error("Could not transform instance: {}, scikit replied: {}".format(instance, e))
                    #raise ValueError(e)
                    pass
            try:
                if self.featureselector:
                    instance = np.nan_to_num(instance)
                    instance = self.featureselector.transform(instance)
            except AttributeError:
                pass
            log.debug('Instance = {}'.format(instance)) 
            #make sure no NaN or inf appears in the instance
            instance = np.nan_to_num(instance)
            #run learner for this instance
            predicted_value = self.learner.predict(instance)
            try:
                distribution = dict(zip(self.learner.classes_, self.learner.predict_proba(instance)[0]))
            except AttributeError: 
                #if learner does not support per-class probability (e.g. LinearSVC) assign 0.5
                distribution = dict([(cl, 0.5) for cl in self.learner.classes_])
            log.debug("Distribution: {}".format(distribution))
            log.debug("Predicted value: {}".format(predicted_value))
            #even if we have a binary learner, it may be that it cannot decide between two classes
            #for us, this means a tie
            if not bidirectional_pairs and distribution and len(distribution)==2 and float(distribution[1])==0.5:
                predicted_value = 0
                distribution[predicted_value] = 0.5
                
            log.debug("{}, {}, {}".format(pairwise_parallelsentence.get_system_names(), predicted_value, distribution))
            
            
            #gather several metadata from the classification, which may be needed 
            resultvector.update({'systems' : pairwise_parallelsentence.get_system_names(),
                                 'value' : predicted_value,
                                 'distribution': distribution,
                                 'confidence': distribution[int(predicted_value)],
#                                 'instance' : instance,
                                 })
            
            #add the new predicted ranks as attributes of the new pairwise sentence
            pairwise_parallelsentence.add_attributes({"rank_predicted":predicted_value,
                                                       "prob_-1":distribution[-1],
                                                       "prob_1":distribution[1]
                                                       })
            
            classified_pairwise_parallelsentences.append(pairwise_parallelsentence)

        
        #gather all classified pairwise comparisons of into one parallel sentence again
        sentenceset = CompactPairwiseParallelSentenceSet(classified_pairwise_parallelsentences)
        if reconstruct == 'hard':
            log.debug("Applying hard reconstruction to produce rank {}".format(new_rank_name))
            ranked_sentence = sentenceset.get_multiranked_sentence(critical_attribute=critical_attribute, 
                                                               new_rank_name=new_rank_name, 
                                                               del_orig_class_att=del_orig_class_att)
        else:
            attribute1 = "prob_-1"
            attribute2 = "prob_1"
            log.debug("Applying soft reconstruction to produce rank {}".format(new_rank_name))
            try:
                ranked_sentence = sentenceset.get_multiranked_sentence_with_soft_ranks(attribute1, attribute2, 
                        critical_attribute, new_rank_name, normalize_ranking=False)
            except:
                raise ValueError("Sentenceset {} from {} caused exception".format(classified_pairwise_parallelsentences, parallelsentence))
        return ranked_sentence, resultvector
コード例 #40
0
def test_scaler_int():
    # test that scaler converts integer input to floating
    # for both sparse and dense matrices
    rng = np.random.RandomState(42)
    X = rng.randint(20, size=(4, 5))
    X[:, 0] = 0  # first feature is always of zero
    X_csr = sparse.csr_matrix(X)
    X_csc = sparse.csc_matrix(X)

    null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
    with warnings.catch_warnings(record=True):
        X_null = null_transform.fit_transform(X_csr)
    assert_array_equal(X_null.data, X_csr.data)
    X_orig = null_transform.inverse_transform(X_null)
    assert_array_equal(X_orig.data, X_csr.data)

    with warnings.catch_warnings(record=True):
        scaler = StandardScaler(with_mean=False).fit(X)
        X_scaled = scaler.transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    with warnings.catch_warnings(record=True):
        scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
        X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
    assert_false(np.any(np.isnan(X_csr_scaled.data)))

    with warnings.catch_warnings(record=True):
        scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
        X_csc_scaled = scaler_csr.transform(X_csc, copy=True)
    assert_false(np.any(np.isnan(X_csc_scaled.data)))

    assert_equal(scaler.mean_, scaler_csr.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csr.std_)

    assert_equal(scaler.mean_, scaler_csc.mean_)
    assert_array_almost_equal(scaler.std_, scaler_csc.std_)

    assert_array_almost_equal(
        X_scaled.mean(axis=0),
        [0., 1.109, 1.856, 21., 1.559], 2)
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])

    X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis0(
        X_csr_scaled.astype(np.float))
    assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
    assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))

    # Check that X has not been modified (copy)
    assert_true(X_scaled is not X)
    assert_true(X_csr_scaled is not X_csr)

    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
    assert_true(X_csr_scaled_back is not X_csr)
    assert_true(X_csr_scaled_back is not X_csr_scaled)
    assert_array_almost_equal(X_csr_scaled_back.toarray(), X)

    X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
    assert_true(X_csc_scaled_back is not X_csc)
    assert_true(X_csc_scaled_back is not X_csc_scaled)
    assert_array_almost_equal(X_csc_scaled_back.toarray(), X)