Beispiel #1
0
def get_data():

	df_train = pd.read_csv('data_original/aps_failure_training_set.csv')
	df_test = pd.read_csv('data_original/aps_failure_test_set.csv')
	# 
	df_train.replace('na','-1', inplace=True)
	df_test.replace('na','-1', inplace=True)
	# categorical for label: 0: neg, 1: pos
	df_train['class'] = pd.Categorical(df_train['class']).codes
	df_test['class']  = pd.Categorical(df_test['class']).codes

	# split data into x and y
	Y_train = df_train['class'].copy(deep=True)
	X_train = df_train.copy(deep=True)
	X_train.drop(['class'], inplace=True, axis=1)

	Y_test = df_test['class'].copy(deep=True)
	X_test = df_test.copy(deep=True)
	X_test.drop(['class'], inplace=True, axis=1)

	# strings to float
	X_train = X_train.astype('float64')
	X_test  = X_test.astype('float64')

	# scale the dataset
	scaler = MaxAbsScaler()
	scaler.fit(X_train)
	X_train = scaler.transform(X_train)
	X_test  = scaler.transform(X_test)

	return X_train, Y_train, X_test, Y_test
Beispiel #2
0
def cv(model, x, y):
    errors = []
    kf = KFold(n_splits=10, shuffle=True)
    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        x_scaler = MaxAbsScaler()
        y_scaler = MaxAbsScaler()

        x_scaler.fit(x_train)
        y_scaler.fit(y_train)

        xx_train = x_scaler.transform(x_train)
        xx_test = x_scaler.transform(x_test)
        yy_train = y_scaler.transform(y_train)
        yy_test = y_scaler.transform(y_test)

        cv_model = sklearn.base.clone(model)
        cv_model.fit(xx_train, yy_train)

        yy_predicted = cv_model.predict(xx_test)

        error = math.sqrt(mean_squared_error(yy_test, yy_predicted))
        errors.append(error)
    return errors
Beispiel #3
0
    def scale(self, X_train, X_test, type):
        if type == "StandardScaler":
            standardScaler = StandardScaler()
            standardScaler.fit(X_train)
            X_train = standardScaler.transform(X_train)
            X_test = standardScaler.transform(X_test)
            return X_train, X_test

        elif type == "MinMaxScaler":
            minMaxScaler = MinMaxScaler()
            minMaxScaler.fit(X_train)
            X_train = minMaxScaler.transform(X_train)
            X_test = minMaxScaler.transform(X_test)
            return X_train, X_test
        elif type == "MaxScaler":

            maxScaler = MaxAbsScaler()
            maxScaler.fit(X_train)
            X_train = maxScaler.transform(X_train)
            X_test = maxScaler.transform(X_test)
            return X_train, X_test

        elif type == "RobustScaler":
            robustScaler = RobustScaler()
            robustScaler.fit(X_train)
            X_train = robustScaler.transform(X_train)
            X_test = robustScaler.transform(X_test)
            return X_train, X_test
Beispiel #4
0
def lasso(input_df, output, plot=False, normalize=False):
    print('LASSO #################')
    if normalize:
        transformer = MaxAbsScaler().fit(input_df.values)
        X = transformer.transform(input_df.values)
        transformer = MaxAbsScaler().fit(output.values.reshape(-1, 1))
        y = transformer.transform(output.values.reshape(-1, 1))
    else:
        X = input_df.values
        y = output.values.reshape(-1, 1)
    # defaults : tol=1e-4, max_iter=1000, 
#   reg = LassoCV(max_iter=8000, tol=1e-1)
    reg = LassoCV(max_iter=8000, tol=0.1)
    reg.fit(X, y)
#   print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
#   print("Best score using built-in LassoCV: %f" %reg.score(X,y))
    coef = pd.Series(reg.coef_, index = input_df.columns)
    imp_coef = coef.sort_values()
#   print(imp_coef)
    if plot:
        matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
        imp_coef.plot(kind = "barh")
        plt.title("Feature importance using Lasso Model")
        plt.show()
    return coef
Beispiel #5
0
def normalization_a():
    from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
    # 区间缩放,返回值为缩放到[0, 1]区间的数据
    minMaxScaler = MinMaxScaler().fit(data)
    minMaxScaler.transform(data)

    MaxAbsScaler = MaxAbsScaler().fit(data)
    MaxAbsScaler.transform(data)
Beispiel #6
0
def load_dataset(dataset):
    ret =  LOADERS[dataset]()
    X_tr, y_tr, X_te, y_te = ret
    transformer = MaxAbsScaler().fit(X_tr)
    X_tr = transformer.transform(X_tr)
    if not X_te is None:
        transformer = MaxAbsScaler().fit(X_te)
        X_te = transformer.transform(X_te)
    return X_tr, y_tr, X_te, y_te
def maxAbsScaler(train_x, test_x):
    scaler = MaxAbsScaler()
    scaler.fit(train_x.data)
    train_x.data = scaler.transform(train_x.data)

    if test_x is not None:
        test_x.data = scaler.transform(test_x.data)

    print(pd.DataFrame(train_x.data).describe())
    return train_x.data
Beispiel #8
0
def run_classification(data, labels, test_idx, trees, c):
    All_scores = []
    length = len(data[0])
    #print len(data)
    total_AUPR_training = 0
    total_AUPR_testing = 0
    folds_AUPR = []
    folds_AUC = []
    folds_precision = []
    folds_recall = []
    folds_f1 = []
    for fold_data, test_idx_fold in zip(data, test_idx):
        train_idx_fold = []
        for idx in range(length):
            if idx not in test_idx_fold:
                train_idx_fold.append(idx)

        fold_data = np.array(fold_data)
        test_idx_fold = np.array(test_idx_fold)
        train_idx_fold = np.array(train_idx_fold)

        X_train, X_test = fold_data[train_idx_fold, ], fold_data[
            test_idx_fold, ]
        y_train, y_test = np.array(train_idx_fold), np.array(test_idx_fold)

        max_abs_scaler = MaxAbsScaler()
        X_train_maxabs_fit = max_abs_scaler.fit(X_train)

        X_train_maxabs_transform = max_abs_scaler.transform(X_train)

        X_test_maxabs_transform = max_abs_scaler.transform(X_test)
        rf = RandomForestClassifier(n_estimators=trees,
                                    n_jobs=6,
                                    criterion=c,
                                    class_weight="balanced",
                                    random_state=1357)

        rf.fit(X_train_maxabs_transform, y_train)
        try:
            scores_training = rf.decision_function(X_train_maxabs_transform)
            scores_testing = rf.decision_function(X_test_maxabs_transform)
        except:
            scores_training = rf.predict_proba(X_train_maxabs_transform)[:, 1]
            scores_testing = rf.predict_proba(X_test_maxabs_transform)[:, 1]

        y_pred = rf.predict_proba(X_test_maxabs_transform)

        All_scores.append(scores_testing)

        rf_fpr, rf_tpr, rf_thr = roc_curve(y_test, scores_testing)

        auc_val = auc(rf_fpr, rf_tpr)
        print(y_test)

    return All_scores
Beispiel #9
0
def normalize(train, test, val):
    start_time = datetime.datetime.now()

    scaler = MaxAbsScaler()
    #scaler = MinMaxScaler()
    #scaler = StandardScaler()
    X_train = scaler.fit_transform(train)
    X_val = scaler.transform(val)
    X_test = scaler.transform(test)

    end_time = datetime.datetime.now()
    print("normalization time taken - {}".format(end_time - start_time))
    return X_train, X_test, X_val
Beispiel #10
0
def MaxAbs_Scaler(X_train, X_test):
    """
  Max min standardization: scale each feature by its maximum absolute value.
  :param X_train: array-like training data;
  :param X_test: array-like test data;
  :return: standardized training data and test data, and the scaler
  """
    scaler = MaxAbsScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, scaler
def convert_to_timeseries(X_train, X_test, time_step, feature_num, scale=True):

    if scale == True:
        transformer = MaxAbsScaler().fit(X_train)
        X_train = transformer.transform(X_train)
        X_test = transformer.transform(X_test)

    X_train = X_train.reshape(X_train.shape[0], feature_num, time_step)
    X_train = np.array([X_train[x].T for x in range(X_train.shape[0])])
    X_test = X_test.reshape(X_test.shape[0], feature_num, time_step)
    X_test = np.array([X_test[x].T for x in range(X_test.shape[0])])

    return X_train, X_test
def multiple_model_runner(df):

    X_train, X_test, y_train, y_test = preprocess(
        df, min_df=100, ngram_range=(1, 1),
        penalty="l2")  # **config_build_models

    scaling = MaxAbsScaler().fit(X_train)
    X_train_scaled = scaling.transform(X_train)
    X_test_scaled = scaling.transform(X_test)

    # Append classifier to preprocessing pipeline for three kinds of models
    # Leave all with default parameters for now - will fit hyperparameters for the best model after
    log_reg = Pipeline(steps=[('classifier',
                               LogisticRegression(class_weight='balanced'))])
    clf = Pipeline(steps=[('classifier',
                           DecisionTreeClassifier(class_weight='balanced'))])
    rd = Pipeline(steps=[('classifier',
                          RandomForestClassifier(class_weight='balanced'))])
    svm_class = Pipeline(steps=[('classifier',
                                 svm.LinearSVC(class_weight='balanced'))])
    knn = Pipeline(steps=[('classifier', KNeighborsClassifier(n_neighbors=4))])
    NN = Pipeline(steps=[('classifier',
                          MLPClassifier(solver='lbfgs',
                                        alpha=1e-5,
                                        hidden_layer_sizes=(5, 2),
                                        random_state=1))])

    # Test which model generally performs the best with default parameters
    model_list = [log_reg, clf, rd, svm_class, knn, NN]
    model_names = [
        'Logistic Regression', 'Decision Tree Classifier',
        'Random Forest Classifier', 'SVM', 'KNN', 'MLPC'
    ]
    accuracies = []
    f1s = []

    for i in range(0, len(model_list)):
        model = model_list[i]

        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)

        accuracies.append(model.score(X_test_scaled, y_test))
        f1s.append(f1_score(y_test, preds, average='macro'))
        print(model_names[i])
        print(confusion_matrix(y_test, preds))

    comparison_table(model_names, accuracies, f1s)

    return
Beispiel #13
0
def scaler_dummy(dataset,dataset_test):

    scaler_mm = MinMaxScaler() 
    scaler_ma = MaxAbsScaler()
    scaler_sd = StandardScaler()
    scaler_rb = RobustScaler()

    numerical = list(dataset.columns)
    data_transform_mm = pd.DataFrame(data = dataset)
    data_transform_ma = pd.DataFrame(data = dataset)
    data_transform_sd = pd.DataFrame(data = dataset)
    data_transform_rb = pd.DataFrame(data = dataset)

    data_transform_mm[numerical] = scaler_mm.fit_transform(dataset[numerical])
    data_transform_ma[numerical] = scaler_ma.fit_transform(dataset[numerical])
    data_transform_sd[numerical] = scaler_sd.fit_transform(dataset[numerical])
    data_transform_rb[numerical] = scaler_rb.fit_transform(dataset[numerical])
  #     scaler_mm.fit(dataset[numerical])
  #     scaler_ma.fit(dataset[numerical])
  #     scaler_sd.fit(dataset[numerical])
  #     scaler_rb.fit(dataset[numerical])

    data_transform_mm[numerical] = scaler_mm.transform(dataset[numerical])
    data_transform_ma[numerical] = scaler_ma.transform(dataset[numerical])
    data_transform_sd[numerical] = scaler_sd.transform(dataset[numerical])
    data_transform_rb[numerical] = scaler_rb.transform(dataset[numerical])

    ## get dummies
    features_final_mm = pd.get_dummies(data_transform_mm)
    features_final_ma = pd.get_dummies(data_transform_ma)
    features_final_sd = pd.get_dummies(data_transform_sd)
    features_final_rb = pd.get_dummies(data_transform_rb)

    numerical = list(dataset_test.columns)
    scaler_mm_fitted_test = scaler_mm.transform(dataset_test[numerical])
    scaler_ma_fitted_test = scaler_ma.transform(dataset_test[numerical])
    scaler_sd_fitted_test = scaler_sd.transform(dataset_test[numerical])
    scaler_rb_fitted_test = scaler_rb.transform(dataset_test[numerical])

    scaler_mm_fitted_test = pd.DataFrame(data = scaler_mm_fitted_test,columns=numerical)
    scaler_ma_fitted_test = pd.DataFrame(data = scaler_ma_fitted_test,columns=numerical)
    scaler_sd_fitted_test = pd.DataFrame(data = scaler_sd_fitted_test,columns=numerical)
    scaler_rb_fitted_test = pd.DataFrame(data = scaler_rb_fitted_test,columns=numerical)
    
    features_final_mmt = pd.get_dummies(scaler_mm_fitted_test)
    features_final_mat = pd.get_dummies(scaler_ma_fitted_test)
    features_final_sdt = pd.get_dummies(scaler_sd_fitted_test)
    features_final_rbt = pd.get_dummies(scaler_rb_fitted_test)        
    return features_final_mm, features_final_ma, features_final_sd, features_final_rb, features_final_mmt, features_final_mat, features_final_sdt, features_final_rbt
Beispiel #14
0
def run_train_test():
    print('reading data...')
    with np.load('data/data.npz') as data:
        x_all, y_all = data['x'], data['y']

    print('transforming data...')
    y_all[y_all < 0] = 0

    # split train and test
    num_train = int(len(y_all) * 0.8)
    x_train, y_train = x_all[:num_train], y_all[:num_train]
    x_test, y_test = x_all[num_train:], y_all[num_train:]
    perm = np.random.permutation(num_train)
    x_train, y_train = x_train[perm], y_train[perm]

    # feature scaling
    scalar = MaxAbsScaler().fit(x_train)
    x_train = scalar.transform(x_train)
    x_test = scalar.transform(x_test)

    # to DMatrix
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dtest = xgb.DMatrix(x_test, label=y_test)

    print('fitting xgb...')
    param = dict(
        objective='binary:logistic',
        eval_metric=['error', 'auc'],
        max_depth=7,
        eta=0.3,
        # scale_pos_weight=np.sum(y_train == 0) / np.sum(y_train == 1),
    )
    print(param)
    bst = xgb.train(param,
                    dtrain,
                    num_boost_round=10,
                    evals=[(dtrain, 'train'), (dtest, 'test')])
    bst.save_model('data/xgb.model')

    print('validating...')
    bst = xgb.Booster()
    bst.load_model('data/xgb.model')
    y_pred_proba = bst.predict(dtest)
    y_pred = y_pred_proba.copy()
    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0
    print('acc:', accuracy_score(y_test, y_pred))
    print('auc:', roc_auc_score(y_test, y_pred_proba))
def df_maxabsscale(df):
    from sklearn.preprocessing import MaxAbsScaler
    maxabs_scaler = MaxAbsScaler().fit(df)
    df = pd.DataFrame(maxabs_scaler.transform(df), columns=df.columns)
    print("DataSet MaxAbsScaled...")
    df.head()
    return df
Beispiel #16
0
def normalize_data(dataframe, mode):
    if mode == 'abs':
        from sklearn.preprocessing import MaxAbsScaler
        max_abs = MaxAbsScaler(copy=True)  #save for retransform later
        max_abs.fit(dataframe)
        data_norm = max_abs.transform(dataframe)

        return data_norm, max_abs

    if mode == 'robust':
        from sklearn.preprocessing import RobustScaler
        robust = RobustScaler(copy=True)  #save for retransform later
        robust.fit(dataframe)
        data_norm = robust.transform(dataframe)

        return data_norm, robust

    if mode == 'min_max':
        from sklearn.preprocessing import MinMaxScaler
        minmax = MinMaxScaler(feature_range=(0, 1),
                              copy=True)  #save for retransform later
        minmax.fit(dataframe)
        data_norm = minmax.transform(dataframe)

        return data_norm, minmax
    if mode == 'std':
        from sklearn.preprocessing import StandardScaler
        stdscaler = StandardScaler(copy=True, with_mean=True, with_std=True)
        stdscaler.fit(dataframe)
        data_norm = stdscaler.transform(dataframe)

        return data_norm, stdscaler
Beispiel #17
0
def scikit_clustering(number_of_clusters=3600):
    user_features = pickle.load(
        open(os.path.join(ROOT_DIR, 'users_feature.p'), 'rb'))
    users_features_vectors = list(user_features.values())
    users_dataset = np.array(users_features_vectors)
    df = pd.DataFrame(users_dataset)
    df[0] = df[0].astype('category')
    df[1] = df[1].astype('category')
    df[3] = df[3].astype('category')
    df[6] = df[6].astype('category')

    abs_scaler = MaxAbsScaler()
    abs_scaler.fit(df[[2, 4, 5, 7, 8, 9]])
    df[[2, 4, 5, 7, 8, 9]] = abs_scaler.transform(df[[2, 4, 5, 7, 8, 9]])
    print(df.iloc[:, [0]].dtypes[0])

    clustering = AgglomerativeClustering(n_clusters=number_of_clusters,
                                         affinity=gower.gower_matrix,
                                         linkage='complete').fit(df)

    result = clustering.labels_
    clustering_result = {}
    for i in range(len(result)):
        if result[i] in clustering_result:
            clustering_result[result[i]] += [users_features_vectors[i]]
        else:
            clustering_result[result[i]] = [users_features_vectors[i]]
    file_to_write = open('users_vectors_clustering.p', 'wb')
    pickle.dump(clustering_result, file_to_write)
Beispiel #18
0
def scikit_clustering_ver2(number_of_clusters=3600):
    user_features = pickle.load(
        open(os.path.join(ROOT_DIR, 'users_feature.p'), 'rb'))
    users_features_vectors = list(user_features.values())
    users_dataset = np.array(users_features_vectors)
    df = pd.DataFrame(users_dataset)
    df[0] = df[0].astype('category')
    df[1] = df[1].astype('category')
    df[3] = df[3].astype('category')
    df[6] = df[6].astype('category')

    abs_scaler = MaxAbsScaler()
    abs_scaler.fit(df[[2, 4, 5, 7, 8, 9]])
    df[[2, 4, 5, 7, 8, 9]] = abs_scaler.transform(df[[2, 4, 5, 7, 8, 9]])

    clustering = KMeans(n_clusters=number_of_clusters, verbose=1).fit(df)

    result = clustering.labels_
    logging.info("result: {0}".format(result))
    clustering_result = {}
    for i in range(len(result)):
        if result[i] in clustering_result:
            clustering_result[result[i]] += [users_features_vectors[i]]
        else:
            clustering_result[result[i]] = [users_features_vectors[i]]
    file_to_write = open('users_vectors_clustering.p', 'wb')
    pickle.dump(clustering_result, file_to_write)
Beispiel #19
0
def run_isolation_forest(file_path):
    """
    Run test for Isolation forest model
    :param file_path: the path of the data set
    :return: isolation forest prediction
    """

    features_list = ['Direction', 'Speed']
    df_train = pd.read_csv(f'{file_path}/without_anom.csv')

    df_train = df_train[features_list]

    scalar = MaxAbsScaler()

    X_train = scalar.fit_transform(df_train)

    random_model = MultiOutputRegressor(
        RandomForestRegressor(max_depth=2, max_features="sqrt"))

    # lab_enc = preprocessing.LabelEncoder()
    # training_scores_encoded = lab_enc.fit_transform(X_train)
    random_model.fit(X_train, X_train)
    pred = random_model.predict(X_train)
    # isolation_model = MultiOutputRegressor(IsolationForest()).fit(X_train)
    # pred = isolation_model.predict(X_train)
    test_path = "C:\\Users\\Yehuda Pashay\\Desktop\\fligth_data\\data_set\\test\\chicago_to_guadalajara\\down_attack"
    df_test = pd.read_csv(f'{test_path}/sensors_8.csv')
    df_test = df_test[features_list]

    Y_test = scalar.transform(df_test)
    test_pred = random_model.predict(Y_test)
    a = 4
def use_MaxAbsScaler():
    # 가장 큰 절대값 사용 : -1 ~ 1 사이로 변환
    x = [[1., -1., 5.], [2., 0., -5.], [0., 1., -10]]

    scaler = MaxAbsScaler()
    scaler.fit(x)
    print(scaler.transform(x))
Beispiel #21
0
    def _scaled(self):
        '''
        特征归一化,采用 MaxAbsScaler 来进行归一化
        :return:
        '''
        print("----- Begin run scaled at %s -------" % current_time())
        train_scales = {}
        test_scales = {}
        self.scalers = {}
        for _type in self.types:
            if _type == 'type 1':
                train_last_index = 5  #最后5列为 group_1/date_act/date_people/char_38/outcome
                test_last_index = 4  #最后4列为 group_1/date_act/date_people/char_38
            else:
                train_last_index = 6  #最后6列为 group_1/char_10_act/date_act/date_people/char_38/outcome
                test_last_index = 5  #最后5列为 group_1/char_10_act/date_act/date_people/char_38

            scaler = MaxAbsScaler()
            train_array = self.train_datas[_type].toarray()
            train_front = train_array[:, :-train_last_index]
            train_mid = scaler.fit_transform(
                train_array[:, -train_last_index:-1])  #outcome 不需要归一化
            train_end = train_array[:, -1].reshape((-1, 1))  #outcome
            train_scales[_type] = np.hstack(
                (train_front, train_mid, train_end))

            test_array = self.test_datas[_type].toarray()
            test_front = test_array[:, :-test_last_index]
            test_end = scaler.transform(test_array[:, -test_last_index:])
            test_scales[_type] = np.hstack((test_front, test_end))
            self.scalers[_type] = scaler
        self.train_datas = train_scales
        self.test_datas = test_scales
        print("----- End run scaled at %s -------" % current_time())
Beispiel #22
0
class InputSequenceSharedData:
    def __init__(self, *, frac_train):
        print('InputSequenceSharedData: reading data..')
        with np.load('data/features.npz') as data:
            self.data_user_ids = data['user_id']
            self.num_rows = int(data['num_rows'])
            self.num_users = int(data['num_users'])
        with np.load('data/data.npz') as data:
            self.data_x, self.data_y = data['x'], data['y']

        print('InputSequenceSharedData: scaling features...')
        self.scalar = MaxAbsScaler().fit(self.data_x)
        self.data_x = self.scalar.transform(self.data_x)

        print('InputSequenceSharedData: building data structures...')
        self.num_train = int(self.num_rows * frac_train)
        self.train_user_rows = [[] for _ in range(self.num_users)]
        self.test_user_rows = [[] for _ in range(self.num_users)]
        for row in range(self.num_train):
            user_id = self.data_user_ids[row]
            self.train_user_rows[user_id].append(row)
        for row in range(self.num_train, self.num_rows):
            user_id = self.data_user_ids[row]
            self.test_user_rows[user_id].append(row)

        self.train_user_offsets, self.test_user_offsets = [0], [0]
        for user_id in range(self.num_users):
            self.train_user_offsets.append(self.train_user_offsets[-1] +
                                           len(self.train_user_rows[user_id]))
            self.test_user_offsets.append(self.test_user_offsets[-1] +
                                          len(self.test_user_rows[user_id]))
        self.train_user_offsets = self.train_user_offsets[1:]
        self.test_user_offsets = self.test_user_offsets[1:]

        print('InputSequenceSharedData: init done')
Beispiel #23
0
class ScalingAdder(BaseEstimator, TransformerMixin):
    def _create_scaler(self, scaler):
        if scaler == 'std':
            self._sc = StandardScaler()
        if scaler == 'minmax':
            self._sc = MinMaxScaler()
        if scaler == 'maxabs':
            self._sc = MaxAbsScaler()

    def __init__(self, scaler=None):
        self.scaler = scaler
        self._create_scaler(scaler)

    def set_params(self, scaler=None, **parameters):
        self.scaler = scaler
        self._create_scaler(scaler)
        return self

    def get_params(self, **kwargs):
        return {"scaler": self.scaler}

    def transform(self, X, **transform_params):
        if self.scaler is None:
            return X
        if (X.shape[1] > 1):
            return np.hstack((X[:, :1], self._sc.transform(X[:, 1:])))
        return np.hstack((X[:, :1], np.zeros(shape=(X.shape[0], 1))))

    def fit(self, X, y=None, **fit_params):
        if self.scaler is not None:
            if X.shape[1] > 1:
                self._sc.fit(X[:, 1:], y)
        return self
Beispiel #24
0
def test_max_abs_scaler():
    tform = MaxAbsScaler()
    tform.fit(X)
    tform_ = convert_estimator(tform)
    X_t = tform.transform(X)
    X_t_ = tform_.transform(X)
    np.allclose(X_t, X_t_)
def scale(train_datas,test_datas): 
    train_results={}
    test_results={}
    types=['type %d'%i for i in range(1,8)]
    
    for _type in types:
        if _type=='type 1':
            train_last_index=5#最后5列为 group_1/date_act/date_people/char_38/outcome
            test_last_index=4#最后4列为 group_1/date_act/date_people/char_38 
        else:
            train_last_index=6#最后6列为 group_1/char_10_act/date_act/date_people/char_38/outcome
            test_last_index=5#最后5列为 group_1/char_10_act/date_act/date_people/char_38 
        
        scaler=MaxAbsScaler()
        train_array=train_datas[_type].toarray()        
        train_front=train_array[:,:-train_last_index]
        train_mid=scaler.fit_transform(train_array[:,-train_last_index:-1])#outcome 不需要归一化
        train_end=train_array[:,-1].reshape((-1,1)) #outcome
        train_results[_type]=np.hstack((train_front,train_mid,train_end))
        
        test_array=test_datas[_type].toarray()
        test_front=test_array[:,:-test_last_index]
        test_end=scaler.transform(test_array[:,-test_last_index:])
        test_results[_type]=np.hstack((test_front,test_end))

    return train_results,test_results
Beispiel #26
0
def ml_stratified_cv():
    #from sklearn.utils import check_random_state
    #rng = check_random_state(0)
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.preprocessing import MaxAbsScaler
    scaler = MaxAbsScaler()

    flag_scale = True

    cv = StratifiedKFold(y, n_folds=10, shuffle=True)
    ytrue, ypred, score = [], [], []
    for itr, its in cv:
        Xtr, ytr = X[itr], y[itr]
        Xts, yts = X[its], y[its]

        if flag_scale:
            scaler.fit(Xtr)
            Xtr = scaler.transform(Xtr)
            Xts = scaler.transform(Xts)

        clf.fit(Xtr, ytr)
        ypr = clf.predict(Xts)
        sco = clf.decision_function(Xts)

        ytrue.append(yts)
        ypred.append(ypr)
        score.append(sco)

    ytrue = np.concatenate(ytrue)
    ypred = np.concatenate(ypred)
    score = np.concatenate(score)

    print tw.clf_results_extended(ytrue, score)
Beispiel #27
0
def main(arguments):
    # param = parse_parameters() # get parameters from command
    display_params(arguments)

    datasets = [read_datsets(x, arguments['multi']) for x in arguments['input']] # loading datasets as lists of document objects
    features_list = [x for x in ['tfidf', 'char_grams', 'lexical', 'style', 'readability', 'nela'] if arguments[x]]

    maxabs_scaler = MaxAbsScaler()

    features_instance = features(datasets[0])

    for i in range(len(datasets)):
        X = compute_features(datasets[i], features_instance,
                             tfidf=arguments['tfidf'],
                             char_grams=arguments['char_grams'],
                             lexical=arguments['lexical'],
                             style=arguments['style'],
                             readability=arguments['readability'],
                             nela=arguments['nela']
                             )
        if i == 0:  # It is the first iteration and we assume this is training
            X = maxabs_scaler.fit_transform(X)
        else:
            X = maxabs_scaler.transform(X)

        dump_feature_file(X, get_output_file_name(arguments['input'][i], features_list) )
Beispiel #28
0
class MaxAbsScalerPrim(primitive):
    def __init__(self, random_state=0):
        super(MaxAbsScalerPrim, self).__init__(name='MaxAbsScaler')
        self.id = 8
        self.hyperparams = []
        self.type = 'feature preprocess'
        self.description = "Scale each feature by its maximum absolute value. his estimator scales and translates each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity. This scaler can also be applied to sparse CSR or CSC matrices."
        self.hyperparams_run = {'default': True}
        self.scaler = MaxAbsScaler()
        self.accept_type = 'c_t'

    def can_accept(self, data):
        return self.can_accept_c(data)

    def is_needed(self, data):
        # data = handle_data(data)
        # Update
        return True

    def fit(self, data):
        data = handle_data(data)
        self.scaler.fit(data['X'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        for i in range(len(cols)):
            if not 'one_hot' in cols[i]:
                cols[i] = "{}_mxabsscale".format(cols[i])
        output['X'] = pd.DataFrame(self.scaler.transform(output['X']),
                                   columns=cols)
        final_output = {0: output}
        return final_output
Beispiel #29
0
def normalize_data(X_train, y_train, X_test, name, method='diff'):
    """Normalize dataset. Please note that it doesn't modify the original
    dataset, it just returns a new dataset that you can use to modify
    the original dataset or create a new one.
    """
    if CONFIG.NORMALIZATION['method'] == 'max':
        scaler = MaxAbsScaler()
        scaler_y = MaxAbsScaler()
    elif CONFIG.NORMALIZATION['method'] == 'diff':
        scaler = MinMaxScaler()
        scaler_y = MinMaxScaler()
    elif CONFIG.NORMALIZATION['method'] == 'std':
        scaler = StandardScaler()
        scaler_y = StandardScaler()
    else:
        raise ValueError(
            'Internal Error: Value of CONFIG.NORMALIZATION["method"] should be "max", "diff", "std".'
        )

    aux = X_train
    X_train = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(data=X_train, index=aux.index, columns=aux.columns)

    aux = X_test
    X_test = scaler.transform(X_test)
    X_test = pd.DataFrame(data=X_test, index=aux.index, columns=aux.columns)
    del aux

    y_train = scaler_y.fit_transform(y_train.values.reshape(-1, 1))

    if name == 'LIGHTGBM':
        y_train = [i[0]
                   for i in y_train]  # TODO: to do in a more efficient way

    return X_train, y_train, X_test, scaler_y
Beispiel #30
0
def scaler_dummy(dataset):

  scaler_mm = MinMaxScaler() 
  scaler_ma = MaxAbsScaler()
  scaler_sd = StandardScaler()
  scaler_rb = RobustScaler()

  numerical = list(dataset.columns)
  data_transform_mm = pd.DataFrame(data = dataset)
  data_transform_ma = pd.DataFrame(data = dataset)
  data_transform_sd = pd.DataFrame(data = dataset)
  data_transform_rb = pd.DataFrame(data = dataset)


  scaler_mm.fit(dataset[numerical])
  scaler_ma.fit(dataset[numerical])
  scaler_sd.fit(dataset[numerical])
  scaler_rb.fit(dataset[numerical])


  data_transform_mm[numerical] = scaler_mm.transform(dataset[numerical])
  data_transform_ma[numerical] = scaler_ma.transform(dataset[numerical])
  data_transform_sd[numerical] = scaler_sd.transform(dataset[numerical])
  data_transform_rb[numerical] = scaler_rb.transform(dataset[numerical])


  ## get dummies

  features_final_mm = pd.get_dummies(data_transform_mm)
  features_final_ma = pd.get_dummies(data_transform_ma)
  features_final_sd = pd.get_dummies(data_transform_sd)
  features_final_rb = pd.get_dummies(data_transform_rb)

  return features_final_mm, features_final_ma, features_final_sd, features_final_rb
Beispiel #31
0
def normalize_raw_features(X: np.array) -> np.array:
    """Normalize features if column was not OneHot encoded"""
    for col in range(X.shape[1]):
        dense_col = X[:, col].todense()
        if (dense_col > 1.).any() or (dense_col < 0.).any():
            scaler = MaxAbsScaler().fit(dense_col)
            X[:, col] = csr_matrix(scaler.transform(dense_col))
    return X
def scale_data(x_train, x_test):

    """
        We only scale the continuous features. No need to scale binary features
    """

    
    idx_binary = [] # columns with boolean values
    for k in range(x_train.shape[1]):
        idx_binary.append( np.array_equal(x_train[:,k], x_train[:,k].astype(bool)) ) # checking if a column is binary
    idx_cont = np.logical_not(idx_binary)


    sc = MaxAbsScaler()
    sc.fit(x_train[:, idx_cont])
    
    x_train[:, idx_cont] = sc.transform(x_train[:, idx_cont])
    x_test[:, idx_cont] = sc.transform(x_test[:, idx_cont])

    return
def test_maxabsscaler_vs_sklearn():
    # Compare msmbuilder.preprocessing.MaxAbsScaler
    # with sklearn.preprocessing.MaxAbsScaler

    maxabsscalerr = MaxAbsScalerR()
    maxabsscalerr.fit(np.concatenate(trajs))

    maxabsscaler = MaxAbsScaler()
    maxabsscaler.fit(trajs)

    y_ref1 = maxabsscalerr.transform(trajs[0])
    y1 = maxabsscaler.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
Beispiel #34
0
def plotPCA(X_train, y_train, X_test, y_test, outdir):
    #clf = loadClf(term, fold, clfName)
    #try:
    #    decision = clf.decision_function
    #    Vf = numpy.arange(-1.,1.1,0.1)
    #    V = (0.,)
    #except AttributeError:
    #    decision =  lambda x:clf.predict_proba(x)[:,0]
    #    Vf = numpy.arange(0.,1.05,0.05)
    #    V = (0.5,)
    scaler = MaxAbsScaler(copy=False)
    target_names = ("Positive","Negative")
    term = outdir.parent.name.replace("_", " ")
    pca = PCA(n_components=2)
    pca.fit(X_train)
    scaler.fit(pca.transform(X_train))
    #delta = 0.025
    #a=numpy.arange(-1., 1., delta)
    #b=numpy.arange(-1., 1., delta)
    #A,B = numpy.meshgrid(a,b)
    #C=numpy.empty(A.shape)
    for X, y, n in ((X_train, y_train, 'training'), (X_test, y_test, 'testing')):
        X_r = scaler.transform(pca.transform(X))
        inlier = (numpy.abs(X_r[:,0]) <= 1) & (numpy.abs(X_r[:,1]) <= 1)
        #print(X_r)
        plt.clf()

        #for k,l in product(range(len(a)),range(len(b))):
        #    C[k][l] = decision(pca.inverse_transform(scaler.inverse_transform(((A[k][l],B[k][l]),))))
        #print(C)
        #cfp = plt.contourf(A,B,C,Vf,cmap=plt.cm.bone)
        #cfp.cmap.set_under('black')
        #cfp.cmap.set_over('white')
        #plt.contour(A,B,C,V,colors=("b",))
        #y=clf.predict(X)
        for c, i, target_name in zip("rg", (0, 1), target_names):
            plt.scatter(X_r[(y == i) & inlier, 0], X_r[(y == i) & inlier, 1],
                    c = c,
                    label = target_name,
                    marker = ",",
                    s = 1,#0.8,#1/numpy.sqrt(2),
                    #edgecolors='none',
                    linewidth = 0,
                    alpha = 0.7)
        plt.legend()
        plt.title('PCA for %s on %s data' % (term, n))
        plt.savefig(str(outdir/('pca-%s.png' % (n,))))
        plt.savefig(str(outdir/('pca-%s.ps' % (n,))))
def _train_test_split():
    # Build the store_weather dataframe
    store_weather_filename = Config.save_dir + "store_weather.pkl"
    if os.path.exists(store_weather_filename):
        store_weather = utils.from_pickle(store_weather_filename)
    else:
        store_weather = _preprocess_data()

    # Split train test for each store
    train = pd.DataFrame({})
    test = pd.DataFrame({})
    store_ids = store_weather.store_id_bk.unique()
    for sid in store_ids:
        c_store = store_weather[store_weather.store_id_bk == sid]
        s_train = c_store[:-Config.test_size]
        s_test = c_store[-Config.test_size:]
        train = train.append(s_train).reset_index().drop(["index"], axis=1)
        test = test.append(s_test).reset_index().drop(["index"], axis=1)

    # Scale numeric columns
    num_cols = ["p_total_revenue", "p_total_volume", "mean_temp",
                "total_precipitation", "total_snow"]
    scaler = MaxAbsScaler().fit(train.loc[:, num_cols])
    train.loc[:, num_cols] = scaler.transform(train.loc[:, num_cols])
    test.loc[:, num_cols] = scaler.transform(test.loc[:, num_cols])

    # Scale 2 output columns
    revenue_scale = MaxAbsScaler().fit(train.loc[:, ["total_revenue"]])
    volume_scale = MaxAbsScaler().fit(train.loc[:, ["total_volume"]])
    train.loc[:, ["total_revenue"]] = revenue_scale.transform(
        train.loc[:, ["total_revenue"]])
    test.loc[:, ["total_revenue"]] = revenue_scale.transform(
        test.loc[:, ["total_revenue"]])
    train.loc[:, ["total_volume"]] = volume_scale.transform(
        train.loc[:, ["total_volume"]])
    test.loc[:, ["total_volume"]] = volume_scale.transform(
        test.loc[:, ["total_volume"]])

    # Save the train/test dataframes to pickle objects
    utils.to_pickle(Config.save_dir + "train_set.pkl", train)
    utils.to_pickle(Config.save_dir + "test_set.pkl", test)

    # Save the 2 scaler for later use
    utils.to_pickle(Config.save_dir + "revenue_scale", revenue_scale)
    utils.to_pickle(Config.save_dir + "volume_scale", volume_scale)

    # Save store_ids
    utils.to_pickle(Config.save_dir + "store_id.pkl", store_ids)

    return train, test
	1066, 1053, 1339, 1040, 497, 253, 1485, 337, 1347, 1343, 122, 980, 87, 126, 528,
	694, 1444, 655, 161, 626, 545, 906, 1235, 684, 263, 69, 882, 1209, 180, 1386,
	1074, 631, 908, 1176, 947, 401, 1085, 1029, 797, 1107, 386, 559, 588, 522, 644,
	614, 1440, 1140, 1267, 1475, 217, 1201, 456, 231, 1079, 1224, 1036, 156, 852, 1384,
	1288, 243, 760, 1071]

# 6. Zmiana na numpy.array
train_index = np.asarray(A)
test_index = np.asarray(B)

# 7. Podział danych
X_train, X_test = raw_X[train_index], raw_X[test_index]
y_train, y_test = raw_y[train_index], raw_y[test_index]

# 8. Normalizacja
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.transform(X_test)

# 9. Nauka algorytmu
clf = SVC(kernel='rbf', C=1, gamma=0.5, coef0=0.0)
clf.fit(X_train_norm, y_train)
pred = clf.predict(X_test_norm)
acc = accuracy_score(pred, y_test)

# 10. Wynik
print "Accuracy:",acc

# 11. Zapisanie modelu
with open("model.pickle", "wb") as f:
  pickle.dump((clf, normalizer), f, 2);
Beispiel #37
0
def normalize_features(X: np.array) -> np.array:
    """Normalize features by scaling to [0,1]"""
    scaler = MaxAbsScaler().fit(X)
    return scaler.transform(X)
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MaxAbsScaler
from sklearn.neighbors import KNeighborsClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indices, testing_indices = train_test_split(tpot_data.index, stratify = tpot_data['class'].values, train_size=0.75, test_size=0.25)


result1 = tpot_data.copy()

# Use Scikit-learn's MaxAbsScaler to scale the features
training_features = result1.loc[training_indices].drop('class', axis=1)

if len(training_features.columns.values) > 0:
    scaler = MaxAbsScaler()
    scaler.fit(training_features.values.astype(np.float64))
    scaled_features = scaler.transform(result1.drop('class', axis=1).values.astype(np.float64))
    result1 = pd.DataFrame(data=scaled_features)
    result1['class'] = result1['class'].values
else:
    result1 = result1.copy()

# Perform classification with a k-nearest neighbor classifier
knnc2 = KNeighborsClassifier(n_neighbors=min(10, len(training_indices)))
knnc2.fit(result1.loc[training_indices].drop('class', axis=1).values, result1.loc[training_indices, 'class'].values)
result2 = result1.copy()
result2['knnc2-classification'] = knnc2.predict(result2.drop('class', axis=1).values)
def _max_abs_scaler(column):
    sc = MaxAbsScaler()
    sc.fit(column.reshape(-1,1))
    new_col = sc.transform(column.reshape(-1,1))
    return(new_col)
Beispiel #40
0
    model.add(Activation('relu'))
    model.add(GaussianNoise(0.00001))
    model.add(Dropout(0.3))

    model.add(MaxoutDense(1, input_dim=100))
    model.add(Activation('sigmoid'))

    #ada = Adagrad(lr=0.001)
    ada = SGD(lr=0.0003, momentum=0.9, decay=0.0001, nesterov=True)
    model.compile(optimizer=ada,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    scaler = MaxAbsScaler()
    train_train_scaled = scaler.fit_transform(train_train[features])
    train_test_scaled = scaler.transform(train_test[features])

    model.fit(train_train_scaled, train_train.target.values, nb_epoch=150, batch_size=100)

    train_train_pred = model.predict(train_train_scaled, batch_size=100)
    train_test_pred = model.predict(train_test_scaled, batch_size=100)

    train_score = log_loss(train_train.target.values, train_train_pred)
    test_score = log_loss(train_test.target.values, train_test_pred)

    #test_poly = poly.transform(test[features])
    test_scaled = scaler.transform(test[features])
    test_pred = model.predict(test_scaled, batch_size=100)

    ensemble_train.loc[train_test.index, 'nn'] = train_test_pred
    submission.loc[:, 'm_{}'.format(ind)] = test_pred
def main():
    X, y = get_data('../../data/train.csv')
    sclr = MaxAbsScaler()
    X = sclr.fit_transform(X)

    # pickle.dump(sclr, open('./dumps/scaler_pickle', 'wb+'))
    X_test, y_test = get_data('../../data/val.csv')
    X_test = sclr.transform(X_test)
    X_fin, y_fin = get_data('../../data/test.csv')
    X_fin = sclr.transform(X_fin)
    other, yo = get_data('../../data/other.csv')
    other = sclr.transform(other)

    lin = linear_model.LogisticRegression(
        C=10000,
    )
    # selector = RFE(lin, 21, step=1)
    # selector.fit(X, y)
    # X = selector.transform(X)
    # X_test = selector.transform(X_test)
    # X_fin = selector.transform(X_fin)
    # for i in range(len(selector.support_)):
    #     print i+1, selector.support_[i]

    lin.fit(X, y)
    # pickle.dump(lin, open('./dumps/lin_reg_pickle', 'wb+'))
    x1 = lin.predict_proba(X)
    x1_test = lin.predict_proba(X_test)
    # x1_fin = lin.predict_proba(X_fin)
    # o1 = lin.predict_proba(other)
    print 'lin'
    print metrics.classification_report(y, lin.predict(X))
    print metrics.classification_report(y_test, lin.predict(X_test))
    print metrics.classification_report(y_fin, lin.predict(X_fin))
    roc = lin.predict_proba(X_fin)
    # r = lin.predict(X_test)
    # l1 = []
    # l2 = []
    # for i in range(len(roc)):
    #     if max(roc[i]) > 0.5:
    #         l1.append(y_fin[i])
    #         l2.append(r[i])
    # print 'dsfasdfasd'
    # print metrics.classification_report(l1, l2)
    # return

    fpr_grd0, tpr_grd0, _ = metrics.roc_curve(y_fin, roc[:, 0], pos_label=0)
    fpr_grd1, tpr_grd1, _ = metrics.roc_curve(y_fin, roc[:, 1], pos_label=1)
    fpr_grd2, tpr_grd2, _ = metrics.roc_curve(y_fin, roc[:, 2], pos_label=2)
    plt.plot(fpr_grd0, tpr_grd0, label='NRP')
    plt.plot(fpr_grd1, tpr_grd1, label='RiPP')
    plt.plot(fpr_grd2, tpr_grd2, label='Polyketide')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()
    # print lin.coef_

    # print sum(lin.predict_proba(X_test)[0])
    svm_model = SVC(
        C=5000,
        # kernel='linear',
        # degree=2,
        coef0=100,
        # probability=True,
        # shrinking=True,
        # class_weight='balanced',
        probability=True,
        # decision_function_shape='ovr'
    )
    svm_model.fit(X, y)
    x2 = svm_model.predict_proba(X)
    x2_test = svm_model.predict_proba(X_test)
    x2_fin = svm_model.predict_proba(X_fin)
    o2 = svm_model.predict_proba(other)
    print 'svm'
    print metrics.classification_report(y, svm_model.predict(X))
    print metrics.classification_report(y_test, svm_model.predict(X_test))