def test_min_max_scaler_iris():
    X = iris.data
    scaler = MinMaxScaler()
    # default params
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), 0)
    assert_array_almost_equal(X_trans.min(axis=0), 0)
    assert_array_almost_equal(X_trans.max(axis=0), 1)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # not default params: min=1, max=2
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), 1)
    assert_array_almost_equal(X_trans.max(axis=0), 2)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # min=-.5, max=.6
    scaler = MinMaxScaler(feature_range=(-.5, .6))
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), -.5)
    assert_array_almost_equal(X_trans.max(axis=0), .6)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # raises on invalid range
    scaler = MinMaxScaler(feature_range=(2, 1))
    assert_raises(ValueError, scaler.fit, X)
Esempio n. 2
0
def prescale_data(x_train, x_test, method):
    """
    Pre-scales training data and (optionally test data) using the specified method.
    :param x_train: The training data to be pre-scaled.
    :param x_test: The (optional) test data to be pre-scaled. Beware that the prescaler is only fit to the training
    data and not to the test data.
    :param method: The method to be used for prescaling. Allowed values are "minmaxscaler" and "standartscaler"
    :return: A tuple of the pre-scaled training and test data or only the training data if the test data was set to None
    """
    if method is not None:
        scaler = None
        if method == "minmaxscaler":
            from sklearn.preprocessing import MinMaxScaler

            scaler = MinMaxScaler()
        if method == "standartscaler":
            from sklearn.preprocessing import StandardScaler

            scaler = StandardScaler()

        if scaler is None:
            raise ValueError("Invalid pre-scaling method: {}".format(method))
        scaler.fit_transform(x_train)
        x_train = scaler.transform(x_train)
        if x_test is not None:
            x_test = scaler.transform(x_test)

    if x_test is not None:
        return x_train, x_test
    else:
        return x_train
Esempio n. 3
0
class NMFReducer():

    def __init__(self, dataset, dataset_name, num_components=10):
        self.dataset = dataset
        self.dataset_name = dataset_name
        self.labels = dataset.target
        self.scaler = MinMaxScaler()
        self.data = self.scaler.fit_transform(dataset.data)
        self.n_samples, self.n_features = self.data.shape

        self.reducer = NMF(n_components=num_components, max_iter=5000)

    def reduce(self):
        self.reducer.fit(self.data)
        self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data))
        return self.reduced

    def benchmark(self, estimator, name, data):
        t0 = time()
        sample_size = 300
        labels = self.labels

        estimator.fit(data)
        print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
              % (name, (time() - t0), estimator.inertia_,
                 metrics.homogeneity_score(labels, estimator.labels_),
                 metrics.completeness_score(labels, estimator.labels_),
                 metrics.v_measure_score(labels, estimator.labels_),
                 metrics.adjusted_rand_score(labels, estimator.labels_),
                 metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
                 metrics.silhouette_score(data, estimator.labels_,
                                          metric='euclidean',
                                          sample_size=sample_size)))

    def display_reduced_digits(self):
        sys.stdout = open('out/NMFReduceDigitsOutput.txt', 'w')
        print("NMF Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print(self.reduced)
        print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print(40 * '-')
        print(self.reducer.reconstruction_err_)

    def display_reduced_iris(self):
        sys.stdout = open('out/NMFReduceIrisOutput.txt', 'w')
        print("NMF Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print(self.reduced)
        print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print(40 * '-')
        print(self.reducer.reconstruction_err_)

    def reduce_crossvalidation_set(self, X_train, X_test):
        self.reducer.fit(X_train)
        reduced_X_train = self.scaler.transform(X_train)
        reduced_X_test = self.scaler.transform(X_test)
        return reduced_X_train, reduced_X_test
 def _scale(self, y):
     z = MinMaxScaler()
     try:
         return z.fit_transform(y)
     except:
         y = np.array(y)
         y = z.fit_transform(y)
         return y.tolist()
def predict_simple_linear(df_train_clean, df_test_clean):
    X_train_cols  =  list(df_train_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                          , 'V_2_ask']].columns.values)

    X_train  =  np.array(df_train_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                              , 'V_2_ask']])
    Y_train  =  np.array(df_train_clean[['labels']])[:,0]

    X_test  =  np.array(df_test_clean[['P_1_bid', 'V_1_bid', 'P_1_ask', 'V_1_ask', 'P_2_bid', 'V_2_bid', 'P_2_ask'
                                       , 'V_2_ask']])
    Y_test  =  np.array(df_test_clean[['labels']])[:,0]
    
    # Define the labels
    labels = np.unique(Y_train)

    ## # Scale Data
    scaler = MinMaxScaler()
    X_test = scaler.fit_transform(X_test)
    X_train = scaler.fit_transform(X_train)

    # Set up the data
    logreg = linear_model.LogisticRegression(C=1e5)

    # Fit
    logreg.fit(X_train, Y_train)

    # Predict
    Y_hat   = logreg.predict(X_test)
    Y_probs = logreg.predict_proba(X_test)

    ## # Misclassification error rate
    miss_err = 1-accuracy_score(Y_test, Y_hat)
    ## # Log Loss
    eps = 10^(-15)
    logloss = log_loss(Y_test, Y_probs, eps = eps)

    ##confusion_matrix
    confusion_matrix1 = confusion_matrix(y_true=Y_test, y_pred=Y_hat
                                         , labels=labels)

    # classification_report
    classification_report1 = classification_report(y_true=Y_test, y_pred=Y_hat)

    # Output results in a list format
    result = []
    result.append("confusion_matrix")
    result.append(confusion_matrix1)
    result.append("classification_report")
    result.append(classification_report1)
    result.append("logloss")
    result.append(logloss)
    result.append("miss_err")
    result.append(miss_err)
    result.append("Y_hat")
    result.append(Y_hat)
    
    return result
Esempio n. 6
0
 def feature_scale(self, X_train, X_val, X_test):
     "Normalize all columns"""
 
     from sklearn.preprocessing import MinMaxScaler
     mms = MinMaxScaler()
     
     X_train_std = mms.fit_transform(X_train)
     X_val_std = mms.fit_transform(X_val)
     X_test_std = mms.fit_transform(X_test)
     return X_train_std, X_val_std, X_test_std
def rescaleSalAndStockValues():
    from sklearn.preprocessing import MinMaxScaler
    import numpy as np
    maxStock, maxSal, minStock, minSal = findMaxMinValues()
    # define the given sal and stock values
    salVal = 200000.0
    stockVal = 1000000.0
    scaler = MinMaxScaler()
    scaledSal = scaler.fit_transform([[maxSal],[minSal],[salVal]])
    scaledStock = scaler.fit_transform([[maxStock],[minStock],[stockVal]])
    return scaledSal, scaledStock
Esempio n. 8
0
def use(method):
    if method == 'naive bayes':
        estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()),
                      ('bayes',GaussianNB())]
        clf = Pipeline(estimators)
        parameters = {"skb__k":[8,9,10,11,12],
                      "pca__n_components":[2,6,4,8]}
        clf = grid_search.GridSearchCV(clf, parameters)
        scaler = MinMaxScaler()
        features_train_scaled = scaler.fit_transform(features_train)
        features_test_scaled = scaler.transform(features_test)
        clf.fit(features_train_scaled, labels_train)
        pred = clf.predict(features_test_scaled)
        print clf.best_params_
        features_k = clf.best_params_['skb__k']
        SKB_k = SelectKBest(f_classif, k = features_k)
        SKB_k.fit_transform(features_train_scaled, labels_train)
        print "features score: "
        print SKB_k.scores_
        features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)]
        print features_selected
    elif method == 'svm':
        estimators = [('reduce_dim', PCA()), ('svc', SVC())]
        clf = Pipeline(estimators)
        parameters = {'svc__C': [1,10]}
        clf = grid_search.GridSearchCV(clf, parameters)
        scaler = MinMaxScaler()
        features_train_scaled = scaler.fit_transform(features_train)
        features_test_scaled = scaler.transform(features_test)
        clf.fit(features_train_scaled, labels_train)
        pred = clf.predict(features_test_scaled)
        print clf.best_estimator_
    elif method == 'decision tree':
        estimators = [("skb", SelectKBest(score_func=f_classif)),('pca', PCA()),
                      ('tree', tree.DecisionTreeClassifier())]
        clf = Pipeline(estimators)
        parameters = {"tree__min_samples_split": [2,10],"skb__k":[8,9,10,11,12],
                      "pca__n_components":[2,4,6,8]}
        clf = grid_search.GridSearchCV(clf, parameters)
        scaler = MinMaxScaler()
        features_train_scaled = scaler.fit_transform(features_train)
        features_test_scaled = scaler.transform(features_test)
        clf.fit(features_train_scaled, labels_train)
        pred = clf.predict(features_test_scaled)
        print clf.best_params_
        features_k = clf.best_params_['skb__k']
        SKB_k = SelectKBest(f_classif, k = features_k)
        SKB_k.fit_transform(features_train, labels_train)
        features_selected = [features_list[1:][i]for i in SKB_k.get_support(indices=True)]
        print features_selected
    accuracy = accuracy_score(labels_test, pred)
    print "accuracy score:"
    print accuracy
    calculate_precision_recall(pred, labels_test)
Esempio n. 9
0
def featureScale(df):
    """
    FEATURE SCALING
    """
    scaler = MinMaxScaler()
    #print(df['ApplicantIncome'].head())
    df[['ApplicantIncome']] = scaler.fit_transform(df[['ApplicantIncome']])
    df[['CoapplicantIncome']] = scaler.fit_transform(df[['CoapplicantIncome']])
    df[['LoanAmount']] = scaler.fit_transform(df[['LoanAmount']])
    df[['Loan_Amount_Term']] = scaler.fit_transform(df[['Loan_Amount_Term']])
    print("Scaling Done")
    #print(df['ApplicantIncome'].head())
    return df
Esempio n. 10
0
def test_min_max_scaler():
    X = iris.data
    scaler = MinMaxScaler()
    # default params
    X_trans = scaler.fit_transform(X)
    assert_equal(X_trans.min(axis=0), 0)
    assert_equal(X_trans.max(axis=0), 1)

    # not default params
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    assert_equal(X_trans.min(axis=0), 1)
    assert_equal(X_trans.max(axis=0), 2)
Esempio n. 11
0
def getDNN(df, random_split=None):
    df_tr, df_val = split(df, rand_ratio=random_split)
    
    X, Y = to_array(df.drop("validation", axis=1))
    Xtr, Ytr = to_array(df_tr)
    Xval, Yval = to_array(df_val)

    scaler = MinMaxScaler((0, 1))
    Xtr = scaler.fit_transform(Xtr)
    Xval = scaler.transform(Xval)

    # Start create model
    print("Create a DNN Classifier")
    model = Sequential()

    model.add(Dense(100, input_dim=Xtr.shape[1], activation='tanh'))
    model.add(PReLU())
    model.add(Dropout(0.2))
    model.add(Dense(80, activation='linear'))
    model.add(ELU(alpha=0.3))
    model.add(Dropout(0.2))
    model.add(Dense(60, activation='tanh'))
    model.add(PReLU())
    model.add(Dropout(0.2))
    model.add(Dense(40, activation='linear'))
    model.add(ELU(alpha=0.1))
    model.add(Dropout(0.2))
    model.add(Dense(15, activation='linear'))
    model.add(PReLU())
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))

    # trainer = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    trainer = Adadelta(lr=0.1, tho=0.98, epsilon=1e-7)
    model.compile(loss='binary_crossentropy', optimizer=trainer)
    
    print(Ytr, Yval)
    model.fit(Xtr, Ytr, nb_epoch=30, batch_size=32, verbose=1, validation_data=(Xval, Yval))


    pred_tr = model.predict_proba(Xtr)
    pred = model.predict_proba(Xval)
    print("auc on train: {}".format(roc_auc_score(Ytr, pred_tr)))
    print("auc on validation: {}".format(roc_auc_score(Yval, pred)))

    X = scaler.fit_transform(X)
    model.fit(X, Y, nb_epoch=30, batch_size=32)
    return model, scaler
Esempio n. 12
0
def readTestData():
    testData  = np.loadtxt('data/test.csv', delimiter=',', skiprows=1)
    xTest     = testData[:,1:31]
    scale = MMS()
    allX = scale.fit_transform(xTest)
    indexTest = list(testData[:,0])
    return [allX, indexTest]
Esempio n. 13
0
]

df = dataframe.dropna(axis=1, thresh=243500).iloc[:, 9:]
df = df.dropna()
df = df.reset_index(drop=True)

# Randomly select a chunk of data
from random import randint
start = randint(0, len(df) - 6000)
print(start)
dataset = df.iloc[start:start + 6000, ]

# # Normalize and add 5% noise
# normalize data
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(dataset.iloc[:3600, ].values)
test_scaled = scaler.transform(dataset.iloc[3600:, ].values)

# add 5% noises as anomalies into train and test data in order to evaluate the method
import math

qty = math.floor(len(train_scaled) * 0.05)
train_anomalies = np.random.choice(train_scaled.shape[0],
                                   size=qty,
                                   replace=False)
print(train_anomalies)
temp_data = train_scaled[train_anomalies, :] + np.random.normal(
    0, 1, size=train_scaled.shape[1])
i = 0
for row in train_anomalies:
    train_scaled[row, :] = temp_data[i, :]
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

start = date(2015,1,1)
end= date.today()

data = get_history(symbol="SBIN", start=start, end=end)

max_=data[['Open','High','Low','Close']].max().max()
min_=data[['Open','High','Low','Close']].min().min()

scl=MinMaxScaler()

X1=(data[['Open','High','Low','Close']]-min_)/(max_-min_)
X2=scl.fit_transform(data[['Volume']].values.reshape(-1,1))
X1=np.array(X1)

data=data.assign(Open=X1[:,0])
data=data.assign(High=X1[:,1])
data=data.assign(Low=X1[:,2])
data=data.assign(Close=X1[:,3])
data=data.assign(Volume=X2[:,0])
data.tail()


X=data[['Open','High','Low','Close','Volume']]
y=data.Last.shift(-1)

timestep=1
X_list=[]
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

if __name__ == '__main__':
    df = pd.read_pickle(
        '/Users/USER/Documents/Python/Data Analysis_Practice_GJ/ED_waiting_time/df_processed.pkl'
    )

    features = df.drop('waiting_time', axis=1)
    y = np.log1p(df['waiting_time'])

    scaler = MinMaxScaler()
    features.iloc[:, :2] = scaler.fit_transform(features.iloc[:, :2])

    x = features
    y_scaled = scaler.fit_transform(y.values.reshape(-1, 1))

    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y_scaled,
                                                        test_size=0.3,
                                                        random_state=42)

    xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror',
                                     n_estimators=100,
                                     learning_rate=0.05,
                                     reg_lambda=1.3)

    xgb_regressor.fit(X_train.values, y_train)
Esempio n. 16
0
### 3. 2nd EDA
# Explore the continuous variables/features using Seaborn's scatterplot matrix
import seaborn as sns

cont_features = list(cc_apps.loc[:, cc_apps.dtypes == float].columns)
sns.pairplot(data=cc_apps, hue='ApprovalStatus'
             )  # By default, pairplot() will skip the object data types

# Scale the continuous features
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = MinMaxScaler()

cc_apps_scaled = cc_apps[['ApprovalStatus']].copy()
for col in cont_features:
    cc_apps_scaled[col] = scaler.fit_transform(cc_apps[col].values.reshape(
        -1, 1))

sns.pairplot(data=cc_apps_scaled, hue='ApprovalStatus'
             )  # Build a scatterplot matrix with the logged features
# The scaler does not change the shape of the distributions, because it changes the scale only. Let's explore a transformation that changes the distributions' shapes: the log

# log the continuous features
cc_apps_logged = cc_apps[['ApprovalStatus']].copy()
for col in cont_features:
    cc_apps_logged['ln_{}'.format(col)] = np.log(cc_apps[col] + 1)

sns.pairplot(data=cc_apps_logged, hue='ApprovalStatus')
# This way is more clear that people with age, years of employment, credit score, income and (even) debt tend to be higher for those whose credits are approved.

### 4. Baseline model: a logistic classifier
Esempio n. 17
0
#MinMaxScaler (区间缩放,基于最大最小值,将数据转换到-1,1区间上的)
#提升模型收敛速度,提升模型精度
#常见用于神经网络

#Normalizer (基于矩阵的行,将样本向量转换为单位向量)
#其目的在于样本向量在点乘运算或其他核函数计算相似性时,拥有统一的标准
#常见用于文本分类和聚类、logistic回归中也会使用,有效防止过拟合

ss = MinMaxScaler()
#用标准化方法对数据进行处理并转换
## scikit learn中模型API说明:
### fit: 模型训练;基于给定的训练集(X,Y)训练出一个模型;该API是没有返回值;eg: ss.fit(X_train, Y_train)执行后ss这个模型对象就训练好了
### transform:数据转换;使用训练好的模型对给定的数据集(X)进行转换操作;一般如果训练集进行转换操作,那么测试集也需要转换操作;这个API只在特征工程过程中出现
### predict: 数据转换/数据预测;功能和transform类似,都是对给定的数据集X进行转换操作,只是transform中返回的是一个新的X, 而predict返回的是预测值Y;这个API只在算法模型中出现
### fit_transform: fit+transform两个API的合并,表示先根据给定的数据训练模型(fit),然后使用训练好的模型对给定的数据X进行转换操作
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)
print("原始数据各个特征属性的调整最小值:", ss.min_)
print("原始数据各个特征属性的缩放数据值:", ss.scale_)

#特征选择:从已有的特征中选择出影响目标值最大的特征属性
# 类比:l1正则 线性回归 稀疏解
#常用方法:
# { 分类:F统计量、卡方系数,互信息mutual_info_classif
#{ 连续 回归:皮尔逊相关系数 F统计量 互信息mutual_info_classif

#SelectKBest(卡方系数)

#在当前的案例中,使用SelectKBest这个方法从4个原始的特征属性,选择出来3个
ch2 = SelectKBest(chi2, k=3)
#K默认为10
import numpy as np
import seaborn as sns
housing = pd.read_csv("house_pricing.csv")
housing

housing.describe().transpose()

summary = housing.describe()
summary = summary.transpose()
print(summary)
X = housing.drop(['medianHouseValue'], axis=1)
y = housing['medianHouseValue']

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scale = scaler.fit_transform(X)
x_scale

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_scale,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_test

import keras
Esempio n. 19
0
# Plotting the return rate per sample
rets = close_px / close_px.shift(1) - 1
rets.plot(label='return')
# plt.show()

data_train = goog[goog['Date'] < '2019-01-01'].copy()
# print(data_train)

data_test = goog[goog['Date'] >= '2019-01-01'].copy()
# print(data_test)

training_data = data_train.drop(['Date'], axis=1)
print(training_data)

scaler = MinMaxScaler()
training_data = scaler.fit_transform(training_data)
print(training_data)

x_train = []
y_train = []

for i in range(30, training_data.shape[0]):
    x_train.append(training_data[i - 30:i])
    y_train.append(training_data[i, 0])

x_train, y_train = np.array(x_train), np.array(y_train)

# Sequential based model
regression = Sequential()

# input shape for the first layer of LSTM model
Esempio n. 20
0
Y_training = training_data_df[['total_earnings']].values

# Load testing data set from CSV file
test_data_df = pd.read_csv("sales_data_test.csv", dtype=float)

# Pull out columns for X (data to train with) and Y (value to predict)
X_testing = test_data_df.drop('total_earnings', axis=1).values
Y_testing = test_data_df[['total_earnings']].values

# All data needs to be scaled to a small range like 0 to 1 for the neural
# network to work well. Create scalers for the inputs and outputs.
X_scaler = MinMaxScaler(feature_range=(0, 1))
Y_scaler = MinMaxScaler(feature_range=(0, 1))

# Scale both the training inputs and outputs
X_scaled_training = X_scaler.fit_transform(X_training)
Y_scaled_training = Y_scaler.fit_transform(Y_training)

# It's very important that the training and test data are scaled with the same scaler.
X_scaled_testing = X_scaler.transform(X_testing)
Y_scaled_testing = Y_scaler.transform(Y_testing)

# Define model parameters
learning_rate = 0.001
training_epochs = 100
display_step = 5

# Define how many inputs and outputs are in our neural network
number_of_inputs = 9
number_of_outputs = 1
Esempio n. 21
0
feattstld2 = pd.read_csv(path + '../features/lead2_tst_ip_device_os_app%s.gz' %
                         (add_),
                         compression='gzip')
featld2 = pd.concat([feattrnld2, feattstld2])
del feattrnld2, feattstld2
featld2.fillna(-1, inplace=True)
featld2 = transform_lead(featld2)
featld2.head()

print('[{}] Load Entropy Features'.format(time.time() - start_time))
featentip = pd.read_csv(path + '../features/entropyip.gz', compression='gzip')
featentip.iloc[:, 1:] = featentip.iloc[:, 1:].astype(np.float32)
featentip.iloc[:, 0] = featentip.iloc[:, 0].astype('uint32')
scaler = MinMaxScaler()
cols_ = [c for c in featentip.columns if c != 'ip']
featentip[cols_] = scaler.fit_transform(featentip[cols_])
featentip[cols_] = featentip[cols_].astype(np.float16)

len_train = len(train_df)
train_df = train_df.append(test_df)
del test_df
gc.collect()
print('[{}] Concat Features'.format(time.time() - start_time))
train_df = pd.concat([train_df, featapp, featspl, featctn, featcum, featld2],
                     axis=1)

print('[{}] Add entropy'.format(time.time() - start_time))
train_df = train_df.merge(featentip, on=['ip'], how='left')

print('[{}] hour, day, wday....'.format(time.time() - start_time))
train_df['hour'] = pd.to_datetime(train_df.click_time).dt.hour.astype('uint8')
def main(infolder, outfolder):

    descriptor = 'PPCALI'

    print "RF Peptide Learning Info\n========================\n"
    print datetime.now().strftime("%Y-%m-%d_%H-%M") + "\n"
    print(
        "INPUT:\nInputfolder is\t%s\nOutputfolder is\t%s\nDescriptor is\t%s , auto-correlated (window 7)\n"
        % (infolder, outfolder, descriptor))

    # -------------------------------- TRAINING --------------------------------
    print "LOG:\nLoading data..."
    Pos = PeptideDescriptor(infolder + '/Pos.fasta', descriptor)
    Pos.filter_duplicates()
    Neg = PeptideDescriptor(infolder + '/Neg.fasta', descriptor)
    Neg.filter_duplicates()
    targets = np.array(len(Pos.sequences) * [1] +
                       len(Neg.sequences) * [0])  # target vector

    # Descriptor calculation
    print "Calculating %s descriptor..." % descriptor
    Data = PeptideDescriptor(Pos.sequences + Neg.sequences, descriptor)
    Data.calculate_autocorr(7)

    # Standard Scaling
    print "Standard scaling %s descriptor..." % descriptor
    scaler = StandardScaler()
    Data = scaler.fit_transform(Data.descriptor)

    # Classifier
    clf = RandomForestClassifier(bootstrap=True,
                                 class_weight=None,
                                 criterion='gini',
                                 max_depth=None,
                                 max_features='sqrt',
                                 max_leaf_nodes=None,
                                 min_samples_leaf=1,
                                 min_samples_split=2,
                                 min_weight_fraction_leaf=0.0,
                                 n_estimators=500,
                                 n_jobs=-1,
                                 oob_score=True,
                                 random_state=seed,
                                 verbose=0,
                                 warm_start=False)

    # fitting classifier
    print "Fitting Random Forest classifier..."
    clf.fit(Data, targets)
    fit_leafs = clf.apply(Data)
    print "\tRF out-of-bag score: %.2f" % clf.oob_score_

    # -------------------------------- LIBRARY --------------------------------
    # Loading library
    print "Loading sequence library..."
    Lib = PeptideDescriptor(infolder + '/Lib.fasta', descriptor)
    class_labels = [l[:3] for l in Lib.names
                    ]  # extract class labels from sequence names

    print "\tLibrary size: %i" % len(Lib.sequences)
    print "\tLibrary composition is:\n\t\thel: %i\n\t\tasy: %i\n\t\tnCM: %i" % (
        class_labels.count('hel'), class_labels.count('asy'),
        class_labels.count('nCM'))

    # Calculating descriptors for library members
    print "Calculating %s descriptor for library..." % descriptor
    D = PeptideDescriptor(Lib.sequences, descriptor)
    D.calculate_autocorr(7)

    # combining both libraries and scaling descriptor
    print "Standard scaling %s descriptor for library..." % descriptor
    X = scaler.transform(D.descriptor)

    # -------------------------------- PREDICTING --------------------------------
    # get single tree predictions and calculate stdev
    print "Predicting single tree results, standard deviation and entropy for library..."
    start = time.time()
    preds = get_tree_pred(clf, X)

    print "Predicting class probabilities for library..."
    probas = clf.predict_proba(X)
    probas = probas[:, 1].tolist()
    variance = np.var(preds, axis=1)
    print("\tPredictions took %.1f s" % (time.time() - start))

    # calculate similarity of library members to training data
    print "Calculating Random Forest similarity (cosine)..."
    start = time.time()
    lib_leafs = clf.apply(
        X
    )  # leaf indices where library samples end up in -> RF intrinsic similarity measure
    D_RF = pairwise_distances(lib_leafs, fit_leafs, metric='cosine')
    RF_dist = D_RF.mean(axis=1).tolist()
    print("\tDistance calculation took %.1f s" % (time.time() - start))

    # scaling all output features
    print "Min-Max scaling outputs..."
    sclr = MinMaxScaler()
    # some transformations from lists to numpy matrices to arrays back to min-max scaled list:
    variance = np.squeeze(sclr.fit_transform(variance.reshape(-1, 1))).tolist()
    RF_dist = np.squeeze(sclr.fit_transform(np.array(RF_dist).reshape(
        -1, 1))).tolist()

    # construct final list with all values (prediction, RF_dist, var, sum)
    print "Creating result dictionaries..."
    sums = [
        x + 0.5 * y + 0.5 * z for x, y, z in zip(probas, RF_dist, variance)
    ]  # weighed [1,0.5,0.5] sum of all values

    # create data frame with all values
    d = pd.DataFrame(
        {
            'Class': class_labels,
            'Prediction': probas,
            'RFDistance': RF_dist,
            'TreeVariance': variance,
            'WeighedSum': sums
        },
        index=Lib.sequences)
    d.index.name = 'Sequence'
    d = d[['Class', 'Prediction', 'RFDistance', 'TreeVariance',
           'WeighedSum']].sort_values('Prediction', ascending=False)

    # get top and bottom two predictions for every class (total 12 sequences = one synthesis)
    d_hel_top = d.loc[d['Class'] == 'hel'].sort_values('Prediction',
                                                       ascending=False)[:2]
    d_hel_bot = d.loc[d['Class'] == 'hel'].sort_values('Prediction',
                                                       ascending=True)[:2]
    d_asy_top = d.loc[d['Class'] == 'asy'].sort_values('Prediction',
                                                       ascending=False)[:2]
    d_asy_bot = d.loc[d['Class'] == 'asy'].sort_values('Prediction',
                                                       ascending=True)[:2]
    d_nCM_top = d.loc[d['Class'] == 'nCM'].sort_values('Prediction',
                                                       ascending=False)[:2]
    d_nCM_bot = d.loc[d['Class'] == 'nCM'].sort_values('Prediction',
                                                       ascending=True)[:2]
    synth_sele = pd.concat(
        [d_hel_top, d_hel_bot, d_asy_top, d_asy_bot, d_nCM_top, d_nCM_bot])

    # writing output
    print "Saving files to output directory..."
    synth_sele.to_csv(outfolder + '/' +
                      datetime.now().strftime("%Y-%m-%d_%H-%M") +
                      'synthesis_selection.csv')
    d.to_csv(outfolder + '/library_pred.csv')

    # saving scaler and classifier to pickle file for later usage
    pickle.dump(
        sclr,
        open(
            outfolder + datetime.now().strftime("%Y-%m-%d_%H-%M") +
            '-scaler.p', 'w'))
    pickle.dump(
        clf,
        open(
            outfolder + datetime.now().strftime("%Y-%m-%d_%H-%M") +
            '-classifier.p', 'w'))

    print("Total runtime: %.1f s\n" % (time.time() - globstart))
    print "\nALL DONE SUCCESSFULLY"
    print "Look for your results file in %s\nAnd maybe save this terminal output to a logfile ;-)" % outfolder
Esempio n. 23
0
def preprocess_data(data_params):

    # *************** params ******************
    look_back = data_params['look_back']
    train_set_fraction = data_params['train_set_fraction']  # 0.75
    dataset_path = data_params["dataset_path"]
    num_features = data_params['input_num_features'], data_params[
        'output_num_features']

    data = pd.read_csv(dataset_path)
    # print(data.isnull().values.any())
    # print(data.head(10))

    data['date'] = pd.to_datetime(data['Timestamp'], unit='s').dt.date
    group = data.groupby('date')
    daily_price = group['Weighted_Price'].mean()

    print(daily_price.head())
    # print(daily_price.tail())
    print(str(len(daily_price.index)))
    print(daily_price.index[0])

    num_samples = len(daily_price.index)

    train_start_idx = 0
    train_end_idx = int(train_set_fraction * num_samples)
    data_params['training_set_size'] = train_end_idx - train_start_idx
    data_params[
        'validation_set_size'] = num_samples - data_params['training_set_size']

    # new logic
    raw_values = daily_price.values
    train_set = raw_values[train_start_idx:train_end_idx]

    daily_price_x, daily_price_y = to_supervised(raw_values, look_back,
                                                 num_features)
    print(daily_price_y.shape)
    daily_price_x = difference(daily_price_x,
                               look_back)  # leaving daily_price_y raw
    train_x = daily_price_x[train_start_idx:train_end_idx]
    train_y = daily_price_y[train_start_idx:train_end_idx]
    train_Y = difference(train_y, look_back)
    test_x = daily_price_x[train_end_idx:]
    test_y = daily_price_y[train_end_idx:]

    print(train_x.shape, test_x.shape)

    scaler = MinMaxScaler(feature_range=(-1, 1))  # feature_range=(-1,1)
    print(train_x.shape, train_y.shape)
    train_set = np.reshape(train_set, (max(train_set.shape), 1))
    train_set_scaled = scaler.fit_transform(train_set)  # scaler.fit()
    train_x = np.reshape(train_x, (max(train_x.shape), 1))
    train_x = scaler.transform(
        train_x)  # don't want to scale test data's labels
    train_y = np.reshape(train_y, (max(train_y.shape), 1))
    train_y = scaler.transform(
        train_y)  # scale train's labels, required for loss calculations
    test_x = np.reshape(test_x, (max(test_x.shape), 1))
    test_x = scaler.transform(test_x)  # don't want to scale test data's labels

    train_x = train_x.reshape([max(train_x.shape), look_back, num_features[0]])
    test_x = test_x.reshape([max(test_x.shape), look_back, num_features[0]])
    train_y = train_y.reshape([max(train_y.shape), look_back, num_features[1]])
    test_y = test_y.reshape([max(test_y.shape), look_back, num_features[1]])

    return raw_values, train_x, train_y, test_x, test_y, scaler
Esempio n. 24
0
class DataLoader():
    """A class for loading and transforming data for the LSTM model"""
    def __init__(self,
                 path,
                 split,
                 cols,
                 label_col,
                 MinMax,
                 start_from=None,
                 end=None,
                 returns=True):
        filename = path
        dataframe = pd.read_csv(filename)
        dataframe = dataframe.dropna(axis=0)
        print(dataframe.isnull().sum())
        self.dates = dataframe['Date']
        if start_from is not None:
            dataframe.Date = pd.to_datetime(dataframe.Date)
            start = pd.to_datetime(start_from)
            dataframe = dataframe.loc[dataframe.Date > start]

        if end is not None:
            dataframe.Date = pd.to_datetime(dataframe.Date)
            end = pd.to_datetime(end)
            dataframe = dataframe.loc[dataframe.Date < end]

        self.dates = dataframe['Date']

        if returns:
            dataframe['log_ret'] = np.log(dataframe['Adj Close'] /
                                          dataframe['Adj Close'].shift(1))
            dataframe = dataframe.iloc[1:]

        dataframe = dataframe.get(cols)
        if split is not None:
            i_split = int(len(dataframe) * split)
            print(self.dates.values[i_split])
            self.data_train = dataframe.values[:i_split]
            self.data_test = dataframe.values[i_split:]
            self.len_test = len(self.data_test)

        if split is None:
            self.data_train = dataframe.values

        self.len_train = len(self.data_train)

        self.label_col_indx = (dataframe.columns.get_loc(label_col)
                               )  # Get index of label column
        if MinMax:
            self.scaler = MinMaxScaler()
            self.data_train = self.scaler.fit_transform(self.data_train)
            self.data_test = self.scaler.transform(self.data_test)

        self.w_normalisation_p0_train = []
        self.w_normalisation_p0_test = []

    def get_train_data(self, seq_len, normalise, num_forward=1):
        '''
        Seq_len: total length, ie. the last gets to be the label
        '''
        seq_len = seq_len
        seq_plus_forward = seq_len + num_forward
        data_x = []
        data_y = []
        for i in range(self.len_train - seq_plus_forward):
            x, y, first_row = self._next_window(i, seq_plus_forward, 'train',
                                                normalise, num_forward)
            self.w_normalisation_p0_train.append(first_row)
            data_x.append(x)
            data_y.append(y)
        return np.array(data_x), np.array(data_y)

    def get_test_data(self, seq_len, normalise, num_forward=1):
        '''
        Seq_len: total length, ie. the last gets to be the label
        '''
        seq_len = seq_len
        seq_plus_forward = seq_len + num_forward
        data_x = []
        data_y = []
        for i in range(self.len_test - seq_plus_forward):
            x, y, first_row = self._next_window(i, seq_plus_forward, 'test',
                                                normalise, num_forward)
            self.w_normalisation_p0_test.append(first_row)
            data_x.append(x)
            data_y.append(y)
        return np.array(data_x), np.array(data_y)

    def _next_window(self, i, seq_len, split, normalise, num_forward):
        """Generates the next data window from the given index location i"""
        ''
        if split == 'train':
            window = self.data_train[i:i + seq_len]
            first_row = window[0, :]
            window = self.normalise_windows(
                window, single_window=True)[0] if normalise else window
            x = window[:seq_len - num_forward]
            y = window[-1, [self.label_col_indx]]

        if split == 'test':
            window = self.data_test[i:i + seq_len]
            first_row = window[0, :]
            window = self.normalise_windows(
                window, single_window=True)[0] if normalise else window
            x = window[:seq_len - num_forward]
            y = window[-1, [self.label_col_indx]]

        return x, y, first_row

    def normalise_windows(self, window_data, single_window=False):
        '''Normalise window with a base value of zero'''
        normalised_data = []
        window_data = [window_data] if single_window else window_data
        for window in window_data:
            normalised_window = []
            for col_i in range(window.shape[1]):
                normalised_col = [((float(p) / float(window[0, col_i])) - 1)
                                  for p in window[:, col_i]]
                normalised_window.append(normalised_col)
            normalised_window = np.array(
                normalised_window
            ).T  # reshape and transpose array back into original multidimensional format
            normalised_data.append(normalised_window)
        return np.array(normalised_data)
Esempio n. 25
0
class LSTM():
    def __init__(self):
        self.df = pd.read_csv('Dados/new_dataset.csv')
        self.NormalizeData()
        self.timesteps = 15
        self.nr_parametos = 2
        self.PrepareData(self.timesteps)
        self.Build(self.timesteps, self.nr_parametos)
        self.Fit()

    def NormalizeData(self):
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        self.normalized = self.scaler.fit_transform(self.df)
        #print(self.normalized)

    def Denormalize(self, dfNormalized):
        pass

    def PrepareData(self, timesteps):
        i = 0
        self.X = []
        self.Y = []
        while i in range(len(self.normalized - timesteps)):
            input_index = i + timesteps
            label_index = input_index + 1
            if (label_index < len(self.normalized)):
                self.X.append(self.normalized[i:input_index, 1:3])
                self.Y.append(self.normalized[input_index:label_index, 1:2])
            i += 1
        self.X = np.array(self.X)
        self.Y = np.array(self.Y)
        #print(self.X)
        X = self.X
        #print(self.Y)
        Y = self.Y

    '''
    def Prepare_Data(self,dataset):
        x = dataset.drop(columns=['Total_Deaths'])
        self.X = x.to_numpy()
        y = dataset['Total_Deaths']
        self.Y = y.to_numpy()
        self.Y = self.Y.astype(float)
    '''

    def Build(self, janela, nmr_parametros):
        self.model = keras.Sequential()
        self.model.add(
            keras.layers.LSTM(32,
                              input_shape=(janela, nmr_parametros),
                              return_sequences=True))
        self.model.add(keras.layers.LSTM(64, return_sequences=True))
        self.model.add(keras.layers.LSTM(128, return_sequences=False))
        self.model.add(keras.layers.Dropout(0.2))
        self.model.add(
            keras.layers.Dense(32,
                               activation="relu",
                               kernel_initializer="uniform"))
        self.model.add(keras.layers.Dense(1, activation="linear"))

    def RMSE(self, y_true, y_pred):
        return keras.backend.sqrt(
            keras.backend.mean(keras.backend.square(y_pred - y_true)))

    def Fit(self):
        self.model.compile(loss=self.RMSE,
                           optimizer=keras.optimizers.Adam(),
                           metrics=['mae', self.RMSE])
        self.model.load_weights("model.h5")
        self.history = self.model.fit(x=self.X,
                                      y=self.Y,
                                      epochs=20,
                                      shuffle=False)
        #self.model.save_weights("model.h5")
    def Predict(self, data):
        result = self.model.predict(data, verbose=True)
        return result

    def forecast(self):
        timesteps = self.timesteps
        multisteps = 50
        data_norm = pd.DataFrame(self.normalized)
        input_seq = data_norm[-timesteps:].values
        inp = input_seq[:, 1:3]

        predictions = list()

        inp = np.array(inp).astype('float32')
        #print(inp)
        for step in range(1, multisteps + 1):

            inp = inp.reshape(1, timesteps, 2)

            taxa_Erro = random.uniform(-0.005, 0.005)

            yhat = self.Predict(inp) + taxa_Erro

            Denormalized = np.ndarray((1, 7))
            Denormalized[0][0] = -1
            Denormalized[0][1] = yhat
            Denormalized[0][2] = inp[0][inp.shape[1] - 1][1]
            Denormalized[0][3] = -1
            Denormalized[0][4] = -1
            Denormalized[0][5] = -1
            Denormalized[0][6] = -1
            #print(self.scaler)
            #print(self.normalized)
            value = self.scaler.inverse_transform(Denormalized)
            #print(value)
            predictions.append(value[0][1])
            #predictions.append(yhat[0][0])
            newCase = np.array((float(yhat), inp[0][inp.shape[1] - 7][1]))  #
            inp = np.append(inp, newCase)
            inp = np.reshape(inp, (-1, 2))
            inp = inp[-timesteps:, :]
            #print(inp)

        self.PredictionGraph(predictions)

    def PredictionGraph(self, prediction):
        fig, ax = plt.subplots(figsize=(20, 10))
        fig.suptitle('Previsao Casos ',
                     fontweight='bold',
                     fontsize=30,
                     color='#0c3c6e')
        plt.xlabel('Dia', fontsize=20)
        plt.ylabel('Previsão', fontsize=20)
        ax.plot(np.arange(len(self.df)), (self.df['Total_Cases']),
                label='Days Gone with real data')
        ax.plot(np.arange(len(self.df),
                          len(self.df) + len(prediction)), (prediction),
                label='50 days LSTM prediction')
        ax.legend(fancybox=True,
                  framealpha=1,
                  shadow=True,
                  borderpad=1,
                  fontsize='15')
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        ax.grid()
        fig.savefig('static/lstm.png')
Esempio n. 26
0
        similar_words.append(
            cosine_value(doc_lines[0], doc_lines[i], doc_lines))
        sen_freq.append(calculate_sentence_freq(doc_lines[i], vocab))
        keywords.append(calculate_top_words(top_words, doc_lines[i]))

    for i in range(len(doc_lines)):
        features.append([
            lengths[i], positions[i], similar_words[i], degrees[i],
            sen_freq[i], keywords[i]
        ])

features = numpy.array(features)
target_values = numpy.array(target_values)

scalar = MinMaxScaler()
features = scalar.fit_transform(features)

# model =create_model()
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# model.fit(features,target_values, batch_size=10, epochs=1000)
# model.save('model5.h5')
# seed =7
kfold = model_selection.KFold(n_splits=3, shuffle=True, random_state=42)
# cvscores = []

# evaluate the model
# scores = model.evaluate(features[test], target_values[test], verbose=2)
# print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
# cvscores.append(scores[1] * 100)

# create model
Esempio n. 27
0
        min_exercise_stock_options = min(min_exercise_stock_options, feature[1])
    if feature[0] != 0:
        max_salary = max(max_salary, feature[0])
        min_salary = min(min_salary, feature[0])

print 'Max value of "exercised_stock_options": ', max_exercise_stock_options
print 'Min value of "exercised_stock_options": ', min_exercise_stock_options
print 'Max value of "salary": ', max_salary
print 'Min value of "salary": ', min_salary

### apply feature scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_features = scaler.fit_transform(finance_features)
print 'Scaled salary of $200,000 & stock options of $1,000,000: ', \
    scaler.transform([[200000., 1000000.]])

### rename the "name" parameter when you change the number of features
### so that the figure gets saved to a different file
try:
    Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
except NameError:
    print "no predictions object named pred found, no clusters to plot"



### rename the "name" parameter when you change the number of features
### so that the figure gets saved to a different file
try:
Esempio n. 28
0
GRN_MTR = GRN_MTR.loc[GRN_MTR['TI_TAG_DESCRIPTION'] == "Meter flow rate"]
GRN_MTR = GRN_MTR[["TD_TAG_VALUE"]]
GRN_MTR.columns = ["GRN_FLOW"]

### Moving average
DON_MTR_12 = mov_avg(DON_MTR_12, "10s", "1min")
s450_MTR = mov_avg(s450_MTR, "10s", "1min")
GRN_MTR = mov_avg(GRN_MTR, "10s", "1min")

both = DON_MTR_12.join(s450_MTR).fillna(0)
both = both.join(GRN_MTR).fillna(0)
values = both.values
values = values.astype('float32')
scaler = MinMaxScaler(feature_range=(0, 1))
joblib.dump(scaler, "/data/scaler.save")
scaled = scaler.fit_transform(values)

reframed = series_to_supervised(scaled, 180, 1)

for i in range(1, 181):
    behind = str(i)
    reframed = reframed.drop('var3(t-' + behind + ')', axis=1)

values = reframed.values
n_train_hours = int(len(values) * 0.80)
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]
# split into input and outputs
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]
# reshape input to be 3D [samples, timesteps, features]
Esempio n. 29
0
class WGAN(object):
    def __init__(self, **kwargs):
        """ Constructor """
        self._defaults()
        self._args(kwargs)  # override defaults with args passed
        self.setup()
        self.build()

    def _defaults(self):
        """ Sets default variable values """
        self.attack_type = None
        self.critic = None
        self.generator = None
        self.gan = None
        self.evaluator = None

        # saved_states can be used to save states of a GAN, say
        # 5 of them so that the best can be saved when breaking out.
        self.saved_states = []
        self.confusion_matrix = None
        self.classification_report = None
        self.scaler = None

        self.optimizer_learning_rate = 0.001
        self.optimizer = RMSprop(lr=0.00005)

        self.max_epochs = 7000
        self.batch_size = 255
        self.sample_size = 500
        self.clip_value = 0.01

        self.valid = None
        self.fake = None
        self.X_train = None

        self.generator_alpha = 0.1
        self.generator_momentum = 0.0
        self.generator_layers = [8, 16, 32]

        self.confusion_matrix = None
        self.classification_report = None

        self.save_file = None

    def _args(self, kwargs):
        """ kwargs handler """
        for key, value in kwargs.items():
            if key == 'attack_type':
                self.attack_type = value
            elif key == 'max_epochs':
                self.max_epochs = value
            elif key == 'batch_size':
                self.batch_size = value
            elif key == 'sample_size':
                self.sample_size = value
            elif key == 'optimizer_learning_rate':
                self.optimizer_learning_rate = value
            elif key == 'critic':
                self.critic = value
            elif key == 'generator_layers':
                self.generator_layers = value
            elif key == 'generator_alpha':
                self.generator_alpha = value
            elif key == 'generator_momentum':
                self.generator_momentum = value

    def setup(self):
        """ Setups the GAN """
        # TODO new method  called from init opt passed

        print("Attack type: " + self.attack_type)

        conn = SQLConnector()
        data = conn.pull_kdd99(attack=self.attack_type, num=5000)
        dataframe = pd.DataFrame.from_records(
            data=data, columns=conn.pull_kdd99_columns(allQ=True))

        # ==========
        # ENCODING
        # ==========
        # https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn

        d = defaultdict(LabelEncoder)

        # Splitting the data from features and lablels. Want labels to be consistent with evaluator encoding, so
        # we use the utils attack_to_num function
        features = dataframe.iloc[:, :41]
        attack_labels = dataframe.iloc[:, 41:]

        for i in range(0, attack_labels.size):
            attack_labels.at[i, 'attack_type'] = util.attacks_to_num(
                attack_labels.at[i, 'attack_type'])

        features = features.apply(
            lambda x: d[x.name].fit_transform(x))  # fit is encoded dataframe

        # feature scaling, reccomended from github implementation
        self.scaler = MinMaxScaler(feature_range=(-1, 1))
        scaled_features = self.scaler.fit_transform(features.astype(float))
        scaled_df = pd.DataFrame(data=scaled_features)

        # Join the seperately encoded sections back into one dataframe
        dataframe = scaled_df.join(attack_labels)
        dataset = dataframe.values  # transform to ndarray
        print(dataset)

        # TODO: Feature scaling? May be necessary. Has to be on a per-feature basis?

        # Splitting up the evaluation dataset. Should maybe be moved?
        eval_dataset = pd.read_csv('PortsweepAndNonportsweep.csv', header=None)
        eval_dataset = eval_dataset.values

        self.eval_dataset_X = eval_dataset[:, 0:41].astype(int)
        self.eval_dataset_Y = eval_dataset[:, 41]

        validationToTrainRatio = 0.05
        validationSize = int(validationToTrainRatio * len(self.eval_dataset_X))
        self.eval_validation_data = self.eval_dataset_X[:validationSize]
        self.eval_validation_labels = self.eval_dataset_Y[:validationSize]
        self.eval_dataset_X = self.eval_dataset_X[validationSize:]
        self.eval_dataset_Y = self.eval_dataset_Y[validationSize:]

        testToTrainRatio = 0.05
        testSize = int(testToTrainRatio * len(self.eval_dataset_X))
        self.eval_test_data = self.eval_dataset_X[:testSize]
        self.eval_test_labels = self.eval_dataset_Y[:testSize]
        self.eval_dataset_X = self.eval_dataset_X[testSize:]
        self.eval_dataset_Y = self.eval_dataset_Y[testSize:]

        # to visually judge encoded dataset
        print("Real encoded " + self.attack_type + " attacks:")
        print(dataset[:1])

        # Set X as our input data and Y as our label
        self.X_train = dataset[:, 0:41].astype(float)
        Y_train = dataset[:, 41]

        # labels for data. 1 for valid attacks, 0 for fake (generated) attacks
        self.valid = np.ones((self.batch_size, 1))
        self.fake = np.zeros((self.batch_size, 1))

    def build(self):
        """ Build the GAN """
        # build the discriminator portion
        eval_args = {
            'train_data': self.eval_dataset_X,
            'train_labels': self.eval_dataset_Y,
            'validation_data': self.eval_validation_data,
            'validation_labels': self.eval_validation_labels,
            'test_data': self.eval_test_data,
            'test_labels': self.eval_test_labels,
        }

        # Doing this so we can read the data from the evaluator object
        evaluator_object = Evaluator(**eval_args)
        self.evaluator = evaluator_object.get_model()

        print("Evaluator metrics after training:")
        print(evaluator_object.performance)
        critic_layers = self.generator_layers.copy()
        critic_layers.reverse()
        print(critic_layers)
        critic_args = {
            'layers': critic_layers,
            'alpha': self.generator_alpha,
            'optimizer': self.optimizer,
        }
        self.critic = Critic(
            **critic_args).get_model()  #self.discriminator_layers
        self.critic.compile(loss=self.wasserstein_loss,
                            optimizer=self.optimizer,
                            metrics=['accuracy'])

        # build the generator portion
        gen_args = {
            'layers': self.generator_layers,
            'alpha': self.generator_alpha,
        }
        self.generator = Generator(**gen_args).get_model()  #**gen_args

        # input and output of our combined model
        z = Input(shape=(41, ))
        attack = self.generator(z)
        validity = self.critic(attack)

        # build combined model from generator and discriminator
        self.gan = Model(z, validity)
        self.gan.compile(loss=self.wasserstein_loss, optimizer=self.optimizer)

    def train(self):
        """ Trains the GAN system """
        # break condition for training (when diverging)
        loss_increase_count = 0
        prev_g_loss = 0

        conn = SQLConnector()

        idx = np.arange(self.batch_size)

        for epoch in range(self.max_epochs):
            #selecting batch_size random attacks from our training data
            #idx = np.random.randint(0, X_train.shape[0], batch_size)
            attacks = self.X_train[idx]

            # generate a matrix of noise vectors
            noise = np.random.normal(0, 1, (self.batch_size, 41))

            # create an array of generated attacks
            gen_attacks = self.generator.predict(noise)

            # loss functions, based on what metrics we specify at model compile time
            c_loss_real = self.critic.train_on_batch(attacks, self.valid)
            c_loss_fake = self.critic.train_on_batch(gen_attacks, self.fake)
            d_loss = 0.5 * np.add(c_loss_real, c_loss_fake)

            for l in self.critic.layers:
                weights = l.get_weights()
                weights = [
                    np.clip(w, -self.clip_value, self.clip_value)
                    for w in weights
                ]
                l.set_weights(weights)

            # generator loss function
            g_loss = self.gan.train_on_batch(noise, self.valid)

            if epoch % 500 == 0:
                print(
                    "%d [D loss: %f, acc.: %.2f%%] [G loss: %f] [Loss change: %.3f, Loss increases: %.0f]"
                    % (epoch, d_loss[0], 100 * d_loss[1], g_loss,
                       g_loss - prev_g_loss, loss_increase_count))

        gen_attacks = self.scaler.inverse_transform(gen_attacks)
        predicted_gen_attack_labels = self.evaluator.predict(
            gen_attacks).transpose().astype(int)
        gen_attack_labels = np.full(predicted_gen_attack_labels.shape, 1)

        print("Generated attack labels: ")
        print(gen_attack_labels)
        print("Predicted labels of generated attacks: ")
        print(predicted_gen_attack_labels)

        right = (predicted_gen_attack_labels == 1).sum()
        wrong = (predicted_gen_attack_labels != 1).sum()

        accuracy = (right / float(right + wrong))

        print("5 generated attacks: ")
        print(gen_attacks[:5, :])
        print()
        print("Accuracy of evaluator on generated data: %.4f " % accuracy)
        if accuracy > .50:
            conn.write_gens(gen_attacks, util.attacks_to_num(self.attack_type))

        layersstr = str(self.generator_layers[0]) + "," + str(
            self.generator_layers[1]) + "," + str(self.generator_layers[2])
        attack_num = util.attacks_to_num(self.attack_type)

        conn.write_hypers(layerstr=layersstr,
                          attack_encoded=attack_num,
                          accuracy=accuracy)

        # TODO: Add foreign key for attack type in hypers table

    def test(self):
        """ A GAN should know how to test itself and save its results into a confusion matrix. """
        # TODO
        pass

    # This functions should only be passed the FEATURES, we don't want to scale the labels
    def feature_scale(self, dataset):
        # Scale all features, minus the label
        for i in range(0, len(dataset[0, :])):
            col_avg = np.mean(dataset[:, i])
            col_sd = np.std(dataset[:, i])
            dataset[:, i] = (dataset[:, i] - col_avg) / col_sd

    ##########################################################################################
    # Uses Sklearn's confusion matrix maker
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
    ##########################################################################################
    def make_confusion_matrix(self, y_true, y_pred):
        self.confusion_matrix = confusion_matrix(y_true, y_pred)
        self.classification_report = classification_report(y_true, y_pred)

    def wasserstein_loss(self, y_true, y_pred):
        return K.mean(y_true * y_pred)

    ################################################################################
    # Use these to save instances of a trained network with some desirable settings
    # Suggestion to save and load from the object's __dict__ taken from:
    # https://stackoverflow.com/questions/2709800/how-to-pickle-yourself
    ################################################################################
    def save_this(self, filename):
        '''
            Provide a basic filename to pickle this object for recovery later.
            Unlike the load function, this requires a save file, so that it will
            never accidentally overwrite a previous file.
        '''
        self.save_file = filename + '.pickle'
        with open(self.save_file, 'wb') as f:
            pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)

    def load_state_from_file(self, filename=None):
        if not filename:
            if self.save_file:
                filename = self.save_file
            else:
                print("Error: No savefile for this object. \
                        \n Using save_this(filename) will set the save filename."
                      )
                return
        with open(filename, 'rb') as f:
            tmp_dict = pickle.load(f)
            self.__dict__.update(tmp_dict.__dict__)
            f.close()
Esempio n. 30
0
df = downcast_dtypes(df)
# nombres
names = list(df.columns)

# datos de training
train_df = df.iloc[0:test_index]
# datos de testing con timesteps hacia atrás
test_df = df.iloc[test_index - timesteps:]
print(train_df.info())

# reset de indices para eliminar las fechas
df.reset_index(drop=True, inplace=True)
# normalizar los datos
sc = MinMaxScaler(feature_range=(0, 1))
# training
train_df = sc.fit_transform(train_df)
# testing
test_df = sc.transform(test_df)
# hacer reshape para las transformaciones de las celdas lstm
x_train, y_train = lstm_preparation(train_df, timesteps=timesteps)
x_test, y_test = lstm_preparation(test_df, timesteps=timesteps)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

# modelo
lstm = tf.keras.Sequential()
lstm.add(
    tf.keras.layers.LSTM(units=512,
                         input_shape=(np.array(x_train).shape[1],
                                      np.array(x_train).shape[2])))
# print(total_lobster_df['Total Lobster'][5110])
print(Lobster_df['Total Lobster'][167])

train_set = Lobster_df.head(676)
test_set = Lobster_df.tail(52)

# train_set = total_lobster_df.head(4745)
# test_set = total_lobster_df.tail(365)

train_set.tail()

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train_set)
test_scaled = scaler.transform(test_set)

print(train_scaled[:5])

# print(train_scaled)
# for i in train_scaled:
#   print(i)
# print(test_scaled)

#train_scaled
y_train = train_scaled[:,:-1]
x_train = train_scaled[:, -1:]

#test scaled
y_test = test_scaled[:,:-1]
# Stock prices keras
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras import metrics
from sklearn.model_selection import train_test_split

dataset_train = pd.read_csv('NSE-TATAGLOBAL.csv')
training_set = dataset_train.iloc[:, 5:6].values
trainig_see = pd.DataFrame(training_set)

from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0, 1))
training_set_scaled = sc.fit_transform(training_set)
'''
LSTMs expect our data to be in a specific format, usually a 3D array. We start 
by creating data in 60 timesteps and converting it into an array using NumPy. 
Next, we convert the data into a 3D dimension array with X_train samples, 60 
timestamps, and one feature at each step.
'''

X_train = []
y_train = []
for i in range(60, 2035):
    X_train.append(training_set_scaled[i - 60:i, 0])
    y_train.append(training_set_scaled[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)

X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
                        'chg_per_svc',
                        'denied',
                        'psps_denied_services_cnt'
                        ], inplace=True,axis = 1)

X_train, X_test, y_train, y_test = train_test_split(results_df_sample, y, stratify=y,test_size=0.20, random_state=123)

print(X_train.info())
print(X_test.info())

WOE_encoder = WOEEncoder()
X_train_enc = WOE_encoder.fit_transform(X_train, y_train)
X_test_enc = WOE_encoder.transform(X_test)

scaler = MinMaxScaler()
X_train_enc_scaled = pd.DataFrame(scaler.fit_transform(X_train_enc, y_train))
X_test_enc_scaled = pd.DataFrame(scaler.transform(X_test_enc))

param_grid = {'C': [6,7,8,9,10,11,12], 'gamma': [1],'kernel': ['rbf']}

print(param_grid)

SVC = SVC()
# Random search of parameters, using 3 fold cross validation,
# search across 10 different combinations, and use all available cores

RFC_CV = GridSearchCV(estimator=SVC, param_grid=param_grid, cv= 2,verbose = 2)
RFC_CV.fit(X_train_enc_scaled,y_train)

print(RFC_CV.best_params_)
print(RFC_CV.best_score_)
Esempio n. 34
0
y_train = y_data[0:num_train]
y_test = y_data[num_train:]
# print(y_train)
#

num_x_signals = x_data.shape[0]
print(num_x_signals)

num_y_signals = y_data.shape[0]
print(num_y_signals)
# #
# # print("Min:",(min(x_train)))
# # print("Max:",(max(x_train)))
#
x_scaler = MinMaxScaler()
x_train_scaled = x_scaler.fit_transform(x_train)
# print(x_train_scaled)
print("Min:", np.min(x_train_scaled))
print("Max:", np.max(x_train_scaled))
x_test_scaled = x_scaler.transform(x_test)
#
y_scaler = MinMaxScaler()
y_train_scaled = y_scaler.fit_transform(y_train.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.reshape(-1, 1))


#  print(x_train_scaled.shape)
# print(y_train_scaled.shape)
#
#
def batch_generator(batch_size, sequence_length):
Esempio n. 35
0
    "attack"
]
df = df.filter(items=features)

#Split into X (matrix) and y (array)
dataset = df.values
X = dataset[:, 0:8]
#X = pd.DataFrame(X).fillna(0)
y = dataset[:, 8]

#Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#Normalize data
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#Start algorithm
n_neighbors = 7
knn = KNeighborsClassifier(n_neighbors, weights='distance')
print("Starting algorithm")
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.5f}'.format(
    knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.5f}'.format(
    knn.score(X_test, y_test)))

pickle.dump(knn, open('KNN_Model', 'wb'))

Esempio n. 36
0
 def scale(data):
     scaler = MinMaxScaler()
     for i in ['age', 'bmi', 'avg_glucose_level']:
         data[i] = scaler.fit_transform(data[i].values.reshape(-1, 1))
     return data
Esempio n. 37
0
warnings.filterwarnings("ignore", category=FutureWarning)



################################
# K-MEANS
################################

df = pd.read_csv("datasets/USArrests.csv", index_col=0)
df.head()
df.isnull().sum()
df.info()
df.describe().T

sc = MinMaxScaler((0, 1))
df = sc.fit_transform(df)
df[0:5]

kmeans = KMeans(n_clusters=4)
k_fit = kmeans.fit(df)
k_fit

k_fit.n_clusters
k_fit.cluster_centers_

k_fit.labels_

df[0:5]

################################
# Kümelerin Görselleştirilmesi
plt.plot(NC_time, 'o', label='Nearest centroid')
plt.plot(GNB_time, 'o', label='Gaussian Naive Bayes')
plt.plot(DT_time, 'o', label='Decision tree')
plt.legend()

plt.plot(KNN_time_simple, 'o', label='K-nearest neighbors (simple)')
plt.plot(KNN_time_with_improvement,
         'o',
         label='K-nearest neighbors (with improvement)')
plt.plot(Grid_KNN_time, 'o', label='Grid search for KNN')
plt.plot(Grid_DT_time, 'o', label='Grid serach for Decision tree')
plt.legend()

tic = timeit.default_timer()
mms = MinMaxScaler()  # feature vektort normalizáljuk
x_train_neural = mms.fit_transform(x_train)
x_test_neural = mms.fit_transform(x_test)

model = Sequential([
    Dense(32, input_shape=(x_train_neural.shape[1], )),
    Activation('relu'),
    Dense(32),
    Activation('relu'),
    Dense(32),
    Activation('relu'),
    Dropout(0.25),  # Regularizáció
    Dense(2),
    Activation('softmax'),
])

model.compile(loss='binary_crossentropy',
    model.add(Dense(units=1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model


if __name__ == '__main__':

    #set random seed
    np.random.seed(seed)
    #import data
    data = read_csv(filename, usecols=[1], engine='python', skipfooter=footer)
    dataset = data.values.astype('float32')

    #standarize data
    scaler = MinMaxScaler()
    dataset = scaler.fit_transform(dataset)
    train_size = int(len(dataset) * 0.67)
    validation_size = len(dataset) - train_size
    train, validation = dataset[0:train_size, :], dataset[
        train_size:len(dataset), :]

    X_train, y_train = create_dataset(train)
    X_validation, y_validation = create_dataset(validation)

    # transform data as [sample, time step, feature]
    X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_validation = np.reshape(
        X_validation, (X_validation.shape[0], 1, X_validation.shape[1]))

    #train model
    model = create_model()
Esempio n. 40
0
# plt.figure()
# fig1 = sns.boxplot(df['close'])
# fig1.set_title('Box plot of %s'%stock_code)
# plt.show()

# plt.figure()
# fig2 = sns.lineplot(df['date'], df['close'])
# fig2.set_title('Time series of %s'%stock_code)
# plt.show()

# print(np.array(df[['close','open']]).shape)
# exit()

# 数据正规化处理
scaler = MinMaxScaler(feature_range=(0, 1))
df['scaled_close'] = scaler.fit_transform(np.array(df['close']).reshape(-1, 1))

# print(df['scaled_close'])

# 数据集划分处理
split_date = datetime(year=2019, month=7, day=1)
df_train = df.loc[df['date'] < split_date]
df_val = df.loc[df['date'] >= split_date]
df_val.reset_index(drop=True, inplace=True)

# print(df_train.shape, df_val.shape)
# exit()


def makeXy(df, time_steps):
    # 本函数用于生成训练模型的数组数据
Esempio n. 41
0
    x,y = build_timeseries(x, y_col_index)
    return trim_dataset(x), trim_dataset(y)

log.info('Loading and preprocessing...')
log.info('Loading data...')

df_data = get_data(data_sym, dates, data_cols=train_cols, sing=True)
df_train, df_test = train_test_split(df_data, train_size=train_size, test_size=test_size, shuffle=False)

log.info('Data loaded.')
print()
log.info('Preprocessing data...')

x = df_train.loc[:,train_cols].values
min_max_scaler = MinMaxScaler()
x_train = min_max_scaler.fit_transform(x)
x_test = min_max_scaler.transform(df_test.loc[:,train_cols])

x_t, y_t = preprocess(x_train, 3)

x_temp, y_temp = preprocess(x_test, 3)
x_val, x_test_t = np.split(x_temp, 2)
y_val, y_test_t = np.split(y_temp, 2)

log.info('Done preprocessing.')
print()

model_path = model_dir+model_name
if os.path.isfile(model_path):
    log.info('Loading model...')
    
trainImages = trainImages.reshape(4000,1*270*300)
testImages = testImages.reshape(4000,1*270*300)

#Verificamos que el dataset de imagenes  sea de 2 dimensiones
testImages.shape
trainImages.shape

#Verificamos que el dataset de labels  sea de 2 dimensiones
testLabels.shape
trainLabels.shape

#----------------------------------------------------------------------------------------------------------------------

#Transformo los datos con el método MinMaxScaler() a una escala particular
scaler = MinMaxScaler()
X_train = scaler.fit_transform(trainImages)
X_test = scaler.transform(testImages)

#----------------------------------------------------------------------------------------------------------------------

'''
 Los autovectores son las direcciones en las que la varianza de los datos es mayor.  
 Recordemos que, en teoría de probabilidad, la varianza  de una variable aleatoria es una medida de dispersión 
 (definida como la esperanza del cuadrado de la desviación de dicha variable respecto a su media). 
 Por tanto, las direcciones en las que la varianza es mayor,  representan la esencia principal de la información 
 contenida en el dataset, por eso se les llama componentes principales. Al igual que un autovector es una dirección, 
 el autovalor es un número, que representa el valor de la varianza sobre ese autovector. Por ello, para encontrar las 
 componentes principales que condensen esa esencia de la información del dataset, calcularemos primero la matriz de 
 covarianza, que nos da la medida de dispersión conjunta entre variables.

 Para ello, usaremos la función Covariance Matrix de la librería Numpy.
Esempio n. 43
-1
def test_min_max_scaler_zero_variance_features():
    """Check min max scaler on toy data with zero variance features"""
    X = [[0.,  1.,  0.5],
         [0.,  1., -0.1],
         [0.,  1.,  1.1]]

    X_new = [[+0.,  2.,  0.5],
             [-1.,  1.,  0.0],
             [+0.,  1.,  1.5]]

    # default params
    scaler = MinMaxScaler()
    X_trans = scaler.fit_transform(X)
    X_expected_0_1 = [[0.,  0.,  0.5],
                      [0.,  0.,  0.0],
                      [0.,  0.,  1.0]]
    assert_array_almost_equal(X_trans, X_expected_0_1)

    X_trans_new = scaler.transform(X_new)
    X_expected_0_1_new = [[+0.,  1.,  0.500],
                          [-1.,  0.,  0.083],
                          [+0.,  0.,  1.333]]
    assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)

    # not default params
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    X_expected_1_2 = [[1.,  1.,  1.5],
                      [1.,  1.,  1.0],
                      [1.,  1.,  2.0]]
    assert_array_almost_equal(X_trans, X_expected_1_2)
Esempio n. 44
-1
    def loaddataset(self,path,module):                
       df=pd.read_csv(path)
       subdf = df[['PassengerId','Pclass','Sex','Age','Embarked','Fare','SibSp','Parch']]
       SibSp=subdf['SibSp']
       Parch=subdf['Parch']
#      supplement Age
       Age=subdf['Age'].fillna(value=subdf.Age.mean())
             
       Fare=subdf['Fare'].fillna(value=subdf.Fare.mean())
       
       dummies_Sex=pd.get_dummies(subdf['Sex'],prefix='Sex')
       
       dummies_Embarked = pd.get_dummies(subdf['Embarked'], prefix= 'Embarked')     
       
       dummies_Pclass = pd.get_dummies(subdf['Pclass'], prefix= 'Pclass')
       
       PassengerId=subdf['PassengerId']
       
#      Age&Fare to Scaler
       scaler=MinMaxScaler()
       age_scaled=scaler.fit_transform(Age.values)
       fare_scaled=scaler.fit_transform(Fare.values)
       
       Age_Scaled=pd.DataFrame(age_scaled,columns=['Age_Scaled'])
       Fare_Scaled=pd.DataFrame(fare_scaled,columns=['Fare_Scaled'])
       
       if module=='train':
          self.trainlabel=df.Survived
          self.trainset=pd.concat([dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1)
       elif module=='test':
          self.testset=pd.concat([PassengerId,dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1)
Esempio n. 45
-1
def normalize_data(tr_x,ts_x,normz=None,axis=0):
    if normz is 'scale':
        tr_x = scale(tr_x,axis=axis)
        ts_x = scale(ts_x,axis=axis)
    elif normz is 'minmax':
        minmax_scaler = MinMaxScaler()
        if axis==0:
            for c_i in range(tr_x.shape[1]):
                tr_x[:,c_i] = minmax_scaler.fit_transform(tr_x[:,c_i])
                ts_x[:,c_i] = minmax_scaler.fit_transform(ts_x[:,c_i])
        elif axis==1:
            for r_i in range(tr_x.shape[0]):
                tr_x[r_i,:] = minmax_scaler.fit_transform(tr_x[r_i,:])
                ts_x[r_i,:] = minmax_scaler.fit_transform(ts_x[r_i,:])
    elif normz is 'sigmoid':
        if axis==0:
            col_max = np.max(tr_x,axis=0)
            cols_non_norm = np.argwhere(col_max>1).tolist()
            tr_x[:,cols_non_norm] = -0.5 + (1 / (1 + np.exp(-tr_x[:,cols_non_norm])))
            # TODO: implement col_max col_non_norm for test set
            ts_x[:,cols_non_norm] = -0.5 + (1/(1+np.exp(-ts_x[:,cols_non_norm])))
        elif axis==1:
            row_max = np.max(tr_x,axis=1)
            rows_non_norm = np.argwhere(row_max>1).tolist()
            tr_x[rows_non_norm,:] = -0.5 + (1 / (1 + np.exp(-tr_x[rows_non_norm,:])))
            # TODO: implement row_max row_non_norm for test set
            ts_x[rows_non_norm,:] = -0.5 + (1/(1+np.exp(-ts_x[rows_non_norm,:])))

    return tr_x,ts_x
    def get_training_data_by_category(category, limit=0):
        limit_pos = limit*0.2
        limit_neg = limit*0.8
        N_pos = DataDAO.count_training_data_by_category(category)
        if N_pos < limit_pos:
            limit_pos = N_pos
            limit_neg = N_pos*5

        training_data = []
        training_target = []
        positive = DataDAO.get_training_data_by_category(category)
        for ind, sample in enumerate(positive):
            if limit != 0 and ind >= limit_pos:
                break
            training_data.append(sample)
            training_target.append(1)
        negative = DataDAO.get_training_data_by_other_categories(category)
        for ind, sample in enumerate(negative):
            if limit != 0 and ind >= limit_neg:
                break
            training_data.append(sample)
            training_target.append(0)

        scaler = MinMaxScaler()
        training_data_scaled = scaler.fit_transform(training_data)

        # training_data_scaled = scale(training_data,axis=0)
        tr_data_sparse = csr_matrix(training_data_scaled)

        return tr_data_sparse, training_target, scaler
Esempio n. 47
-1
def test_stratified_shuffle_split(clf, dataset, feature_list, folds = 1000, scale_features = True):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
 
    # Scale features
    if(scale_features):
        scaler = MinMaxScaler()
        features = scaler.fit_transform(features)

    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print 'Total predictions: '+str(total_predictions)
        print 'Accuracy: '+str(accuracy)
        print 'Precision: '+str(precision)
        print 'Recall: '+str(recall)
        print 'F1: '+str(f1)
        print 'F2: '+str(f2)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."
def cluster(final_data_dict, cluster_range, list_or_dict):
    final_data_list= clustering_module.convert_to_list(final_data_dict) 
    respondent_IDs = np.array(map(int, final_data_dict.keys()))
    feature_names = final_data_dict.values()[0].keys()
    final_data_list_imputed = clustering_module.preprocess(final_data_list)
    Scaler = MinMaxScaler()    
    final_data_list_scaled = Scaler.fit_transform(final_data_list_imputed)
    #Transformed is distance of each respondent from each cluster center
    #Predicted is the cluster membership of each respondent
    merging_list = clustering_module.convert_to_list(final_data_dict,remove_NaN=0 )
    data = list(merging_list)
    ignore_set_added = set(['ids'])
    for num_clusters in cluster_range:    
        transformed, predicted, score = clustering_module.clustering(final_data_list_scaled, num_clusters)
        cluster_name = "%s_clusters" % num_clusters
        ignore_set_added.add(cluster_name)    
        data, feature_names = clustering_module.add_new_data_to_rows(predicted, data, feature_names, [cluster_name])
    data, feature_names = clustering_module.add_new_data_to_rows(respondent_IDs, data, feature_names, ["ids"], "before")
    if list_or_dict == "dict":        
        temp = dictionary_conversion.create_dictionary(data, feature_names)    
        num_converted = dictionary_conversion.convert_values_to_int(temp)    
        #Set of features that should be different due to being categorical
        ignore_set_changed = set(['busgrn', 'peopgrn', 'sex', 'race', 'topprob1', 'topprob2'])    
        verdict = compare_respondent_dicts(respondent_IDs, num_converted, final_data_dict, ignore_set_changed, ignore_set_added)
        return num_converted, verdict
    elif list_or_dict == "list":
        return data, feature_names
Esempio n. 49
-1
def vary_border(pred_true,y,num_iter=101):
    mms = MinMaxScaler()
    pred=pred_true.copy()
    pred=mms.fit_transform(pred)
    best_score = 0
    for k1 in range(num_iter):
        c1 = k1/(num_iter-1)
        for k2 in range(num_iter):
            c2 = k2/(num_iter-1)
            for k3 in range(num_iter):
                c3 = k3/(num_iter-1)
                if c1 < c2 and c1 < c3 and c2 < c3 and c1 > 0.25 and c1 < 0.5 and c3 < 0.9:
                    tmp_pred = pred.copy()
                    mask1 = tmp_pred < c1
                    mask2 = (tmp_pred >=c1) * (tmp_pred < c2)
                    mask3 = (tmp_pred >=c2) * (tmp_pred < c3)
                    mask4 = tmp_pred >=c3
                    tmp_pred[mask1] = 1
                    tmp_pred[mask2] = 2
                    tmp_pred[mask3] = 3
                    tmp_pred[mask4] = 4
                    score = quadratic_weighted_kappa(y,tmp_pred)
                    if score > best_score:
                        best_score = score
                        best_coef = [c1,c2,c3]
                        best_pred = tmp_pred.copy()
    #print(best_score,best_coef)
    return best_pred, best_coef
 def minmaxscaling(df):
     # MinMaxScaling between 0 and 1 is bad when you have outliers.
     # https://stats.stackexchange.com/a/10298
     scaler = MinMaxScaler(feature_range=(0, 1))
     # min max scaler want features in the columns and samples in the rows -> ok
     df = scaler.fit_transform(df)
     return df, scaler
def getips(conf, net, superpixels_num, layer='inner_product_target'):
    (options, args) = parser.parse_args()
    layer = options.layer
    data = net.blobs[layer].data
    #data = net.blobs['InnerProduct1'].data
    feature_len = data.shape[1]
    try:
        negative_numbers = conf.model['number_of_negatives']
    except:
        negative_numbers = 1
    reps = np.zeros((superpixels_num*negative_numbers, feature_len))
    for i in xrange(superpixels_num):
        if i%1000==1:
            print i
        net.forward()
        reps[i] = np.sum(net.blobs[layer].data, axis=1)
    reps_slice = reps[..., 0]
    from sklearn.preprocessing import MinMaxScaler
    clf = MinMaxScaler()
    reps_slice = clf.fit_transform(reps_slice)
    if negative_numbers > 1:
        reps_slice = np.square(reps_slice)
    #reps_slice[reps_slice<np.mean(reps_slice)] = 0
    for i in xrange(reps_slice.shape[0]):
        reps[i] = reps_slice[i]
        # print net.blobs['inner_product_target'].data[1:10]
    return reps
Esempio n. 52
-1
 def scale(self):
     # Scaling is an important part of this process: many of our algorithms
     # require our data to be scaled or otherwise standardized. We 
     # do this by scaling features to values between [0,1]. This preserves
     # zero entries in our sparse matrix which is always a desirable 
     # quality when working with this sort of data.
     # Scaling is sort of a convoluted process because Scipy/Scikit
     # doesn't offer a way to do this natively. We transpose the matrix, 
     # convert it to LIL format (which isn't inefficient in this operation),
     # and divide each row (column in the original matrix) by the row's
     # sum before transposing and converting back to CSR. 
     # However, if the matrix is not sparse, we don't have to worry about
     # this and can simply use one of Scikit's utility methods.
     # TODO: Maybe look at profiling to ensure that this strategy really
     # is the least expensive one.
     if self.sparse:
         self.vecs = self.vecs.tolil()
         self.vecs = self.vecs.transpose()
         num_features, _ = self.vecs.shape
         for i in range(num_features):
             self.vecs[i] /= self.vecs[i].sum()
         self.vecs = self.vecs.transpose()
         self.vecs = self.vecs.tocsr()
     else:
         mms = MinMaxScaler(copy = False)
         self.vecs = mms.fit_transform(self.vecs)
Esempio n. 53
-1
def runAlgorithm(data, categories, function, iterations = 5, num_partitions = 2):
    results_table = np.empty([iterations*num_partitions,4], dtype=float)
    scaler = MinMaxScaler()
    data = scaler.fit_transform(data)

    for i in range(iterations):
        # Se realiza una partición aleatoria
        print("Iteration ", i)
        partition  = makePartitions(data, categories, random_ppio)
        for j in range(num_partitions):
            print("Sub iteration ", j)
            start = time.time()

            training_data = partition[0][j]
            training_categ = partition[1][j]

            test_data = np.array([partition[0][k][l] for k in range(num_partitions) if k!=j for l in range(len(partition[0][k]))], float)
            test_categ = np.array([partition[1][k][l] for k in range(num_partitions) if k!=j for l in range(len(partition[1][k]))])

            solution, train_rate = function(training_data, training_categ)

            end = time.time()

            nbrs =  neighbors.KNeighborsClassifier(3)
            nbrs.fit(training_data[:,solution],training_categ)
            rate = 100*nbrs.score(test_data[:,solution], test_categ)

            results_table[i*num_partitions+j,0] = train_rate/len(training_data)*100
            results_table[i*num_partitions+j,1] = rate
            results_table[i*num_partitions+j,2] = (1 - sum(solution)/len(training_data[0]))*100
            results_table[i*num_partitions+j,3] = end-start

            print("Rate = " + str(rate) + "\nTime = " + str(end-start) + " s")

    return results_table
Esempio n. 54
-1
def analysis_7(df_Coredata):
	""" 多次元多項式モデル """

	#https://www.jeremyjordan.me/polynomial-regression/

	X = df_Coredata[['d','e','f','g','i']]
	y = df_Coredata['j']

	# グラフのスタイルを指定
	sns.set(style = 'whitegrid', context = 'notebook')
	# 変数のペアの関係をプロット
	#sns.pairplot(df_Coredata)
	#plt.show()


	#X_train, X_test, y_train, y_test  =  train_test_split(X,y,random_state = 0)
	#lr = linear_model.LinearRegression().fit(X_train, y_train)
	#print("Trainng set score: {:.2f}".format(lr.score(X_train, y_train)))
	#print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

	### データのスケール変換
	# 標準化
	std_Scaler = StandardScaler()
	data_std = std_Scaler.fit_transform(X)

	mmx_Scaler =MinMaxScaler()
	X_scaled = mmx_Scaler.fit_transform(X)
	#X_test_scaled = scaler.transform(X_test)

	#print(X_train_scaled)

	poly = PolynomialFeatures(degree = 2).fit(data_std)
	print(poly.get_feature_names())
def plot_prediction_relevance(results, EFA=True, classifier='ridge', 
                              rotate='oblimin', change=False, size=4.6, 
                              dpi=300, ext='png', plot_dir=None):
    """ Plots the relevant relevance of each factor for predicting all outcomes """
    predictions = results.load_prediction_object(EFA=EFA, 
                                                 change=change,
                                                 classifier=classifier,
                                                 rotate=rotate)['data']

    targets = list(predictions.keys())
    predictors = predictions[targets[0]]['predvars']
    importances = abs(np.vstack([predictions[k]['importances'] for k in targets]))
    # scale to 0-1 
    scaler = MinMaxScaler()
    scaled_importances = scaler.fit_transform(importances.T).T
    # make proportion
    scaled_importances = scaled_importances/np.expand_dims(scaled_importances.sum(1),1)
    # convert to dataframe
    scaled_df = pd.DataFrame(scaled_importances, index=targets, columns=predictors)
    melted = scaled_df.melt(var_name='Factor', value_name='Importance')
    plt.figure(figsize=(8,12))
    f=sns.boxplot(y='Factor', x='Importance',  data=melted,
                  width=.5)
    if plot_dir is not None:
        filename = 'prediction_relevance'
        save_figure(f, path.join(plot_dir, filename), 
                    {'bbox_inches': 'tight', 'dpi': dpi})
        plt.close()
Esempio n. 56
-1
def uniform_to_normal(df, continuous_features):
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df[continuous_features].dropna()), columns=continuous_features)
    uniform = set()
    alpha = 0.05

    for c in continuous_features:
        statistic, pvalue = kstest(df_scaled[c], scipy.stats.uniform().cdf)
        if statistic < alpha:
            uniform.add(c)

    zero_to_one = [f for f in uniform if
                   df[f].min() > 0 and df[f].min() < 0.001 and df[f].max() < 1 and df[f].max() > 0.999]
    zero_to_ten = [f for f in uniform if
                   df[f].min() > 0 and df[f].min() < 0.01 and df[f].max() < 10 and df[f].max() > 9.99]
    zero_to_hundred = [f for f in uniform if
                       df[f].min() > 0 and df[f].min() < 0.1 and df[f].max() < 100 and df[f].max() > 99.9]
    for f in uniform:
        min = 0 if f in zero_to_one or f in zero_to_ten or f in zero_to_hundred else df[f].min()
        max = 1 if f in zero_to_one else (10 if f in zero_to_ten else 100 if f in zero_to_hundred else df[f].max())
        df[f] = df[f].map(lambda x: norm.ppf((x - min) / (
        max - min)))  # we could use df_scaled but this should give us better results since what we think are the actual min and max, and not the observed min and max

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    return uniform
Esempio n. 57
-1
def sdae_syn(X_s,P,h_layer,activations,noise,epoch,loss,batch_size):
	"""Generate synthetic samples using stacked De-noising Encoders
	Parameters
	----------
	X_s: positive class sample (Numpy Array) (Input Must be in within range of 0 to 1)
	P: Over Sampling Percentage
	h_layer: hidden layer (list)
	activation: activation functions list (same length as hidden layer)
	noise : [None,Gaussian,mask]
	epoch: epoch for each layer (list with same size as hidden layer)
	loss: 'rmse' or 'cross-entropy'
	batch_size = mini_batch size

	For more detaisl on input parameters https://github.com/rajarsheem/libsdae 
	"""
	n_samples=int(X_s.shape[0]*P/100)
	print "generating %d samples" %(n_samples)
	X_init=np.random.standard_normal(size=(n_samples,X_s.shape[1]))
	scaler=MinMaxScaler()
	X_init=scaler.fit_transform(X_init)
	model = StackedAutoEncoder(dims=h_layer, activations=activations, noise=noise, 
		epoch=epoch,loss=loss, 
		batch_size=batch_size, lr=0.007, print_step=2000)
	model.fit(X_s)
	syn_Z=model.transform(X_init)
	return syn_Z
Esempio n. 58
-1
def rank_to_dict(ranks, names, order=1, ratio=1):
	minmax = MinMaxScaler()
	ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
	if np.mean(ranks) == 0:
		ranks+=1
	ranks = map(lambda x: round(x, 2), ranks)
	return dict(zip(names, ranks ))
def train_model(feats_csv):

	df = pd.DataFrame()
	df = pd.read_csv(feats_csv).iloc[:,1:]

	y = np.ravel(df.iloc[:,-1:])
	X = np.array(df.iloc[:,:-1])

	############ 15 Best selected features using ANOVA F-value score function ###############
	X_new = SelectKBest(f_classif, k=15).fit_transform(X, y)
	selected_features = SelectKBest(f_classif, k=15).fit(X, y).get_support(indices = True)

	############ KNN manhattan ###############
	##### preprocessing: data scaling######## 
	min_max_scaler = MinMaxScaler()
	X_new = min_max_scaler.fit_transform(X_new)

	model = KNeighborsClassifier(n_neighbors = 1,algorithm = 'brute',metric = 'manhattan',weights = 'uniform')
	model.fit(X_new,y)

	newdir = '../kNN_clfr'
	os.mkdir(newdir)

	joblib.dump(model, os.path.join(newdir,'kNN.pkl')) 

	return
Esempio n. 60
-1
def readTrainingData():
    data = np.loadtxt( 'data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) })
    allY = data[:, 32]
    allX = data[:, 1:31]
    allW = data[:, 31]
    
    scale = MMS()
    allX = scale.fit_transform(allX)
    np.random.seed(42)
    r = np.random.rand(allY.shape[0])

    xTrain = allX[r<=0.4]
    yTrain = allY[r<=0.4]
    wTrain = allW[r<=0.4]

    xValid = allX[r>0.7]
    yValid = allY[r>0.7]
    wValid = allW[r>0.7]

    v = np.random.rand(yValid.shape[0])
    xCrossValid = xValid[v<=0.5]
    yCrossValid = yValid[v<=0.5]
    wCrossValid = wValid[v<=0.5]

    xTestValid  = xValid[v>0.5]
    yTestValid  = yValid[v>0.5]
    wTestValid  = wValid[v>0.5]
    
    return [xTrain, yTrain, wTrain, xCrossValid, yCrossValid, wCrossValid, xTestValid, yTestValid, wTestValid]