Exemple #1
0
def readTestData():
    testData  = np.loadtxt('data/test.csv', delimiter=',', skiprows=1)
    xTest     = testData[:,1:31]
    scale = MMS()
    allX = scale.fit_transform(xTest)
    indexTest = list(testData[:,0])
    return [allX, indexTest]
def test_min_max_scaler_iris():
    X = iris.data
    scaler = MinMaxScaler()
    # default params
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), 0)
    assert_array_almost_equal(X_trans.min(axis=0), 0)
    assert_array_almost_equal(X_trans.max(axis=0), 1)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # not default params: min=1, max=2
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), 1)
    assert_array_almost_equal(X_trans.max(axis=0), 2)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # min=-.5, max=.6
    scaler = MinMaxScaler(feature_range=(-.5, .6))
    X_trans = scaler.fit_transform(X)
    assert_array_almost_equal(X_trans.min(axis=0), -.5)
    assert_array_almost_equal(X_trans.max(axis=0), .6)
    X_trans_inv = scaler.inverse_transform(X_trans)
    assert_array_almost_equal(X, X_trans_inv)

    # raises on invalid range
    scaler = MinMaxScaler(feature_range=(2, 1))
    assert_raises(ValueError, scaler.fit, X)
def sample_from_generator(history, nb_samples, latent_dim=12, 
                          valid_split=0.3, random_split=True,
                          hidden_dims=None, **kwargs):
    scaler = MinMaxScaler()
    scaler.fit(history)
    scaled = scaler.transform(history)
    
    nb_train = history.shape[0]    
    if not valid_split:
        nb_valid = 0
    elif isinstance(valid_split, float):
        nb_valid = nb_train - int(np.floor(nb_train*valid_split))
    else:
        nb_valid = valid_split
        
    if nb_valid > 0:
        if random_split:
            ind = np.arange(nb_train)
            np.random.shuffle(ind)
            x_valid = scaled[ind[-nb_valid:], :]
            x_train = scaled[ind[:-nb_valid], :]
        else:
            x_valid = scaled[-nb_valid:, :]
            x_train = scaled[:-nb_valid, :]
    else:
        x_valid = None
        x_train = scaled
    
    _, generator = build_model(latent_dim, x_train, x_valid=x_valid, 
                               hidden_dims=hidden_dims, **kwargs)
    
    normal_sample = np.random.standard_normal((nb_samples, latent_dim))
    draws = generator.predict(normal_sample)
    return scaler.inverse_transform(draws)
Exemple #4
0
mpl.rc('figure', figsize=(8, 7))
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20, 10
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier as MLP
import pandas_datareader.data as web
from pandas import Series, DataFrame
from sklearn.linear_model import LinearRegression
import datetime, math
from sklearn.neighbors import KNeighborsRegressor as knn
import matplotlib.dates as mdates
clf = LinearRegression()  #n_jobs=-1)

days = 240
sight = 480
scaler = MinMaxScaler(feature_range=(0, 1))
start = datetime.datetime(2010, 1, 1)

end = datetime.datetime.today() + datetime.timedelta(days=days)
dayss = (end - start).days
predicted_list = [end - datetime.timedelta(days=x) for x in range(days)]
predicted_list.reverse()

stock = input("Stock: ").upper()
df = web.DataReader(stock, 'yahoo', start, end)
data = df['Adj Close']
X, y = [], []
fig, ax = plt.subplots()
formatter = mdates.DateFormatter("%Y")
date_list = list(data.reset_index()["Date"])
for i in range(0, len(data) - (sight + 1)):
Exemple #5
0
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

data = pd.read_csv('monthly-milk-production-pounds.csv', index_col='Month')
print(data.head())
data.plot()
plt.show()

data.index = pd.to_datetime(data.index)
train_data = data.head(156)
test_data = data.tail(12)
scl = MinMaxScaler()
train_scaled = scl.fit_transform(train_data)
test_scaled = scl.transform(test_data)
print(train_scaled)
print(test_scaled)


def next_batch(training_data, steps):
    random_start = np.random.randint(0, len(training_data) - steps)
    y_data = np.array(training_data[random_start:random_start + steps +
                                    1]).reshape(1, steps + 1)
    return y_data[:, :-1].reshape(-1, steps,
                                  1), y_data[:, 1:].reshape(-1, steps, 1)


num_inputs = 1
num_outputs = 1
Exemple #6
0
import matplotlib.pyplot as plt
import pandas as pd

train_date = pd.Timestamp('2015-06-20')

train = naver2.loc[:train_date, ['Close']]
test = naver2.loc[train_date:, ['Close']]

ax = train.plot()
test.plot(ax=ax)
plt.legend(['train', 'val','test'])
plt.show()

from sklearn.preprocessing import MinMaxScaler

sc = MinMaxScaler()

train_sc = sc.fit_transform(train)
test_sc = sc.transform(test)

train_sc.shape
train_sc.head()

train_sc_df = pd.DataFrame(train_sc, columns=['Scaled'], index=train.index)
test_sc_df = pd.DataFrame(test_sc, columns=['Scaled'], index=test.index)
train_sc_df.head()

for s in range(1, 13):
    train_sc_df['shift_{}'.format(s)] = train_sc_df['Scaled'].shift(s)
    test_sc_df['shift_{}'.format(s)] = test_sc_df['Scaled'].shift(s)
Exemple #7
0
sel_rows = df_merged_volume[lambda r:
                            ((r.timeofday >= 6) & (r.timeofday < 10)) | (
                                (r.timeofday >= 15) & (r.timeofday < 19))]
sel_rows = sel_rows[useful_cols]

#split to train and test set
train_rows = sel_rows[:-24 * 7]
test_rows = sel_rows[-24 * 7:]  #reserve 1 week for test

#get numpy array from panda dataframe
train_arr = train_rows.values
test_arr = test_rows.values

#scale feature array to range -1 to 1
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_arr)
train_scaled_arr = scaler.transform(train_arr)
test_scaled_arr = scaler.transform(test_arr)

#sample subsequence from the time series
train_seqs = []
nSegments = train_arr.shape[
    0] // 12  # each segment holds 4hr data (12 datapoints, 20min each)
for segment in range(nSegments):
    for t in range(6):
        startIdx = segment * 12 + t
        train_seqs.append(train_scaled_arr[startIdx:startIdx + 7])
train_seqs = np.stack(train_seqs)

test_seqs = []
Exemple #8
0
data.describe()

# Create dummy variables for categorical feature
data['area code'] = data['area code'].astype(str)
cotegorical_columns = ['state', 'area code']
df_dummies = pd.get_dummies(data[cotegorical_columns])
data = pd.merge(data,
                df_dummies,
                how="inner",
                left_index=True,
                right_index=True).drop(columns=cotegorical_columns)

# Min-Max Scaling
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
column_list_for_scaling = data.columns.tolist()[:17]
data[column_list_for_scaling] = min_max_scaler.fit_transform(
    data[column_list_for_scaling])

X = data.drop(['churn'], axis=1)
y = data.churn

# CLASSIFIER SELECTION PIPELINE
# https://www.kaggle.com/sandipdatta/customer-churn-analysis
# from "Stratified Cross Validation - Since the Response values are not balanced" on

# ensemble.GradientBoostingClassifier
# svm.SVC
# ensemble.RandomForestClassifier
# neighbors.KNeighborsClassifier
Exemple #9
0
class PrepareDataset:
    def __init__(self, path_to_data,
                 dataset_name='full_dataset.csv'):
        self.path_to_data = path_to_data
        self.dataset_name = dataset_name
        # self.pred_template_name = pred_template_name
        self.initial_df = pd.read_csv(os.path.join(self.path_to_data, self.dataset_name))
        # self.pred_template = pd.read_csv(os.path.join(self.path_to_data, self.pred_template_name))
        self.time_periods = np.unique(self.initial_df['time_period'], return_index=True)[0]
        self.time_periods_index = np.unique(self.initial_df['time_period'], return_index=True)[1]
        self.transformed_df = None
        self.scaler = None
        self.transformed_scaled_df = None
        self.pca = None

    def collapse_basic_indicators(self, df=None, indicators_range=range(1, 71)):
        if df is None:
            df = copy.copy(self.initial_df)

        print('Begin to collapse basic indicators')
        print('{} indicators were selected'.format(indicators_range))

        variable_list = ["X" + str(i) + '_' for i in indicators_range]

        for var in variable_list:
            df[var + 'avg'] = df.filter(regex=var).mean(axis=1)

        for var in variable_list:
            df[var + 'std'] = df.filter(regex=var).std(axis=1)

        for var in variable_list:
            df[var + 'avg' + '_pctile'] = stat.rankdata(df[var + 'avg']) / df[var + 'avg'].shape[0]

        for var in variable_list:
            df[var + 'std' + '_pctile'] = stat.rankdata(df[var + 'std']) / df[var + 'std'].shape[0]

        model_data_new = pd.concat([df.iloc[:, 0:5],
                                    df.filter(regex='avg'),
                                    df.filter(regex='std'),
                                    ], axis=1)

        model_data_new['is_second_half'] = model_data_new['time_period'].apply(lambda x: 1 if x.endswith('1') else 0)

        model_data_new.fillna(0, inplace=True)

        self.transformed_df = model_data_new
        print('Basic indicators collapsed')

    def drop_outliers(self, quantile = None):
        if quantile is None:
            quantile = 0.001

        print('Start to drop rows with outliers. {} quantile will be removed from each side'.format(quantile/2.))
        idx_to_drop = []
        features_list = list(self.transformed_df.filter(regex='avg$').columns)

        for col in features_list:
            condition1 = self.transformed_df.Train == 1
            condition2 = self.transformed_df[col] > self.transformed_df[col].quantile(1. - quantile/2.)
            condition3 = self.transformed_df[col] < self.transformed_df[col].quantile(quantile/2.)
            to_drop = list(self.transformed_df[(condition1) & (condition2)].index)
            idx_to_drop += to_drop
            to_drop = list(self.transformed_df[(condition1) & (condition3)].index)
            idx_to_drop += to_drop
        self.transformed_df = self.transformed_df.drop(list(set(idx_to_drop)))
        print('Done. {} rows were removed'.format(len(list(set(idx_to_drop)))))

    def add_tech_indicators(self, df=None):
        if df is None:
            df = self.transformed_df if self.transformed_df is not None else self.initial_df

        print('Start to form basic indicators')
        data_agg_avg = df.groupby(['time_period']).mean()
        period_agg_df = pd.DataFrame(index=self.time_periods)
        period_agg_df['Close'] = data_agg_avg['Norm_Ret_F6M']

        # Shift Norm_Ret_F6M to the next period to avoid future looking
        period_agg_df['Close'] = period_agg_df['Close'].shift(1)
        period_agg_df['Close'].fillna(method='bfill', inplace=True)

        period_agg_df = ti.MA(period_agg_df, 2)
        period_agg_df = ti.MA(period_agg_df, 3)

        period_agg_df = ti.EMA(period_agg_df, 2)
        period_agg_df = ti.EMA(period_agg_df, 3)

        period_agg_df = ti.MOM(period_agg_df, 2)
        period_agg_df = ti.MOM(period_agg_df, 3)

        period_agg_df = ti.ROC(period_agg_df, 2)
        period_agg_df = ti.ROC(period_agg_df, 3)

        period_agg_df = ti.MACD(period_agg_df, 2, 3)

        period_agg_df = ti.KST(period_agg_df, 1, 2, 3, 4, 1, 2, 3, 4)

        period_agg_df = ti.TSI(period_agg_df, 2, 2)

        period_agg_df = ti.COPP(period_agg_df, 2)
        period_agg_df = ti.COPP(period_agg_df, 3)

        period_agg_df = ti.STDDEV(period_agg_df, 2)
        period_agg_df = ti.STDDEV(period_agg_df, 3)

        # concatenate technical indicator with the transformed dataset
        df = df.join(period_agg_df, on='time_period')
        df.fillna(method='ffill', inplace=True)
        df.fillna(method='bfill', inplace=True)

        self.transformed_df = df
        print('Done')

    def generate_synthetic_indicators(self, types=None):
        features_list = list(self.transformed_df.filter(regex='avg$').columns)

        if types is None:
            types = ['substract', 'multiply']

        for i1, col1 in enumerate(features_list):
            print('Processing column {}'.format(col1))
            for i2, col2 in enumerate(features_list):
                if 'substract' in types:
                    self.transformed_df['%s_%s_1' % (col1, col2)] = self.transformed_df[col1] - \
                                                                    self.transformed_df[col2]
                if 'add' in types:
                    self.transformed_df['%s_%s_2' % (col1, col2)] = self.transformed_df[col1] + \
                                                                    self.transformed_df[col2]

                if 'divide' in types:
                    self.transformed_df['%s_%s_3' % (col1, col2)] = self.transformed_df[col1] / \
                                                                    (self.transformed_df[col2]+0.01)

                if 'multiply' in types:
                    self.transformed_df['%s_%s_4' % (col1, col2)] = self.transformed_df[col1] * \
                                                                    self.transformed_df[col2]


        print('Done')

    def scale_df(self, scaled_columns=None):
        if scaled_columns is None:
            scaled_columns = [x for x in self.transformed_df.columns if x.endswith('pctile')]
            columns_for_scale = set(self.transformed_df.iloc[:, 5:].columns) - set(scaled_columns)
            self.scaler = MinMaxScaler()
            df_for_scale = copy.copy(self.transformed_df)
            df_for_scale.loc[:, columns_for_scale] = self.scaler.fit_transform(df_for_scale.loc[:, columns_for_scale])
            self.transformed_scaled_df = df_for_scale

    def apply_pca_to_scaled_df(self, n_components=0.99, svd_solver='full'):
        self.pca = PCA(n_components=n_components, svd_solver=svd_solver, random_state=17)
        self.pca.fit(self.transformed_scaled_df.iloc[:, 5:])
        pca_arr = self.pca.transform(self.transformed_scaled_df.iloc[:, 5:])
        columns_pca = ['pca_' + str(x) for x in range(1, pca_arr.shape[1]+1)]
        pca_df = pd.DataFrame(pca_arr, columns=columns_pca)
        self.transformed_scaled_df = pd.concat([self.transformed_scaled_df.iloc[:, :5], pca_df], axis=1)
    idmax = int(0.8 * ndat)
    a = mfrw.fitrw([dat[:idmax, 0]], [dat[:idmax, 1]], [sig[:idmax]],
                   floin=1. / 200,
                   fhiin=2.0,
                   ploton=1,
                   dtresin=-1,
                   nits=1,
                   tplotlims=[-10.0, 120.0, 0.1])
# load the dataset
else:
    dataframe = read_csv(tit_input, usecols=[1], engine='python', skipfooter=3)
    dataset = dataframe.values
    dataset = dataset.astype('float32')

# normalize the dataset
scaler = MinMaxScaler(feature_range=(0.0, 1.0))
dataset = scaler.fit_transform(dataset)

# split into train and test sets
train_size = idtrain_end  #int(len(dataset) * 0.09)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size, :], dataset[train_size:len(dataset), :]

# reshape into X=t and Y=t+1
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)

# reshape input to be [samples, time steps, features]
trainX = numpy.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = numpy.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
Exemple #11
0
x = tf.placeholder(tf.float32, shape=[None, 10])
y = tf.placeholder(tf.float32, shape=[None, 1])

# 실습
dataset = load_diabetes()
x_data = dataset.data
y_data = dataset.target.reshape(-1, 1)
print(x_data.shape)
print(y_data.shape)

x_train, x_test, y_train, y_test = train_test_split(x_data,
                                                    y_data,
                                                    test_size=0.1,
                                                    random_state=42)

scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

w = tf.Variable(tf.random_normal([10, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

hypothesis = tf.matmul(x, w) + b

cost = tf.reduce_mean(tf.square(hypothesis - y))

train = tf.train.GradientDescentOptimizer(learning_rate=0.3475).minimize(cost)

with tf.Session() as sess:
    sess.run(tf.compat.v1.global_variables_initializer())
Exemple #12
0
from keras.models import Sequential
from keras.utils import np_utils
from keras.layers.core import Dense, Activation, Dropout
from sklearn.preprocessing import  MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from keras.utils.vis_utils import plot_model
import csv
sc= MinMaxScaler()

import pandas as pd
import numpy as np

# Read data
data = pd.read_csv('./desharnais.csv')
Y = data.iloc[:, 6].values
data = data.drop('YearEnd', axis=1)
data = data.drop('Effort', axis=1)
X = data.iloc[:,2:]
Y = Y.reshape(-1, 1)

X_normalised = sc.fit_transform(X)
Y_normalised = sc.fit_transform(Y)

total_length = len(data)
train_length = int(0.8*total_length)
test_length = int(0.2*total_length)

X_train = X_normalised[:train_length]
X_test = X_normalised[train_length:]
Y_train = Y_normalised[:train_length]
Y_test = Y_normalised[train_length:]
Exemple #13
0
def normalize(dataset):
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_data = scaler.fit_transform(dataset)
    
    return scaled_data
Exemple #14
0
    df = web.DataReader('AAPL', data_source='yahoo', start='2012-01-01', end='2020-08-19')  ## pulling data
    plt.figure(figsize=(16,8))
    plt.plot(df['Close'])
    plt.title('Close Price of MSFT')
    plt.xlabel('Date', fontsize=18)
    plt.ylabel('Close Price', fontsize=18)
    # plt.show()

    ## process data
    data = df.filter(['Close'])
    dataset = data.values

    trainDataLength = math.ceil(len(dataset) * 0.8)

    ## scale data
    scaler = MinMaxScaler(feature_range=(0,1))
    scaled_data = scaler.fit_transform(dataset)
    trainData = scaled_data[0:trainDataLength , :]
    
    ## separate into xtrain and ytrain
    xtrain = []
    ytrain = []
    for i in range(60, len(trainData)): # past 60 days
        xtrain.append(trainData[i-60:i,0])
        ytrain.append(trainData[i,0])

    ## convert to numpy arrays
    xtrain, ytrain = np.array(xtrain), np.array(ytrain)

    ## reshape data to 3D for LSTM model
    xtrain = np.reshape(xtrain, (xtrain.shape[0], xtrain.shape[1], 1))
    This CV training loop standardizes X and standardizes Y every iteration for each CV fold.
    """
    fold_id = str(i+1)
    print('fold: ', fold_id)
    cv_train, cv_test = training.iloc[cv_train_idx[i],
                                      :].copy(), training.iloc[cv_test_idx[i], :].copy()

    # below: X standardization
    cv_train_scaler_X = StandardScaler()
    cv_train[cv_train.columns[~cv_train.columns.isin(['subject', 'PCL', 'group'])]] = cv_train_scaler_X.fit_transform(
        cv_train[cv_train.columns[~cv_train.columns.isin(['subject', 'PCL', 'group'])]])
    cv_test[cv_test.columns[~cv_test.columns.isin(['subject', 'PCL', 'group'])]] = cv_train_scaler_X.transform(
        cv_test[test.columns[~cv_test.columns.isin(['subject', 'PCL', 'group'])]])

    # below: Y min-max scaling
    cv_train_scaler_Y = MinMaxScaler(feature_range=(0, 1))
    cv_train[cv_train.columns[cv_train.columns.isin(['PCL'])]] = cv_train_scaler_Y.fit_transform(
        cv_train[cv_train.columns[cv_train.columns.isin(['PCL'])]])
    cv_test[cv_test.columns[cv_test.columns.isin(['PCL'])]] = cv_train_scaler_Y.fit_transform(
        cv_test[cv_test.columns[cv_test.columns.isin(['PCL'])]])

    # transform into numpy arrays
    cv_train_X, cv_train_Y = longitudinal_cv_xy_array(input=cv_train, Y_colnames=['PCL'],
                                                      remove_colnames=['subject', 'group'], n_features=n_features)
    cv_test_X, cv_test_Y = longitudinal_cv_xy_array(input=cv_test, Y_colnames=['PCL'],
                                                    remove_colnames=['subject', 'group'], n_features=n_features)

    # train
    cv_m, cv_m_history, cv_pred, cv_m_test_rmse, cv_m_test_rsq = lstm_cv_train(trainX=cv_train_X, trainY=cv_train_Y,
                                                                               testX=cv_test_X, testY=cv_test_Y,
                                                                               lstm_model='stacked',
Exemple #16
0
def write_submission(preds, output):
    sample = pd.read_csv('sampleSubmission.csv')
    preds = pd.DataFrame(preds,
                         index=sample.id.values,
                         columns=sample.columns[1:])
    preds.to_csv(output, index_label='id')


def load_test():
    test = pd.read_csv('test.csv')
    test = test.drop('id', axis=1)
    return test.values


X, y = load_train_data()
scaler = MinMaxScaler()  #Tested with and without
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)
X_FinalTest = load_test()

model = Sequential()
#Input layer
model.add(
    Dense(93, input_dim=93, kernel_initializer='normal', activation='relu')
)  #Tested with Neurons: 100,93: PRELU activation, RELU Activation
model.add(Dropout(0.13))  #Tested with: 0.0, 0.1, 0.13
model.add(BatchNormalization())  # with and without
employee_df['BusinessTravel'].unique()

X_numerical = employee_df[['Age',	'DailyRate',	'DistanceFromHome',	'Education',		
                           'EnvironmentSatisfaction',	'HourlyRate',	'JobInvolvement',	
                           'JobLevel',	'JobSatisfaction',	'MonthlyIncome',	'MonthlyRate',	
                           'NumCompaniesWorked',	'PercentSalaryHike',	'PerformanceRating',	
                           'RelationshipSatisfaction',		'StockOptionLevel',	'TotalWorkingYears',	
                           'TrainingTimesLastYear',	'WorkLifeBalance',	'YearsAtCompany',	'YearsInCurrentRole',	
                           'YearsSinceLastPromotion',	'YearsWithCurrManager']]

X_all = pd.concat([X_cat, X_numerical], axis = 1)

"NORMALIZAÇÃO DOS DADOS para não considerar um dado mais importante que o outro"
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X_all) #atributos previsores 

y = employee_df['Attrition']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25) # 25% dos dados para testar e o restante para o algoritmo aprender
# X_TRAIN contem os dados que serão utilizados como base para a previsão
# Y_TRAIN contem os dados que contem os dados de entrada para o algoritmo prever
X_train.shape, y_train

#X_test -> Atributos previsores 
#y_test -> Classe

X_test.shape, y_test
Exemple #18
0
# EmployeeNumber是唯一识别号码,删除
df_train.drop(
    ['Over18', 'StandardHours', 'EmployeeNumber'], axis=1, inplace=True)
df_test.drop(
    ['Over18', 'StandardHours', 'EmployeeNumber'], axis=1, inplace=True)

# 预测变量
target_var = 'Attrition'

# 连续变量
continuous_var = [
    'Age', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany',
    'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'
]

scaler = MinMaxScaler()

df_train[continuous_var] = scaler.fit_transform(np.log(df_train[continuous_var]+0.001))

pca_result = pca.fit_transform(df_train[continuous_var])
##print(pca.components_)
#print(pca_result)
label = df_train[target_var].map({0:'red', 1:'green'})

first_pc = pca.components_[0]
second_pc = pca.components_[1]

fig = plt.figure()
for ii, jj in pca_result:
    plt.scatter(first_pc[0]*ii[0], first_pc[1]*ii[0], c='r')
    plt.scatter(second_pc[0]*ii[1], second_pc[1]*ii[1], c='c')
modelName = "naiveLstm"
resultDicmainStr = dataSrc + "_" + modelName + "_resultDicmain_4Time3InOut100ep_200_100"

baseAddress = "lstm_multi_stacked\\"
dataFileBir = 'finalBirminghamDataArrDF.csv'
resultGraphBase = "\\multiLstm\\multiExpResul\\"
# load the new file
dataset = read_csv(dataFileBir,
                   header=0,
                   infer_datetime_format=True,
                   parse_dates=['date'],
                   index_col=0).fillna(0)
values = dataset.iloc[:, :-18].values
print("values.shape: ", values.shape)

scaler = MinMaxScaler(feature_range=(0, 1))
scaledValues = scaler.fit_transform(values)
print(scaledValues.shape)
# div factor is week
n_train = 1187 + 1
#n_train+8064
trainFrom, trainTo, testFrom, testTo, divFactor = 0, n_train, n_train, len(
    values), 1
train, test = split_dataset2(values, trainFrom, trainTo, testFrom, testTo,
                             divFactor)
print("train.shape: ", train.shape)
print("test.shape: ", test.shape)

#def  RunGetResult(dataSrc,expNum,inN,outN,train,test):

for expNum in expNumRange:
Exemple #20
0
nt = int(T / dt)

# data = y1 and y2; target = slope = (y^(n+1)-y^n)/dt
# y1(t) = cos(wt), y2(t) = -w*sin(wt)
data = [[np.cos(omega * dt * i), -omega * np.sin(omega * dt * i)]
        for i in range(nt)]
data = np.array(data, dtype=np.float64)
data = data

target = [[(data[i, 0] - data[i - 1, 0]) / dt,
           (data[i, 1] - data[i - 1, 1]) / dt] for i in range(1, nt)]
target = np.array(target, dtype=np.float64)
target = target

# normalization
scaler = MinMaxScaler(feature_range=(0, 1))
#data = scaler.fit_transform(data)
#target = scaler.fit_transform(target)

# data for training

# data_train = data[1:100].reshape(99,1,2)
# target_train = target.reshape(99,1,2)
data_train = data[1:100].reshape(99, 1, 2)
target_train = target  #.reshape(1,99,2)

model = Sequential()
model.add(LSTM(4, input_shape=(1, 2), activation='tanh'))
model.add(Dense(2))

# compile the model
Exemple #21
0
 def __init__(self, limits):
     self._fit_inner = False
     self.limits = np.array(limits)
     self.cdf = norm(0, 1).cdf
     self.icdf = norm(0, 1).ppf
     self.scaler = MinMaxScaler()
Exemple #22
0
data.loc[data["Embarked"] == "S", "Embarked"] = 0
data.loc[data["Embarked"] == "C", "Embarked"] = 1
data.loc[data["Embarked"] == "Q", "Embarked"] = 2
data["Embarked"] = data["Embarked"].fillna(3)
# data.loc[data["Embarked"]==None,"Embarked"]=3
print(data["Embarked"].describe())
print(data["Embarked"].unique())
print(data["Embarked"].value_counts())

print("--------------追加特征---------------")
print(data["Ticket"].describe())
print(data["Ticket"].unique())
print(data["Ticket"].value_counts())

print("--------------Fare 归一化---------------")
data_scaler = MinMaxScaler(feature_range=(0, 1))
data_Fare = np.array(data["Fare"].values)
lenInt = len(data_Fare)
arr = []
for i in range(0, lenInt):
    temp = []
    temp.append(data_Fare[i])
    arr.append(temp)
data_rescaledX = data_scaler.fit_transform(arr)
data["Fare_scaler"] = data_rescaledX

print("--------------Name 特征处理---------------")
data["NameLength"] = data["Name"].apply(lambda x: len(x))


def getTitle(name):
Exemple #23
0
    def fit_modele(self, config, final=False):
        """
            Fait un fit rapide (100 itérations par défaut) d'un LSTM selon les
            paramètres contenus dans config (h, n, f, d)
        """
        resultat = None
        key = str(config)

        iter = 500 if final else 100  # entraînement final du modèle retenu

        nbre_couches = config.get("nbre_couches")
        taille = config.get("taille_entree")
        nbre_neurones = config.get("nbre_neurones")
        activation = config.get("activation")
        dropout = config.get("dropout")

        nbre_retards = np.count_nonzero(
            np.isnan(self.serie.data['Série stationnarisée']))

        # MinMaxScaler
        donnees_brutes = self.serie.data['Série stationnarisée'][0:self.serie.index_fin_entrainement].dropna(
        ).values
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler = scaler.fit(np.array(donnees_brutes).reshape(-1, 1))
        serie_reduite = scaler.transform(np.array(
            self.serie.data['Série stationnarisée'].dropna().values).reshape(-1, 1))
        a = np.empty((1, nbre_retards))
        a[:] = np.nan
        serie_reduite = np.concatenate((a, np.array(serie_reduite)), axis=0)

        self.serie.data['Série stationnarisée réduite'] = serie_reduite

        X_train, y_train = decouper_serie_apprentissage_supervise(
            self.serie.data['Série stationnarisée réduite'][0:self.serie.index_fin_entrainement].dropna().values, taille)

        X_test, y_test = decouper_serie_apprentissage_supervise(
            self.serie.data['Série stationnarisée réduite'][self.serie.index_fin_entrainement:self.serie.index_fin_test].dropna().values, taille)

        n_features = 1  # une variable explicative
        X_train = X_train.reshape(
            (X_train.shape[0], X_train.shape[1], n_features))
        X_test = X_test.reshape(
            (X_test.shape[0], X_test.shape[1], n_features))

        # Création du réseau de neurones
        model = Sequential()

        # couche d'entrée
        for i in range(0, nbre_couches):
            model.add(kLSTM(nbre_neurones, activation=activation, return_sequences=True,
                            input_shape=(taille, n_features)))
            model.add(Dropout(dropout))  # ajout d'un dropout

        # dernière couche (pas de retour)
        model.add(kLSTM(nbre_neurones, activation=activation))
        model.add(Dropout(dropout))  # ajout d'un dropout

        # couche de sortie (1 dimension)
        model.add(Dense(1))

        methode_optimisation = optimizers.Nadam()

        model.compile(optimizer=methode_optimisation,
                      loss='mse')

        # Critère d'arret prématuré, aucune amélioration sur le jeu de test
        # pendant plus de 20 itérations
        critere_stop = EarlyStopping(
            monitor='val_loss', min_delta=0, patience=20)

        # Fit du modèle
        historique = model.fit(
            X_train, y_train, validation_data=(X_test, y_test), epochs=iter, verbose=final, callbacks=[critere_stop], shuffle=False)

        if final:  # sauvegarde de l'historique d'entrainement si modèle final
            self.historique = historique

        # Prédiction sur jeu de test + validation
        serie_predite = []
        serie_predite_temp = []  # stock les prédictions réduites
        serie_predite_dynamique = []

        # walk-forward validation (one step ahead)
        for i in range(0, len(self.serie.data['Test'].dropna())+len(self.serie.data['Validation'].dropna())):

            x_input = self.serie.data['Série stationnarisée réduite'][self.serie.index_fin_entrainement -
                                                                      taille+i:self.serie.index_fin_entrainement+i].values
            x_input = x_input.reshape((1, taille, n_features))
            yhat = model.predict(x_input, verbose=0)[0][0]
            serie_predite_temp.append(yhat)

            # inversion de la mise à l'échelle
            padding = np.zeros(taille-1).reshape(1, taille - 1)
            yhat = np.append(padding, [yhat]).reshape(1, -1)
            yhat = scaler.inverse_transform(yhat)
            yhat = yhat[0][-1]

            # déstationnarisation
            yhat = yhat + \
                self.serie.data['Série'][self.serie.index_fin_entrainement+i-nbre_retards]
            serie_predite.append(yhat)

        # prévision dynamique
        if final:
            anciennes_predictions = []
            for i in range(0, len(self.serie.data['Test'].dropna())+len(self.serie.data['Validation'].dropna())):
                decoupe = -taille + i

                if decoupe < 0:
                    x_input_dynamique = np.append(
                    self.serie.data['Série stationnarisée réduite'][decoupe:].values, anciennes_predictions)
                
                else:
                    x_input_dynamique = np.array(anciennes_predictions)[-taille:]

                x_input_dynamique = x_input_dynamique.reshape(
                    (1, taille, n_features))

                yhat_dynamique = model.predict(
                    x_input_dynamique, verbose=0)[0][0]

                anciennes_predictions.append(yhat_dynamique)

                # inversion de la mise à l'échelle
                padding = np.zeros(taille-1).reshape(1, taille - 1)
                yhat_dynamique = np.append(
                    padding, [yhat_dynamique]).reshape(1, -1)
                yhat_dynamique = scaler.inverse_transform(yhat_dynamique)
                yhat_dynamique = yhat_dynamique[0][-1]

                # déstationnarisation
                yhat_dynamique = yhat_dynamique + \
                    self.serie.data['Série'][self.serie.index_fin_entrainement+i-nbre_retards]
                serie_predite_dynamique.append(yhat_dynamique)

        # ajout d'un padding avec des nan
        a = np.empty((1, len(self.serie.data['Entraînement'].dropna())))
        a[:] = np.nan
        serie_predite = np.concatenate((a[0], np.array(serie_predite)), axis=0)

        # calcul du MSE uniquement sur le jeu de test
        resultat = mean_squared_error(
            serie_predite[self.serie.index_fin_entrainement:self.serie.index_fin_test], self.serie.data['Test'].dropna())

        if final:
            self.serie.data[self.__class__.__name__] = serie_predite

            # ajout d'un padding avec des nan
            a = np.empty((1, len(self.serie.data['Entraînement'].dropna())))
            a[:] = np.nan
            serie_predite_dynamique = np.concatenate(
                (a[0], np.array(serie_predite_dynamique)), axis=0)

            self.serie.data[self.__class__.__name__ +
                            "_dynamique"] = serie_predite_dynamique
            self.modele = model

        print("Fit du modèle " + key + " : " + str(resultat))

        return (key, resultat)
Exemple #24
0

# Separate the variables to traning

# In[279]:


training_vars = [var for var in X_train.columns if var not in ['PassengerId', 'Survived']]
training_vars


# In[280]:


# fit scaler
scaler = MinMaxScaler() # create an instance
scaler.fit(X_train[training_vars]) #  fit  the scaler to the train set and then transform it


# ### Phase #4 Modeling & #5 Evaluation

# ##### Machine Learning algorithm building

# #### xgboost

# In[281]:


xgb_model = xgb.XGBClassifier()

eval_set = [(X_test[training_vars], y_test)]
#=============================================================================#
############################# import functions ################################
#=============================================================================#
import numpy as np
import os
import pandas as pd
from scipy import signal
from scipy.signal import resample
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
from Utils.utils_aug_ft import augment_train_set_ft, segment_signal
from Utils.FeatureExtraction import ExtraFeatures
#=============================================================================#
########################## read data: simulation ##############################
#=============================================================================#
## 20 mm wheel flat simulation data
FolderPath = os.path.abspath(
    os.path.join(os.getcwd(), "./Data/Augmentation/input_data/"))
sim_normal = pd.read_csv(FolderPath + '/sim_norm_20mm.txt',
                         engine='python')  # can also be 30mm or 50mm WF
sim_WF = pd.read_csv(FolderPath + '/sim_WF_20mm.txt', engine='python')

# time shifting for more data samples
FolderPath1 = ['v70']  # choose the speed from dataframe
FolderPath2 = ['normal', 'WF']
FolderPath3 = ['Good', 'Bad']
for i in range(len(FolderPath1)):
    for j in range(len(FolderPath2)):
        a_varname = 'sim_' + FolderPath2[j] + '_' + FolderPath1[i]
        df_varname = 'sim_' + FolderPath2[j]
        globals()[a_varname] = segment_signal(globals()[df_varname],
                   nosym=True)


c = FixAtoms(indices=[atom.index for atom in atoms if atom.position[2]<8])
atoms.set_constraint(c)

identified_images = Trajectory('identified_images.traj','a', properties = ['energy','forces'])
traj_md = Trajectory('md.traj','a', properties=['energy','forces'])

path = ['/work/common/hxin_lab/jiamin/non_adiabatic/Langevin/Training_3nd/sym70_2L70/newriver/amp-checkpoint.amp',
        '/work/common/hxin_lab/jiamin/non_adiabatic/Langevin/Training_3nd/sym70_2L65/newriver/amp-checkpoint.amp',
        '/work/common/hxin_lab/jiamin/non_adiabatic/Langevin/Training_3nd/sym70_2L60/newriver/amp-checkpoint.amp',
        ]

fingerprints = np.loadtxt('/work/common/hxin_lab/jiamin/non_adiabatic/Langevin/Training_3nd/trajs/fingerprints/total_fingerp.txt')
scaler_fp = MinMaxScaler(feature_range=(-1,1), copy=True)
scaler_fp.fit(fingerprints)
scaled_fp = scaler_fp.transform(fingerprints)

atoms_chg = io.read('/work/common/hxin_lab/jiamin/non_adiabatic/Langevin/Training_3nd/trajs/fingerprints/identified.traj',index=':')
chg = np.zeros(len(atoms_chg)*2)
i = 0
for atom in atoms_chg:
    for index in range(12,14):
        chg[i]=atom.get_charges()[index]
        i += 1
scaler_chg = MinMaxScaler(feature_range=(-1,1), copy=True)
scaler_chg.fit(chg.reshape(-1,1))
scaled_chg = scaler_chg.transform(chg.reshape(-1,1))

X_train, X_test, Y_train, Y_test = train_test_split(scaled_fp[:], scaled_chg[:], test_size = 0.2, random_state = None)
Exemple #27
0
def create_segments(df, cluster_num=5, do_minmax=False, do_pca=True, generate_report=True):
    '''
    Creates cluster segments and generates report
    Input:
        df: the final dataset that is ready for clustering (e.g. starbucks_imputed)
        cluster_num: number of clusters to be created
        do_minmax (bool): flag condition to use MinMax Scaler if True, otherwise use Standard Scaler
        do_pca (bool): flag condition to perfom PCA if True
        generate_report (bool): whether to generate report 
                                (i.e. plot segments interpreter, customer segments by size, metrics)
    '''
    #one-hot encoding
    starbucks_ohe = df.copy()
    starbucks_ohe.drop('person', axis=1, inplace=True)
    categorical_col = starbucks_ohe.columns[(starbucks_ohe.dtypes == 'category') | (starbucks_ohe.dtypes == 'object')]
    starbucks_ohe = pd.get_dummies(starbucks_ohe, columns=categorical_col)
    
    # feature scaling
    if do_minmax == False:
        scaler = StandardScaler().fit(starbucks_ohe)
    else:
        scaler = MinMaxScaler().fit(starbucks_ohe)
    starbucks_scaled = scaler.transform(starbucks_ohe) 

    # PCA
    if do_pca == True:
        pca = PCA()
        X_pca = pca.fit_transform(starbucks_scaled)
        cum_expl_var_ratio = np.cumsum(pca.explained_variance_ratio_)

        #choose number of components that explain ~80% of variance
        components_num = len(cum_expl_var_ratio[cum_expl_var_ratio <= 0.805])
        print(f"number of pca components that explain 80%: {components_num}")
        pca = PCA(components_num).fit(starbucks_scaled)
        starbucks_pca = pca.transform(starbucks_scaled)
    
        # clustering
        clusterer = KMeans(n_clusters=cluster_num, n_init=10, init='k-means++').fit(starbucks_pca)
        starbucks_preds = clusterer.predict(starbucks_pca)
        print(f"silhouette_score for {cluster_num} clusters: {metrics.silhouette_score(starbucks_pca, clusterer.labels_, metric='euclidean'):.3f}")
        print(82 * '_')
        plot_elbow_curve(starbucks_pca)
        
    else:
        pca = None
        starbucks_pca = None
        # clustering
        clusterer = KMeans(n_clusters=cluster_num, n_init=10, init='k-means++').fit(starbucks_scaled) 
        starbucks_preds = clusterer.predict(starbucks_scaled)
        print("silhouette_score:", metrics.silhouette_score(starbucks_scaled, clusterer.labels_, metric='euclidean'))
        print(82 * '_')
        plot_elbow_curve(starbucks_scaled)
    
    # assign customer segments to data
    starbucks_predicted = df.copy()
    starbucks_predicted['segments'] = starbucks_preds
     
    #generate report
    if generate_report == True:
        cluster_df = create_cluster_df(pca, clusterer, scaler, cluster_num, starbucks_ohe, do_minmax, do_pca)
        plot_segments_interpreter(cluster_df)
        plot_customer_segments(starbucks_predicted, cluster_num)
        plot_metrics(starbucks_predicted)
        
    #return starbucks_ohe, scaler, starbucks_scaled, pca, starbucks_pca, clusterer, starbucks_preds
    return starbucks_predicted
        '/Users/yaofeifan/Documents/Tsinghua/Lesson/模式识别/Project2/data/北京空气质量/2019/All.csv',
        header=0,
        index_col=0)
    dataset2020 = read_csv(
        '/Users/yaofeifan/Documents/Tsinghua/Lesson/模式识别/Project2/data/北京空气质量/2020/All.csv',
        header=0,
        index_col=0)
    dataset = pd.concat([dataset2019], axis=0)

    dataset = dataset.drop(columns=["hour", "AQI", "PM10", 'grade'])
    order = ['PM2.5', 'CO', 'NO2', 'O3', 'SO2']
    dataset = dataset[order]
    values = dataset.values
    values = values.astype('float32')
    # normalize features
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled = scaler.fit_transform(values)
    reframed = series_to_supervised(scaled, 1, 1)
    reframed.drop(reframed.columns[[6, 7, 8, 9]], axis=1, inplace=True)
    thetrain = reframed
    print(thetrain.head())

    # load test dataset
    dataset2 = read_csv(
        '/Users/yaofeifan/Documents/Tsinghua/Lesson/模式识别/Project2/data/北京空气质量/2014/All.csv',
        header=0,
        index_col=0)
    dataset2 = dataset2.drop(columns=["hour", "AQI", "PM10", 'grade'])
    order = ['PM2.5', 'CO', 'NO2', 'O3', 'SO2']
    dataset2 = dataset2[order]
    values2 = dataset2.values
# --------------
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Code starts here
df=pd.read_csv(path)
#print(df.head())
print(df.attr1089.value_counts())
X=df.iloc[:,0:len(df.columns)-1]
y=df.iloc[:,-1]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=4)
scaler=MinMaxScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)
# Code ends here


# --------------
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
lr=LogisticRegression()
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
roc_score=roc_auc_score(y_test,y_pred)
print(roc_score)


# --------------
Exemple #30
0
# Recurrent Neural Network

# Part 1 - Data Preprocessing

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the training set
dataset_train = pd.read_csv('trainingset.csv')
training_set = dataset_train.iloc[:, 1:2].values

# Feature Scaling
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0, 1))
training_set_scaled = sc.fit_transform(training_set)

# Creating a data structure with 60 timesteps and 1 output
X_train = []
y_train = []
for i in range(60, 639):
    X_train.append(training_set_scaled[i - 60:i, 0])
    y_train.append(training_set_scaled[i, 0])
X_train, y_train = np.array(X_train), np.array(y_train)

# Reshaping
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

# Part 2 - Building the RNN
Exemple #31
0
		dataX.append(a)
		dataY.append(dataset[i + look_back, 0])
	return np.array(dataX), np.array(dataY)

# fix random seed for reproducibility
np.random.seed(5)

# load the dataset
df = read_csv(input_file, header=None, index_col=None, delimiter=',')

# take close price column[5]
all_y = df[5].values
dataset=all_y.reshape(-1, 1)

# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)

# split into train and test sets, 50% test data, 50% training data
train_size = int(len(dataset) * 0.5)
test_size = len(dataset) - train_size
train, test = dataset[0:train_size,:], dataset[train_size:len(dataset),:]

# reshape into X=t and Y=t+1, timestep 240
look_back = 240
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)

# reshape input to be [samples, time steps, features]
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))
Exemple #32
0
def preprocess_prediciton(iq):
    Actives = ['EURUSD','GBPUSD','EURJPY','AUDUSD']
    active = 'EURUSD'
    main = pd.DataFrame()
    current = pd.DataFrame()
    for active in Actives:
        if active == 'EURUSD':
            main = fast_data(iq,active).drop(columns = {'from','to'})
        else:
            current = fast_data(iq,active)
            current = current.drop(columns = {'from','to','open','min','max'})
            current.columns = [f'close_{active}',f'volume_{active}']
            main = main.join(current)
    
    df = main
    
    """
    graphical analysis components
    """
    
    df.isnull().sum().sum() # there are no nans
    df.fillna(method="ffill", inplace=True)
    df = df.loc[~df.index.duplicated(keep = 'first')]
    
    df['MA_20'] = df['close'].rolling(window = 20).mean()
    df['MA_50'] = df['close'].rolling(window = 50).mean()
    
    
    df['L14'] = df['min'].rolling(window=14).min()
    df['H14'] = df['max'].rolling(window=14).max()
    df['%K'] = 100*((df['close'] - df['L14']) / (df['H14'] - df['L14']) )
    df['%D'] = df['%K'].rolling(window=3).mean()
    
    df['EMA_20'] = df['close'].ewm(span = 20, adjust = False).mean()
    df['EMA_50'] = df['close'].ewm(span = 50, adjust = False).mean()
    
    rsi_period = 14 
    chg = df['close'].diff(1)
    gain = chg.mask(chg<0,0)
    df['gain'] = gain
    loss = chg.mask(chg>0,0)
    df['loss'] = loss
    avg_gain = gain.ewm(com = rsi_period - 1, min_periods = rsi_period).mean()
    avg_loss = loss.ewm(com = rsi_period - 1, min_periods = rsi_period).mean()
    
    df['avg_gain'] = avg_gain
    df['avg_loss'] = avg_loss
    rs = abs(avg_gain/avg_loss)
    df['rsi'] = 100-(100/(1+rs))
    
    """
    Finishing preprocessing
    """
    df = df.drop(columns = {'open','min','max','avg_gain','avg_loss','L14','H14','gain','loss'})
    
    df = df.dropna()
    df = df.fillna(method="ffill")
    df = df.dropna()
    
    df.sort_index(inplace = True)
    
    scaler = MinMaxScaler()
    indexes = df.index
    df_scaled = scaler.fit_transform(df)
    
    pred = pd.DataFrame(df_scaled,index = indexes)

    sequential_data = []
    prev_days = deque(maxlen = SEQ_LEN)            
    
    for i in pred.iloc[len(pred) -SEQ_LEN :len(pred)   , :].values:
        prev_days.append([n for n in i[:]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days)])

    X = []

    for seq in sequential_data:
        X.append(seq)
    
    
    return np.array(X)
del train['ID']
#%%
# Create independent and dependent variable
y = train.iloc[:, -1]
x = train.iloc[:, 0:-1]
#%%
import matplotlib.pyplot as plt
import collections
cy = collections.Counter(y)
plt.bar(cy.keys(), cy.values())
plt.show()
#%%
y = pd.get_dummies(y).values
#%%
# Preprocessing: Minmax scaling
x_scaling = MinMaxScaler().fit_transform(x)
# By scaling operation score increase from 0.49045 to score=0.52721
#%%
x_train, x_test, y_train, y_test = train_test_split(x_scaling,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=10)
#%%
model = XGBClassifier()
model.fit(x_train, y_train)
# score=0.52721
#%%
prediction = model.predict(x_test)
#%%
score = f1_score(y_test, prediction, average='weighted')
#score=0.52721
Exemple #34
-1
class NumericColumn(BaseEstimator, TransformerMixin):
    '''
    Take a numeric value column and standardize it.
    '''

    def __init__(self):
        '''
        Set up the internal transformation.
        '''
        self._transformer = MinMaxScaler()

    def fit(self, X, y=None):
        '''
        Fit the standardization.
        '''
        zeroed = pd.DataFrame(np.array(X).reshape(-1, 1)).fillna(0)
        self._transformer.fit(zeroed)
        return self

    def transform(self, X):
        '''
        Transform a column of data into numerical percentage values.

        Parameters
        ----------
        X : pandas series or numpy array
        '''
        zeroed = pd.DataFrame(np.array(X).reshape(-1, 1)).fillna(0)
        return self._transformer.transform(zeroed).astype(np.float32)
Exemple #35
-1
 def predict_new(self, input):
     model = self.train_model()
     assert len(input) == 5 and type(input) == list
     scaler = MinMaxScaler(feature_range=(0, 1))
     scaler.fit(self.data)
     inp = scaler.transform([input])
     print(scaler.inverse_transform(model.predict(numpy.array(inp).reshape(1, 1, 5))))
Exemple #36
-1
    def loaddataset(self,path,module):                
       df=pd.read_csv(path)
       subdf = df[['PassengerId','Pclass','Sex','Age','Embarked','Fare','SibSp','Parch']]
       SibSp=subdf['SibSp']
       Parch=subdf['Parch']
#      supplement Age
       Age=subdf['Age'].fillna(value=subdf.Age.mean())
             
       Fare=subdf['Fare'].fillna(value=subdf.Fare.mean())
       
       dummies_Sex=pd.get_dummies(subdf['Sex'],prefix='Sex')
       
       dummies_Embarked = pd.get_dummies(subdf['Embarked'], prefix= 'Embarked')     
       
       dummies_Pclass = pd.get_dummies(subdf['Pclass'], prefix= 'Pclass')
       
       PassengerId=subdf['PassengerId']
       
#      Age&Fare to Scaler
       scaler=MinMaxScaler()
       age_scaled=scaler.fit_transform(Age.values)
       fare_scaled=scaler.fit_transform(Fare.values)
       
       Age_Scaled=pd.DataFrame(age_scaled,columns=['Age_Scaled'])
       Fare_Scaled=pd.DataFrame(fare_scaled,columns=['Fare_Scaled'])
       
       if module=='train':
          self.trainlabel=df.Survived
          self.trainset=pd.concat([dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1)
       elif module=='test':
          self.testset=pd.concat([PassengerId,dummies_Pclass,dummies_Sex,dummies_Embarked,Age_Scaled,Fare_Scaled,SibSp,Parch],axis=1)
def cluster(final_data_dict, cluster_range, list_or_dict):
    final_data_list= clustering_module.convert_to_list(final_data_dict) 
    respondent_IDs = np.array(map(int, final_data_dict.keys()))
    feature_names = final_data_dict.values()[0].keys()
    final_data_list_imputed = clustering_module.preprocess(final_data_list)
    Scaler = MinMaxScaler()    
    final_data_list_scaled = Scaler.fit_transform(final_data_list_imputed)
    #Transformed is distance of each respondent from each cluster center
    #Predicted is the cluster membership of each respondent
    merging_list = clustering_module.convert_to_list(final_data_dict,remove_NaN=0 )
    data = list(merging_list)
    ignore_set_added = set(['ids'])
    for num_clusters in cluster_range:    
        transformed, predicted, score = clustering_module.clustering(final_data_list_scaled, num_clusters)
        cluster_name = "%s_clusters" % num_clusters
        ignore_set_added.add(cluster_name)    
        data, feature_names = clustering_module.add_new_data_to_rows(predicted, data, feature_names, [cluster_name])
    data, feature_names = clustering_module.add_new_data_to_rows(respondent_IDs, data, feature_names, ["ids"], "before")
    if list_or_dict == "dict":        
        temp = dictionary_conversion.create_dictionary(data, feature_names)    
        num_converted = dictionary_conversion.convert_values_to_int(temp)    
        #Set of features that should be different due to being categorical
        ignore_set_changed = set(['busgrn', 'peopgrn', 'sex', 'race', 'topprob1', 'topprob2'])    
        verdict = compare_respondent_dicts(respondent_IDs, num_converted, final_data_dict, ignore_set_changed, ignore_set_added)
        return num_converted, verdict
    elif list_or_dict == "list":
        return data, feature_names
Exemple #38
-1
def rank_to_dict(ranks, names, order=1, ratio=1):
	minmax = MinMaxScaler()
	ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
	if np.mean(ranks) == 0:
		ranks+=1
	ranks = map(lambda x: round(x, 2), ranks)
	return dict(zip(names, ranks ))
def vary_border(pred_true,y,num_iter=101):
    mms = MinMaxScaler()
    pred=pred_true.copy()
    pred=mms.fit_transform(pred)
    best_score = 0
    for k1 in range(num_iter):
        c1 = k1/(num_iter-1)
        for k2 in range(num_iter):
            c2 = k2/(num_iter-1)
            for k3 in range(num_iter):
                c3 = k3/(num_iter-1)
                if c1 < c2 and c1 < c3 and c2 < c3 and c1 > 0.25 and c1 < 0.5 and c3 < 0.9:
                    tmp_pred = pred.copy()
                    mask1 = tmp_pred < c1
                    mask2 = (tmp_pred >=c1) * (tmp_pred < c2)
                    mask3 = (tmp_pred >=c2) * (tmp_pred < c3)
                    mask4 = tmp_pred >=c3
                    tmp_pred[mask1] = 1
                    tmp_pred[mask2] = 2
                    tmp_pred[mask3] = 3
                    tmp_pred[mask4] = 4
                    score = quadratic_weighted_kappa(y,tmp_pred)
                    if score > best_score:
                        best_score = score
                        best_coef = [c1,c2,c3]
                        best_pred = tmp_pred.copy()
    #print(best_score,best_coef)
    return best_pred, best_coef
def Iris(training_size, test_size, n, PLOT_DATA):
    class_labels = [r'A', r'B', r'C']
    data, target = datasets.load_iris(True)
    sample_train, sample_test, label_train, label_test = train_test_split(data, target, test_size=1, random_state=42)

    # Now we standarize for gaussian around 0 with unit variance
    std_scale = StandardScaler().fit(sample_train)
    sample_train = std_scale.transform(sample_train)
    sample_test = std_scale.transform(sample_test)

    # Scale to the range (-1,+1)
    samples = np.append(sample_train, sample_test, axis=0)
    minmax_scale = MinMaxScaler((-1, 1)).fit(samples)
    sample_train = minmax_scale.transform(sample_train)
    sample_test = minmax_scale.transform(sample_test)

    # Pick training size number of samples from each distro
    training_input = {key: (sample_train[label_train == k, :])[:training_size] for k, key in enumerate(class_labels)}
    test_input = {key: (sample_train[label_train == k, :])[training_size:(
        training_size+test_size)] for k, key in enumerate(class_labels)}

    if PLOT_DATA:
        for k in range(0, 3):
            plt.scatter(sample_train[label_train == k, 0][:training_size],
                        sample_train[label_train == k, 1][:training_size])

        plt.title("Iris dataset")
        plt.show()

    return sample_train, training_input, test_input, class_labels
Exemple #41
-1
 def scale(self):
     # Scaling is an important part of this process: many of our algorithms
     # require our data to be scaled or otherwise standardized. We 
     # do this by scaling features to values between [0,1]. This preserves
     # zero entries in our sparse matrix which is always a desirable 
     # quality when working with this sort of data.
     # Scaling is sort of a convoluted process because Scipy/Scikit
     # doesn't offer a way to do this natively. We transpose the matrix, 
     # convert it to LIL format (which isn't inefficient in this operation),
     # and divide each row (column in the original matrix) by the row's
     # sum before transposing and converting back to CSR. 
     # However, if the matrix is not sparse, we don't have to worry about
     # this and can simply use one of Scikit's utility methods.
     # TODO: Maybe look at profiling to ensure that this strategy really
     # is the least expensive one.
     if self.sparse:
         self.vecs = self.vecs.tolil()
         self.vecs = self.vecs.transpose()
         num_features, _ = self.vecs.shape
         for i in range(num_features):
             self.vecs[i] /= self.vecs[i].sum()
         self.vecs = self.vecs.transpose()
         self.vecs = self.vecs.tocsr()
     else:
         mms = MinMaxScaler(copy = False)
         self.vecs = mms.fit_transform(self.vecs)
def NB_coefficients(year=2010):
    poi_dist = getFourSquarePOIDistribution(useRatio=False)
    F_taxi = getTaxiFlow(normalization="bydestination")
    W2 = generate_geographical_SpatialLag_ca()
    Y = retrieve_crime_count(year=year)
    C = generate_corina_features()
    D = C[1]

    popul = C[1][:,0].reshape(C[1].shape[0],1)
    Y = np.divide(Y, popul) * 10000
    
    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)
    
    f = np.concatenate( (D, f2, ftaxi, poi_dist), axis=1 )
    mms = MinMaxScaler(copy=False)
    mms.fit(f)
    mms.transform(f)
    header = C[0] + [ 'spatiallag', 'taxiflow'] + \
        ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 
                       'POI outdoors recreation', 'POI education', 'POI nightlife', 
                       'POI professional', 'POI shops', 'POI event']
    df = pd.DataFrame(f, columns=header)
    
    np.savetxt("Y.csv", Y, delimiter=",")
    df.to_csv("f.csv", sep=",", index=False)
    
    # NB permute
    nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca', 'coefficient'] )
    print nbres
    
    ls = nbres.strip().split(" ")
    coef = [float(e) for e in ls]
    print coef
    return coef, header
def Breast_cancer(training_size, test_size, n, PLOT_DATA):
    class_labels = [r'A', r'B']
    data, target = datasets.load_breast_cancer(True)
    sample_train, sample_test, label_train, label_test = train_test_split(data, target, test_size=0.3, random_state=12)

    # Now we standarize for gaussian around 0 with unit variance
    std_scale = StandardScaler().fit(sample_train)
    sample_train = std_scale.transform(sample_train)
    sample_test = std_scale.transform(sample_test)

    # Now reduce number of features to number of qubits
    pca = PCA(n_components=n).fit(sample_train)
    sample_train = pca.transform(sample_train)
    sample_test = pca.transform(sample_test)

    # Scale to the range (-1,+1)
    samples = np.append(sample_train, sample_test, axis=0)
    minmax_scale = MinMaxScaler((-1, 1)).fit(samples)
    sample_train = minmax_scale.transform(sample_train)
    sample_test = minmax_scale.transform(sample_test)

    # Pick training size number of samples from each distro
    training_input = {key: (sample_train[label_train == k, :])[:training_size] for k, key in enumerate(class_labels)}
    test_input = {key: (sample_train[label_train == k, :])[training_size:(
        training_size+test_size)] for k, key in enumerate(class_labels)}

    if PLOT_DATA:
        for k in range(0, 2):
            plt.scatter(sample_train[label_train == k, 0][:training_size],
                        sample_train[label_train == k, 1][:training_size])

        plt.title("PCA dim. reduced Breast cancer dataset")
        plt.show()

    return sample_train, training_input, test_input, class_labels
Exemple #44
-1
def readTrainingData():
    data = np.loadtxt( 'data/training.csv', delimiter=',', skiprows=1, converters={32: lambda x:int(x=='s'.encode('utf-8')) })
    allY = data[:, 32]
    allX = data[:, 1:31]
    allW = data[:, 31]
    
    scale = MMS()
    allX = scale.fit_transform(allX)
    np.random.seed(42)
    r = np.random.rand(allY.shape[0])

    xTrain = allX[r<=0.4]
    yTrain = allY[r<=0.4]
    wTrain = allW[r<=0.4]

    xValid = allX[r>0.7]
    yValid = allY[r>0.7]
    wValid = allW[r>0.7]

    v = np.random.rand(yValid.shape[0])
    xCrossValid = xValid[v<=0.5]
    yCrossValid = yValid[v<=0.5]
    wCrossValid = wValid[v<=0.5]

    xTestValid  = xValid[v>0.5]
    yTestValid  = yValid[v>0.5]
    wTestValid  = wValid[v>0.5]
    
    return [xTrain, yTrain, wTrain, xCrossValid, yCrossValid, wCrossValid, xTestValid, yTestValid, wTestValid]
def getips(conf, net, superpixels_num, layer='inner_product_target'):
    (options, args) = parser.parse_args()
    layer = options.layer
    data = net.blobs[layer].data
    #data = net.blobs['InnerProduct1'].data
    feature_len = data.shape[1]
    try:
        negative_numbers = conf.model['number_of_negatives']
    except:
        negative_numbers = 1
    reps = np.zeros((superpixels_num*negative_numbers, feature_len))
    for i in xrange(superpixels_num):
        if i%1000==1:
            print i
        net.forward()
        reps[i] = np.sum(net.blobs[layer].data, axis=1)
    reps_slice = reps[..., 0]
    from sklearn.preprocessing import MinMaxScaler
    clf = MinMaxScaler()
    reps_slice = clf.fit_transform(reps_slice)
    if negative_numbers > 1:
        reps_slice = np.square(reps_slice)
    #reps_slice[reps_slice<np.mean(reps_slice)] = 0
    for i in xrange(reps_slice.shape[0]):
        reps[i] = reps_slice[i]
        # print net.blobs['inner_product_target'].data[1:10]
    return reps
Exemple #46
-1
    def fit(self, X, y):
        X = np.matrix(X)
        y = np.matrix(y)
        self._outputNormalizer = MinMaxScaler()
        self._inputNormalizer = MinMaxScaler()
        self._outputNormalizer = self._outputNormalizer.fit(y)
        self._inputNormalizer = self._inputNormalizer.fit(X)
        self._inputDimension = X.shape[1]
        self._outputDimension = y.shape[1]#For now, hardcoded to 1-dimensional regression problems.
        if (not self._warmStart or self._weights == None):
            self._initializeWeights()
            self._lastDelta = None
        batchFeatures, batchTargets = self._batchify(np.matrix(self._inputNormalizer.transform(X)), self._batchSize,
                                                     np.matrix(self._outputNormalizer.transform(y)))


        #do for each step until the maximum steps:
        for i in range(self._maxSteps):
            reducedLearningRate = self._learningRate * self._shrinkage ** self._step
            for j in range(len(batchFeatures)):
                deltaW = self._learnFromBatch(batchFeatures[j], batchTargets[j])
                if (self._lastDelta == None):
                    self._lastDelta = deltaW
                for k in range(len(self._weights)):
                    self._lastDelta[k] = ((1-self._momentum) * deltaW[k] + self._momentum * self._lastDelta[k])
                    self._weights[k] = self._weights[k] + reducedLearningRate * self._lastDelta[k]
                #self._positifyWeights()
            self._step += 1
        #print(step)
        return self
Exemple #47
-1
def analysis_7(df_Coredata):
	""" 多次元多項式モデル """

	#https://www.jeremyjordan.me/polynomial-regression/

	X = df_Coredata[['d','e','f','g','i']]
	y = df_Coredata['j']

	# グラフのスタイルを指定
	sns.set(style = 'whitegrid', context = 'notebook')
	# 変数のペアの関係をプロット
	#sns.pairplot(df_Coredata)
	#plt.show()


	#X_train, X_test, y_train, y_test  =  train_test_split(X,y,random_state = 0)
	#lr = linear_model.LinearRegression().fit(X_train, y_train)
	#print("Trainng set score: {:.2f}".format(lr.score(X_train, y_train)))
	#print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

	### データのスケール変換
	# 標準化
	std_Scaler = StandardScaler()
	data_std = std_Scaler.fit_transform(X)

	mmx_Scaler =MinMaxScaler()
	X_scaled = mmx_Scaler.fit_transform(X)
	#X_test_scaled = scaler.transform(X_test)

	#print(X_train_scaled)

	poly = PolynomialFeatures(degree = 2).fit(data_std)
	print(poly.get_feature_names())
 def minmaxscaling(df):
     # MinMaxScaling between 0 and 1 is bad when you have outliers.
     # https://stats.stackexchange.com/a/10298
     scaler = MinMaxScaler(feature_range=(0, 1))
     # min max scaler want features in the columns and samples in the rows -> ok
     df = scaler.fit_transform(df)
     return df, scaler
def runAlgorithm(data, categories, function, iterations = 5, num_partitions = 2):
    results_table = np.empty([iterations*num_partitions,4], dtype=float)
    scaler = MinMaxScaler()
    data = scaler.fit_transform(data)

    for i in range(iterations):
        # Se realiza una partición aleatoria
        print("Iteration ", i)
        partition  = makePartitions(data, categories, random_ppio)
        for j in range(num_partitions):
            print("Sub iteration ", j)
            start = time.time()

            training_data = partition[0][j]
            training_categ = partition[1][j]

            test_data = np.array([partition[0][k][l] for k in range(num_partitions) if k!=j for l in range(len(partition[0][k]))], float)
            test_categ = np.array([partition[1][k][l] for k in range(num_partitions) if k!=j for l in range(len(partition[1][k]))])

            solution, train_rate = function(training_data, training_categ)

            end = time.time()

            nbrs =  neighbors.KNeighborsClassifier(3)
            nbrs.fit(training_data[:,solution],training_categ)
            rate = 100*nbrs.score(test_data[:,solution], test_categ)

            results_table[i*num_partitions+j,0] = train_rate/len(training_data)*100
            results_table[i*num_partitions+j,1] = rate
            results_table[i*num_partitions+j,2] = (1 - sum(solution)/len(training_data[0]))*100
            results_table[i*num_partitions+j,3] = end-start

            print("Rate = " + str(rate) + "\nTime = " + str(end-start) + " s")

    return results_table
def cal_result(model,year):
    """
    计算1个模型的各个统计量
    :param model: 模型
    :return: 统计量列表
    """
    X = load_data(year)[0]
    y1 = load_data(year)[1][0]  # 票房
    y2= load_data(year)[1][1]  # 微博评分
    y3= load_data(year)[1][2]  # 豆瓣评分
    y4 = load_data(year)[1][3]  # 时光网评分
    scaler = MinMaxScaler().fit(X)
    X = scaler.transform(X)
    # print model(X, y1)[0]
    # print model(X, y2)[0]
    # print model(X, y3)[0]
    # print model(X, y4)[0]
    result = cal_one_model(model(X, y1)[0], cal_avg(model(X, y2)[0], model(X, y3)[0], model(X, y4)[0]))
    result1 = []
    result1.append(model(X, y1)[1])
    result1.append(model(X, y2)[1])
    result1.append(model(X, y3)[1])
    result1.append(model(X, y4)[1])
    # print result1
    # scaler = StandardScaler().fit(result1)
    # result1 = scaler.transform(result1)
    return result, result1
def train(mode):
    if mode == "NextWeek":
        DATA = "MLprojectOutput/week34567to8Formated/part-00000"
    else:
        DATA = "MLprojectOutput/week34567to9Formated/part-00000"
    X, Y = readData(DATA, 10000, -1)
    X_Scaler = MinMaxScaler().fit(X)
    joblib.dump(X_Scaler, 'Predict{0}_Scaler.pkl'.format(mode))
    X = X_Scaler.transform(X)
    dtrain = xgb.DMatrix(X, label = Y)
    param = { 'booster':"gbtree",
              'eta':0.3,
              'max_depth':6,
              'subsample':0.85,
              'colsample_bytree':0.7,
              'silent':0,
              'objective':'reg:linear',
              'nthread':10,
              'eval_metric':'rmse'}
    __model = xgb.train(param.items(), dtrain)
    __model.save_model('Predict{0}.model'.format(mode))
    X_TEST, Y_TEST = readData(DATA, 0, 10000)
    X_TEST = X_Scaler.transform(X_TEST)
    dtest = xgb.DMatrix(X_TEST)
    Y_pred = list(map(lambda x: int(x), __model.predict(dtest)))
    evaluate(Y_TEST,Y_pred)
def uniform_to_normal(df, continuous_features):
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df[continuous_features].dropna()), columns=continuous_features)
    uniform = set()
    alpha = 0.05

    for c in continuous_features:
        statistic, pvalue = kstest(df_scaled[c], scipy.stats.uniform().cdf)
        if statistic < alpha:
            uniform.add(c)

    zero_to_one = [f for f in uniform if
                   df[f].min() > 0 and df[f].min() < 0.001 and df[f].max() < 1 and df[f].max() > 0.999]
    zero_to_ten = [f for f in uniform if
                   df[f].min() > 0 and df[f].min() < 0.01 and df[f].max() < 10 and df[f].max() > 9.99]
    zero_to_hundred = [f for f in uniform if
                       df[f].min() > 0 and df[f].min() < 0.1 and df[f].max() < 100 and df[f].max() > 99.9]
    for f in uniform:
        min = 0 if f in zero_to_one or f in zero_to_ten or f in zero_to_hundred else df[f].min()
        max = 1 if f in zero_to_one else (10 if f in zero_to_ten else 100 if f in zero_to_hundred else df[f].max())
        df[f] = df[f].map(lambda x: norm.ppf((x - min) / (
        max - min)))  # we could use df_scaled but this should give us better results since what we think are the actual min and max, and not the observed min and max

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    return uniform
    def _scaled_data(self):
        """Load scaled data.

        Args:
            None

        Returns:
            (scaler, train, test): Tuple of list of train and test data

        """
        # Initialize key variables
        (_train, _test) = self._data()

        # Fit scaler
        scaler = MinMaxScaler(feature_range=(-1, 1))
        scaler = scaler.fit(_train)

        # Transform train
        train = _train.reshape(_train.shape[0], _train.shape[1])
        train_scaled = scaler.transform(train)

        # Transform test
        test = _test.reshape(_test.shape[0], _test.shape[1])
        test_scaled = scaler.transform(test)

        # Return
        return scaler, train_scaled, test_scaled
Exemple #54
-1
def normalize_data(tr_x,ts_x,normz=None,axis=0):
    if normz is 'scale':
        tr_x = scale(tr_x,axis=axis)
        ts_x = scale(ts_x,axis=axis)
    elif normz is 'minmax':
        minmax_scaler = MinMaxScaler()
        if axis==0:
            for c_i in range(tr_x.shape[1]):
                tr_x[:,c_i] = minmax_scaler.fit_transform(tr_x[:,c_i])
                ts_x[:,c_i] = minmax_scaler.fit_transform(ts_x[:,c_i])
        elif axis==1:
            for r_i in range(tr_x.shape[0]):
                tr_x[r_i,:] = minmax_scaler.fit_transform(tr_x[r_i,:])
                ts_x[r_i,:] = minmax_scaler.fit_transform(ts_x[r_i,:])
    elif normz is 'sigmoid':
        if axis==0:
            col_max = np.max(tr_x,axis=0)
            cols_non_norm = np.argwhere(col_max>1).tolist()
            tr_x[:,cols_non_norm] = -0.5 + (1 / (1 + np.exp(-tr_x[:,cols_non_norm])))
            # TODO: implement col_max col_non_norm for test set
            ts_x[:,cols_non_norm] = -0.5 + (1/(1+np.exp(-ts_x[:,cols_non_norm])))
        elif axis==1:
            row_max = np.max(tr_x,axis=1)
            rows_non_norm = np.argwhere(row_max>1).tolist()
            tr_x[rows_non_norm,:] = -0.5 + (1 / (1 + np.exp(-tr_x[rows_non_norm,:])))
            # TODO: implement row_max row_non_norm for test set
            ts_x[rows_non_norm,:] = -0.5 + (1/(1+np.exp(-ts_x[rows_non_norm,:])))

    return tr_x,ts_x
def test_stratified_shuffle_split(clf, dataset, feature_list, folds = 1000, scale_features = True):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
 
    # Scale features
    if(scale_features):
        scaler = MinMaxScaler()
        features = scaler.fit_transform(features)

    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print 'Total predictions: '+str(total_predictions)
        print 'Accuracy: '+str(accuracy)
        print 'Precision: '+str(precision)
        print 'Recall: '+str(recall)
        print 'F1: '+str(f1)
        print 'F2: '+str(f2)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."
def plot_prediction_relevance(results, EFA=True, classifier='ridge', 
                              rotate='oblimin', change=False, size=4.6, 
                              dpi=300, ext='png', plot_dir=None):
    """ Plots the relevant relevance of each factor for predicting all outcomes """
    predictions = results.load_prediction_object(EFA=EFA, 
                                                 change=change,
                                                 classifier=classifier,
                                                 rotate=rotate)['data']

    targets = list(predictions.keys())
    predictors = predictions[targets[0]]['predvars']
    importances = abs(np.vstack([predictions[k]['importances'] for k in targets]))
    # scale to 0-1 
    scaler = MinMaxScaler()
    scaled_importances = scaler.fit_transform(importances.T).T
    # make proportion
    scaled_importances = scaled_importances/np.expand_dims(scaled_importances.sum(1),1)
    # convert to dataframe
    scaled_df = pd.DataFrame(scaled_importances, index=targets, columns=predictors)
    melted = scaled_df.melt(var_name='Factor', value_name='Importance')
    plt.figure(figsize=(8,12))
    f=sns.boxplot(y='Factor', x='Importance',  data=melted,
                  width=.5)
    if plot_dir is not None:
        filename = 'prediction_relevance'
        save_figure(f, path.join(plot_dir, filename), 
                    {'bbox_inches': 'tight', 'dpi': dpi})
        plt.close()
def test_min_max_scaler_zero_variance_features():
    """Check min max scaler on toy data with zero variance features"""
    X = [[0.,  1.,  0.5],
         [0.,  1., -0.1],
         [0.,  1.,  1.1]]

    X_new = [[+0.,  2.,  0.5],
             [-1.,  1.,  0.0],
             [+0.,  1.,  1.5]]

    # default params
    scaler = MinMaxScaler()
    X_trans = scaler.fit_transform(X)
    X_expected_0_1 = [[0.,  0.,  0.5],
                      [0.,  0.,  0.0],
                      [0.,  0.,  1.0]]
    assert_array_almost_equal(X_trans, X_expected_0_1)

    X_trans_new = scaler.transform(X_new)
    X_expected_0_1_new = [[+0.,  1.,  0.500],
                          [-1.,  0.,  0.083],
                          [+0.,  0.,  1.333]]
    assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)

    # not default params
    scaler = MinMaxScaler(feature_range=(1, 2))
    X_trans = scaler.fit_transform(X)
    X_expected_1_2 = [[1.,  1.,  1.5],
                      [1.,  1.,  1.0],
                      [1.,  1.,  2.0]]
    assert_array_almost_equal(X_trans, X_expected_1_2)
    def get_training_data_by_category(category, limit=0):
        limit_pos = limit*0.2
        limit_neg = limit*0.8
        N_pos = DataDAO.count_training_data_by_category(category)
        if N_pos < limit_pos:
            limit_pos = N_pos
            limit_neg = N_pos*5

        training_data = []
        training_target = []
        positive = DataDAO.get_training_data_by_category(category)
        for ind, sample in enumerate(positive):
            if limit != 0 and ind >= limit_pos:
                break
            training_data.append(sample)
            training_target.append(1)
        negative = DataDAO.get_training_data_by_other_categories(category)
        for ind, sample in enumerate(negative):
            if limit != 0 and ind >= limit_neg:
                break
            training_data.append(sample)
            training_target.append(0)

        scaler = MinMaxScaler()
        training_data_scaled = scaler.fit_transform(training_data)

        # training_data_scaled = scale(training_data,axis=0)
        tr_data_sparse = csr_matrix(training_data_scaled)

        return tr_data_sparse, training_target, scaler
def train_model(feats_csv):

	df = pd.DataFrame()
	df = pd.read_csv(feats_csv).iloc[:,1:]

	y = np.ravel(df.iloc[:,-1:])
	X = np.array(df.iloc[:,:-1])

	############ 15 Best selected features using ANOVA F-value score function ###############
	X_new = SelectKBest(f_classif, k=15).fit_transform(X, y)
	selected_features = SelectKBest(f_classif, k=15).fit(X, y).get_support(indices = True)

	############ KNN manhattan ###############
	##### preprocessing: data scaling######## 
	min_max_scaler = MinMaxScaler()
	X_new = min_max_scaler.fit_transform(X_new)

	model = KNeighborsClassifier(n_neighbors = 1,algorithm = 'brute',metric = 'manhattan',weights = 'uniform')
	model.fit(X_new,y)

	newdir = '../kNN_clfr'
	os.mkdir(newdir)

	joblib.dump(model, os.path.join(newdir,'kNN.pkl')) 

	return
Exemple #60
-1
def sdae_syn(X_s,P,h_layer,activations,noise,epoch,loss,batch_size):
	"""Generate synthetic samples using stacked De-noising Encoders
	Parameters
	----------
	X_s: positive class sample (Numpy Array) (Input Must be in within range of 0 to 1)
	P: Over Sampling Percentage
	h_layer: hidden layer (list)
	activation: activation functions list (same length as hidden layer)
	noise : [None,Gaussian,mask]
	epoch: epoch for each layer (list with same size as hidden layer)
	loss: 'rmse' or 'cross-entropy'
	batch_size = mini_batch size

	For more detaisl on input parameters https://github.com/rajarsheem/libsdae 
	"""
	n_samples=int(X_s.shape[0]*P/100)
	print "generating %d samples" %(n_samples)
	X_init=np.random.standard_normal(size=(n_samples,X_s.shape[1]))
	scaler=MinMaxScaler()
	X_init=scaler.fit_transform(X_init)
	model = StackedAutoEncoder(dims=h_layer, activations=activations, noise=noise, 
		epoch=epoch,loss=loss, 
		batch_size=batch_size, lr=0.007, print_step=2000)
	model.fit(X_s)
	syn_Z=model.transform(X_init)
	return syn_Z