Ejemplo n.º 1
0
 def fit(self, X, y, lam_fpc):
     n, self.p = X.shape
     if self.standardize:
         self.enc = SS().fit(X)
     else:
         self.enc = SS()
         self.enc.mean_ = np.repeat(0, self.p)
         self.enc.scale_ = np.repeat(1, self.p)
     Xtil = self.enc.transform(X)
     ybar = y.mean()
     lmax = max(np.abs(Xtil.T.dot(y - ybar) / n))
     lmin = lmax * 0.001
     lseq = np.exp(np.linspace(np.log(lmax), np.log(lmin), 100))
     self.l1 = Lasso(fit_intercept=True,
                     normalize=False,
                     copy_X=False,
                     warm_start=True)
     e2 = np.repeat(0.0, len(lseq))
     ll2 = e2.copy()
     for ii, ll in enumerate(lseq):
         self.l1.alpha = ll
         self.l1.fit(Xtil, y)
         r2 = np.sqrt(sum((y - self.l1.predict(Xtil))**2))
         e2[ii], ll2[ii] = r2, n * ll / r2
         if ll2[ii] < lam_fpc:
             print('Found solution!')
             self.supp = np.where(~(self.l1.coef_ == 0))[0]
             self.l1 = LogisticRegression('l2',
                                          C=1000,
                                          fit_intercept=True,
                                          solver='lbfgs',
                                          max_iter=1000)
             self.l1.fit(Xtil[:, self.supp], y)
             break
Ejemplo n.º 2
0
    def fit(self, Xtrain, ytrain, validation_data=None, **vae_kwargs):
        """
        Fits a vae oversampler

        Arguments:
            Xtrain: training data
            ytrain: training labels
            validation_data = (Xtest,ytest)
                optional
            variational autoencoder kwargs: passed to keras

        Returns: none
        """
        if validation_data is not None:
            Xtest, ytest = validation_data
        if self.rescale:
            self.ss = SS()
            self.ss.fit(Xtrain[ytrain == self.minority_class_id])
            X = self.ss.transform(Xtrain[ytrain == self.minority_class_id])
            if validation_data is not None:
                x_test = self.ss.transform(
                    Xtest[ytest == self.minority_class_id])
        else:
            X = Xtrain[ytrain == self.minority_class_id]
            if validation_data is not None:
                x_test = Xtest[ytest == self.minority_class_id]
        if validation_data is not None:
            self.build_train(X, x_test=x_test, **vae_kwargs)
        else:
            self.build_train(X, **vae_kwargs)
Ejemplo n.º 3
0
def normalise_numeric_features(X,
                               standardisation=False,
                               means=True,
                               stdev=True):
    """ Normalisation for numeric features

    :param X: A numpy matrix of the data. First axis corresponding to instances, second axis corresponding to samples
    :param standardisation: Whether standardisation needs to be done instead of normalisation. Default: False
    :param means: Whether the mean should be normalised. Default: True
    :param stdev: Whether the standard devation should be normalised. Default: True
    :return: X and features with numeric features normalised.
    """

    column_types = column_types_dataset(X, categorical=False)

    for i in range(len(column_types)):
        if column_types[i]:

            if standardisation:
                # Standardisation
                scaler = MMS([0, 1])
                X[:, i:i + 1] = scaler.fit_transform(X[:, i:i + 1])
            else:
                # Normalisation
                scaler = SS(means, stdev)
                X[:, i:i + 1] = scaler.fit_transform(X[:, i:i + 1])

    return X
Ejemplo n.º 4
0
 def __init__(self, mva=30, data=None):
     self.mva = mva
     self.scaler = SS()
     self.orig_data = data
     self.datasets = {}
     self.mvas = {}
     self.datasetcount = 0
     if data is not None:
         self.datasets['orig'] = data
Ejemplo n.º 5
0
 def standardizeData(self, df_orig):
     '''
     standardize data with the help of sklearn's StandardScaler() class
     '''
     scaler = SS()
     scaled_columns = scaler.fit_transform(df_orig[self.column_selection])
     df_cp = df_orig[self.column_selection].copy()
     for num, column in enumerate(self.column_selection):
         df_cp[column + "_scaled"] = scaled_columns[:, num]
     return df_cp.iloc[:, len(self.column_selection):]
Ejemplo n.º 6
0
def scale_it(dat, tq=True):
    sh0, sh2 = dat.shape[0], dat.shape[2]
    s = SS(
        copy=False
    )  # copy=False does the scaling inplace, so we don't have to make a new list
    if tq:
        it = tqdm(range(sh0))
    else:
        it = range(sh0)
    for j in it:  # timesteps
        for i in range(sh2):  # number of indicators/etc
            _ = s.fit_transform(dat[j, :, i].reshape(-1, 1))[:, 0]
Ejemplo n.º 7
0
def scale_pcts(df):
    feats = [
        'close-open_pct', 'high-low_pct', 'close-close_pct', 'open-open_pct',
        'high-high_pct', 'low-low_pct', 'vol-vol_pct'
    ]

    scalers = []
    for f in feats:
        sc = SS()
        df[f + '_scaled'] = sc.fit_transform(df[f].values.reshape(-1, 1))
        scalers.append(sc)

    return df, scalers
Ejemplo n.º 8
0
    def fit(self, X=None, y=None):
        """Pass.

        Parameters
        ----------
        X
            Ignored

        y
            Ignored

        """
        self._ss = SS(with_mean=self.norm_mean, with_std=self.norm_std)
        return self
Ejemplo n.º 9
0
def _calculate_component(XA: np.ndarray):
    """
    Calculate each component to calculate dissimilarity for subsequences
    """
    # distance component
    mu_XA = XA.mean(axis=0)

    # rotation component
    pca_XA = PCA()
    pca_XA.fit(SS().fit_transform(XA))
    e_vector_XA = pca_XA.components_

    # variance component
    p_XA = pca_XA.explained_variance_

    return mu_XA, e_vector_XA, p_XA
Ejemplo n.º 10
0
    def fit_resample(self, Xtrain, ytrain, validation_data=None, **vae_kwargs):
        """
        Fits a vae oversampler and returns resampled dataset

        Arguments:
            Xtrain: training data
            ytrain: training labels
            validation_data = (Xtest,ytest)
                optional
            variational autoencoder kwargs: passed to keras

        Returns:
            Xres,yres: resampled data and labels.
            attempts to balance the dataset to 50% minority class
        """
        if validation_data is not None:
            Xtest, ytest = validation_data
        num_samples_to_generate = max(
            Xtrain[ytrain != self.minority_class_id].shape[0] -
            Xtrain[ytrain == self.minority_class_id].shape[0], 100)
        if self.rescale:
            self.ss = SS()
            self.ss.fit(Xtrain[ytrain == self.minority_class_id])
            X = self.ss.transform(Xtrain[ytrain == self.minority_class_id])
            if validation_data is not None:
                x_test = self.ss.transform(
                    Xtest[ytest == self.minority_class_id])
        else:
            X = Xtrain[ytrain == self.minority_class_id]
            if validation_data is not None:
                x_test = Xtest[ytest == self.minority_class_id]
        if validation_data is not None:
            self.build_train(X, x_test=x_test, **vae_kwargs)
        else:
            self.build_train(X, **vae_kwargs)
        z_sample = np.random.normal(0, 1,
                                    (num_samples_to_generate, self.latent_dim))
        outputs = self.decoder.predict(z_sample)
        if self.rescale:
            oversampled_X = self.ss.inverse_transform(outputs)
        else:
            oversampled_X = outputs
        oversampled_y = np.ones(num_samples_to_generate)\
            * self.minority_class_id
        X_all = np.concatenate((Xtrain, oversampled_X))
        y_all = np.concatenate((ytrain, oversampled_y))
        return (X_all, y_all)
Ejemplo n.º 11
0
def rf():
    X, y = make_classification(n_samples=1000,
                               n_features=4,
                               n_informative=2,
                               n_redundant=0,
                               random_state=0,
                               shuffle=False)
    with open(
            r'D:\work\DL_Predicting_Pharmacological\data\result_file\Paper_result\random_data_notd.txt',
            'r') as mesh_file:
        all_list = []
        all_data = []
        for mesh_lines in mesh_file:
            mesh_lines = mesh_lines.strip().split('\t')
            all_list.append(mesh_lines)
    label = []
    all_data = np.array(all_list)
    all_data = SS().fit_transform(all_data)
    test_data = all_data[:16000]
    with open(
            r'D:\work\DL_Predicting_Pharmacological\data\result_file\Paper_result\random_label.txt',
            'r') as label_file:
        for label_line in label_file:
            label_line = label_line.strip()
            label.append(int(label_line))
    non_label = label
    #label=np_utils.to_categorical(label)
    vali_data = all_data[17001:17368]
    test_label = label[:16000]
    clf = RandomForestClassifier(max_depth=35, random_state=0)
    clf.fit(test_data, test_label)
    y = clf.predict(vali_data)
    pre_label = []
    #print(y)
    # for i in y:
    #     #i=list(i)
    #     pre_label.append(i.index(max(i)))
    print(y, 'predict')
    print(non_label[17001:17368])
    c = 0
    for i in range(len(y)):
        if y[i] == non_label[17001:17368][i]:
            c += 1
    print(c)
    print(c / len(y), 'rate')
Ejemplo n.º 12
0
def gen_log_problem_obj_uniq_ss():
    df = utils.load_enroll()
    log_df = utils.load_log()
    arr = []
    log_sz = len(log_df.groupby('enrollment_id'))

    for i, (eid, part_df) in enumerate(log_df.groupby('enrollment_id')):
        if i % 1000 == 0:
            l.info("{0} of {1}".format(i, log_sz))

        ev = part_df[part_df['event'] == 'problem']
        part_d = {'enrollment_id': eid}
        part_d['evuniq'] = len(ev['object'].unique())
        arr.append(part_d)

    feat_df = pd.DataFrame(arr)
    df = df.merge(feat_df, how='left', on='enrollment_id').fillna(0)
    return {'X': SS().fit_transform(utils.reshape(df['evuniq']))}
Ejemplo n.º 13
0
def pca_plot(col_nums=None, ann_cutoff=1 / 3):
    pca = PCA(whiten=False)
    ss = SS()
    X = pca.fit_transform(ss.fit_transform(zcta_df.fillna(zcta_df.median())))
    ann_factor = 20
    cols = zcta_df.columns
    if col_nums is None:
        col_nums = list(range(len(cols)))
    N = len(col_nums) + 1
    for coli in col_nums:
        for colj in col_nums:
            ploti = col_nums.index(coli)
            plotj = col_nums.index(colj)
            print(ploti, plotj, (plotj * N) + (ploti % N) + 1)
            plt.subplot(N, N, (plotj * N) + (ploti % N) + 1)
            plt.scatter(X[:, coli], X[:, colj], s=0.1)
            plt.xlabel("PC %s" % coli)
            plt.ylabel("PC %s" % colj)
            for i, col in enumerate(cols):
                arr = np.zeros(len(cols))
                arr[i] = 1
                proj = pca.transform([arr])
                x, y = proj[0][[coli, colj]]
                norm = sqrt(x**2 + y**2)
                print("col", i, "has norm", norm, "in graph", coli, colj)
                if norm > ann_cutoff:
                    plt.plot([0, ann_factor * x], [0, ann_factor * y])
                    plt.annotate(col, (ann_factor * x, ann_factor * y))
    plt.subplot(N, N, (N)**2)
    plt.plot(pca.explained_variance_ratio_)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    for comp in range(len(pca.explained_variance_ratio_)):
        print("-" * 80)
        print(comp)
        print("explained variance ratio:", pca.explained_variance_ratio_[comp])
        sorted_loadings = sorted(zip(zcta_df.columns, pca.components_[comp]),
                                 key=lambda xy: xy[1],
                                 reverse=True)
        for col, load in sorted_loadings:
            print(col, load)
    plt.show()
# In[8]:

Y = data['class']  # actual output
X = data[data.columns[:-1]]  # input data features
data, target = X, Y
from sklearn.model_selection import train_test_split as SPLIT
X_train, X_test, Y_train, Y_test = SPLIT(X, Y, test_size=0.3, random_state=4)
# 70% Data for Training, 30% Data for Testing

# ### Scale the Data

# In[9]:

from sklearn.preprocessing import StandardScaler as SS

X = SS().fit_transform(X)

# ## Train the Support Vector Classifier

# In[10]:

from sklearn.svm import SVC

# Hyperparameters
kernel = 'rbf'
C = 13
gamma = 0.325

from time import time as T
start = T()
model = SVC(kernel=kernel, C=C, gamma=gamma)
Ejemplo n.º 15
0
imp.fit(X[:, 1:3])
X[:, 1:3] = imp.transform(X[:, 1:3])

#Encoding categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
Labelx = LabelEncoder()
X[:, 0] = Labelx.fit_transform(X[:, 0])
print(X)
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],
    remainder='passthrough')

X = np.array(ct.fit_transform(X), dtype=np.float)
Labely = LabelEncoder()
Y = Labely.fit_transform(Y)
print(Y)

#Splitting the data into training and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler as SS
sc_X = SS()
x_train = sc_X.fit_transform(x_train)
x_test = sc_X.transform(x_test)
Ejemplo n.º 16
0

df_full = df_full.fillna(-999) #-999

df_full['CR_AMB_Drop_EOP_1'] = df_full.CR_AMB_Drop_Build_1 * df_full.EOP_prev1
df_full['I_CR_AQB_EOP_1'] = df_full.I_CR_AQB_PrevQ1 / df_full.EOP_prev1
df_full['I_AQB_EOP_1'] = df_full.I_AQB_PrevQ1 / df_full.EOP_prev1
df_full['D_Prev1_EOP_1'] = df_full.D_prev1 / df_full.EOP_prev1
df_full['CR_AMB_Drop_1_D_prev1'] = df_full.CR_AMB_Drop_Build_1 / df_full.D_prev1
df_full['CR_AMB_Drop_2_D_prev1'] = df_full.CR_AMB_Drop_Build_2 / df_full.D_prev1
df_full['CR_AMB_Drop_1_vintage'] = df_full.CR_AMB_Drop_Build_1 / df_full.vintage


df_full = df_full.replace([np.inf, -np.inf], np.nan)
df_full = df_full.fillna(-999)
df_full = SS().fit_transform(df_full)

df_train = df_full[:300000]
df_test = df_full[300000:]

gc.collect()


lgb_train = lgb.Dataset(df_train, Y)

lgb_params = {
    'boosting_type': 'gbdt', 'objective': 'binary',
    'nthread': -1, 'silent': True,
    'num_leaves': 2**8 -1, 'learning_rate': 0.02, 'max_depth': 8,
    'max_bin': 2**8 -1, 'metric': 'auc',
    'colsample_bytree': 0.33, #0.4
Y = encoder.transform(Y)

#new_Y = pd.DataFrame(encoded_Y)

'''print('\n X.head(10): after 1 hot')
print(X.head(10))
print('\n new_Y.head(10)')
print(new_Y.head(10))'''

# convert X and new_Y to numpy arrays
X = X.values
print('\n new Y.shape')
print(Y.shape)

# standardize X
scaler = SS().fit(X)
rescaledX = scaler.transform(X)

# split into train test sets using t_t_s
# because we combined the datasets to apply uniform
# one hot and label encoding, we set 'shuffle' parameter as false
# we also know that there should be 15060 rows in the test sets
test_set_size = test_dataset_nomissing.shape[0]
print('\n test_set_size...')
print(test_set_size)
X_train, X_test, Y_train, Y_test = t_t_s(rescaledX, Y, test_size=test_set_size, random_state=seed, shuffle=False)

# instantiate XGBC class using defaults
model = XGBC()

# evaluate the model against the training datset using stratified kfold
Ejemplo n.º 18
0
# Importing the dataset
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, [2, 3]].values
Y = dataset.iloc[:, 4].values

#Splitting the data into training data and test data
from sklearn.model_selection import train_test_split
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

#Feature scaling has to be done in K_nearest neighbors algo
#Feature Scaling
from sklearn.preprocessing import StandardScaler as SS
sc = SS()
X_Train = sc.fit_transform(X_Train)
X_Test = sc.transform(X_Test)

#Creating the model and fitting it to data
from sklearn.neighbors import KNeighborsClassifier as KNC
classifier = KNC(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_Train, Y_Train)

#Predicting the result
y_pred = classifier.predict(X_Test)

#Creatingt the confusion matrix for finding the accuracy of model
from sklearn.metrics import confusion_matrix  #here confusion_matrix is a method not a class
cm = confusion_matrix(Y_Test, y_pred)
Ejemplo n.º 19
0
 def __init__(self):
     self._scaler = SS()
Ejemplo n.º 20
0
def make_zip_pca(zcta_df):
    pca = PCA(whiten=False)
    ss = SS()
    X = pca.fit_transform(ss.fit_transform(zcta_df.fillna(zcta_df.median())))
    return X
Ejemplo n.º 21
0
# In[8]:

datax.columns = (datax.iloc[0])
datax.drop(0, inplace=True, axis=1)
datax.drop('!series_matrix_table_end', axis=1, inplace=True)
datax.drop('Probe_ID', inplace=True)
datax.head()
# datax.to_csv('Alzh_Features_Wrangled.csv')
# datay.to_csv('Alzh_Labels_Wrangled.csv')

# In[ ]:

from sklearn.preprocessing import StandardScaler as SS
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.pipeline import Pipeline
SS = SS()
clf = MLP()
#print(clf.get_params().keys())
pipe = Pipeline(steps=[('scaler', SS), ('MLP', clf)])
params = {
    'MLP__hidden_layer_sizes': list(range(1000, 30000, 1000)),
    'MLP__activation': ['logistic', 'tanh', 'relu']
}

grid_search = GSCV(pipe, params, cv=8, scoring='accuracy')
grid_search.fit(datax, datay)
best_act = grid_search.best_params_.get('MLP__activation')
best_hl = grid_search.best_params_.get('MLP__hidden_layer_size')
print('Best Parameters:', grid_search.best_params_)
print("Accuracy:", grid_search.best_score_)
Ejemplo n.º 22
0
def Standard_norm(arr):
    scalerSS = SS()
    scalerSS.fit(arr)
    arrSS = scalerSS.transform(arr)
    
    return arrSS
Ejemplo n.º 23
0
targets = data.Survived; targets = targets.map({'Died': 0, 'Survived':1})
data.drop(columns=['Survived'],inplace=True)
data.Pclass = data.Pclass.map({'Poor':1, 'Medium':2, 'Upper':3})
data_model = pd.get_dummies(data=data, drop_first=True)
X_train, X_test, y_train, y_test = train_test_split(data_model, targets,
                test_size=0.15, random_state=seed, stratify=targets)
#Model Instantiation
logit = LR(random_state=seed,solver='lbfgs',max_iter=300)
rf = RFC(n_estimators=250, random_state=seed)
gb = GBC(n_estimators=250, random_state=seed)
xgb = xgb.XGBClassifier(objective='reg:logistic', n_estimators=250, seed=42)
svm = SVC(random_state=seed,probability=True)

models = [logit, rf,gb,xgb, svm]
labels = ['Died', 'Survived']
scaler = SS(); X_train = scaler.fit_transform(X_train); X_test = scaler.transform(X_test)  #FOR SVM

def fit_metrics(model, Xtr, ytr, Xts, yts, labels):
    print(model.__class__.__name__ + ' Results:')
    model.fit(Xtr, ytr)
    cm = m.confusion_matrix(yts, model.predict(Xts))
    plot_matrix(cm, classes=labels, normalize=True,
    title='Confusion Matrix for Titanic Test Data'); plt.show()
    plot_roc_auc(yts, logit.predict_proba(Xts)[:,1])
    plot_precision_recall(yts, logit.predict_proba(Xts)[:,1])
    classification_metrics(yts,logit.predict(Xts))
    
#need to add Cross-Validation for more reliable results, here we get Bird's Eye view
for model in models:
    print('*'*25)
    fit_metrics(model, X_train, y_train, X_test, y_test, labels)
Ejemplo n.º 24
0
CLR = list(range(len(Y)))
for i in range(len(Y)):
    if (Y[i] == 0):
        CLR[i] = 'a'
    elif (Y[i] == 0.19):
        CLR[i] = 'b'
    elif (Y[i] == 2.5):
        CLR[i] = 'c'
    elif (Y[i] == 4.5):
        CLR[i] = 'd'

Y = CLR

#Standard Scaling the data.
from sklearn.preprocessing import StandardScaler as SS
ss = SS()
X = ss.fit_transform(X)

#train_test splitting for analysis of optimal number of parameters.
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)
X_cv, X_test, Y_cv, Y_test = train_test_split(X_test,
                                              Y_test,
                                              test_size=0.5,
                                              random_state=0)

#Modelling
Ejemplo n.º 25
0
with open(r'/Users/libingrui/Desktop/Work_file/Data/0610_path_data.txt',
          'r') as mesh_file:
    all_list = []
    all_data = []
    for mesh_lines in mesh_file:
        mesh_lines = mesh_lines.strip().split('\t')
        all_list.append(mesh_lines)
with open(r'/Users/libingrui/Desktop/Work_file/Data/0610_label.txt',
          'r') as label_file:
    for label_line in label_file:
        label_line = label_line.strip()
        label.append(int(label_line))
# X=dt
# y=label
all_data = np.array(all_list)
all_data = SS().fit_transform(all_data)
#label=np_utils.to_categorical(label)
X_train = all_data[:3000]
X_test = all_data[30001:3985]
y_train = label[:3000]
y_test = label[3001:3985]
#print(y_train)
# 算法参数
params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',
    'num_class': 6,
    'gamma': 0.1,
    'max_depth': 6,
    'lambda': 2,
    'subsample': 0.7,
Ejemplo n.º 26
0
    "matrix",
    type=str,
    help="the kmer frequency matrix file in csv format, with header and index")
parser.add_argument("label", type=str, help="the label of matrix")
args = parser.parse_args()

# load your data and label as text format
allmatrix = pd.read_csv(args.matrix, header=0, index_col=0,
                        low_memory=True).values
target = np.loadtxt(args.label)

print("allmatrix shape: {};label shape: {}".format(allmatrix.shape,
                                                   target.shape))

# standarize your data
allmatrix = SS().fit_transform(allmatrix)

# transform your data to tensor
x = tf.convert_to_tensor(allmatrix, dtype=tf.float32)
y = tf.convert_to_tensor(target, dtype=tf.int32)

# split train, validation, and test data with ratio 7:1:2
idx = tf.range(allmatrix.shape[0])
idx = tf.random.shuffle(idx)
x_train, y_train = tf.gather(x, idx[:int(0.7 * len(idx))]), tf.gather(
    y, idx[:int(0.7 * len(idx))])
x_val, y_val = tf.gather(
    x, idx[int(0.7 * len(idx)):int(0.8 * len(idx))]), tf.gather(
        y, idx[int(0.7 * len(idx)):int(0.8 * len(idx))])
x_test, y_test = tf.gather(x, idx[int(0.8 * len(idx)):]), tf.gather(
    y, idx[int(0.8 * len(idx)):])
Ejemplo n.º 27
0
resultant = sorted(resultant, key=lambda x: x[1], reverse=True)
print("\n**************************\n")
resultant = [x for x, _ in resultant]
#resultant.sort(reverse = True)
print(resultant)
vectors = []
for d, _ in enumerate(chapters):
    vectors = vectors + [[]]
    for j in resultant:
        if j in chapters[d]:
            vectors[d] = vectors[d] + [chapters[d][j]]
        else:
            vectors[d] = vectors[d] + [0]
vectors = np.array(vectors)
print(vectors)
vectors = SS().fit_transform(vectors)
print(vectors)
pca = PCA(n_components=2)
pca.fit(vectors)
print(pca.components_)
scores = pca.transform(vectors)

labels = []
for x in range(comeco, fim):
    labels += [x]

plt.scatter(scores[:, 0], scores[:, 1])

for i, l in enumerate(labels):
    plt.annotate(l,
                 xy=(scores[i, 0], scores[i, 1]),
Ejemplo n.º 28
0
X_train, X_test, y_train, y_test = SPLIT(wine.data,
                                         wine.target,
                                         test_size=0.25,
                                         stratify=wine.target,
                                         random_state=123)

# printing class distribution of test dataset
print(f'Classes: {np.unique(y_test)}')
print(f'Class distribution for test data: {np.bincount(y_test)}')

# MLP is sensitive to feature scaling, hence performing scaling
# Options: MinmaxScaler and Standardscaler
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
from sklearn.preprocessing import StandardScaler as SS
X_train_stdsc = SS().fit_transform(X_train)
X_test_stdsc = SS().fit_transform(X_test)

# Setting of hyperparameters of the network
from sklearn.neural_network import MLPClassifier as MLP
mlp = MLP(hidden_layer_sizes=(10, ), learning_rate_init=0.001, max_iter=5000)

# Calculating Training Time : more neurons, more time
from time import time
start = time()
# Train the model using the scaled training sets
mlp.fit(X_train_stdsc, y_train)
end = time()
print(f'Training Time: {(end-start)*1000:.3f}ms')

# Predict the response for test dataset
Ejemplo n.º 29
0
good_up = resample(good, replace=True, n_samples=550, random_state=seed)

#Dimension reduction vs w/out Dimension reduction
from helper_funcs import model_reduce, plot_roc_auc, plot_precision_recall, fit_metrics
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler as SS
from sklearn.model_selection import RandomizedSearchCV as RSCV

qualities = wines['quality']
wines.drop(columns=['quality'], inplace=True)
X_train, X_test, y_train, y_test = tts(wines,
                                       qualities,
                                       test_size=0.2,
                                       random_state=seed,
                                       stratify=qualities)
scaler = SS()

X_train.loc[:, X_train.columns] = scaler.fit_transform(
    X_train.loc[:, X_train.columns])
X_test.loc[:, X_test.columns] = scaler.transform(X_test.loc[:, X_test.columns])

from sklearn.ensemble import RandomForestClassifier as RFC, GradientBoostingClassifier as GBC
from sklearn.linear_model import LogisticRegression as LR
import xgboost as xgb
from sklearn.metrics import r2_score

logit = LR(random_state=seed, solver='lbfgs', max_iter=300, multi_class='auto')
rf = RFC(n_estimators=250, random_state=seed)
gb = GBC(n_estimators=250, random_state=seed)
xgb = xgb.XGBClassifier(objective='reg:logistic', n_estimators=250, seed=42)
models = [logit, rf, gb, xgb]
Ejemplo n.º 30
0
b=pd.DataFrame(pr.normalize(y,norm='l1',axis=0))
b.iloc[:,0].apply(lambda x:abs(x)).sum()

#l2-norm is unit euclidean distance. Consider a vector in a space. To convert it into unit distance, we use l2-norm i.e. divide it by sqrt(a^2+b^2)
#l1-norm is unit Manhattan distance. So in this case, the vector is divided by abs(a)+abs(b)
#Standardization is centering with unit variance
#Axis=0 implies a feature(Column) while Axis =1 implies a sample(Row)  
#http://scikit-learn.org/stable/modules/preprocessing.html

c=pd.DataFrame(pr.scale(y,axis=0))

#How to standardise training and test data
x=pd.DataFrame({'a':[randint(-10,10) for i in range(20)]})
y=pd.DataFrame({'b':[randint(-10,10) for i in range(20)]})

scalar=SS().fit(x)
x_scalar=scalar.transform(x)
y_scalar=scalar.transform(y)
scalar.mean_
scalar.var_

state=list(np.repeat('PA',5))
total=[randint(10000,500000) for i in range(5)]
Obama=[round(randint(0,300)/3.0,3) for i in range(5)]
Romney=[round(randint(0,300)/3.0,3) for i in range(5)]
winner=[ randint(0,1) for i in range(5) ]

election=pd.DataFrame({'state':state,'total':total,'Obama':Obama,'Romney':Romney,'winner':winner})
election['winner']= election['winner'].apply(lambda x: 'Romney' if x==1 else 'Obama')
election['country']=['Adams','Allegheny','Armstrong','Beaver','Bedford']
election.set_index('country',inplace=True)