コード例 #1
0
ファイル: baselines.py プロジェクト: IlyasAzeem/PRs_project
def feature_selection_LR():

    from sklearn.feature_selection import RFE

    rfe_selector = RFE(estimator=RandomForestClassifier(),
                       n_features_to_select=30,
                       step=5,
                       verbose=5)
    rfe_selector.fit(X_train_scaled, y_train)

    y_pred = rfe_selector.predict(X_test_scaled)
    y_predprob = rfe_selector.predict_proba(X_test_scaled)[:, 1]

    rfe_support = rfe_selector.get_support()
    rfe_feature = X_train[predictors].loc[:, rfe_support].columns.tolist()
    print(str(len(rfe_feature)), 'selected features')
    print('RFE features')
    print(rfe_feature)
    # Print model report:
    print("\nModel Report")
    #print("Train Accuracy : %.4g" % metrics.accuracy_score(y_train, y_pred_train))
    print("Test Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
    #print('Train error: {:.3f}'.format(1 - metrics.accuracy_score(y_train, y_pred_train)))
    print('Test error: {:.3f}'.format(1 -
                                      metrics.accuracy_score(y_test, y_pred)))
    print("AUC Score : %f" % metrics.roc_auc_score(y_test, y_predprob))
    print("Recall : %f" % metrics.recall_score(y_test, y_pred))
    print("Precision : %f" % metrics.precision_score(y_test, y_pred))
    print("F-measure : %f" % metrics.f1_score(y_test, y_pred))
    c_matrix = metrics.confusion_matrix(y_test, y_pred)
    print('========Confusion Matrix==========')
    print("          Rejected    Accepted")
    print('Rejected     {}      {}'.format(c_matrix[0][0], c_matrix[0][1]))
    print('Accepted     {}      {}'.format(c_matrix[1][0], c_matrix[1][1]))
コード例 #2
0
def perform_rfe(model, train, test, filename, to_remove=None):

    if to_remove is None:
        to_remove = floor(0.3 * len(train.columns))

    X = train.drop(TARGET, axis=1)
    y = train[TARGET]

    model.fit(X, y)
    preds = model.predict_proba(test)[:, 1]
    build_results_csv(filename,
                      X.columns,
                      send_submission("doesnt_matter.csv", preds),
                      create_file=True)
    sleep(3)

    for i in range(to_remove):
        rfe = RFE(model, n_features_to_select=len(X.columns) - 1).fit(X, y)

        preds = rfe.predict_proba(test)[:, 1]

        X = X.iloc[:, rfe.get_support()]
        test = test.iloc[:, rfe.get_support()]

        results = build_results_csv(
            filename, X.columns, send_submission("doesnt_matter.csv", preds))
        sleep(3)

    return results
コード例 #3
0
def train_recursive_feature_elimination(x_train,
                                        y_train,
                                        x_test,
                                        y_test,
                                        feature_num=10):
    print("-------------RFE Model-------------")
    class_weight = dict()
    class_weight[1] = 1
    class_weight[0] = 1
    model = LogisticRegression(solver='sag', class_weight=class_weight)
    # model = RandomForestClassifier(n_estimators=100)
    # model = SVC(gamma='scale', probability=True, kernel='poly')
    rfe = RFE(model, feature_num)
    # RFE Fit
    rfe.fit(x_train, y_train)
    # RFE Predict
    y_predicted = rfe.predict(x_test)
    y_prob = rfe.predict_proba(x_test)
    print(rfe.support_)
    return y_predicted, y_prob
コード例 #4
0
from sklearn.feature_selection import RFE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
import pickle
import pandas as pd

train_data = pickle.load(open('xwd/train_feat.pkl', 'rb'))
train_label = pickle.load(open('xwd/train_label.pkl', 'rb'))
'''
train_data = train_data[:100]
train_label =  train_label[:100]
train_data.fillna(0,inplace=True)
train_label.fillna(0,inplace=True)
'''
valid_data = pickle.load(open('xwd/valid_feat.pkl', 'rb'))
valid_label = pickle.load(open('xwd/valid_label.pkl', 'rb'))
test = pickle.load(open('xwd/test_feat.pkl', 'rb'))

gbm = lgb.LGBMClassifier(
    objective='binary',
    num_leaves=200,  #600W
    learning_rate=0.05,
    min_child_samples=100,
    n_estimators=1)

model = RFE(estimator=LogisticRegression(),
            n_features_to_select=70).fit(train_data, train_label)

proba_test = model.predict_proba(test)
コード例 #5
0
cols = [
    "Age", "Fare", "TravelAlone", "Pclass_1", "Pclass_2", "Embarked_C",
    "Embarked_S", "Sex_male", "IsMinor"
]
X = final_train[cols]
Y = final_train["Survived"]
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

logistic = LogisticRegression()
rfe = RFE(logistic, 8)
rfe.fit(x_train, y_train)

# summarize the selection of the attributes
print('Selected features: %s' % list(X.columns[rfe.support_]))

print("=========================")
y_pred = rfe.predict(x_test)
print("========================")
y_pred_proba = rfe.predict_proba(x_test)[:, 1]
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)
print('Train/Test split results:')
print(rfe.__class__.__name__ +
      " accuracy is %2.3f" % accuracy_score(y_test, y_pred))
print(rfe.__class__.__name__ +
      " log_loss is %2.3f" % log_loss(y_test, y_pred_proba))
print(rfe.__class__.__name__ + " auc is %2.3f" % auc(fpr, tpr))
コード例 #6
0
# import modules
import numpy as np
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_selection import RFE

# read data
data = pd.read_csv('data_cleaned.csv')

# make data ready
data_x = data.drop(['Target', 'Client_ID'], axis = 1)
data_y = data.Target

# set train and test
train_x = data_x[data_x['X2006'] == 0].values
train_y = data_y[data_x['X2006'] == 0].values
test_x = data_x[data_x['X2006'] == 1].values
test_y = data_y[data_x['X2006'] == 1].values

# set model
nb = BernoulliNB(250)

# select features
rfe = RFE(nb, n_features_to_select=39)
rfe.fit(train_x, train_y);

# make predictions
pred = rfe.predict_proba(data_x)

# create submission
pd.DataFrame({'Client_ID':data.Client_ID, 'Cross_Sell':pred[:, 1]}).to_csv('sub_final.csv', index=False)
コード例 #7
0
    return target

nullmod = nullmod(df, target, other)
y210 = getTargetDf(df, target, other)
ypred = nullmod.predict(y210)
y210[target] = ypred

# 递归特征选择
from sklearn.feature_selection import RFE

estimator = xgb.XGBClassifier(**params)
selector = RFE(estimator, 200, step=0.1)

selector = selector.fit(xtrain, ytrain)

p = selector.predict_proba(xvalid)

roc_auc_score(yvalid, p[:, 1])





#-------------------------------------------
# KNN
KNeighborsClassifier(n_neighbors=5,
                     weights='uniform',
                     algorithm='auto',
                     leaf_size=30,
                     p=2,  # power, minkowski 的2次方 即euclidean_distance
                     metric='minkowski',
コード例 #8
0
y_pred_prob_train = model.predict_proba(X_train)[:, 1]
logreg_roc_auc = roc_auc_score(y_train, y_pred_prob_train)

from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train, y_pred_prob_train)
len(thresholds)
thresholds[thresholds > 0.8][-1]
plt.plot(fpr, tpr, label="area=%0.4f" % logreg_roc_auc)
plt.legend(loc="0")
from sklearn.feature_selection import RFE
model = LogisticRegression()
rfe = RFE(model, 30)
rfe.fit(X_train, y_train)
rank = list(rfe.ranking_)
col = list(X_train.columns)
feature = []
for i in range(len(col)):
    if rank[i] == 1:
        feature.append(col[i])
feature
x_train_new = pd.DataFrame()
for i in feature:
    x_train_new[i] = X_train[i]
x_test_new = pd.DataFrame()
for i in feature:
    x_test_new[i] = X_test[i]
rfe = rfe.fit(x_train_new, y_train)
y_pred = rfe.predict(x_test_new)
y_pred_prob = rfe.predict_proba(x_test_new)[:, 1]
score = roc_auc_score(y_test, y_pred_prob)
コード例 #9
0
print(rfe.support_)
print(rfe.ranking_)
col = list(X_train.columns)
rank = list(rfe.ranking_)
new_list = []
for i in range(len(col)):
    if rank[i] == 1:
        new_list.append(col[i])

x_new = pd.DataFrame()
for i in new_list:
    x_new[i] = X_train[i]

rfe = rfe.fit(x_new, y_train)
y_pred_new = rfe.predict(x_new)
new_accuracy = rfe.score(x_new, y_train)
y_new_pred_prob = rfe.predict_proba(x_new)
new_roc_auc_score = roc_auc_score(y_train, y_new_pred_prob[:, 1])

#New model for test data
model = LogisticRegression(class_weight="balanced", penalty="l2")
model.fit(X_train, y_train)
y_pred_test = model.predict(X_test)
y_pred_test = y_pred_test.astype(np.int16)
df3 = df2["PassengerId"]
df3 = pd.DataFrame(df3)

df3.set_index("PassengerId", inplace=True)
df3["Survived"] = y_pred_test
df3.to_csv(r'C:\Users\Admin\Desktop\ml practice\kaggle_titanic.csv')
print matchups['home_team']

# Remove the 'week' 'home_team' and 'away_team' columns from matchups as they are not used in the algorithm
matchups.drop(['week', 'home_team', 'away_team'], axis=1, inplace=True)
'''You'll likely want to use the a pickled model from previous regression predicting 2015 results'''

for feat in range(1, len(matchups.columns)):
    for c in C_vec:
        # Create the classifier and check the score
        # clf = LogisticRegression()
        clf = linear_model.LogisticRegression(C=c, random_state=42)
        selector = RFE(clf)
        selector = selector.fit(X_train, y_train)

        # Calculate probabilities using the predict_proba method for logistic regression
        probabilities = selector.predict_proba(scaler.transform(matchups))

        # Vectorize the spread_conversion function and apply the function to the probabilities result vector
        vfunc = np.vectorize(spread_conversion)
        predicted_spreads = np.apply_along_axis(vfunc, 0, probabilities[:, 0])

        # If the actual line for the home team is lower than the predicted line then you would take the away team, otherwise take the home team
        bet_vector = np.array(np.where(predicted_spreads > spreads, 0, 1))

        # Create the actual result vector where a tie counts as a loss for the home team
        game_result = np.array(
            np.where(
                home_score.ix[:, 0] + predicted_spreads[:] >
                away_score.ix[:, 0], 1, 0))

        # Check to see where the bet_vector equals the actual game result with the spread included
from var_clus import VarClus

demo = VarClus(max_eigenvalue=1.35, max_tries=5)
demo.decompose(dfc)
demo.print_cluster_structure()

#%%stepwise
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(class_weight='balanced',
                            random_state=11,
                            solver='lbfgs')
rfe = RFE(logreg, 44)
rfe.fit(dfc, y)
rfe.predict_proba(dfc)

selectX = rfe.transform(dfc)
## find those selected variables
for i in range(44):
    temp = selectX[:, i]
    for name in dfc.columns:
        temp1 = dfc[name]
        if (temp1 == temp).all():
            print(name)

prob = rfe.predict_proba(dfc)
odds = prob[:, 0] / prob[:, 1]

#%%
import matplotlib.pyplot as plt





for feat in range(1,len(matchups.columns)):
    for c in C_vec:
        # Create the classifier and check the score
        # clf = LogisticRegression()
        clf = linear_model.LogisticRegression(C=c,random_state=42)
        selector = RFE(clf)
        selector = selector.fit(X_train,y_train)

        # Calculate probabilities using the predict_proba method for logistic regression
        probabilities = selector.predict_proba(scaler.transform(matchups))

        # Vectorize the spread_conversion function and apply the function to the probabilities result vector
        vfunc = np.vectorize(spread_conversion)
        predicted_spreads = np.apply_along_axis(vfunc,0,probabilities[:,0])

        # If the actual line for the home team is lower than the predicted line then you would take the away team, otherwise take the home team
        bet_vector = np.array(np.where(predicted_spreads > spreads,0,1))

        # Create the actual result vector where a tie counts as a loss for the home team
        game_result = np.array(np.where(home_score.ix[:,0] + predicted_spreads[:] > away_score.ix[:,0], 1, 0))

        # Check to see where the bet_vector equals the actual game result with the spread included
        result = np.array(np.where(bet_vector == game_result,1,0))

        prob_result = float(np.sum(result)) / len(result)
コード例 #13
0
def Model(Label,Parameters=[]):
    global filepath, filename, fixed_seed_num, sequence_window, number_class, hidden_units, input_dim, learning_rate, epoch, is_multi_scale, training_level, cross_cv, is_add_noise, noise_ratio
    try:
        filepath = Parameters["filepath"]
        filename = Parameters["filename"]
        sequence_window = Parameters["sequence_window"]
        number_class = Parameters["number_class"]
        hidden_units = Parameters["hidden_units"]
        input_dim = Parameters["input_dim"]
        learning_rate = Parameters["learning_rate"]
        epoch = Parameters["epoch"]
        is_multi_scale = Parameters["is_multi_scale"]
        training_level = Parameters["training_level"]
        cross_cv = Parameters["cross_cv"]
        fixed_seed_num = Parameters["fixed_seed_num"]
        is_add_noise = Parameters["is_add_noise"]
        noise_ratio = Parameters["noise_ratio"]
    except:
        pass


    result_list_dict = defaultdict(list)
    evaluation_list = ["ACCURACY","F1_SCORE","AUC","G_MEAN"]
    for each in evaluation_list:
        result_list_dict[each] = []
    np.random.seed(fixed_seed_num)  # for reproducibility
    #num_selected_features = 30
    #num_selected_features = 25#AS leak tab=0
    #num_selected_features = 32#Slammer tab=0
    num_selected_features = 33#Nimda tab=1
    for tab_cv in range(cross_cv):

        if not tab_cv == 0 :continue
        epoch_training_loss_list = []
        epoch_val_loss_list = []
        #print(is_multi_scale)

        #using MLP to train
        if Label == "SVM":
            x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename,
                                                                                            sequence_window, tab_cv,
                                                                                            cross_cv,
                                                                                            Multi_Scale=is_multi_scale,
                                                                                            Wave_Let_Scale=training_level,
                                                                                            Normalize=0)

            print(Label+" is running..............................................")
            y_train = y_train0
            clf = svm.SVC(kernel="rbf", gamma=0.00001, C=100000,probability=True)
            print(x_train.shape)
            clf.fit(x_train, y_train)
            result = clf.predict_proba(x_test)
            #return Evaluation.Evaluation(y_test, result)
            #results = Evaluation.Evaluation(y_test, result)

        elif Label == "SVMF":
            x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename,
                                                                                            sequence_window, tab_cv,
                                                                                            cross_cv,
                                                                                            Multi_Scale=is_multi_scale,
                                                                                            Wave_Let_Scale=training_level,
                                                                                            Normalize=5)

            print(Label+" is running..............................................")
            clf = svm.SVC(kernel="rbf", gamma=0.00001, C=100000,probability=True)
            print(x_train.shape)
            #x_train_new = SelectKBest(f_classif, k=num_selected_features).fit_transform(x_train, y_train0)
            #x_test_new = SelectKBest(f_classif, k=num_selected_features).fit_transform(x_test, y_test0)

            clf.fit(x_train, y_train0)
            result = clf.predict_proba(x_test)
            #return Evaluation.Evaluation(y_test, result)
            #results = Evaluation.Evaluation(y_test, result)
        elif Label == "SVMW":
            x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename,
                                                                                            sequence_window, tab_cv,
                                                                                            cross_cv,
                                                                                            Multi_Scale=is_multi_scale,
                                                                                            Wave_Let_Scale=training_level,
                                                                                            Normalize=6)

            print(Label + " is running..............................................")
            #SVR(kernel="linear") = svm.SVC(kernel="rbf", gamma=0.00001, C=100000, probability=True)
            estimator = svm.SVC(kernel="linear",probability=True)
            selector = RFE(estimator, num_selected_features, step=1)
            selector = selector.fit(x_train, y_train0)

            result = selector.predict_proba(x_test)
            # return Evaluation.Evaluation(y_test, result)
            # results = Evaluation.Evaluation(y_test, result)
        elif Label == "NBF":

            x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename,
                                                                                            sequence_window, tab_cv,
                                                                                            cross_cv,
                                                                                            Multi_Scale=is_multi_scale,
                                                                                            Wave_Let_Scale=training_level,
                                                                                            Normalize=10)

            print(Label + " is running..............................................")
            clf = MultinomialNB()
            clf.fit(x_train, y_train0)
            result = clf.predict_proba(x_test)


        elif Label == "NBW":
            x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename,
                                                                                            sequence_window, tab_cv,
                                                                                            cross_cv,
                                                                                            Multi_Scale=is_multi_scale,
                                                                                            Wave_Let_Scale=training_level,
                                                                                            Normalize=11)

            print(Label + " is running..............................................")
            #SVR(kernel="linear") = svm.SVC(kernel="rbf", gamma=0.00001, C=100000, probability=True)
            estimator = MultinomialNB()
            selector = RFE(estimator, num_selected_features, step=1)
            selector = selector.fit(x_train, y_train0)

            result = selector.predict_proba(x_test)
            # return Evaluation.Evaluation(y_test, result)
            # results = Evaluation.Evaluation(y_test, result)
        elif Label == "NB":
            x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename,
                                                                                            sequence_window, tab_cv,
                                                                                            cross_cv,
                                                                                            Multi_Scale=is_multi_scale,
                                                                                            Wave_Let_Scale=training_level,
                                                                                            Normalize=1)

            print(Label+" is running..............................................")
            y_train = y_train0
            clf = MultinomialNB()
            clf.fit(x_train, y_train)
            result = clf.predict_proba(x_test)

            #return Evaluation.Evaluation(y_test, result)
            #results = Evaluation.Evaluation(y_test, result)

        elif Label == "DT":
            x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename,
                                                                                            sequence_window, tab_cv,
                                                                                            cross_cv,
                                                                                            Multi_Scale=is_multi_scale,
                                                                                            Wave_Let_Scale=training_level,
                                                                                            Normalize=2)

            print(Label+" is running.............................................."+str(x_train.shape))
            y_train = y_train0
            clf = tree.DecisionTreeClassifier()
            clf.fit(x_train, y_train)
            result = clf.predict_proba(x_test)

            #return Evaluation.Evaluation(y_test, result)
            #results = Evaluation.Evaluation(y_test, result)
        elif Label == "Ada.Boost":
            x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename,
                                                                                            sequence_window, tab_cv,
                                                                                            cross_cv,
                                                                                            Multi_Scale=is_multi_scale,
                                                                                            Wave_Let_Scale=training_level,
                                                                                            Normalize=0)

            print(Label+" is running.............................................."+str(x_train.shape))
            y_train = y_train0
            #clf = AdaBoostClassifier(n_estimators=10) #Nimda tab=1
            clf = AdaBoostClassifier(n_estimators=10)

            clf.fit(x_train, y_train)
            result = clf.predict_proba(x_test)

            #return Evaluation.Evaluation(y_test, result)
            #results = Evaluation.Evaluation(y_test, result)
        elif Label == "MLP":
            x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData_WithoutS(is_add_noise,noise_ratio,filepath, filename,
                                                                                            sequence_window, tab_cv,
                                                                                            cross_cv,
                                                                                            Multi_Scale=is_multi_scale,
                                                                                            Wave_Let_Scale=training_level,
                                                                                            Normalize=0)

            print(Label+" is running..............................................")
            batch_size = len(y_train)
            start = time.clock()
            model = Sequential()
            model.add(Dense(hidden_units, activation="relu", input_dim=33))

            model.add(Dense(output_dim=number_class))
            model.add(Activation("sigmoid"))
            # model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

            model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=epoch)
            #result = model.predict(X_Testing, batch_size=batch_size)
            result = model.predict(x_test)
            end = time.clock()
            print("The Time For MLP is " + str(end - start))

            #return Evaluation.Evaluation(y_test, result)
            #results = Evaluation.Evaluation(y_test, result)

        #elif Label == "SVM-S":
            #x_train, y_train, y_train0, x_test, y_test, y_test0 = LoadData.GetData('Attention',filepath,filename,sequence_window,tab_cv,cross_cv)
            #x_train,y_train = Manipulation(x_train,y_train0,sequence_window)
            #x_test, y_test = Manipulation(x_test, y_test0, sequence_window)
            #clf = svm.SVC(kernel="rbf")
            #clf.fit(x_train, y_train)
            #result = clf.predict(x_test)
            #results = Evaluation.Evaluation_WithoutS(y_test, result)
        elif Label == "RNN":
            print(Label+" is running..............................................")
            start = time.clock()
            x_train_multi_list, x_train, y_train, x_testing_multi_list, x_test, y_test = LoadData.GetData(is_add_noise,noise_ratio,'Attention',
                                                                                                          filepath,
                                                                                                          filename,
                                                                                                          sequence_window,
                                                                                                          tab_cv,
                                                                                                          cross_cv,
                                                                                                          Multi_Scale=is_multi_scale,
                                                                                                          Wave_Let_Scale=training_level)

            batch_size = len(y_train)
            rnn_object = SimpleRNN(hidden_units, input_length=len(x_train[0]), input_dim=input_dim)
            model = Sequential()

            model.add(rnn_object)  # X.shape is (samples, timesteps, dimension)
            #model.add(Dense(30, activation="relu"))
            #model.add(Dropout(0.2))
            model.add(Dense(30, activation="sigmoid"))
            #model.add(Dropout(0.3))
            # model.add(Dense(5,activation="tanh"))

            model.add(Dense(output_dim=number_class))
            model.add(Activation("sigmoid"))
            # model.add(Activation("softmax"))

            # model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

            model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=epoch)

            #result = model.predict(X_Testing, batch_size=batch_size)

            result = model.predict(x_test)

            #return Evaluation.Evaluation(y_test, result)
            #results = Evaluation.Evaluation(y_test, result)

            end = time.clock()
            print("The Time For RNN is " + str(end - start))

            # print(result)
        elif Label == "LSTM":
            print(Label+" is running..............................................")
            start = time.clock()
            x_train_multi_list, x_train, y_train, x_testing_multi_list, x_test, y_test = LoadData.GetData(is_add_noise,noise_ratio,'Attention',filepath,
                                                                                                          filename,
                                                                                                          sequence_window,
                                                                                                          tab_cv,
                                                                                                          cross_cv,
                                                                                                          Multi_Scale=is_multi_scale,
                                                                                                          Wave_Let_Scale=training_level)

            batch_size = len(y_train)

            lstm_object = LSTM(hidden_units, input_length=len(x_train[0]), input_dim=input_dim)
            model = Sequential()

            model.add(lstm_object)  # X.shape is (samples, timesteps, dimension)
            # model.add(LSTM(lstm_size,return_sequences=True,input_shape=(len(X_Training[0]),33)))
            # model.add(LSTM(100,return_sequences=True))
            # model.add(Dense(10, activation="tanh"))
            # model.add(Dense(5,activation="tanh"))
            model.add(Dense(30, activation="relu"))
            #model.add(Dropout(0.2))

            #model.add(Dense(30, activation="sigmoid"))
            #model.add(Dropout(0.3))
            # model.add(Dense(5,activation="tanh"))

            model.add(Dense(output_dim=number_class))
            model.add(Activation("sigmoid"))
            #model.add(Activation("softmax"))

            # model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
            model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

            model.fit(x_train, y_train, batch_size=batch_size, nb_epoch=epoch)

            #result = model.predict(X_Testing, batch_size=batch_size)

            result = model.predict(x_test)

            end = time.clock()
            print("The Time For LSTM is " + str(end - start))

        if len(Parameters) > 0:
            return Evaluation.Evaluation(y_test, result)#Plotting AUC

        results = Evaluation.Evaluation(y_test, result)# Computing ACCURACY,F1-score,..,etc
        print(results)
        y_test2 = np.array(Evaluation.ReverseEncoder(y_test))
        result2 = np.array(Evaluation.ReverseEncoder(result))
        print("---------------------------1111111111111111")
        with open("StatFalseAlarm_"+filename+"_True.txt","w") as fout:
            for tab in range(len(y_test2)):
                fout.write(str(int(y_test2[tab]))+'\n')
        with open("StatFalseAlarm_"+filename+"_"+Label+"_"+"_Predict.txt","w") as fout:
            for tab in range(len(result2)):
                fout.write(str(int(result2[tab]))+'\n')
        print(result2.shape)
        print("---------------------------22222222222222222")

        for each_eval, each_result in results.items():
            result_list_dict[each_eval].append(each_result)

    for eachk, eachv in result_list_dict.items():
        result_list_dict[eachk] = np.average(eachv)
    #print(result_list_dict)
    if is_add_noise == False:
        with open(os.path.join(os.getcwd(),"Comparison_Log_"+filename+".txt"),"a")as fout:
            outfileline = Label+":__"
            fout.write(outfileline)
            for eachk,eachv in result_list_dict.items():
                fout.write(eachk+": "+str(round(eachv,3))+",\t")
            fout.write('\n')
    else:
        with open(os.path.join(os.getcwd(),"Comparison_Log_Adding_Noise_"+filename+".txt"),"a")as fout:
            outfileline = Label+":__"+"Noise_Ratio_:"+str(noise_ratio)
            fout.write(outfileline)
            for eachk,eachv in result_list_dict.items():
                fout.write(eachk+": "+str(round(eachv,3))+",\t")
            fout.write('\n')

    return results
コード例 #14
0
ファイル: donot.py プロジェクト: seculayer/AI_Competitions_2
    'penalty': ['l2', 'l1'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

grid = GridSearchCV(estimator=log_clf,
                    param_grid=param_grid,
                    scoring='roc_auc',
                    verbose=1,
                    n_jobs=-1)

grid.fit(X, y)

print("Best Score:" + str(grid.best_score_))
print("Best Parameters: " + str(grid.best_params_))

best_parameters = grid.best_params_

#Recursive feature elimination
log_clf = LogisticRegression(**best_parameters)
log_clf.fit(X, y)

selector = RFE(log_clf, 25, step=1)
selector.fit(X, y)
scores_table(selector, 'selector_clf')

#submission
submission = pd.read_csv('../input/dont-overfit-ii/sample_submission.csv')
X_test = test
submission['target'] = selector.predict_proba(X_test)
submission.to_csv('submission.csv', index=False)
コード例 #15
0
pca = PCA(n_components=2)
pc = pca.fit_transform(x)
pc_t = pca.fit_transform(t)
pcdf = pd.DataFrame(data=pc, columns=['pc1', 'pc2'])
pcdf_t = pd.DataFrame(data=pc_t, columns=['pc1', 'pc2'])
print(pcdf.shape)
print(pcdf_t.shape)
finalDf = pd.concat([pcdf, df[['target']]], axis=1)
print(finalDf.head())
print('주성분 설명력 : ', pca.explained_variance_ratio_)

# REF 적용  Random Forest
# RF용 StandardScaler
ss = StandardScaler()
x_train_ss = ss.fit_transform(x_train)
x_test_ss = ss.fit_transform(x_test)
test_ss = ss.fit_transform(test)
print(train.shape, test.shape)

#sub model 1 : RandomForest
forest = RandomForestClassifier(n_estimators=500, random_state=7)
select = RFE(forest, n_features_to_select=77)
x_train_rf = select.fit_transform(x_train_ss, y_train)
x_test_rf = select.transform(x_test_ss)
test_rf = select.transform(test_ss)
print(x_train_rf.shape)
score = select.fit(x_train_rf, y_train).score(x_test_rf, y_test)
print('RFE 후 acc : {:.3f}'.format(score))

rf_y_pred = select.predict_proba(test_rf)
コード例 #16
0
attributes = attributes_balance.drop('fusao', axis=1)
print(attributes)

#Cria dicionário e mapa para sexo
d = {'F': 0, 'M': 1}
attributes['Sexo'] = attributes['Sexo'].map(d).astype(int)

attributes = pd.get_dummies(attributes)
print(attributes)

# Divide aleatoriamentes os conjuntos em teste e treino
X_train, X_test, y_train, y_test = train_test_split(attributes,
                                                    classes,
                                                    test_size=0.20)

# Criar e treinar modelo de regressão
logreg = LogisticRegression(solver='liblinear')
classifier = RFE(logreg, 20)
classifier = classifier.fit(X_train, y_train)

jl.dump(classifier, 'models/diabetes_logistic_regression.joblib')

y_pred = classifier.predict(X_test)
print(y_pred)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# predict probabilities
probs = classifier.predict_proba(X_test)
print(probs)
コード例 #17
0
    for f in range(n_features):
        print("%d. feature %d (%f)" %
              (f + 1, indices[f], importances[indices[f]]))

if (feat_roc):
    half = int(n_samples / 2)
    x, y = shuffle(x, y, random_state=random_state)
    X_train, X_test = x[0:half], x[half:-1]
    y_train, y_test = y[0:half], y[half:-1]

    rf_feat_sel = RandomForestClassifierWithCoef(n_estimators=n_trees)

    for i in range(n_features):
        print(i)
        rfe = RFE(rf_feat_sel, i + 1)
        rfe = rfe.fit(X_train, y_train)
        probas_ = rfe.predict_proba(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
        roc_auc = auc(fpr, tpr)
        if (i == 0 or i == 18):
            print("auc: ", roc_auc)
        pl.plot(fpr, tpr, lw=1)

    pl.xlim([-0.001, 1.001])
    pl.ylim([-0.001, 1.001])
    pl.xlabel('False Positive Rate')
    pl.ylabel('True Positive Rate')
    pl.title('Receiver operating characteristic example')
    pl.show()
コード例 #18
0
# Try the RFE method
# Create the RFE model and select number of attributes
# We checked the appropriate number of attributes through the confusion matrix
rfe = RFE(model, 7).fit(x_train, y_train)

# summarize the selection of the attributes
print('Selected features: %s' % list(x_train.columns[rfe.support_]))

# Create a confusion matrix in the form of an array
y_pred = rfe.predict(x_train)
cnf_matrix = metrics.confusion_matrix(y_train, y_pred)
cnf_matrix

# Plot of predicted probabilities of survival vs actual survival
y_pred_prob = rfe.predict_proba(x_train)[:, 1]
plt.figure(7)
plt.scatter(y_pred_prob, y_train, s=10)
plt.xlabel('Predicted Chance Of Survival')
plt.ylabel('Actual Survival')
plt.tight_layout()

plt.show()

print("Accuracy:", metrics.accuracy_score(y_train, y_pred))

#####################################################
# Test our model against dataset_test
# Fill out the missing values of age with the mean
# Fill out the missing values of fare with the median
mean_age = dataset_test.loc[:, 'Age'].mean()
コード例 #19
0
cfm = confusion_matrix(Y_test, Y_pred)
print(cfm)

#[[7397   26]
#[ 142 2204]]

print("Classification Report")
print(classification_report(Y_test, Y_pred))
accuracy_score = accuracy_score(Y_test, Y_pred)
print("Accuracy of the Model:", accuracy_score)

# Accuracy of the Model: 0.982802743372
#%%

# Adjusting The Threshold
Y_pred_prob = rfe.predict_proba(X_test)
print(Y_pred_prob)

Y_pred_class = []
for value in Y_pred_prob[:, 0]:
    if value < 0.72:
        Y_pred_class.append(1)
    else:
        Y_pred_class.append(0)
print(Y_pred_class)
#%%
from sklearn.metrics import confusion_matrix, accuracy_score

cfm = confusion_matrix(Y_test.tolist(), Y_pred_class)
print(cfm)
コード例 #20
0
ファイル: random forest.py プロジェクト: dewalg/lab
	for f in range(n_features):
	    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

if(feat_roc):
	half = int(n_samples / 2)
	x,y = shuffle(x,y,random_state=random_state)
	X_train, X_test = x[0:half], x[half:-1]
	y_train, y_test = y[0:half], y[half:-1]

	rf_feat_sel = RandomForestClassifierWithCoef(n_estimators=n_trees)

	for i in range(n_features):
		print(i)
		rfe = RFE(rf_feat_sel, i+1)
		rfe = rfe.fit(X_train,y_train)
		probas_ = rfe.predict_proba(X_test)
		fpr, tpr, thresholds = roc_curve(y_test, probas_[:,1])
		roc_auc = auc(fpr, tpr)
		if (i==0 or i==18):
			print ("auc: ", roc_auc)
		pl.plot(fpr, tpr, lw=1)



	pl.xlim([-0.001, 1.001])
	pl.ylim([-0.001, 1.001])
	pl.xlabel('False Positive Rate')
	pl.ylabel('True Positive Rate')
	pl.title('Receiver operating characteristic example')
	pl.show()
コード例 #21
0
            # separate features from labels
            train_y = train_data['exclusion']
            train_x = train_data.drop(columns=['exclusion'])
            test_y = test_data['exclusion']
            test_x = test_data.drop(columns=['exclusion'])

            start = timeit.default_timer()
            rf_model = RandomForestClassifier(n_jobs=-1,
                                              n_estimators=tree_count)
            rfe = RFE(estimator=rf_model,
                      n_features_to_select=subset_size,
                      step=5)
            rfe.fit(train_x, train_y)

            logger.log_message(
                f'Using features {train_x.columns[rfe.support_].values}')

            # record performance
            posteriors = rfe.predict_proba(test_x)
            roc_auc = roc_auc_score(test_y, posteriors[:, 1])

            # record results
            time_elapsed = timeit.default_timer() - start
            output += f'{roc_auc},{time_elapsed}\n'
            logger.log_message(f'Ending run {run}')
            logger.log_message(f'Results {output}')
            with open(results_file, 'a') as outfile:
                outfile.write(output)

            counter += 1
コード例 #22
0
# ROC curve
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

# In[113]:

#Logistic regression with RFE
log_clf = LogisticRegression(C=best_parameters['C'],
                             penalty=best_parameters['penalty'],
                             random_state=random_state)

selector = RFE(log_clf)
selector = selector.fit(X_train, y_train)

y_pred = selector.predict(X_test)
y_score = selector.predict_proba(X_test)[:, 1]

# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0, 1]

plt.figure()
plot_confusion_matrix(cm,
                      classes=class_names,
                      title='Logistic Confusion matrix')
plt.xlim(-0.5, len(np.unique(y)) - 0.5)  # ADD THIS LINE
plt.ylim(len(np.unique(y)) - 0.5, -0.5)  # ADD THIS LINE
plt.show()

show_metrics()
コード例 #23
0
ファイル: XGB_Feature_RFE.py プロジェクト: Shurooo/gumgum
num_round = 250   # Number of rounds of training, increasing this increases the range of output values
clf = xgbw.XGBWrapper(param, num_round, verbose_eval=0)

k = 500
step = 25
result_all = []

for step in [400, 200, 100, 50, 25]:
    selector = RFE(clf, step=step, n_features_to_select=k, verbose=2)

    print "Fitting Selector: k = {}, step = {}".format(k, step)
    start = time.time()
    selector = selector.fit(X_train, y_train)
    train_time = time.time() - start

    support = selector.get_support(indices=True)
    file_name = str(data[0]).rjust(2, "0") + str(data[1]).rjust(2, "0") + "_k" + str(k) + "_s" + str(step)
    addr_out = os.path.join("/home/ubuntu/Weiyi/RFE_Select", file_name)
    np.save(addr_out, support)

    start = time.time()
    prob = selector.predict_proba(X_test)
    test_time = round(time.time() - start, 2)

    score, recall, filter_rate, cut, net_savings = search_cut(prob)
    result_all.append([k, train_time, test_time, score, recall, filter_rate, cut, net_savings, step])

data = pd.DataFrame(np.array(result_all), columns=["k", "train time", "test time", "score", "recall", "filter rate", "cut", "net savings", "step"])
data.to_csv("/home/ubuntu/Weiyi/RFE_Select/RFE_0604.csv")