コード例 #1
0
ファイル: test_smote.py プロジェクト: chkoar/imbalanced-learn
def test_sample_with_nn_svm():
    kind = 'svm'
    nn_k = NearestNeighbors(n_neighbors=6)
    svm = SVC(gamma='scale', random_state=RND_SEED)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm)
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206],
                     [0.77481731, 0.60935141],
                     [1.25192108, -0.22367336],
                     [0.53366841, -0.30312976],
                     [1.52091956, -0.49283504],
                     [-0.28162401, -2.10400981],
                     [0.83680821, 1.72827342],
                     [0.3084254, 0.33299982],
                     [0.70472253, -0.73309052],
                     [0.28893132, -0.38761769],
                     [1.15514042, 0.0129463],
                     [0.88407872, 0.35454207],
                     [1.31301027, -0.92648734],
                     [-1.11515198, -0.93689695],
                     [-0.18410027, -0.45194484],
                     [0.9281014, 0.53085498],
                     [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653],
                     [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234],
                     [0.47436887, -0.2645749],
                     [1.07844562, -0.19435291],
                     [1.44228238, -1.31256615],
                     [1.25636713, -1.04463226]])
    y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
                     1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
コード例 #2
0
ファイル: test_smote.py プロジェクト: chkoar/imbalanced-learn
def test_fit_resample_nn_obj():
    kind = 'borderline1'
    nn_m = NearestNeighbors(n_neighbors=11)
    nn_k = NearestNeighbors(n_neighbors=6)
    smote = SMOTE(
        random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m)
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [
        1.25192108, -0.22367336
    ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [
        -0.28162401, -2.10400981
    ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [
        0.70472253, -0.73309052
    ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [
        0.88407872, 0.35454207
    ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [
        -0.18410027, -0.45194484
    ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [
        -0.41635887, -0.38299653
    ], [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.3765279, -0.2009615], [0.55276636, -0.10550373],
                     [0.45413452, -0.08883319], [1.21118683, -0.22817957]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
コード例 #3
0
ファイル: decision_tree.py プロジェクト: softlang/wikionto
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False,
                            export=False):
    assert k > 0
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectKBest(score_function, k=k)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True)
        transform(fitted_ids, configurationname)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
コード例 #4
0
ファイル: test_smote.py プロジェクト: chkoar/imbalanced-learn
def test_sample_borderline2():
    kind = 'borderline2'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [
        1.25192108, -0.22367336
    ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [
        -0.28162401, -2.10400981
    ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [
        0.70472253, -0.73309052
    ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [
        0.88407872, 0.35454207
    ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [
        -0.18410027, -0.45194484
    ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653], [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234], [0.47436888, -0.2645749],
                     [1.07844561, -0.19435291], [0.33339622, 0.49870937]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
コード例 #5
0
ファイル: test_smote.py プロジェクト: chkoar/imbalanced-learn
def test_sample_regular_half():
    sampling_strategy = {0: 9, 1: 12}
    smote = SMOTE(
        sampling_strategy=sampling_strategy, random_state=RND_SEED)
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [
        1.25192108, -0.22367336
    ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [
        -0.28162401, -2.10400981
    ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [
        0.70472253, -0.73309052
    ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [
        0.88407872, 0.35454207
    ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [
        -0.18410027, -0.45194484
    ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049],
                     [-0.41635887, -0.38299653], [0.08711622, 0.93259929],
                     [1.70580611, -0.11219234], [0.36784496, -0.1953161]])
    y_gt = np.array(
        [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
コード例 #6
0
ファイル: test_smote.py プロジェクト: chkoar/imbalanced-learn
def test_sample_regular_with_nn():
    nn_k = NearestNeighbors(n_neighbors=6)
    smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k)
    X_resampled, y_resampled = smote.fit_resample(X, Y)
    X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [
        1.25192108, -0.22367336
    ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [
        -0.28162401, -2.10400981
    ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [
        0.70472253, -0.73309052
    ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [
        0.88407872, 0.35454207
    ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [
        -0.18410027, -0.45194484
    ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [
        -0.41635887, -0.38299653
    ], [0.08711622, 0.93259929], [1.70580611, -0.11219234],
                     [0.29307743, -0.14670439], [0.84976473, -0.15570176],
                     [0.61319159, -0.11571668], [0.66052536, -0.28246517]])
    y_gt = np.array([
        0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0
    ])
    assert_allclose(X_resampled, X_gt, rtol=R_TOL)
    assert_array_equal(y_resampled, y_gt)
コード例 #7
0
#     if y[i] == 1:
#         print(i)
#         count += 1
# print(count)

dt = DecisionTreeClassifier()
sm = SMOTE(sampling_strategy=0.2, k_neighbors=10)
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
train_size: int = int(0.8 * len(x))
test_size: int = len(x) - train_size
x_train = x[:train_size]
x_test = x[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]
# SMOTE resampling and sort by time
x_train, y_train = sm.fit_resample(x_train, y_train)
entries = list(zip(x_train, y_train))
entries.sort(key=lambda in_: in_[0])
x_train, y_train = zip(*entries)
x_train = list(x_train)
y_train = list(y_train)
for i in x_train:
    i.pop(0)
for i in x_test:
    i.pop(0)
# x_train = x_train[:, 1:]
# x_test = x_test[:, 1:]
# train_size = len(x_train)

window_size = DEFAULT_INIT_WINDOW_SIZE  # initialize the window size
min_pos_por = 0.2  # minimum positive classified proportion
コード例 #8
0
PPVSVMLinear = np.zeros([nrepeat,nfold])

for i in range(nrepeat):
    indexes = list(range(nobs))
    random.shuffle(indexes)
    dfs = np.array_split(indexes,nfold)
    for j in range(nfold):
        
        index_bad = X.index.isin(dfs[j])
        X_test = X[index_bad]
        y_test = y[index_bad]
        X_train = X[~index_bad]
        y_train = y[~index_bad]
        #SMOTE
        oversample = SMOTE(k_neighbors=7)
        X_train,y_train = oversample.fit_resample(X_train,y_train)

        linear_svc = svm.SVC(kernel='rbf',random_state=0, tol=1e-5, C = 1)
        linear_svc.fit(X_train, y_train)

        y_predict = linear_svc.predict(X_test)
        tn, fp, fn, tp = confusion_matrix(y_test.ravel(),y_predict).ravel()
        RepeatSVMLinear[i,j] = tp/(tp+fn)
        PPVSVMLinear[i,j] = tp/(tp+fp)

print('Linear')
print(mean(100*RepeatSVMLinear.ravel()))
print(pstdev(100*RepeatSVMLinear.ravel()))
print(mean(100*PPVSVMLinear.ravel()))
print(pstdev(100*PPVSVMLinear.ravel()))
コード例 #9
0
    def fitness(self, particle, metric):

        matrix_length = len(np.unique(self.data[self.target]))

        if self.mode == 'sgd':
            model = SGDClassifier(class_weight='balanced',
                                  loss='modified_huber',
                                  random_state=1)
        elif self.mode == 'svr':
            model = SVC(kernel='linear',
                        class_weight='balanced',
                        probability=True)
        elif self.mode == 'rdf':
            model = SVC(kernel='rbf',
                        class_weight='balanced',
                        probability=True)
        elif self.mode == 'pol':
            model = SVC(kernel='poly',
                        class_weight='balanced',
                        probability=True)
        elif self.mode == 'rdc':
            model = RandomForestClassifier(n_estimators=10,
                                           class_weight='balanced',
                                           random_state=1)
        elif self.mode == 'dtc':
            model = DecisionTreeClassifier(class_weight='balanced',
                                           random_state=1)
        elif self.mode == 'gdc':
            model = GradientBoostingClassifier(random_state=1)
        elif self.mode == 'etc':
            model = ExtraTreesClassifier(class_weight='balanced',
                                         random_state=1)
        elif self.mode == 'adc':
            model = AdaBoostClassifier(random_state=1)
        elif self.mode == 'bac':
            model = BaggingClassifier(random_state=1)
        elif self.mode == 'lda':
            model = LinearDiscriminantAnalysis()
        elif self.mode == 'qda':
            model = QuadraticDiscriminantAnalysis()
        elif self.mode == 'gnb':
            model = GaussianNB()
        elif self.mode == 'rrc':
            model = RidgeClassifier(class_weight='balanced')
        else:
            model = LogisticRegression(solver='liblinear',
                                       C=10.0,
                                       class_weight='balanced')
        k = model_selection.StratifiedKFold(5)
        try:
            tab_data, tab_val = tab.get([int(x) for x in particle.posiion],
                                        self.tab_data, self.tab_vals)
            tab_val = np.array(tab_val)
            accuracy = (utility.getTotalTruePositive(tab_val) + utility.getTotalTrueNegative(tab_val)) / \
                       (utility.getTotalTruePositive(tab_val) + utility.getTotalTrueNegative(tab_val) +
                        utility.getTotalFalsePositive(tab_val) + utility.getTotalFalseNegative(tab_val))
            precision_tab = []
            recall_tab = []
            for i in range(len(tab_val)):
                a = utility.getTruePositive(
                    tab_val, i) / (utility.getFalsePositive(tab_val, i) +
                                   utility.getTruePositive(tab_val, i))
                b = utility.getTruePositive(
                    tab_val, i) / (utility.getFalseNegative(tab_val, i) +
                                   utility.getTruePositive(tab_val, i))
                precision_tab.append(a)
                recall_tab.append(b)
            precision = sum(precision_tab) / len(precision_tab)
            recall = sum(recall_tab) / len(recall_tab)
            fscore = 2 * (1 / ((1 / precision) + (1 / recall)))
            matrix = tab_val
            tmp = self.data.drop([self.target], axis=1)
            tmp = tmp.iloc[:, particle.position]
            cols = tmp.columns
            self.tab_find = self.tab_find + 1
        except AttributeError:
            matrix = np.zeros((matrix_length, matrix_length), dtype=int)
            X, y, cols = utility.ready(self, particle.position, self.data,
                                       self.dummiesList, self.createDummies,
                                       self.normalize)
            originalclass = []
            predictedclass = []
            for train_index, test_index in k.split(X, y):  # Split in X
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                if self.mode == ('knn' or 'dct' or 'gbc' or 'lda' or 'qda'
                                 or 'adc' or 'bac'):
                    if self.mode == 'knn':
                        model = KNeighborsClassifier(
                            n_neighbors=int(len(X_train)**(1 / 2)))
                    sm = SMOTE(sampling_strategy='auto')
                    X_train, y_train = sm.fit_resample(X_train, y_train)

                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                originalclass.extend(y_test)
                predictedclass.extend(y_pred)

                matrix += confusion_matrix(y_test, y_pred)

            accuracy = (utility.getTotalTruePositive(matrix) + utility.getTotalTrueNegative(matrix)) / \
                       (utility.getTotalTruePositive(matrix) + utility.getTotalTrueNegative(matrix) +
                        utility.getTotalFalsePositive(matrix) + utility.getTotalFalseNegative(matrix))

            precision, recall, fscore, support = s(originalclass,
                                                   predictedclass,
                                                   average='macro')
            self.tab_data, self.tab_vals = tab.add(
                [int(x) for x in particle.position], matrix.tolist(),
                self.tab_data, self.tab_vals)
            self.tab_insert = self.tab_insert + 1

        if metric == 'accuracy' or 'exactitude':
            score = accuracy
        elif metric == 'recall' or 'rappel':
            score = recall
        elif metric == 'precision' or 'précision':
            score = precision
        elif metric == 'fscore':
            score = fscore
        else:
            score = accuracy

        return score, accuracy, recall, precision, fscore, cols, matrix
コード例 #10
0
                    X, y)


# Now that we know the gap between the train and test gap of loss. But we can still boost it cause we know that target has an imbalance data so perhaps we can find better evaluation if we make it balance using SMOTE.

# In[167]:


from imblearn.over_sampling import SMOTE


# In[169]:


smote = SMOTE(random_state = 402)
X_smote, Y_smote = smote.fit_resample(X,y)


sns.countplot(Y_smote, edgecolor = 'black')


# In[171]:


plot_learning_curve(RandomForestClassifier(
        bootstrap=True, max_depth=10, max_features='auto', min_samples_leaf=2, min_samples_split=5, n_estimators=100),
                    X_smote, Y_smote)


# In[ ]:
コード例 #11
0
def initialise():
    # Connect to the database
    mean_performance = 0
    connection = pymysql.connect(host='localhost',
                                 user='******',
                                 password='',
                                 db='crickml',
                                 charset='utf8mb4',
                                 cursorclass=pymysql.cursors.DictCursor)

    # Fetch data for features

    print("Fetching player data for Features.....")

    player_list_2011 = fetch_data_pre('SELECT * FROM pre_wc_2011', connection)
    player_list_2015 = fetch_data_pre('SELECT * FROM pre_wc_2015', connection)
    player_list_2013 = fetch_data_pre('SELECT * FROM pre_ct_2013', connection)
    player_list_2017 = fetch_data_pre('SELECT * FROM pre_ct_2017', connection)
    # player_list_2007 = fetch_data_pre('SELECT * FROM pre_ct_2007', connection)

    # Fetch data for labels

    print("Fetching player data for Labels.....")

    performance_list_2011 = fetch_data_post(
        'SELECT * FROM wc_2011', connection)
    performance_list_2015 = fetch_data_post(
        'SELECT * FROM wc_2015', connection)
    performance_list_2013 = fetch_data_post(
        'SELECT * FROM ct_2013', connection)
    performance_list_2017 = fetch_data_post(
        'SELECT * FROM ct_2017', connection)
    # performance_list_2007 = fetch_data_post(
    #     'SELECT * FROM wc_2007', connection)
    
    # print("Mean")
    # mean_performance = sum(performance_list_2011[:, 0])/len(performance_list_2011)
    # print(mean_performance)
    # mean_performance = sum(performance_list_2015[:, 0])/len(performance_list_2015)
    # print(mean_performance)
    # mean_performance = sum(performance_list_2013[:, 0])/len(performance_list_2013)
    # print(mean_performance)
    # mean_performance = sum(performance_list_2017[:, 0])/len(performance_list_2017)
    # print(mean_performance)

    # class_interval = np.max()

    np_players = np.concatenate(
        (player_list_2011, player_list_2013, player_list_2015, player_list_2017), axis=0)
    np_performances = np.concatenate(
        (performance_list_2011, performance_list_2013, performance_list_2015, performance_list_2017), axis=0)

    # print(np_performances)
    # class_interval = ( np.max(np_performances[:,0]) - np.min(np_performances[:,0]) ) / 3
    # print("class interval")
    # print(class_interval)
    # exit()

    max_career = np.max(np_players[:, 0])
    max_recent = np.max(np_players[:, 1])
    max_away = np.max(np_players[:, 2])
    max_home = np.max(np_players[:, 3])

    np_players = scale_features(
        np_players, max_career, max_recent, max_away, max_home)
    # from sklearn.preprocessing import StandardScaler

    # sc = StandardScaler()  
    # np_players = sc.fit_transform(np_players)  
    # X_test = sc.transform(X_test)  

    sm = SMOTE(random_state=41)
    np_players_resampled, np_performances_resampled = sm.fit_resample(np_players, np_performances)
    

    # DO train test split using SKLEARN
    feature_train, feature_test, target_train, target_test = train_test_split(
        np_players_resampled, np_performances_resampled, test_size=0.30, random_state=42)

    # pca = PCA(n_components=2)  
    # feature_train = pca.fit_transform(feature_train)  
    # feature_test = pca.transform(feature_test)  

    # Train Naive Bayes model
    # gnb = GaussianNB()
    # gnb.fit(feature_train, target_train)
    # nb_pred_prob = gnb.predict_proba(feature_test)
    # nb_pred = gnb.predict(feature_test)
    # print(classification_report(target_test, nb_pred))
    # acc = accuracy_score(nb_pred, target_test)
    # print(acc)

    # exit()

    # Train Multi-layer Perceptron model
    # mlp_clf = MLPClassifier(solver='lbfgs', alpha=1e-06,
    #                         hidden_layer_sizes=(13), random_state=7, max_iter=1100)
    # mlp_clf.fit(feature_train, target_train)
    # mlp_pred_prob = mlp_clf.predict_proba(feature_test)
    # mlp_pred = mlp_clf.predict(feature_test)
    # print(classification_report(target_test, mlp_pred))
    # acc = accuracy_score(mlp_pred, target_test)
    # print(acc)

    # exit()
    # tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
    #                  'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
    #                 {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
    #                  'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
    #                 {'kernel': ['linear'], 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}
    #                ]

    # scores = ['precision', 'recall']

    # for score in scores:
    #     print("# Tuning hyper-parameters for %s" % score)
    #     print()

    #     clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
    #                     scoring='%s_macro' % score)
    #     clf.fit(feature_train, target_train)

    #     print("Best parameters set found on development set:")
    #     print()
    #     print(clf.best_params_)
    #     print()
    #     print("Grid scores on development set:")
    #     print()
    #     means = clf.cv_results_['mean_test_score']
    #     stds = clf.cv_results_['std_test_score']
    #     for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    #         print("%0.3f (+/-%0.03f) for %r"
    #             % (mean, std * 2, params))
    #     print()
    

    # parameters = {'solver': ['lbfgs'], 'max_iter': [1000,1100], 'alpha': 10.0 ** -np.arange(1, 10), 'hidden_layer_sizes':np.arange(10, 15), 'random_state':[0,1,2,3,4,5,6,7,8,9]}
    # clf = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1)
    # clf.fit(feature_train, target_train)
    # print(clf.score(feature_test, target_test))
    # print(clf.best_params_)
              

    # Train SVM model
    # svm_clf = SVC(C=1000, kernel='sigmoid', gamma=0.001, probability=True)
    # svm_clf.fit(feature_train, target_train)
    # svm_pred = svm_clf.predict(feature_test)
    # svm_pred_prob = svm_clf.predict_proba(feature_test)
    # print(classification_report(target_test, svm_pred))
    # acc = accuracy_score(svm_pred, target_test)
    # print(acc)

    # exit()

    # Train Decision Tree model
    desT = DecisionTreeClassifier(max_depth=11)
    desT.fit(feature_train, target_train)
    # desc_pred = desT.predict(feature_test)
    # desc_pred_prob = desT.predict_proba(feature_test)
    # print(classification_report(target_test, desc_pred))
    # acc = accuracy_score(desc_pred, target_test)
    # print(acc)

    # exit()

    # amt_say_nb = acceptance_rate(nb_pred, target_test)

    # amt_say_mlp = acceptance_rate(mlp_pred, target_test)

    # amt_say_svm = acceptance_rate(svm_pred, target_test)

    # amt_say_desc = acceptance_rate(desc_pred, target_test)

    # print('Amount of say NB :', amt_say_nb)
    # print('Amount of say MLP :', amt_say_mlp)
    # print('Amount of say SVM :', amt_say_svm)
    # print('Amount of say Descision Tree :', amt_say_desc)

    return connection, desT, max_home, max_away, max_recent, max_career, feature_train, feature_test, target_train, target_test
コード例 #12
0
def execute(project):
    dataset_dir = Config.get_work_dir_path(
        os.path.join("paper", "datasets", "traditional_designite",
                     project.github()))
    Path(dataset_dir).mkdir(parents=True, exist_ok=True)
    training_path = os.path.join(dataset_dir, "training.csv")
    testing_path = os.path.join(dataset_dir, "testing.csv")

    training_df = pd.read_csv(training_path).dropna().replace({
        'True': 1,
        'False': 0
    })
    testing_df = pd.read_csv(testing_path).dropna().replace({
        'True': 1,
        'False': 0
    })

    training_y = training_df.pop('Bugged').values
    training_X = training_df.values
    training_X = preprocessing.scale(training_X)

    oversample = SMOTE()
    training_X, training_y = oversample.fit_resample(training_X, training_y)

    models = {
        'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
        'LogisticRegression': LogisticRegression(),
        'BernoulliNaiveBayes': BernoulliNB(),
        'K-NearestNeighbor': KNeighborsClassifier(),
        'DecisionTree': DecisionTreeClassifier(),
        'RandomForest': RandomForestClassifier(),
        'SupportVectorMachine': SVC(),
        # 'MultilayerPerceptron': MLPClassifier()
    }
    params = {
        'LinearDiscriminantAnalysis': {},
        'QuadraticDiscriminantAnalysis': {},
        'LogisticRegression': {
            'C': list(np.logspace(-4, 4, 3))
        },
        'BernoulliNaiveBayes': {},
        'K-NearestNeighbor': {},
        'DecisionTree': {
            'criterion': ['gini', 'entropy'],
        },
        'RandomForest': {
            'n_estimators': [10, 100]
        },
        'SupportVectorMachine': {
            'C': [0.1, 100]
        },
        # 'MultilayerPerceptron': {'hidden_layer_sizes': [(55, 27, 55)],
        #                         'activation': ['tanh', 'relu']}
    }

    helper = EstimatorSelectionHelper(models, params)
    helper.fit(training_X, training_y, scoring='f1')
    summary = helper.score_summary()
    top_summary = summary[:10]
    top_summary_iter = top_summary.drop(EstimatorSelectionHelper.get_scores_info(), axis=1)\
                                  .where(pd.notnull(top_summary), None)\
                                  .iterrows()

    testing_y = testing_df.pop('Bugged').values
    testing_X = preprocessing.scale(testing_df.values)
    models_info = list(map(lambda x: x[1].to_dict(), top_summary_iter))

    columns = [
        'estimator', 'configuration', 'precision', 'recall', 'f1-measure',
        'auc-roc', 'brier score'
    ]
    scores = pd.DataFrame(columns=columns)
    predictions = []
    for model_info in models_info:
        estimator = models[model_info['estimator']]
        params = {
            key: val
            for key, val in model_info.items()
            if not (val is None or key == 'estimator')
        }
        estimator.set_params(**params)
        estimator.fit(training_X, training_y)
        prediction_y = estimator.predict(testing_X)
        predictions.append(prediction_y)
        scores_dict = {
            'estimator': model_info['estimator'],
            'configuration': str(params),
            'precision': precision_score(testing_y, prediction_y),
            'recall': recall_score(testing_y, prediction_y),
            'f1-measure': f1_score(testing_y, prediction_y),
            'auc-roc': roc_auc_score(testing_y, prediction_y),
            'brier score': brier_score_loss(testing_y, prediction_y)
        }
        scores = scores.append(scores_dict, ignore_index=True)
    scores_dir = Config.get_work_dir_path(
        os.path.join("paper", "scores", "traditional_designite",
                     project.github()))
    Path(scores_dir).mkdir(parents=True, exist_ok=True)
    scores_path = os.path.join(scores_dir, "scores.csv")
    training_x_path = os.path.join(scores_dir, "training_x.csv")
    training_y_path = os.path.join(scores_dir, "training_y.csv")
    testing_x_path = os.path.join(scores_dir, "testing_x.csv")
    testing_y_path = os.path.join(scores_dir, "testing_y.csv")
    prediction_y_path = os.path.join(scores_dir, "prediction_y.csv")
    prediction_real_y_path = os.path.join(scores_dir, "prediction_real_y.csv")
    summary_path = os.path.join(scores_dir, "summary.csv")
    scores.to_csv(scores_path, index=False)
    pd.DataFrame(data=training_X,
                 columns=training_df.columns).to_csv(training_x_path,
                                                     index=False)
    pd.DataFrame(data=training_y, columns=['Bugged']).to_csv(training_y_path,
                                                             index=False)
    pd.DataFrame(data=testing_X,
                 columns=training_df.columns).to_csv(testing_x_path,
                                                     index=False)
    pd.DataFrame(data=testing_y, columns=['Bugged']).to_csv(testing_y_path,
                                                            index=False)
    columns = list(map(lambda x: str(x), models_info))
    pd.DataFrame(data=np.array(predictions).transpose(),
                 columns=columns).to_csv(prediction_y_path, index=False)
    predictions.append(testing_y)
    columns.append("real")
    pd.DataFrame(data=np.array(predictions).transpose(),
                 columns=columns).to_csv(prediction_real_y_path, index=False)
    summary.to_csv(summary_path, index=False)
コード例 #13
0
ファイル: model.py プロジェクト: matteoca/bmi_app
def train_resampled(X, y):
    # resampling
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    return X_res, y_res
コード例 #14
0
def get_data(plot=True):
    data = pandas.read_excel('mros_1103snps.xlsx')
    # drop HA_SLDFXFU where only 10% is filled, drop subjectid,
    data.drop(['HA_SLDFXFU', 'TURSMOKE', 'HA_SLDFX', 'HA_WRSTFX'],
              axis=1,
              inplace=True)
    # add genetic scores
    #sheet = 'c1_4321_GRS'
    #grs_data = pandas.read_excel('MrOS_Genotype_Genetic_Score(2018-12-12).xlsx', sheet_name=sheet)
    #sig_dict = {}
    #for i in range(len(grs_data['ID'])):
    #    key = grs_data['ID'].iloc[i]
    #    value = sig_dict[key] = grs_data['#ALLELE'].iloc[i]
    # change allele to number
    #    data[key] = data.apply(lambda sample: sample[key].count(value), axis=1)
    data['FRAC'] = 0
    data['STATUS'] = 0
    data['DAYS'] = 0
    # make the fractures into 1 variable
    for attribute in data.keys():
        data[attribute] = data.apply(
            lambda sample: fill_empty_cell(sample, attribute, data), axis=1)
    # drop the other fractured values
    #data.drop(['FAANYSLD', 'FAANYWST', 'FAANYHIP', 'XMDSQGE1', 'XMSQGE2', 'EFSTATUS', 'FAHIPFV1', 'FASLDFV1', 'FAWSTFV1'], axis=1, inplace=True)
    # encode the categorical data
    data = pandas.DataFrame(
        pandas.get_dummies(data, columns=['GIERACE', 'PHYS_MROS', 'NFWLKSPD']))
    # setting Y and X
    Y = data['FRAC']
    X = pandas.read_excel('norma_continu_var.xlsx')
    #X_df = data.drop(['SUBJECTID', 'HA_LSD', 'BUAMEAN', 'FAHIPFV1', 'FASLDFV1', 'FAWSTFV1', 'EFSTATUS', 'HA_BMI', 'FAANYHIP', 'HA_CALCIUM', 'XMDSQGE1', 'XMSQGE2', 'CLINIC', 'FRAC', 'FAANYSLD', 'FAANYWST', 'STATUS', 'DAYS', 'FAANYSLD','FAANYWST'], axis=1)

    # weight_LS = load_weight('LS_sex-combined_beta')
    #features = list(data)[13:-8]
    #feature_data = data[features]
    # weight_LS = pandas.DataFrame(pandas.Series(weight_LS, index=features, name=0))
    #weight_FN = load_weight('FN_sex-combined_beta')
    # weight_FN == pandas.DataFrame(pandas.Series(weight_FN, index=features, name=0))
    # X_df['GRS_LS'] = feature_data.dot(weight_LS)
    #X_df['GRS_FN'] = feature_data.dot(weight_FN)
    #X_df.drop(features, axis=1, inplace=True)
    # if kaplan_meier_estimator is not None and plot:
    #     survival = numpy.array(X_df.apply(lambda sample: (sample['STATUS'], sample['DAYS']), axis=1), dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    #     time, survival_prob = kaplan_meier_estimator(survival['Status'], survival['Survival_in_days'])
    #     pyplot.step(time, survival_prob, where="post")
    #     pyplot.ylabel(r'est. probability of survival $\hat{S}(t)$')
    #     pyplot.xlabel('time $t$ in days')
    #X_df.drop(['STATUS', 'DAYS', 'FRAC'], axis=1, inplace=True)
    # weight=pandas.Series(weight)
    # X_df['grs']=0
    # X_df['grs'] = X_df.apply(lambda sample: weighted_grs(sample, data, weight), axis=1)
    # print(list(X_df))
    # scale numerical entries to 0-1
    # numericals = ['GIAGE1', 'HA_HEIGHT', 'HA_WEIGHT', 'HA-SMOKE', 'GIERACE', 'TUDRPRWK', 'B1FND', 'GRS_FN']  # ['BUAMEAN', 'GIAGE1', 'HA_BMI', 'HA_CALCIUM', 'TUDRPRWK', 'GRS_FN', 'GRS_LS', 'B1FND', 'B1TLD', 'B1THD']  # , 'score1', 'score2', 'score3']
    #minMaxScaler = preprocessing.MinMaxScaler()
    #X_df[NUMERICALS] = minMaxScaler.fit_transform(X_df[NUMERICALS])
    # pca=PCA(n_components=10)
    # X_df=pca.fit_transform(X_df)
    # smote
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    sm = SMOTE(random_state=2, ratio=1.0)
    x_train_s, y_train_s = sm.fit_resample(x_train, y_train)
    # Xtrain, Xtest, ytrain, ytest = train_test_split(X_df, Y_df, test_size=0.2)
    parameters = {
        'loss': ['deviance', 'exponential'],
        'n_estimators': [200, 500, 800, 1000, 1200],
        'learning_rate': [0.001, 0.003, 0.005],
        'subsample':
        [0.3, 0.5,
         0.7],  # <1.0 results in reduction of variance and increase in bias
        'min_samples_split': [2, 5, 8],
        'max_features': ['auto', 'log2', 'sqrt', 0.2],
        'random_state': [42],
        'max_depth': [2, 3, 5],
        'min_impurity_decrease': [0.15, 0.1, 0.08, 0.05]
    }
    # split data for parameter sweep
    model = RandomSearch(estimator=GradientBoostingClassifier(),
                         modelName='Gradient Boosting Classifier',
                         params=parameters,
                         Xtrain=x_train_s,
                         ytrain=y_train_s,
                         Xtest=x_test,
                         ytest=y_test,
                         score='roc_auc')
    # model=GradientBoostingClassifier(subsample=0.3, n_estimators=800, min_samples_split=2, min_impurity_decrease=0.05, max_features='sqrt', max_depth=3, loss='deviance', learning_rate=0.01)
    model.fit(x_train_s, y_train_s)
    print(model.feature_importances_)
    # ypred = model.predict(Xtest)
    yscore_raw = model.predict_proba(x_test)
    yscore = [s[1] for s in yscore_raw]
    fpr, tpr, thresh = roc_curve(y_test, yscore)
    auc = roc_auc_score(y_test, yscore)
    ytest = numpy.array(y_test)
    # yscore = numpy.array(yscore)
    # n_bootstraps = 1000
    # bootstrapped_scores = []
    # rng = numpy.random.RandomState()
    # for i in range(n_bootstraps):
    #     # bootstrap by sampling with replacement on the prediction indices
    #     indices = rng.random_integers(0, len(yscore) - 1, len(yscore))
    #     if len(numpy.unique(ytest[indices])) < 2:
    #         # We need at least one positive and one negative sample for ROC AUC to be defined: reject the sample
    #         continue
    #     bootstrapped_scores.append(roc_auc_score(ytest[indices], yscore[indices]))
    # print(mean_confidence_interval(bootstrapped_scores))
    # plot roc curve
    if plot:
        y_probas = model.predict_proba(
            x_test)  # predicted probabilities generated by sklearn classifier
        scikitplot.metrics.plot_roc(
            ytest,
            y_probas,
            plot_macro=False,
            plot_micro=False,
            classes_to_plot=[1],
            title='ROC Curve by Gradient Boosting Model')
        pyplot.show()
    return fpr, tpr, thresh, auc
コード例 #15
0
def svm_func(train_A, words_of_tweets, extra_features, feature_selection, encoding, print_file):
    reading = Twitter_Depression_Detection.Reader()  # Import the Twitter_Depression_Detection.py file, to get the encoding
    print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    print(words_of_tweets)
    x = np.array(words_of_tweets)
    y = train_A['label']

    # Initialize the roc-auc score running average list
    # Initialize a count to print the number of folds
    # Initialize metrics to print their average
    av_roc = 0.
    count = 0
    precision = 0
    accuracy = 0
    recall = 0
    f1score = 0
    # Below 3 variables are used for ROC-AUC curve
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    # Initialize your 10 - cross vailidation
    # Set shuffle equals True to randomize your splits on your training data
    kf = KFold(n_splits=10, random_state=41, shuffle=True)

    # Set up for loop to run for the number of cross vals you defined in your parameter
    for train_index, test_index in kf.split(x):
        count += 1
        print('Fold #: ', count)


        with open(print_file, "a") as myfile: # Write above print into output file
            myfile.write('Fold #: ' + str(count) + '\n')

        # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True
        x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index], train_index, extra_features, feature_selection, encoding, print_file), reading.get_enc(x[test_index], 0, y[test_index], test_index, extra_features, feature_selection, encoding, print_file)
        y_train, y_test = y[train_index], y[test_index]

        # Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset
        # Create SVM classification object
        # For very large C, the margin is hard, and points cannot lie in it. For smaller C, the margin is softer, and can grow to encompass some points.
        # gamma: Higher the value of gamma, will try to exact fit the training data set i.e.generalization error and cause over-fitting problem.
        model = naive_bayes.GaussianNB()


#######################################################################################################################
        # Feature Scaling
        minMaxScaler = MinMaxScaler(feature_range=(0, 1))
        # Get points and discard classification labels
        #x_train = minMaxScaler.fit_transform(x_train)
        #x_test = minMaxScaler.transform(x_test)
#######################################################################################################################
        oversample = SMOTE(sampling_strategy='minority', k_neighbors=10, random_state=0)
        model.fit(x_train, y_train)
        return model
#######################################################################################################################
        # Visualization of normal and oversampled data

        '''visualize_data(x_train, y_train, "Normal Dataset")'''

        # 'minority': resample only the minority class;
        x_train, y_train = oversample.fit_resample(x_train, y_train)
        '''visualize_data(x_train, y_train, "Oversampled Dataset")'''

#######################################################################################################################

        model.score(x_train, y_train)
        # Predict Output
        y_pred = model.predict(x_test)
        #return model
#######################################################################################################################

        # Your model is fit. Time to predict our output and test our training data
        print("Evaluating model...")

        with open(print_file, "a") as myfile: # Write above print into output file
            myfile.write("Evaluating model..." + '\n')

        #roc = roc_auc_score(y_test, y_pred)

        # Print your ROC-AUC score for your kfold, and the running score average
        #print('ROC: ', roc)
        #av_roc += roc
        #print('Continued Avg: ', av_roc / count)

        #with open(print_file, "a") as myfile: # Write above print into output file
            #myfile.write('ROC: ' + str(Continued Avg: ' + str(av_roc / count) + '\n')

        #y_pred = (y_pred > 0.5)

# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
        '''
        # Compute ROC curve and area the curve

        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc))
        '''
# -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

        '''
        # Creating the Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        print(cm)


        with open(print_file, "a") as myfile: # Write above print into output file
            myfile.write(str(cm) + '\n')
        '''
        print(y_pred)
        temp_accuracy = accuracy_score(y_test, y_pred)
        temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support(y_test, y_pred,
                                                                                        average='macro')

        accuracy += temp_accuracy
        precision += temp_precision
        recall += temp_recall
        f1score += temp_f1_score

        print("Accuracy: ", temp_accuracy)
        print("Precision: ", temp_precision)
        print("Recall: ", temp_recall)
        print("F1 score: ", temp_f1_score)

        print(metrics.classification_report(y_test,y_pred))


    # Create ROC-AUC curve
#    compute_ROC_Curve(tprs, mean_fpr, aucs)


##########################################################################################################################


    # Print average of metrics
    print("Average Precision: ", precision / 10)
    print("Average Accuracy: ", accuracy / 10)
    print("Average Recall: ", recall / 10)
    print("Average F1-score: ", f1score / 10)

    # Print your final average ROC-AUC score and organize your models predictions in a dataframe
    #print('Average ROC:', av_roc / 10)

    with open(print_file, "a") as myfile:  # Write above print into output file
        myfile.write("Average Precision: " + str(precision / 10) + '\n' + "Average Accuracy: " + str(accuracy / 10) + '\n' + "Average Recall: " + str(recall / 10) + '\n' + "Average F1-score: " + str(f1score / 10) + '\n' + 'Average ROC:' + str(av_roc / 10) + '\n')
コード例 #16
0
df.head()

y = df["bi_popularity"]
X = df.drop(columns=[
    "popularity", "track_id", "year", "bi_popularity", "genre", "artist_name",
    "track_name", "release_date"
])

# Get Dummies for mode and key
X = pd.get_dummies(X, columns=["mode", "key", "time_signature"])
X.head()

from imblearn.over_sampling import SMOTE

os = SMOTE()
X_oversample, y_oversample = os.fit_resample(X, y)

#Features shape
X_oversample.shape
"""Split and Train"""

# split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_oversample,
                                                    y_oversample,
                                                    random_state=1)
X_train.shape

# XGBoost model
from xgboost import XGBClassifier
コード例 #17
0
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

Classifier = RandomForestClassifier(n_estimators=1000)
Classifier = Classifier.fit(X_train, Y_train)
Y_pred = Classifier.predict(X_test)

ModelPerformanceMetrics(Y_test, Y_pred)

# Try SMOTE
X = df.drop(['Bankrupt?'], axis=1)
Y = df['Bankrupt?']
sm = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=0)
X_smote, Y_smote = sm.fit_resample(X, Y)
X_train, X_test, Y_train, Y_test = train_test_split(X_smote,
                                                    Y_smote,
                                                    test_size=0.2,
                                                    random_state=0)

Classifier = LogisticRegression(max_iter=1000)
Classifier = Classifier.fit(X_train, Y_train)
Y_pred = Classifier.predict(X_test)

ModelPerformanceMetrics(Y_test, Y_pred)

# Try XGBoost
X = df.drop(['Bankrupt?'], axis=1)
Y = df['Bankrupt?']
X_train, X_test, Y_train, Y_test = train_test_split(X,
コード例 #18
0
    if IS_SMOTE:

        # num_y = []
        # for i in range(2):
        #     num_y.append(np.sum(y == i))

        # max_num_y = max(num_y)

        # _ratio = {0: max_num_y, 1: max_num_y}

        # SMOTE
        # smote = SMOTE(sampling_strategy=_ratio, random_state=71)

        smote = SMOTE(random_state=42)
        X, y = smote.fit_resample(X, y)

        for i in range(2):
            print("Data number resampled => " + str(i) + ": " +
                  str(np.sum(y == i)))

    Xs = []
    ys = []

    # bootstrap sampling
    for i in range(L2_ENSEMBLE_NUMBER):

        resampled_X, resampled_y = resample(X, y, n_samples=len(X))
        Xs.append(resampled_X)
        ys.append(resampled_y)
        print("l2_ensemble_{} was created.".format(i + 1))
コード例 #19
0
ファイル: test_smote.py プロジェクト: chkoar/imbalanced-learn
def test_smote_error_passing_estimator(smote_params, err_msg):
    smote = SMOTE(**smote_params)
    with pytest.raises(ValueError, match=err_msg):
        smote.fit_resample(X, Y)
コード例 #20
0
def fix_imbalance(self,
                  data,
                  target,
                  threshold=10.0,
                  oversample=True,
                  smote=False):
    """
    Method Name: fix_imbalance
    Description: This method will be used to handle unbalanced datasets(rare classes) through oversampling/ undersampling
                 techniques
    Input Description: data: the input dataframe with target column.
                       threshold: the threshold of mismatch between the target values to perform balancing.

    Output: A balanced dataframe.
    On Failure: Raise Exception

    Written By: Punit Nanda
    Version: 1.2
    Revisions: None

    """
    try:
        X = data.drop(target, axis=1)
        y = data[target]
        no_of_classes = y.nunique()

        if no_of_classes == 2:
            thresh_satisfied = ((y.value_counts() / float(len(y)) * 100).any()
                                < threshold)
            # pdb.set_trace()
            if thresh_satisfied:
                if smote:
                    smote = SMOTE()
                    X, y = smote.fit_resample(X, y)
                elif oversample:
                    ROS = RandomOverSampler(sampling_strategy='auto',
                                            random_state=42)
                    X, y = ROS.fit_sample(X, y)
                else:
                    ROS = RandomUnderSampler(sampling_strategy='auto',
                                             random_state=42)
                    X, y = ROS.fit_sample(X, y)
        else:

            high = (y.value_counts() / float(len(y)) * 100).ravel().max()
            low = (y.value_counts() / float(len(y)) * 100).ravel().min()

            thresh_satisfied = (high - low > 100.0 - threshold)

            while thresh_satisfied:
                if smote:
                    smote = SMOTE(sampling_strategy='minority')
                    X, y = smote.fit_resample(X, y)
                elif oversample:
                    ROS = RandomOverSampler(sampling_strategy='minority',
                                            random_state=42)
                    X, y = ROS.fit_sample(X, y)
                else:
                    ROS = RandomUnderSampler(sampling_strategy='auto',
                                             random_state=42)
                    X, y = ROS.fit_sample(X, y)

                high = (y.value_counts() / float(len(y)) *
                        100).ravel().max()  # added v0.1
                low = (y.value_counts() / float(len(y)) *
                       100).ravel().min()  # added v0.1
                thresh_satisfied = (high - low > 100.0 - threshold)
                # pdb.set_trace()

        y.to_frame(name=target)
        dfBalanced = pd.concat([X, y], axis=1)
        return dfBalanced

    except Exception as e:
        raise Exception()  # raising exception and exiting
コード例 #21
0
    for i in range(9):
        img = cv.imread(
            "F:\\360MoveData\\Users\\1\\Desktop\\zhinengzhizao\\cv\\cv-img\\test/ex-2/2/"
            + str(i) + ".tiff", 0)
        img_new = img.ravel()
        train_label[i] = 2
        for j in range(29241):
            train_data[i, j] = img_new[j]

    for m in range(9, 13):
        img = cv.imread(
            "F:\\360MoveData\\Users\\1\\Desktop\\zhinengzhizao\\cv\\cv-img\\test/ex-2/3/"
            + str(m - 9) + ".tiff", 0)
        img_new = img.ravel()
        train_label[m] = 3
        for n in range(29241):
            train_data[m, n] = img_new[n]

    return train_data, train_label


train_data, train_label = load_picture()
model_smote = SMOTE(k_neighbors=3)
new_data, new_label = model_smote.fit_resample(train_data, train_label)

for i in range(13, 18):
    img = new_data[i].reshape(171, 171)
    cv.imwrite(
        "F:\\360MoveData\\Users\\1\\Desktop\\zhinengzhizao\\cv\\cv-img\\test/ex-2/3-sup/"
        + str(i) + ".tiff", img)
def imbalance_handler(
        mode: str,
        parent_class_label: str):
    """
    Purpose
    -------
    The purpose of this function is to provide the user a tool that
    allows them to easily manipulate their training and/or test dataset
    so that it is  significantly more balanced between its classes. One
    would want to do this in order to improve the realiability of their
    classifier that will get trainined on this dataset (see 1. in the
    References section for more information about this).

    **Note, however, that if a class has only 5 or fewer article
    instances that belong to it, it will be dropped completely due to
    the fact that the SMOTE and ENN algorithms used in this function
    rely on at least 6 nearest-neighbors of a class to exist. If this
    class label is particularly important and you would like to keep it
    around, then obtain more data for it.**

    Parameters
    ----------
    mode : str
        This string allows the user to specify how they would like the
        imbalancing of the dataset to be handled. The available options
        include:
            1. "smote" - In this mode, the only algorithm that will be
                         implemented to make  the dataset more balanced
                         is the over-sampling algorithm SMOTE. See 1., 3.,
                         4., and 5. in the References section for more
                         information about this algorithm.
            2. "enn" - In this mode, the only algorithm that will be
                       implemented to make the dataset more balanced is
                       the under-sampling algorithm Edited-Nearest Neighbors
                       (ENN). See 1. and 6. for more information about
                       this algorithm.
            3. "smote-enn" - In this mode, this function will implement
                             both the SMOTE and ENN algorithms; SMOTE
                             will oversample to make the classes balanced
                             and ENN will under-sample to remove any newly
                             generated samples in the minority class(es)
                             that are not helpful. See 1. and 7. for more
                             information about the benefits of doing using
                             this 2-step process and for how this is
                             implemented in the imbalanced-learn module.
    parent_class_label : str
        This string represents the class label that is the Parent class
        of all of the sub-classes that will be distignuished and predicted
        by a classifier that you wish to build. I.e., if you want to build
        a classifier for the children of the "Auto Type" class (which
        includes "Budget Cars", "Concept Cars", and "Luxury Cars" to name
        a few), then you simply have to pass in the "Auto Type" string to
        this parameter.

    Returns
    -------
    to_return : (Sparse Numpy Array, Numpy Array)
        The former element represents the new feature matrix (where some
        rows correspond to the article instances that were synthetically
        generated if the user specifed for over-sampling to occur) and the
        latter element represents the new class labels. Note that the
        number of rows in both these array objects are the same since each
        row of the two correspond to the same (real or synthetic) article
        instance.

    References
    ----------
    1. https://towardsdatascience.com/guide-to-classification-on-imbalanced-datasets-d6653aa5fa23
    2. https://imbalanced-learn.readthedocs.io/en/stable/index.html
    3. https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
    4. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
    5. https://www.kite.com/blog/python/smote-python-imbalanced-learn-for-oversampling/
    6. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html#imblearn.under_sampling.EditedNearestNeighbours
    7. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTEENN.html
    """
    # First, get values for the parameters that we will need to use for
    # the rest of the function.
    normalized_mode = "".join(mode.lower().split("-"))
    child_tier_lvl, raw_articles_df = class_data_retrival(
        parent_class_label, give_child_tier_lvl=True)

    # Before performing any transformations on our data, we need to
    # double check that it is suitable for the BOWs and balance
    # correcting model. If it is not, then perform any corrections
    # neccessary.
    child_tier_label = "Tier{}".format(child_tier_lvl)
    counts_of_classes = raw_articles_df[child_tier_label].value_counts()
    counts_checker = counts_of_classes.values <= 6
    num_with_less = counts_checker.sum()
    if num_with_less > 0:
        # If any of the classes that we are working with have 5 or fewer
        # articles in them. If this is the case, then we cannot use any
        # of the over/under-sampling algorithms that investigate the
        # characteristics of its 6 nearest-neighbors. Our current
        # solution is to simply drop this class from consideration.
        indicies_with_less = np.argwhere(counts_checker).flatten()
        labels_with_less = counts_of_classes.index.values[indicies_with_less]
        if num_with_less == 1:
            # If there is exactly 1 class labels that we are going to
            # have to remove from the DataFrame of articles.
            assert labels_with_less.size == 1
            label_to_remove = labels_with_less[0]
            articles_df = raw_articles_df[raw_articles_df[child_tier_label]
                                          != label_to_remove]
        elif num_with_less == 2:
            # If there are exactly 2 class labels that we are going to
            # have to remove from the DataFrame of articles.
            assert labels_with_less.size == 2
            conditions_to_remove = np.logical_and(
                raw_articles_df[child_tier_label] != labels_with_less[0],
                raw_articles_df[child_tier_label] != labels_with_less[1])
            articles_df = raw_articles_df[conditions_to_remove]
        else:
            # If there are 3 or more class labels that we are going to
            # have to remove from the DataFrame of articles.
            assert labels_with_less.size >= 3
            for i in range(len(labels_with_less)):
                #
                if i == 0:
                    # If we are on our first iteration. In this case, we
                    # need to instantiate the `conditions_to_remove`
                    # object with the first two labels that we want to
                    # remove.
                    conditions_to_remove = np.logical_and(
                        raw_articles_df[child_tier_label] != labels_with_less[i],
                        raw_articles_df[child_tier_label] != labels_with_less[i + 1])
                elif i > 1:
                    # If we are on either our third or further down
                    # iteration. If this is the case, then we know that
                    # the `conditions_to_remove` object has been
                    # instantiated. We just need to add on to it with
                    # the remaining labels that we would like to remove.
                    conditions_to_remove = np.logical_and(
                        conditions_to_remove, raw_articles_df[child_tier_label]
                        != labels_with_less[i])
            articles_df = raw_articles_df[conditions_to_remove]

    else:
        # All the article counts for each class pass the test :).
        articles_df = raw_articles_df
    # Next, obtain your X (features) matrix and your y (labels) vector.
    _, featue_matrix = bag_of_words_converter(mode="tfidf",
                                              parent_class_label=None,
                                              articles_df=articles_df,
                                              upper_n_gram=2,
                                              upper_features=300,
                                              apply_PCA=True)
    labels_arr = np.array(
        articles_df[child_tier_label].tolist())

    # Next, implement the algorithm the user has specified.
    if normalized_mode == "smote":
        # If the user would first like to oversample with the SMOTE
        # algorithm.
        sm_model = SMOTE(random_state=169,
                         n_jobs=3)
        final_feature_matrix, final_labels_arr = sm_model.fit_resample(
            featue_matrix, labels_arr)
    elif normalized_mode == "enn":
        # If the user would like to undersample with the Tomek links
        # algorithm
        enn_model = EditedNearestNeighbours(sampling_strategy="auto",
                                            n_jobs=3)
        final_feature_matrix, final_labels_arr = enn_model.fit_resample(
            featue_matrix, labels_arr)
    elif normalized_mode == "smoteenn":
        # If the user would first like to oversample with SMOTE and then
        # improve on that new set of samples by undersampling with the
        # ENN algorithm

        # Instantiate the smoteenn object from imblearn that first
        # performs SMOTE and then ENN.
        sm_enn_model = SMOTEENN(random_state=169,
                                n_jobs=3)

        # Fit and resample with this pipeline object.
        final_feature_matrix, final_labels_arr = sm_enn_model.fit_resample(
            featue_matrix, labels_arr)

    to_return = (final_feature_matrix, final_labels_arr)

    return to_return
コード例 #23
0
                                                  if x == 'Positive' else 0)

vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(df['review'].values)

X = df['review']
y = df['user_sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

#Transforming the X_train and X_test using the tf-idf model
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

smt = SMOTE()
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

log_model = LogisticRegression()

params = {
    'C': np.logspace(-10, 1, 15),
    'class_weight': [None, 'balanced'],
    'penalty': ['l1', 'l2']
}

cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True)

# Create grid search using 5-fold cross validation
clf_LR = GridSearchCV(log_model, params, cv=cv, scoring='roc_auc', n_jobs=-1)

clf_LR.fit(X_train_sm, y_train_sm)
コード例 #24
0
ファイル: DT.py プロジェクト: yeimsf/HD-DataSet
sns.heatmap(
    abs(df.corr()), annot=True
)  #Shows the features correlation. ps: there's a bug on matplotlib #3.1.1.
plt.show()
# IN [6]: to split the data and balance it.
X_train, X_test, y_train, y_test = train_test_split(df.drop(['target'],
                                                            axis=1),
                                                    df['target'],
                                                    test_size=0.2,
                                                    random_state=2)
X_train.head()
X_train.shape
# checking if balanced?
y_train.mean()
sm = SMOTE(random_state=1)
X_train_balanced, y_train_balanced = sm.fit_resample(X_train, y_train)
# Balancing it
X_train_balanced.shape
y_train_balanced.shape
y_train_balanced.mean()
# IN [7]: Building models Decision Tree and Random Forest.
dt = DecisionTreeClassifier(random_state=2)
print('CV score:',
      cross_val_score(dt, X_train_balanced, y_train_balanced, cv=3).mean())
rf = RandomForestClassifier(n_estimators=100, random_state=2)
print('CV score:',
      cross_val_score(rf, X_train_balanced, y_train_balanced, cv=3).mean())
rf_grid = RandomForestClassifier(random_state=2)  # creates a new estimator
# IN [8]: Random Forest.
# Create the parameter grid based on the results of random search
rf_param_grid = {
コード例 #25
0
X_test_RFE = pd.DataFrame(rfe.transform(X_test),
                          columns=X_test.columns[rfe.support_])

#X_train_RFE = X_train[rfe.support_]

print(X_train.shape, X_train_RFE.shape)

#%%
from imblearn.over_sampling import ADASYN, SMOTE, SVMSMOTE

#OVERSAMPLING
sampler_smote = SMOTE(n_jobs=-1)
sampler_svm = SVMSMOTE(n_jobs=-1)
sampler_adasyn = ADASYN(n_jobs=-1)

X_smote, y_smote = sampler_smote.fit_resample(X=X_train, y=y_train.ravel())
X_svm, y_svm = sampler_svm.fit_resample(X=X_train, y=y_train.ravel())
X_adasyn, y_adasyn = sampler_adasyn.fit_resample(X=X_train, y=y_train.ravel())

#%%
#baseline
rf = ensemble.RandomForestClassifier(n_estimators=100,
                                     max_depth=8,
                                     criterion="entropy",
                                     n_jobs=-1)
rf.fit(X=X_train, y=y_train.ravel())

#FROM NOW ON, USE THE TUNED VERSION ALTHOUGH WE SHOULD RE-TUNE

#weight classes
rf_balanced = ensemble.RandomForestClassifier(
コード例 #26
0
ファイル: train.py プロジェクト: PengJia6/msisensor-rna
def train_model(paras):
    input = paras.input[0]
    model_path = paras.model[0]
    cancer_type = paras.cancer_type[0]
    classifier_name = paras.classifier[0]
    input_description = paras.input_description[0]
    model_description = paras.model_description[0]
    positive_num = paras.positive_num[0]
    author = paras.author[0]
    email = paras.email[0]

    input_df = pd.read_csv(input, index_col=0).dropna()
    genes = input_df.columns.to_list()
    genes.remove("msi")
    msi_status = input_df["msi"].to_list()
    # print(msi_status)
    y_label = [
        1 if i.upper() in ["MSI", "MSI-H", "MSI_H"] else 0 for i in msi_status
    ]
    class_num = Counter(y_label)
    if class_num[1] < positive_num:
        logger.error(
            "The No. of MSI sample lower than the minimum values, Please set with -p."
        )
    x = input_df[genes]
    smo = SMOTE(random_state=1)
    x_balanced, y_label_balanced = smo.fit_resample(x, y_label)
    classifier = build_classrfier(type=classifier_name)
    classifier.fit(x_balanced, y_label_balanced)

    y_pre = classifier.predict(x)
    y_pre_pro = classifier.predict_proba(x)[:, 1]
    roc_auc = roc_auc_score(y_label, y_pre_pro)
    conf_matrix = confusion_matrix(y_label,
                                   y_pre,
                                   labels=None,
                                   sample_weight=None)
    TN = conf_matrix[0][0]
    FN = conf_matrix[1][0]
    TP = conf_matrix[1][1]
    FP = conf_matrix[0][1]
    sensitivity = TP / (TP + FN)
    precision = TP / (TP + FP)
    specificity = TN / (TN + FP)
    f_score1 = f1_score(y_label, y_pre)
    accuracy_score = (TP + TN) / (FN + FP + TN + TP)

    description = {}
    description["Cancer Type"] = cancer_type
    description["Classifier"] = classifier_name
    description["Input Description"] = input_description
    description["Model Description"] = model_description
    description["Available Trained Sample"] = len(input_df)
    description["Positive Sample"] = class_num[1]
    description["Negative Sample"] = class_num[0]
    description["Accuracy"] = accuracy_score
    description["AUC"] = roc_auc
    description["F1 Score"] = f_score1
    description["Sensitivity"] = sensitivity
    description["Specificity"] = specificity
    description["Precision"] = precision
    description["Model Path"] = os.path.abspath(model_path)
    description["Author"] = os.path.abspath(author)
    description["Email"] = os.path.abspath(email)
    for item, value in description.items():
        logger.info(item + ": {}".format(value))
    with open(model_path, 'wb') as f:
        pickle.dump(classifier, f)
        pickle.dump(description, f)
        pickle.dump(genes, f)
コード例 #27
0
# Slice inputs and outputs
[input_data_train, output_data_train] = slice_data(train_data_frame)
[input_data_test, sample_ids] = slice_data(test_data_frame)

# Original class distribution
count_per_class(output_data_train)

# If resample flag is True, we need to resample the training dataset by generating new synthetic samples
if resample:
    resampler = SMOTE(sampling_strategy='auto',
                      random_state=42,
                      k_neighbors=5,
                      n_jobs=4)
    print("Resampling data")
    [input_data_train, output_data_train] = resampler.fit_resample(
        input_data_train, output_data_train)  # Original class distribution
    print("Done resampling")
    # Resampled class distribution
    count_per_class(output_data_train)

# Train the classifier
print("Started training")
clf = clf.fit(input_data_train, output_data_train)
print("Finished training")

# Predict
print("Prediction")
predicted = clf.predict(input_data_test)
proba = clf.predict_proba(input_data_test)
print("Finished prediction")
コード例 #28
0
ファイル: app.py プロジェクト: DataNinjas-code/prudential
def predict():
    if request.method == "POST":
        if request.form['features'].isnumeric():
            start = dt.datetime.now()
            train = pd.read_csv("train.csv")
            X = train.drop('labels', 1)
            y = train['labels']
            #features
            num_features = int(request.form['features'])
            features_name_to_keep = pd.read_csv("features.csv")
            features_name_to_keep = features_name_to_keep[
                features_name_to_keep['rank'] < num_features]["col"].to_list()
            final_features = pd.DataFrame(features_name_to_keep,
                                          columns=['final_features'])
            final_features.to_csv("final_features.csv")
            #only keep selected features from feature selection
            X = X[features_name_to_keep]

            # Split into Train and Test dataset
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42)
            # SMOTE
            oversample = SMOTE()
            X_train, y_train = oversample.fit_resample(X_train, y_train)
            #model training
            #XGBoost
            clf_xgb = XGBClassifier(colsample_bytree=0.3,
                                    learning_rate=0.1,
                                    max_depth=6,
                                    reg_alpha=0.8)
            clf_xgb.fit(X_train, y_train)
            accuracy_xgb = clf_xgb.score(X_test, y_test)
            metrics_xgb = precision_recall_fscore_support(
                y_test, clf_xgb.predict(X_test), average='macro')
            joblib.dump(clf_xgb, 'xgb.pkl')

            # Random Forest
            clf_rf = RandomForestClassifier(n_estimators=400,
                                            min_samples_split=5,
                                            min_samples_leaf=1,
                                            max_features='sqrt',
                                            max_depth=None,
                                            bootstrap=False)
            clf_rf.fit(X_train, y_train)
            accuracy_rf = clf_rf.score(X_test, y_test)
            metrics_rf = precision_recall_fscore_support(
                y_test, clf_rf.predict(X_test), average='macro')
            joblib.dump(clf_rf, 'rf.pkl')

            # Adaboost
            clf_ab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),
                                        n_estimators=225,
                                        learning_rate=0.3)
            clf_ab.fit(X_train, y_train)
            accuracy_ab = clf_ab.score(X_test, y_test)
            metrics_ab = precision_recall_fscore_support(
                y_test, clf_ab.predict(X_test), average='macro')
            joblib.dump(clf_ab, 'ab.pkl')
            # End timer
            time_taken = str(dt.datetime.now() - start)

            if accuracy_xgb > accuracy_rf:
                highest_accuracy_model = "XGBoost"
            elif accuracy_ab > accuracy_xgb:
                highest_accuracy_model = "AdaBoost"
            else:
                highest_accuracy_model = "Random Forest"
            if metrics_xgb[2] > metrics_rf[2]:
                highest_fscore_model = "XGBoost"
            elif metrics_ab[2] > metrics_xgb[2]:
                highest_fscore_model = "AdaBoost"
            else:
                highest_fscore_model = "Random Forest"
            return render_template(
                "metrics.html",
                metrics_xgb=metrics_xgb,
                accuracy_xgb=accuracy_xgb,
                metrics_rf=metrics_rf,
                accuracy_rf=accuracy_rf,
                metrics_ab=metrics_ab,
                accuracy_ab=accuracy_ab,
                time_taken=time_taken,
                highest_accuracy_model=highest_accuracy_model,
                highest_fscore_model=highest_fscore_model)
        else:
            flash("Please enter a digit for the number of features to use!")
            return redirect("/model")
コード例 #29
0
    def __init__(self, getfile, test_num):

        #-----SPLIT DATASETS-------
        self.getfile = getfile
        self.test_num = test_num

        tested = pd.read_csv(self.getfile)
        x = tested.iloc[:, [5, 6]].values
        # output
        y = tested.iloc[:, 7].values
        xtrain, xtest, ytrain, ytest = train_test_split(
            x, y, test_size=self.test_num, random_state=42)
        print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)
        #testx = len(xtest)
        #print(testx)
        #4 Feature Scaling

        #Feature Scaling or Standardization: It is a step of Data Pre Processing which is applied to independent variables or features of data.
        # It basically helps to normalise the data within a particular range. Sometimes, it also helps in speeding up the calculations in an algorithm.

        sc_x = StandardScaler()
        xtrain = sc_x.fit_transform(np.asarray(xtrain))
        xtest = sc_x.transform(np.asarray(xtest))

        counter = Counter(y)

        #---------------SMOTE ALGORITHM--------------------------

        print("Before OverSampling, counts of label '1': {}\n".format(
            sum(ytrain == 1)))
        print("Before OverSampling, counts of label '-1': {} \n".format(
            sum(ytrain == -1)))
        print('WITH SMOTE')

        os = RandomOverSampler(sampling_strategy='minority')
        xtrain_res, ytrain_res = os.fit_sample(x, y)
        oversample = SMOTE()

        xtrain, ytrain = oversample.fit_resample(xtrain_res,
                                                 ytrain_res.ravel())

        counter = Counter(ytrain)
        print(counter)
        print('After OverSampling, the shape of train_X: {}'.format(
            xtrain.shape))
        print('After OverSampling, the shape of train_y: {} \n'.format(
            ytrain.shape))

        print("After OverSampling, counts of label '1': {}".format(
            sum(ytrain == 1)))
        print("After OverSampling, counts of label '-1': {}".format(
            sum(ytrain == -1)))

        #---------------LOGISTIC REGRESSION----------------------
        #5 Fitting the Logistic Regression to the Training Set:
        #We create a classifier object of LR class

        classifier = LogisticRegression()

        #Fit logistic regression model to the training set (Xtrain and ytrain)
        classifier.fit(xtrain, ytrain)
        #vget = classifier.vard
        #print(vget)

        #6 Predicting the Test set results
        #Using predict method for the classifier object and put Xtest for #argument
        y_pred = classifier.predict(xtest)
        #print(y_pred)
        posed = 1
        neued = 1
        neged = 1

        import MySQLdb

        mydb = MySQLdb.connect(host="127.0.0.1",
                               user="******",
                               password="",
                               database="logitregression_data")
        mycursor = mydb.cursor()
        logit = []
        with open('temp_file.csv', 'r') as tempo:
            read = csv.reader(tempo, delimiter=',')

            for tem in read:
                logit.append(tem)

        with open(getfile, 'r') as file:
            reader = csv.reader(file, delimiter=',')
            all_value = []
            counter = 0

            mycursor.execute("DELETE FROM hybrid_logitval")

            #-----------The Result On The Logistic Regression Process Based on the Number of Test size will be seperated and determine the overall Result--------------
            for over in y_pred:
                counter += 1
                if over == 1:
                    posed += 1

                    resu = 'Positive'
                    regval = 1
                elif over == 0:
                    neued += 1

                    resu = 'Neutral'
                    regval = 0
                else:
                    neged += 1

                    resu = 'Negative'
                    regval = -1
                #stregval = str(regval)
                #valued = (counter,over,stregval, resu)

                query2 = "INSERT INTO `hybrid_logitval`(`HYB_ID`, `HYB_VALUE`, `HYB_SENTIMENT`, `HYB_RESULT`) VALUES (%s,%s,%s,%s)"
                mycursor.execute(query2,
                                 (counter, logit[counter], regval, resu))

            for row in reader:
                #print(row[0])
                value = (row[0], row[1], row[2], row[3], row[4], row[5],
                         row[6], row[7])
                all_value.append(value)

        mycursor.execute("DELETE FROM `baseline`")

        query = "INSERT INTO `baseline`(`ID`, `TWEETS`, `TOKENIZED`, `STOP_WORDS`, `STEMMED`, `POLARITY`, `SUBJECTIVITY`, `SENTIMENT`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)"

        mycursor.executemany(query, all_value)
        mycursor.execute("DELETE FROM `baseline` WHERE `baseline`.`ID` = 0")

        mydb.commit()
        mydb.close()

        #---------------CONFUSION MATRIX----------------------
        #7 Making the Confusion Matrix. It contains the correct and incorrect predictions of our model

        #ytest parameter will be y_test
        #y_pred is the logistic regression model prediction
        cm = confusion_matrix(ytest, y_pred)
        import warnings
        warnings.filterwarnings("ignore")
        cr = classification_report(ytest, y_pred)
        print(ytest)

        print("Confusion Matrix : \n", cm)
        print(cr)

        import mlxtend.plotting
        from mlxtend.plotting import plot_confusion_matrix
        class_names = ['-1', '0', '1']
        fig, ax = plot_confusion_matrix(conf_mat=cm,
                                        colorbar=True,
                                        class_names=class_names)
        fig.canvas.set_window_title('HYBRID LOGISTIC REGRESSION')
        plt.ylabel('Actual label')
        plt.xlabel('Predicted label')
        plt.show()

        #-------SENDS ALL VALUES TO APPEAR ON THE USER INTERFACE----------------
        global accurate, confuse, posi, neut, nega, overall, plots, replot, percentage, reports
        accurate = accuracy_score(ytest, y_pred)
        print(accurate)
        percentage = "{:.0%}".format(accurate)
        confuse = cm
        print(percentage)
        posi = posed
        neut = neued
        nega = neged
        plots = y_pred
        replot = plt
        reports = cr

        if (neut >= posi) and (neut >= nega):
            overall = 'NEUTRAL'
        elif (posi >= neut) and (posi >= nega):
            overall = 'POSITIVE'
        else:
            overall = 'NEGATIVE'

        print(overall)
コード例 #30
0
ファイル: Model.py プロジェクト: nrohith55/Heroku_New
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer()

X = new_data.iloc[:, 1]
y = new_data.iloc[:, 0]

X = tv.fit_transform(new_data.Posts)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=444)

X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
X_train_res.shape
y_train_res.shape
X_test.shape
y_test.shape

model = LogisticRegression()
model.fit(X_train_res, y_train_res)

y_pred = model.predict(X_test)

pickle.dump(tv, open('Transform.pkl', 'wb'))

pickle.dump(model, open('Model.pkl', 'wb'))
コード例 #31
0
X_resampled, y_resampled = rus.fit_resample(X,y)

np.bincount(y_resampled)

plt.scatter(X_resampled[:,0],X_resampled[:,1],c=y_resampled)


rus = RandomUnderSampler(random_state=0, sampling_strategy={1:64*2,2:64*10}) 
X_resampled, y_resampled = rus.fit_resample(X,y)

np.bincount(y_resampled)

from imblearn.over_sampling import SMOTE

sm=SMOTE(k_neighbors=5, random_state=0)
X_resampled, y_resampled = sm.fit_resample(X,y)

np.bincount(y_resampled)

plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled)

from imblearn.over_sampling import ADASYN

ada=ADASYN(random_state=0, n_neighbors=5)

X_resampled, y_resampled = ada.fit_resample(X,y) 

np.bincount(y_resampled)

plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled)
コード例 #32
0
ファイル: classify.py プロジェクト: cosmictransients/FLEET
def create_training_testing(object_name,
                            features_table,
                            training_days=20,
                            model='single',
                            clean=0,
                            feature_set=13,
                            sorting_state=42,
                            SMOTE_state=42,
                            clf_state=42,
                            n_estimators=100,
                            max_depth=7,
                            hostless_cut=0.1):
    '''
    Import the training set and modify the features_table according to the model
    parameters specified.

    Parameters
    -------------
    object_name    : Name of the object to exclude from training set
    features_table : Astropy table with all the features of the new transient
    training_days  : What data set to use for training
    model          : Which model to use for training, single or double
    clean          : Clean hostless transients?
    feature_set    : Which feature set to use
    sorting_state  : Seed number for list sorter
    SMOTE_state    : Seed number for SMOTE
    clf_state      : Seed number for classifier
    n_estimators   : Number of trees
    max_depth      : Depth of trees
    hostless_cut   : Only consider hosts with a Pcc lower than this

    Return
    ---------------
    Predicted Probability to be ['Nuclear','SLSN-I','SLSN-II','SNII','SNIIb','SNIIn','SNIa','SNIbc','Star']
    '''

    # Import Data
    table_name = pkg_resources.resource_filename(
        __name__,
        'training_set/center_table_%s_%s.txt' % (training_days, model))
    training_table_in = table.Table.read(table_name, format='ascii')

    # Remove bad objects from training sample
    bad = [
        '2020cui', '2019lwy', '2019cvi', '2018jsc', '2005bf', '2005gi',
        '2007ib', '2008aq', '2008ax', '2009N', '2010id', '2012aw', '2013ai',
        '2013am', '2013bu', '2013ej', '2013fs', '2016X', '2018cyg', '2018epm',
        '2018fjw', '2018fii', '2018fuw', '2018gvt', '2018imj', '2018lcd',
        '2019B', '2019bvq', '2019cda', '2019ci', '2019dok', '2019gaf',
        '2019gqk', '2019hau', '2019iex', '2019keo', '2019lkw', '2019oa',
        '2019otb', '2019pjs', '2019sjx', '2019tqb', '2019wbg', '2020ekk',
        object_name
    ]
    good = [i not in bad for i in training_table_in['mod_object_name']]
    training_table_in = training_table_in[good]

    # Shuffle Order of Table
    order = np.arange(len(training_table_in))
    np.random.seed(sorting_state)
    np.random.shuffle(order)
    training_table = training_table_in[order]

    # Select Only Clean Data
    if clean == 0:
        clean_training = training_table[
            np.isfinite(training_table['red_amplitude'])
            & np.isfinite(training_table['Pcc'])]
    if clean == 1:
        clean_training = training_table[
            np.isfinite(training_table['red_amplitude'])
            & np.isfinite(training_table['Pcc']) &
            (training_table['Pcc'] <= hostless_cut)]

    # Select Features
    if feature_set == 0:
        use_features = [
            'red_amplitude', 'green_amplitude', 'normal_separation',
            'deltamag_red', 'deltamag_green'
        ]
    if feature_set == 1:
        use_features = [
            'red_amplitude', 'green_amplitude', 'delta_time',
            'normal_separation', 'deltamag_red', 'deltamag_green'
        ]
    if feature_set == 2:
        use_features = [
            'red_amplitude', 'green_amplitude', 'delta_time', 'deltamag_red',
            'deltamag_green'
        ]
    if feature_set == 3:
        use_features = [
            'red_amplitude', 'green_amplitude', 'delta_time',
            'normal_separation'
        ]
    if feature_set == 4:
        use_features = [
            'red_amplitude',
            'green_amplitude',
            'delta_time',
        ]
    if feature_set == 5: use_features = [
            'red_amplitude',
            'green_amplitude',
    ]
    if feature_set == 6:
        use_features = ['normal_separation', 'deltamag_red', 'deltamag_green']
    if feature_set == 7:
        use_features = [
            'red_amplitude', 'green_amplitude', 'normal_separation'
        ]
    if feature_set == 8:
        use_features = [
            'red_amplitude', 'green_amplitude', 'delta_time',
            'normal_separation', 'color'
        ]
    if feature_set == 9:
        use_features = [
            'red_amplitude', 'green_amplitude', 'delta_time',
            'normal_separation', 'deltamag_red', 'deltamag_green', 'color'
        ]
    if feature_set == 10:
        use_features = [
            'red_amplitude', 'green_amplitude', 'delta_time',
            'normal_separation', 'Pcc'
        ]
    if feature_set == 11:
        use_features = [
            'red_amplitude', 'green_amplitude', 'delta_time',
            'normal_separation', 'redshift'
        ]
    if feature_set == 12:
        use_features = [
            'red_amplitude', 'green_amplitude', 'delta_time',
            'normal_separation', 'absmag'
        ]
    if feature_set == 13:
        use_features = [
            'red_amplitude', 'green_amplitude', 'delta_time',
            'normal_separation', 'model_color'
        ]
    if feature_set == 14:
        use_features = [
            'red_amplitude', 'green_amplitude', 'delta_time',
            'normal_separation', 'deltamag_red', 'deltamag_green',
            'model_color'
        ]
    if feature_set == 15:
        use_features = [
            'red_amplitude', 'green_amplitude', 'normal_separation',
            'deltamag_red', 'deltamag_green', 'model_color'
        ]
    if feature_set == 16:
        use_features = [
            'red_amplitude', 'green_amplitude', 'delta_time',
            'normal_separation', 'model_color', 'redshift'
        ]

    # If using the 'double' model, add the W2 parameter
    if model == 'double':
        use_features += ['red_amplitude2', 'green_amplitude2']

    # Create array with Training and Testing data
    training_data = np.array(clean_training[use_features].to_pandas())
    testing_data = np.array(features_table[use_features].to_pandas())

    # Get names of objects and classes
    training_class_in = np.array(clean_training['class'])

    # Group Transients into groups
    training_class_in[np.where(training_class_in == 'LBV')] = 'Star'
    training_class_in[np.where(training_class_in == 'Varstar')] = 'Star'
    training_class_in[np.where(training_class_in == 'CV')] = 'Star'

    training_class_in[np.where(training_class_in == 'SNIbn')] = 'SNIbc'
    training_class_in[np.where(training_class_in == 'SNIb')] = 'SNIbc'
    training_class_in[np.where(training_class_in == 'SNIbc')] = 'SNIbc'
    training_class_in[np.where(training_class_in == 'SNIc')] = 'SNIbc'
    training_class_in[np.where(training_class_in == 'SNIc-BL')] = 'SNIbc'

    training_class_in[np.where(training_class_in == 'SNII')] = 'SNII'
    training_class_in[np.where(training_class_in == 'SNIIP')] = 'SNII'

    training_class_in[np.where(training_class_in == 'TDE')] = 'Nuclear'
    training_class_in[np.where(training_class_in == 'AGN')] = 'Nuclear'

    classes_names = {
        'Nuclear': 0,
        'SLSN-I': 1,
        'SLSN-II': 2,
        'SNII': 3,
        'SNIIb': 4,
        'SNIIn': 5,
        'SNIa': 6,
        'SNIbc': 7,
        'Star': 8
    }
    training_class = np.array([classes_names[i]
                               for i in training_class_in]).astype(int)

    # SMOTE the data
    sampler = SMOTE(random_state=SMOTE_state)
    data_train_smote, class_train_smote = sampler.fit_resample(
        training_data, training_class)

    # Train Random Forest Classifier
    clf = RandomForestClassifier(n_estimators=n_estimators,
                                 max_depth=max_depth,
                                 random_state=clf_state)
    clf.fit(data_train_smote, class_train_smote)

    # Predict Excluded Object
    predicted_probability = 100 * clf.predict_proba(testing_data)

    return predicted_probability
コード例 #33
0
ファイル: CDSMOTE-Bin.py プロジェクト: heyad/CDSMOTE-BINARY
 for i,j in enumerate(newclassdist_count[0:-1]):
     if abs(j-average)<c:
         c = abs(j-average)
         newmaj = i
 majority_class_new = majority_class+'_c'+str(newmaj)
 minority_class_new = minority_class+'_c0'
 ## 3. Create the dataset that only contains the new majority and minority classes
 data_majmin = []
 target_majmin = []
 for m, label in enumerate(target_cd):
     if label == majority_class_new or label == minority_class_new:
         data_majmin.append(data[m])
         target_majmin.append(label)
 if oversampler.lower() == 'smote':
     sm = SMOTE()
     data_over, target_over = sm.fit_resample(data_majmin, target_majmin) 
 elif oversampler.lower() == 'adasyn':
     ada = ADASYN()
     data_over, target_over = ada.fit_resample(data_majmin, target_majmin)
 else:
     print('Invalid oversampling algorithm.')
     sys.exit() 
 ## 4. combine this with the remaning classes
 data_cdsmote = data_over.copy()
 target_cdsmote = target_over.copy()
 for m, label in enumerate(target_cd):
     if label != minority_class_new and label != majority_class_new:
         data_cdsmote.append(data[m])
         target_cdsmote.append(label)
 
 
コード例 #34
0
##Once the bag of words is prepared, the dataset should be divided into training and test sets:

X = amazon2['Reviews1']
y = amazon2['sent']
X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split(
    bow_data,  # Features
    amazon2['sent'],  # Target variable
    test_size=0.2,  # 20% test size
    random_state=0)  # random state for replication purposes

# # SAMPLING

# In[11]:

sm = SMOTE(random_state=42)
X_train_smo, y_train_smo = sm.fit_resample(X_train_bow, y_train_bow)

Counter(y_train_bow)
Counter(y_train_smo)

# # TRAINING & TESTING

# In[12]:

### 1. logistic regression

lr_model = LogisticRegression()

#Fit train and test into the model

lr_model.fit(X_train_smo, y_train_smo)
コード例 #35
0
    
    #读取数据,存属性
    dfs = pd.read_csv(params['data_path'])
    col = list(dfs.columns)
    data = np.array(dfs.iloc[:,:-1])
    labels = np.array(dfs['label'])
    
    #统计原数据的标签数量
    c1 = Counter(labels)
    #根据指定的ratio(少数样本/多数样本),来生成ra集合,用以在SMOTE的指定创建
    m = max(c1.values())
    ra = {}
    for i in c1.keys():
        if (c1[i]/m) < params['ratio']:
            ra[i] = int(m*params['ratio'])

    #对样本进行非均衡处理
    smo = SMOTE(sampling_strategy=ra,
                k_neighbors=params['kneighbors'],
                random_state=42)
    data_smote,labels_smote = smo.fit_resample(data,labels)
    #c2_smote = Counter(labels_smote)

    #存储过采样后的数据
    labels_smote = np.array(labels_smote).reshape(-1,1)
    data_label = np.hstack((data_smote,labels_smote))
    
    result = pd.DataFrame(data_label, columns = col)
    result.to_csv(params['save_path'], sep=',', header=True, index=False)
except Exception as e:
    print(e)
コード例 #36
0
dfDescartadosUnder = dfDescartados.sample(qtdeConfirmados)
dfUnder = pd.concat([dfDescartadosUnder, dfConfirmados], axis=0)
xTreino = dfUnder[features].values
yTreino = dfUnder['RESULTADO'].values

#Over sampling
print('\nOver Sampling')
dfConfirmadosOver = dfConfirmados.sample(qtdeDescartados, replace=True)
dfOver = pd.concat([dfDescartados, dfConfirmadosOver], axis=0)
xTreino = dfOver[features].values
yTreino = dfOver['RESULTADO'].values

#Smote sampling
print('\nSmote Sampling')
oversample = SMOTE()
xTreino, yTreino = oversample.fit_resample(treino[features], treino['RESULTADO'])
xTreino = xTreino.values
yTreino = yTreino.values

#Classificador Randon Forest
classifierRF = RandomForestClassifier(random_state=1986, criterion='gini', max_depth=10, n_estimators=50, n_jobs=-1)

#Treina com todos registros
classifierRF.fit(xTreino, yTreino) 

#Feature Selection
print('\nFeature Selection')
featuresSelection = zip(classifierRF.feature_importances_, features)
for importance, feature in sorted(featuresSelection, reverse=True)[:30]:
    print('%s: %f%%' % (feature, importance*100))
コード例 #37
0
ファイル: AT11.py プロジェクト: newmarwegner/positvo
def balance_data(final_data, class_column):
    x, y = final_data.iloc[:, :-1], final_data[class_column]
    oversample = SMOTE()

    return oversample.fit_resample(x, y)