def test_sample_with_nn_svm(): kind = 'svm' nn_k = NearestNeighbors(n_neighbors=6) svm = SVC(gamma='scale', random_state=RND_SEED) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, svm_estimator=svm) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [1.25192108, -0.22367336], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [-0.28162401, -2.10400981], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [0.70472253, -0.73309052], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [0.88407872, 0.35454207], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [-0.18410027, -0.45194484], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.47436887, -0.2645749], [1.07844562, -0.19435291], [1.44228238, -1.31256615], [1.25636713, -1.04463226]]) y_gt = np.array([0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_fit_resample_nn_obj(): kind = 'borderline1' nn_m = NearestNeighbors(n_neighbors=11) nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE( random_state=RND_SEED, kind=kind, k_neighbors=nn_k, m_neighbors=nn_m) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ -0.28162401, -2.10400981 ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ 0.70472253, -0.73309052 ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ 0.88407872, 0.35454207 ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ -0.18410027, -0.45194484 ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ -0.41635887, -0.38299653 ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.3765279, -0.2009615], [0.55276636, -0.10550373], [0.45413452, -0.08883319], [1.21118683, -0.22817957]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def train_decisiontree_with(configurationname, train_data, k, score_function, undersam=False, oversam=False, export=False): assert k > 0 print("Training with configuration " + configurationname) X_train, y_train, id_to_a_train = train_data dtc = DecisionTreeClassifier(random_state=0) print("Feature Selection") # selector = SelectFpr(score_function) selector = SelectKBest(score_function, k=k) result = selector.fit(X_train, y_train) X_train = selector.transform(X_train) fitted_ids = [i for i in result.get_support(indices=True)] print("Apply Resampling") print(Counter(y_train)) if undersam and not oversam: renn = RepeatedEditedNearestNeighbours() X_train, y_train = renn.fit_resample(X_train, y_train) if oversam and not undersam: # feature_indices_array = list(range(len(f_to_id))) # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0) # X_train, y_train = smote_nc.fit_resample(X_train, y_train) sm = SMOTE(random_state=42) X_train, y_train = sm.fit_resample(X_train, y_train) if oversam and undersam: smote_enn = SMOTEENN(random_state=0) X_train, y_train = smote_enn.fit_resample(X_train, y_train) print(Counter(y_train)) print("Train Classifier") dtc = dtc.fit(X_train, y_train, check_input=True) if export: export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True) transform(fitted_ids, configurationname) print("Self Accuracy: " + str(dtc.score(X_train, y_train))) return selector, dtc
def test_sample_borderline2(): kind = 'borderline2' smote = SMOTE(random_state=RND_SEED, kind=kind) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ -0.28162401, -2.10400981 ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ 0.70472253, -0.73309052 ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ 0.88407872, 0.35454207 ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ -0.18410027, -0.45194484 ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.47436888, -0.2645749], [1.07844561, -0.19435291], [0.33339622, 0.49870937]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_half(): sampling_strategy = {0: 9, 1: 12} smote = SMOTE( sampling_strategy=sampling_strategy, random_state=RND_SEED) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ -0.28162401, -2.10400981 ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ 0.70472253, -0.73309052 ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ 0.88407872, 0.35454207 ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ -0.18410027, -0.45194484 ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [-0.41635887, -0.38299653], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.36784496, -0.1953161]]) y_gt = np.array( [0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
def test_sample_regular_with_nn(): nn_k = NearestNeighbors(n_neighbors=6) smote = SMOTE(random_state=RND_SEED, k_neighbors=nn_k) X_resampled, y_resampled = smote.fit_resample(X, Y) X_gt = np.array([[0.11622591, -0.0317206], [0.77481731, 0.60935141], [ 1.25192108, -0.22367336 ], [0.53366841, -0.30312976], [1.52091956, -0.49283504], [ -0.28162401, -2.10400981 ], [0.83680821, 1.72827342], [0.3084254, 0.33299982], [ 0.70472253, -0.73309052 ], [0.28893132, -0.38761769], [1.15514042, 0.0129463], [ 0.88407872, 0.35454207 ], [1.31301027, -0.92648734], [-1.11515198, -0.93689695], [ -0.18410027, -0.45194484 ], [0.9281014, 0.53085498], [-0.14374509, 0.27370049], [ -0.41635887, -0.38299653 ], [0.08711622, 0.93259929], [1.70580611, -0.11219234], [0.29307743, -0.14670439], [0.84976473, -0.15570176], [0.61319159, -0.11571668], [0.66052536, -0.28246517]]) y_gt = np.array([ 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0 ]) assert_allclose(X_resampled, X_gt, rtol=R_TOL) assert_array_equal(y_resampled, y_gt)
# if y[i] == 1: # print(i) # count += 1 # print(count) dt = DecisionTreeClassifier() sm = SMOTE(sampling_strategy=0.2, k_neighbors=10) # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) train_size: int = int(0.8 * len(x)) test_size: int = len(x) - train_size x_train = x[:train_size] x_test = x[train_size:] y_train = y[:train_size] y_test = y[train_size:] # SMOTE resampling and sort by time x_train, y_train = sm.fit_resample(x_train, y_train) entries = list(zip(x_train, y_train)) entries.sort(key=lambda in_: in_[0]) x_train, y_train = zip(*entries) x_train = list(x_train) y_train = list(y_train) for i in x_train: i.pop(0) for i in x_test: i.pop(0) # x_train = x_train[:, 1:] # x_test = x_test[:, 1:] # train_size = len(x_train) window_size = DEFAULT_INIT_WINDOW_SIZE # initialize the window size min_pos_por = 0.2 # minimum positive classified proportion
PPVSVMLinear = np.zeros([nrepeat,nfold]) for i in range(nrepeat): indexes = list(range(nobs)) random.shuffle(indexes) dfs = np.array_split(indexes,nfold) for j in range(nfold): index_bad = X.index.isin(dfs[j]) X_test = X[index_bad] y_test = y[index_bad] X_train = X[~index_bad] y_train = y[~index_bad] #SMOTE oversample = SMOTE(k_neighbors=7) X_train,y_train = oversample.fit_resample(X_train,y_train) linear_svc = svm.SVC(kernel='rbf',random_state=0, tol=1e-5, C = 1) linear_svc.fit(X_train, y_train) y_predict = linear_svc.predict(X_test) tn, fp, fn, tp = confusion_matrix(y_test.ravel(),y_predict).ravel() RepeatSVMLinear[i,j] = tp/(tp+fn) PPVSVMLinear[i,j] = tp/(tp+fp) print('Linear') print(mean(100*RepeatSVMLinear.ravel())) print(pstdev(100*RepeatSVMLinear.ravel())) print(mean(100*PPVSVMLinear.ravel())) print(pstdev(100*PPVSVMLinear.ravel()))
def fitness(self, particle, metric): matrix_length = len(np.unique(self.data[self.target])) if self.mode == 'sgd': model = SGDClassifier(class_weight='balanced', loss='modified_huber', random_state=1) elif self.mode == 'svr': model = SVC(kernel='linear', class_weight='balanced', probability=True) elif self.mode == 'rdf': model = SVC(kernel='rbf', class_weight='balanced', probability=True) elif self.mode == 'pol': model = SVC(kernel='poly', class_weight='balanced', probability=True) elif self.mode == 'rdc': model = RandomForestClassifier(n_estimators=10, class_weight='balanced', random_state=1) elif self.mode == 'dtc': model = DecisionTreeClassifier(class_weight='balanced', random_state=1) elif self.mode == 'gdc': model = GradientBoostingClassifier(random_state=1) elif self.mode == 'etc': model = ExtraTreesClassifier(class_weight='balanced', random_state=1) elif self.mode == 'adc': model = AdaBoostClassifier(random_state=1) elif self.mode == 'bac': model = BaggingClassifier(random_state=1) elif self.mode == 'lda': model = LinearDiscriminantAnalysis() elif self.mode == 'qda': model = QuadraticDiscriminantAnalysis() elif self.mode == 'gnb': model = GaussianNB() elif self.mode == 'rrc': model = RidgeClassifier(class_weight='balanced') else: model = LogisticRegression(solver='liblinear', C=10.0, class_weight='balanced') k = model_selection.StratifiedKFold(5) try: tab_data, tab_val = tab.get([int(x) for x in particle.posiion], self.tab_data, self.tab_vals) tab_val = np.array(tab_val) accuracy = (utility.getTotalTruePositive(tab_val) + utility.getTotalTrueNegative(tab_val)) / \ (utility.getTotalTruePositive(tab_val) + utility.getTotalTrueNegative(tab_val) + utility.getTotalFalsePositive(tab_val) + utility.getTotalFalseNegative(tab_val)) precision_tab = [] recall_tab = [] for i in range(len(tab_val)): a = utility.getTruePositive( tab_val, i) / (utility.getFalsePositive(tab_val, i) + utility.getTruePositive(tab_val, i)) b = utility.getTruePositive( tab_val, i) / (utility.getFalseNegative(tab_val, i) + utility.getTruePositive(tab_val, i)) precision_tab.append(a) recall_tab.append(b) precision = sum(precision_tab) / len(precision_tab) recall = sum(recall_tab) / len(recall_tab) fscore = 2 * (1 / ((1 / precision) + (1 / recall))) matrix = tab_val tmp = self.data.drop([self.target], axis=1) tmp = tmp.iloc[:, particle.position] cols = tmp.columns self.tab_find = self.tab_find + 1 except AttributeError: matrix = np.zeros((matrix_length, matrix_length), dtype=int) X, y, cols = utility.ready(self, particle.position, self.data, self.dummiesList, self.createDummies, self.normalize) originalclass = [] predictedclass = [] for train_index, test_index in k.split(X, y): # Split in X X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] if self.mode == ('knn' or 'dct' or 'gbc' or 'lda' or 'qda' or 'adc' or 'bac'): if self.mode == 'knn': model = KNeighborsClassifier( n_neighbors=int(len(X_train)**(1 / 2))) sm = SMOTE(sampling_strategy='auto') X_train, y_train = sm.fit_resample(X_train, y_train) model.fit(X_train, y_train) y_pred = model.predict(X_test) originalclass.extend(y_test) predictedclass.extend(y_pred) matrix += confusion_matrix(y_test, y_pred) accuracy = (utility.getTotalTruePositive(matrix) + utility.getTotalTrueNegative(matrix)) / \ (utility.getTotalTruePositive(matrix) + utility.getTotalTrueNegative(matrix) + utility.getTotalFalsePositive(matrix) + utility.getTotalFalseNegative(matrix)) precision, recall, fscore, support = s(originalclass, predictedclass, average='macro') self.tab_data, self.tab_vals = tab.add( [int(x) for x in particle.position], matrix.tolist(), self.tab_data, self.tab_vals) self.tab_insert = self.tab_insert + 1 if metric == 'accuracy' or 'exactitude': score = accuracy elif metric == 'recall' or 'rappel': score = recall elif metric == 'precision' or 'précision': score = precision elif metric == 'fscore': score = fscore else: score = accuracy return score, accuracy, recall, precision, fscore, cols, matrix
X, y) # Now that we know the gap between the train and test gap of loss. But we can still boost it cause we know that target has an imbalance data so perhaps we can find better evaluation if we make it balance using SMOTE. # In[167]: from imblearn.over_sampling import SMOTE # In[169]: smote = SMOTE(random_state = 402) X_smote, Y_smote = smote.fit_resample(X,y) sns.countplot(Y_smote, edgecolor = 'black') # In[171]: plot_learning_curve(RandomForestClassifier( bootstrap=True, max_depth=10, max_features='auto', min_samples_leaf=2, min_samples_split=5, n_estimators=100), X_smote, Y_smote) # In[ ]:
def initialise(): # Connect to the database mean_performance = 0 connection = pymysql.connect(host='localhost', user='******', password='', db='crickml', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) # Fetch data for features print("Fetching player data for Features.....") player_list_2011 = fetch_data_pre('SELECT * FROM pre_wc_2011', connection) player_list_2015 = fetch_data_pre('SELECT * FROM pre_wc_2015', connection) player_list_2013 = fetch_data_pre('SELECT * FROM pre_ct_2013', connection) player_list_2017 = fetch_data_pre('SELECT * FROM pre_ct_2017', connection) # player_list_2007 = fetch_data_pre('SELECT * FROM pre_ct_2007', connection) # Fetch data for labels print("Fetching player data for Labels.....") performance_list_2011 = fetch_data_post( 'SELECT * FROM wc_2011', connection) performance_list_2015 = fetch_data_post( 'SELECT * FROM wc_2015', connection) performance_list_2013 = fetch_data_post( 'SELECT * FROM ct_2013', connection) performance_list_2017 = fetch_data_post( 'SELECT * FROM ct_2017', connection) # performance_list_2007 = fetch_data_post( # 'SELECT * FROM wc_2007', connection) # print("Mean") # mean_performance = sum(performance_list_2011[:, 0])/len(performance_list_2011) # print(mean_performance) # mean_performance = sum(performance_list_2015[:, 0])/len(performance_list_2015) # print(mean_performance) # mean_performance = sum(performance_list_2013[:, 0])/len(performance_list_2013) # print(mean_performance) # mean_performance = sum(performance_list_2017[:, 0])/len(performance_list_2017) # print(mean_performance) # class_interval = np.max() np_players = np.concatenate( (player_list_2011, player_list_2013, player_list_2015, player_list_2017), axis=0) np_performances = np.concatenate( (performance_list_2011, performance_list_2013, performance_list_2015, performance_list_2017), axis=0) # print(np_performances) # class_interval = ( np.max(np_performances[:,0]) - np.min(np_performances[:,0]) ) / 3 # print("class interval") # print(class_interval) # exit() max_career = np.max(np_players[:, 0]) max_recent = np.max(np_players[:, 1]) max_away = np.max(np_players[:, 2]) max_home = np.max(np_players[:, 3]) np_players = scale_features( np_players, max_career, max_recent, max_away, max_home) # from sklearn.preprocessing import StandardScaler # sc = StandardScaler() # np_players = sc.fit_transform(np_players) # X_test = sc.transform(X_test) sm = SMOTE(random_state=41) np_players_resampled, np_performances_resampled = sm.fit_resample(np_players, np_performances) # DO train test split using SKLEARN feature_train, feature_test, target_train, target_test = train_test_split( np_players_resampled, np_performances_resampled, test_size=0.30, random_state=42) # pca = PCA(n_components=2) # feature_train = pca.fit_transform(feature_train) # feature_test = pca.transform(feature_test) # Train Naive Bayes model # gnb = GaussianNB() # gnb.fit(feature_train, target_train) # nb_pred_prob = gnb.predict_proba(feature_test) # nb_pred = gnb.predict(feature_test) # print(classification_report(target_test, nb_pred)) # acc = accuracy_score(nb_pred, target_test) # print(acc) # exit() # Train Multi-layer Perceptron model # mlp_clf = MLPClassifier(solver='lbfgs', alpha=1e-06, # hidden_layer_sizes=(13), random_state=7, max_iter=1100) # mlp_clf.fit(feature_train, target_train) # mlp_pred_prob = mlp_clf.predict_proba(feature_test) # mlp_pred = mlp_clf.predict(feature_test) # print(classification_report(target_test, mlp_pred)) # acc = accuracy_score(mlp_pred, target_test) # print(acc) # exit() # tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5], # 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}, # {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5], # 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}, # {'kernel': ['linear'], 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]} # ] # scores = ['precision', 'recall'] # for score in scores: # print("# Tuning hyper-parameters for %s" % score) # print() # clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, # scoring='%s_macro' % score) # clf.fit(feature_train, target_train) # print("Best parameters set found on development set:") # print() # print(clf.best_params_) # print() # print("Grid scores on development set:") # print() # means = clf.cv_results_['mean_test_score'] # stds = clf.cv_results_['std_test_score'] # for mean, std, params in zip(means, stds, clf.cv_results_['params']): # print("%0.3f (+/-%0.03f) for %r" # % (mean, std * 2, params)) # print() # parameters = {'solver': ['lbfgs'], 'max_iter': [1000,1100], 'alpha': 10.0 ** -np.arange(1, 10), 'hidden_layer_sizes':np.arange(10, 15), 'random_state':[0,1,2,3,4,5,6,7,8,9]} # clf = GridSearchCV(MLPClassifier(), parameters, n_jobs=-1) # clf.fit(feature_train, target_train) # print(clf.score(feature_test, target_test)) # print(clf.best_params_) # Train SVM model # svm_clf = SVC(C=1000, kernel='sigmoid', gamma=0.001, probability=True) # svm_clf.fit(feature_train, target_train) # svm_pred = svm_clf.predict(feature_test) # svm_pred_prob = svm_clf.predict_proba(feature_test) # print(classification_report(target_test, svm_pred)) # acc = accuracy_score(svm_pred, target_test) # print(acc) # exit() # Train Decision Tree model desT = DecisionTreeClassifier(max_depth=11) desT.fit(feature_train, target_train) # desc_pred = desT.predict(feature_test) # desc_pred_prob = desT.predict_proba(feature_test) # print(classification_report(target_test, desc_pred)) # acc = accuracy_score(desc_pred, target_test) # print(acc) # exit() # amt_say_nb = acceptance_rate(nb_pred, target_test) # amt_say_mlp = acceptance_rate(mlp_pred, target_test) # amt_say_svm = acceptance_rate(svm_pred, target_test) # amt_say_desc = acceptance_rate(desc_pred, target_test) # print('Amount of say NB :', amt_say_nb) # print('Amount of say MLP :', amt_say_mlp) # print('Amount of say SVM :', amt_say_svm) # print('Amount of say Descision Tree :', amt_say_desc) return connection, desT, max_home, max_away, max_recent, max_career, feature_train, feature_test, target_train, target_test
def execute(project): dataset_dir = Config.get_work_dir_path( os.path.join("paper", "datasets", "traditional_designite", project.github())) Path(dataset_dir).mkdir(parents=True, exist_ok=True) training_path = os.path.join(dataset_dir, "training.csv") testing_path = os.path.join(dataset_dir, "testing.csv") training_df = pd.read_csv(training_path).dropna().replace({ 'True': 1, 'False': 0 }) testing_df = pd.read_csv(testing_path).dropna().replace({ 'True': 1, 'False': 0 }) training_y = training_df.pop('Bugged').values training_X = training_df.values training_X = preprocessing.scale(training_X) oversample = SMOTE() training_X, training_y = oversample.fit_resample(training_X, training_y) models = { 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(), 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(), 'LogisticRegression': LogisticRegression(), 'BernoulliNaiveBayes': BernoulliNB(), 'K-NearestNeighbor': KNeighborsClassifier(), 'DecisionTree': DecisionTreeClassifier(), 'RandomForest': RandomForestClassifier(), 'SupportVectorMachine': SVC(), # 'MultilayerPerceptron': MLPClassifier() } params = { 'LinearDiscriminantAnalysis': {}, 'QuadraticDiscriminantAnalysis': {}, 'LogisticRegression': { 'C': list(np.logspace(-4, 4, 3)) }, 'BernoulliNaiveBayes': {}, 'K-NearestNeighbor': {}, 'DecisionTree': { 'criterion': ['gini', 'entropy'], }, 'RandomForest': { 'n_estimators': [10, 100] }, 'SupportVectorMachine': { 'C': [0.1, 100] }, # 'MultilayerPerceptron': {'hidden_layer_sizes': [(55, 27, 55)], # 'activation': ['tanh', 'relu']} } helper = EstimatorSelectionHelper(models, params) helper.fit(training_X, training_y, scoring='f1') summary = helper.score_summary() top_summary = summary[:10] top_summary_iter = top_summary.drop(EstimatorSelectionHelper.get_scores_info(), axis=1)\ .where(pd.notnull(top_summary), None)\ .iterrows() testing_y = testing_df.pop('Bugged').values testing_X = preprocessing.scale(testing_df.values) models_info = list(map(lambda x: x[1].to_dict(), top_summary_iter)) columns = [ 'estimator', 'configuration', 'precision', 'recall', 'f1-measure', 'auc-roc', 'brier score' ] scores = pd.DataFrame(columns=columns) predictions = [] for model_info in models_info: estimator = models[model_info['estimator']] params = { key: val for key, val in model_info.items() if not (val is None or key == 'estimator') } estimator.set_params(**params) estimator.fit(training_X, training_y) prediction_y = estimator.predict(testing_X) predictions.append(prediction_y) scores_dict = { 'estimator': model_info['estimator'], 'configuration': str(params), 'precision': precision_score(testing_y, prediction_y), 'recall': recall_score(testing_y, prediction_y), 'f1-measure': f1_score(testing_y, prediction_y), 'auc-roc': roc_auc_score(testing_y, prediction_y), 'brier score': brier_score_loss(testing_y, prediction_y) } scores = scores.append(scores_dict, ignore_index=True) scores_dir = Config.get_work_dir_path( os.path.join("paper", "scores", "traditional_designite", project.github())) Path(scores_dir).mkdir(parents=True, exist_ok=True) scores_path = os.path.join(scores_dir, "scores.csv") training_x_path = os.path.join(scores_dir, "training_x.csv") training_y_path = os.path.join(scores_dir, "training_y.csv") testing_x_path = os.path.join(scores_dir, "testing_x.csv") testing_y_path = os.path.join(scores_dir, "testing_y.csv") prediction_y_path = os.path.join(scores_dir, "prediction_y.csv") prediction_real_y_path = os.path.join(scores_dir, "prediction_real_y.csv") summary_path = os.path.join(scores_dir, "summary.csv") scores.to_csv(scores_path, index=False) pd.DataFrame(data=training_X, columns=training_df.columns).to_csv(training_x_path, index=False) pd.DataFrame(data=training_y, columns=['Bugged']).to_csv(training_y_path, index=False) pd.DataFrame(data=testing_X, columns=training_df.columns).to_csv(testing_x_path, index=False) pd.DataFrame(data=testing_y, columns=['Bugged']).to_csv(testing_y_path, index=False) columns = list(map(lambda x: str(x), models_info)) pd.DataFrame(data=np.array(predictions).transpose(), columns=columns).to_csv(prediction_y_path, index=False) predictions.append(testing_y) columns.append("real") pd.DataFrame(data=np.array(predictions).transpose(), columns=columns).to_csv(prediction_real_y_path, index=False) summary.to_csv(summary_path, index=False)
def train_resampled(X, y): # resampling sm = SMOTE(random_state=42) X_res, y_res = sm.fit_resample(X, y) return X_res, y_res
def get_data(plot=True): data = pandas.read_excel('mros_1103snps.xlsx') # drop HA_SLDFXFU where only 10% is filled, drop subjectid, data.drop(['HA_SLDFXFU', 'TURSMOKE', 'HA_SLDFX', 'HA_WRSTFX'], axis=1, inplace=True) # add genetic scores #sheet = 'c1_4321_GRS' #grs_data = pandas.read_excel('MrOS_Genotype_Genetic_Score(2018-12-12).xlsx', sheet_name=sheet) #sig_dict = {} #for i in range(len(grs_data['ID'])): # key = grs_data['ID'].iloc[i] # value = sig_dict[key] = grs_data['#ALLELE'].iloc[i] # change allele to number # data[key] = data.apply(lambda sample: sample[key].count(value), axis=1) data['FRAC'] = 0 data['STATUS'] = 0 data['DAYS'] = 0 # make the fractures into 1 variable for attribute in data.keys(): data[attribute] = data.apply( lambda sample: fill_empty_cell(sample, attribute, data), axis=1) # drop the other fractured values #data.drop(['FAANYSLD', 'FAANYWST', 'FAANYHIP', 'XMDSQGE1', 'XMSQGE2', 'EFSTATUS', 'FAHIPFV1', 'FASLDFV1', 'FAWSTFV1'], axis=1, inplace=True) # encode the categorical data data = pandas.DataFrame( pandas.get_dummies(data, columns=['GIERACE', 'PHYS_MROS', 'NFWLKSPD'])) # setting Y and X Y = data['FRAC'] X = pandas.read_excel('norma_continu_var.xlsx') #X_df = data.drop(['SUBJECTID', 'HA_LSD', 'BUAMEAN', 'FAHIPFV1', 'FASLDFV1', 'FAWSTFV1', 'EFSTATUS', 'HA_BMI', 'FAANYHIP', 'HA_CALCIUM', 'XMDSQGE1', 'XMSQGE2', 'CLINIC', 'FRAC', 'FAANYSLD', 'FAANYWST', 'STATUS', 'DAYS', 'FAANYSLD','FAANYWST'], axis=1) # weight_LS = load_weight('LS_sex-combined_beta') #features = list(data)[13:-8] #feature_data = data[features] # weight_LS = pandas.DataFrame(pandas.Series(weight_LS, index=features, name=0)) #weight_FN = load_weight('FN_sex-combined_beta') # weight_FN == pandas.DataFrame(pandas.Series(weight_FN, index=features, name=0)) # X_df['GRS_LS'] = feature_data.dot(weight_LS) #X_df['GRS_FN'] = feature_data.dot(weight_FN) #X_df.drop(features, axis=1, inplace=True) # if kaplan_meier_estimator is not None and plot: # survival = numpy.array(X_df.apply(lambda sample: (sample['STATUS'], sample['DAYS']), axis=1), dtype=[('Status', '?'), ('Survival_in_days', '<f8')]) # time, survival_prob = kaplan_meier_estimator(survival['Status'], survival['Survival_in_days']) # pyplot.step(time, survival_prob, where="post") # pyplot.ylabel(r'est. probability of survival $\hat{S}(t)$') # pyplot.xlabel('time $t$ in days') #X_df.drop(['STATUS', 'DAYS', 'FRAC'], axis=1, inplace=True) # weight=pandas.Series(weight) # X_df['grs']=0 # X_df['grs'] = X_df.apply(lambda sample: weighted_grs(sample, data, weight), axis=1) # print(list(X_df)) # scale numerical entries to 0-1 # numericals = ['GIAGE1', 'HA_HEIGHT', 'HA_WEIGHT', 'HA-SMOKE', 'GIERACE', 'TUDRPRWK', 'B1FND', 'GRS_FN'] # ['BUAMEAN', 'GIAGE1', 'HA_BMI', 'HA_CALCIUM', 'TUDRPRWK', 'GRS_FN', 'GRS_LS', 'B1FND', 'B1TLD', 'B1THD'] # , 'score1', 'score2', 'score3'] #minMaxScaler = preprocessing.MinMaxScaler() #X_df[NUMERICALS] = minMaxScaler.fit_transform(X_df[NUMERICALS]) # pca=PCA(n_components=10) # X_df=pca.fit_transform(X_df) # smote x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) sm = SMOTE(random_state=2, ratio=1.0) x_train_s, y_train_s = sm.fit_resample(x_train, y_train) # Xtrain, Xtest, ytrain, ytest = train_test_split(X_df, Y_df, test_size=0.2) parameters = { 'loss': ['deviance', 'exponential'], 'n_estimators': [200, 500, 800, 1000, 1200], 'learning_rate': [0.001, 0.003, 0.005], 'subsample': [0.3, 0.5, 0.7], # <1.0 results in reduction of variance and increase in bias 'min_samples_split': [2, 5, 8], 'max_features': ['auto', 'log2', 'sqrt', 0.2], 'random_state': [42], 'max_depth': [2, 3, 5], 'min_impurity_decrease': [0.15, 0.1, 0.08, 0.05] } # split data for parameter sweep model = RandomSearch(estimator=GradientBoostingClassifier(), modelName='Gradient Boosting Classifier', params=parameters, Xtrain=x_train_s, ytrain=y_train_s, Xtest=x_test, ytest=y_test, score='roc_auc') # model=GradientBoostingClassifier(subsample=0.3, n_estimators=800, min_samples_split=2, min_impurity_decrease=0.05, max_features='sqrt', max_depth=3, loss='deviance', learning_rate=0.01) model.fit(x_train_s, y_train_s) print(model.feature_importances_) # ypred = model.predict(Xtest) yscore_raw = model.predict_proba(x_test) yscore = [s[1] for s in yscore_raw] fpr, tpr, thresh = roc_curve(y_test, yscore) auc = roc_auc_score(y_test, yscore) ytest = numpy.array(y_test) # yscore = numpy.array(yscore) # n_bootstraps = 1000 # bootstrapped_scores = [] # rng = numpy.random.RandomState() # for i in range(n_bootstraps): # # bootstrap by sampling with replacement on the prediction indices # indices = rng.random_integers(0, len(yscore) - 1, len(yscore)) # if len(numpy.unique(ytest[indices])) < 2: # # We need at least one positive and one negative sample for ROC AUC to be defined: reject the sample # continue # bootstrapped_scores.append(roc_auc_score(ytest[indices], yscore[indices])) # print(mean_confidence_interval(bootstrapped_scores)) # plot roc curve if plot: y_probas = model.predict_proba( x_test) # predicted probabilities generated by sklearn classifier scikitplot.metrics.plot_roc( ytest, y_probas, plot_macro=False, plot_micro=False, classes_to_plot=[1], title='ROC Curve by Gradient Boosting Model') pyplot.show() return fpr, tpr, thresh, auc
def svm_func(train_A, words_of_tweets, extra_features, feature_selection, encoding, print_file): reading = Twitter_Depression_Detection.Reader() # Import the Twitter_Depression_Detection.py file, to get the encoding print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') print(words_of_tweets) x = np.array(words_of_tweets) y = train_A['label'] # Initialize the roc-auc score running average list # Initialize a count to print the number of folds # Initialize metrics to print their average av_roc = 0. count = 0 precision = 0 accuracy = 0 recall = 0 f1score = 0 # Below 3 variables are used for ROC-AUC curve tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) # Initialize your 10 - cross vailidation # Set shuffle equals True to randomize your splits on your training data kf = KFold(n_splits=10, random_state=41, shuffle=True) # Set up for loop to run for the number of cross vals you defined in your parameter for train_index, test_index in kf.split(x): count += 1 print('Fold #: ', count) with open(print_file, "a") as myfile: # Write above print into output file myfile.write('Fold #: ' + str(count) + '\n') # This indexs your train and test data for your cross validation and sorts them in random order, since we used shuffle equals True x_train, x_test = reading.get_enc(x[train_index], 1, y[train_index], train_index, extra_features, feature_selection, encoding, print_file), reading.get_enc(x[test_index], 0, y[test_index], test_index, extra_features, feature_selection, encoding, print_file) y_train, y_test = y[train_index], y[test_index] # Assumed you have, X (predictor) and Y (target) for training data set and x_test(predictor) of test_dataset # Create SVM classification object # For very large C, the margin is hard, and points cannot lie in it. For smaller C, the margin is softer, and can grow to encompass some points. # gamma: Higher the value of gamma, will try to exact fit the training data set i.e.generalization error and cause over-fitting problem. model = naive_bayes.GaussianNB() ####################################################################################################################### # Feature Scaling minMaxScaler = MinMaxScaler(feature_range=(0, 1)) # Get points and discard classification labels #x_train = minMaxScaler.fit_transform(x_train) #x_test = minMaxScaler.transform(x_test) ####################################################################################################################### oversample = SMOTE(sampling_strategy='minority', k_neighbors=10, random_state=0) model.fit(x_train, y_train) return model ####################################################################################################################### # Visualization of normal and oversampled data '''visualize_data(x_train, y_train, "Normal Dataset")''' # 'minority': resample only the minority class; x_train, y_train = oversample.fit_resample(x_train, y_train) '''visualize_data(x_train, y_train, "Oversampled Dataset")''' ####################################################################################################################### model.score(x_train, y_train) # Predict Output y_pred = model.predict(x_test) #return model ####################################################################################################################### # Your model is fit. Time to predict our output and test our training data print("Evaluating model...") with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Evaluating model..." + '\n') #roc = roc_auc_score(y_test, y_pred) # Print your ROC-AUC score for your kfold, and the running score average #print('ROC: ', roc) #av_roc += roc #print('Continued Avg: ', av_roc / count) #with open(print_file, "a") as myfile: # Write above print into output file #myfile.write('ROC: ' + str(Continued Avg: ' + str(av_roc / count) + '\n') #y_pred = (y_pred > 0.5) # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ''' # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_test, y_pred) tprs.append(interp(mean_fpr, fpr, tpr)) tprs[-1][0] = 0.0 roc_auc = auc(fpr, tpr) aucs.append(roc_auc) plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (count - 1, roc_auc)) ''' # ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ''' # Creating the Confusion Matrix cm = confusion_matrix(y_test, y_pred) print(cm) with open(print_file, "a") as myfile: # Write above print into output file myfile.write(str(cm) + '\n') ''' print(y_pred) temp_accuracy = accuracy_score(y_test, y_pred) temp_precision, temp_recall, temp_f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='macro') accuracy += temp_accuracy precision += temp_precision recall += temp_recall f1score += temp_f1_score print("Accuracy: ", temp_accuracy) print("Precision: ", temp_precision) print("Recall: ", temp_recall) print("F1 score: ", temp_f1_score) print(metrics.classification_report(y_test,y_pred)) # Create ROC-AUC curve # compute_ROC_Curve(tprs, mean_fpr, aucs) ########################################################################################################################## # Print average of metrics print("Average Precision: ", precision / 10) print("Average Accuracy: ", accuracy / 10) print("Average Recall: ", recall / 10) print("Average F1-score: ", f1score / 10) # Print your final average ROC-AUC score and organize your models predictions in a dataframe #print('Average ROC:', av_roc / 10) with open(print_file, "a") as myfile: # Write above print into output file myfile.write("Average Precision: " + str(precision / 10) + '\n' + "Average Accuracy: " + str(accuracy / 10) + '\n' + "Average Recall: " + str(recall / 10) + '\n' + "Average F1-score: " + str(f1score / 10) + '\n' + 'Average ROC:' + str(av_roc / 10) + '\n')
df.head() y = df["bi_popularity"] X = df.drop(columns=[ "popularity", "track_id", "year", "bi_popularity", "genre", "artist_name", "track_name", "release_date" ]) # Get Dummies for mode and key X = pd.get_dummies(X, columns=["mode", "key", "time_signature"]) X.head() from imblearn.over_sampling import SMOTE os = SMOTE() X_oversample, y_oversample = os.fit_resample(X, y) #Features shape X_oversample.shape """Split and Train""" # split the dataset into training and testing sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_oversample, y_oversample, random_state=1) X_train.shape # XGBoost model from xgboost import XGBClassifier
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) Classifier = RandomForestClassifier(n_estimators=1000) Classifier = Classifier.fit(X_train, Y_train) Y_pred = Classifier.predict(X_test) ModelPerformanceMetrics(Y_test, Y_pred) # Try SMOTE X = df.drop(['Bankrupt?'], axis=1) Y = df['Bankrupt?'] sm = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=0) X_smote, Y_smote = sm.fit_resample(X, Y) X_train, X_test, Y_train, Y_test = train_test_split(X_smote, Y_smote, test_size=0.2, random_state=0) Classifier = LogisticRegression(max_iter=1000) Classifier = Classifier.fit(X_train, Y_train) Y_pred = Classifier.predict(X_test) ModelPerformanceMetrics(Y_test, Y_pred) # Try XGBoost X = df.drop(['Bankrupt?'], axis=1) Y = df['Bankrupt?'] X_train, X_test, Y_train, Y_test = train_test_split(X,
if IS_SMOTE: # num_y = [] # for i in range(2): # num_y.append(np.sum(y == i)) # max_num_y = max(num_y) # _ratio = {0: max_num_y, 1: max_num_y} # SMOTE # smote = SMOTE(sampling_strategy=_ratio, random_state=71) smote = SMOTE(random_state=42) X, y = smote.fit_resample(X, y) for i in range(2): print("Data number resampled => " + str(i) + ": " + str(np.sum(y == i))) Xs = [] ys = [] # bootstrap sampling for i in range(L2_ENSEMBLE_NUMBER): resampled_X, resampled_y = resample(X, y, n_samples=len(X)) Xs.append(resampled_X) ys.append(resampled_y) print("l2_ensemble_{} was created.".format(i + 1))
def test_smote_error_passing_estimator(smote_params, err_msg): smote = SMOTE(**smote_params) with pytest.raises(ValueError, match=err_msg): smote.fit_resample(X, Y)
def fix_imbalance(self, data, target, threshold=10.0, oversample=True, smote=False): """ Method Name: fix_imbalance Description: This method will be used to handle unbalanced datasets(rare classes) through oversampling/ undersampling techniques Input Description: data: the input dataframe with target column. threshold: the threshold of mismatch between the target values to perform balancing. Output: A balanced dataframe. On Failure: Raise Exception Written By: Punit Nanda Version: 1.2 Revisions: None """ try: X = data.drop(target, axis=1) y = data[target] no_of_classes = y.nunique() if no_of_classes == 2: thresh_satisfied = ((y.value_counts() / float(len(y)) * 100).any() < threshold) # pdb.set_trace() if thresh_satisfied: if smote: smote = SMOTE() X, y = smote.fit_resample(X, y) elif oversample: ROS = RandomOverSampler(sampling_strategy='auto', random_state=42) X, y = ROS.fit_sample(X, y) else: ROS = RandomUnderSampler(sampling_strategy='auto', random_state=42) X, y = ROS.fit_sample(X, y) else: high = (y.value_counts() / float(len(y)) * 100).ravel().max() low = (y.value_counts() / float(len(y)) * 100).ravel().min() thresh_satisfied = (high - low > 100.0 - threshold) while thresh_satisfied: if smote: smote = SMOTE(sampling_strategy='minority') X, y = smote.fit_resample(X, y) elif oversample: ROS = RandomOverSampler(sampling_strategy='minority', random_state=42) X, y = ROS.fit_sample(X, y) else: ROS = RandomUnderSampler(sampling_strategy='auto', random_state=42) X, y = ROS.fit_sample(X, y) high = (y.value_counts() / float(len(y)) * 100).ravel().max() # added v0.1 low = (y.value_counts() / float(len(y)) * 100).ravel().min() # added v0.1 thresh_satisfied = (high - low > 100.0 - threshold) # pdb.set_trace() y.to_frame(name=target) dfBalanced = pd.concat([X, y], axis=1) return dfBalanced except Exception as e: raise Exception() # raising exception and exiting
for i in range(9): img = cv.imread( "F:\\360MoveData\\Users\\1\\Desktop\\zhinengzhizao\\cv\\cv-img\\test/ex-2/2/" + str(i) + ".tiff", 0) img_new = img.ravel() train_label[i] = 2 for j in range(29241): train_data[i, j] = img_new[j] for m in range(9, 13): img = cv.imread( "F:\\360MoveData\\Users\\1\\Desktop\\zhinengzhizao\\cv\\cv-img\\test/ex-2/3/" + str(m - 9) + ".tiff", 0) img_new = img.ravel() train_label[m] = 3 for n in range(29241): train_data[m, n] = img_new[n] return train_data, train_label train_data, train_label = load_picture() model_smote = SMOTE(k_neighbors=3) new_data, new_label = model_smote.fit_resample(train_data, train_label) for i in range(13, 18): img = new_data[i].reshape(171, 171) cv.imwrite( "F:\\360MoveData\\Users\\1\\Desktop\\zhinengzhizao\\cv\\cv-img\\test/ex-2/3-sup/" + str(i) + ".tiff", img)
def imbalance_handler( mode: str, parent_class_label: str): """ Purpose ------- The purpose of this function is to provide the user a tool that allows them to easily manipulate their training and/or test dataset so that it is significantly more balanced between its classes. One would want to do this in order to improve the realiability of their classifier that will get trainined on this dataset (see 1. in the References section for more information about this). **Note, however, that if a class has only 5 or fewer article instances that belong to it, it will be dropped completely due to the fact that the SMOTE and ENN algorithms used in this function rely on at least 6 nearest-neighbors of a class to exist. If this class label is particularly important and you would like to keep it around, then obtain more data for it.** Parameters ---------- mode : str This string allows the user to specify how they would like the imbalancing of the dataset to be handled. The available options include: 1. "smote" - In this mode, the only algorithm that will be implemented to make the dataset more balanced is the over-sampling algorithm SMOTE. See 1., 3., 4., and 5. in the References section for more information about this algorithm. 2. "enn" - In this mode, the only algorithm that will be implemented to make the dataset more balanced is the under-sampling algorithm Edited-Nearest Neighbors (ENN). See 1. and 6. for more information about this algorithm. 3. "smote-enn" - In this mode, this function will implement both the SMOTE and ENN algorithms; SMOTE will oversample to make the classes balanced and ENN will under-sample to remove any newly generated samples in the minority class(es) that are not helpful. See 1. and 7. for more information about the benefits of doing using this 2-step process and for how this is implemented in the imbalanced-learn module. parent_class_label : str This string represents the class label that is the Parent class of all of the sub-classes that will be distignuished and predicted by a classifier that you wish to build. I.e., if you want to build a classifier for the children of the "Auto Type" class (which includes "Budget Cars", "Concept Cars", and "Luxury Cars" to name a few), then you simply have to pass in the "Auto Type" string to this parameter. Returns ------- to_return : (Sparse Numpy Array, Numpy Array) The former element represents the new feature matrix (where some rows correspond to the article instances that were synthetically generated if the user specifed for over-sampling to occur) and the latter element represents the new class labels. Note that the number of rows in both these array objects are the same since each row of the two correspond to the same (real or synthetic) article instance. References ---------- 1. https://towardsdatascience.com/guide-to-classification-on-imbalanced-datasets-d6653aa5fa23 2. https://imbalanced-learn.readthedocs.io/en/stable/index.html 3. https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/ 4. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html 5. https://www.kite.com/blog/python/smote-python-imbalanced-learn-for-oversampling/ 6. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.under_sampling.EditedNearestNeighbours.html#imblearn.under_sampling.EditedNearestNeighbours 7. https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.combine.SMOTEENN.html """ # First, get values for the parameters that we will need to use for # the rest of the function. normalized_mode = "".join(mode.lower().split("-")) child_tier_lvl, raw_articles_df = class_data_retrival( parent_class_label, give_child_tier_lvl=True) # Before performing any transformations on our data, we need to # double check that it is suitable for the BOWs and balance # correcting model. If it is not, then perform any corrections # neccessary. child_tier_label = "Tier{}".format(child_tier_lvl) counts_of_classes = raw_articles_df[child_tier_label].value_counts() counts_checker = counts_of_classes.values <= 6 num_with_less = counts_checker.sum() if num_with_less > 0: # If any of the classes that we are working with have 5 or fewer # articles in them. If this is the case, then we cannot use any # of the over/under-sampling algorithms that investigate the # characteristics of its 6 nearest-neighbors. Our current # solution is to simply drop this class from consideration. indicies_with_less = np.argwhere(counts_checker).flatten() labels_with_less = counts_of_classes.index.values[indicies_with_less] if num_with_less == 1: # If there is exactly 1 class labels that we are going to # have to remove from the DataFrame of articles. assert labels_with_less.size == 1 label_to_remove = labels_with_less[0] articles_df = raw_articles_df[raw_articles_df[child_tier_label] != label_to_remove] elif num_with_less == 2: # If there are exactly 2 class labels that we are going to # have to remove from the DataFrame of articles. assert labels_with_less.size == 2 conditions_to_remove = np.logical_and( raw_articles_df[child_tier_label] != labels_with_less[0], raw_articles_df[child_tier_label] != labels_with_less[1]) articles_df = raw_articles_df[conditions_to_remove] else: # If there are 3 or more class labels that we are going to # have to remove from the DataFrame of articles. assert labels_with_less.size >= 3 for i in range(len(labels_with_less)): # if i == 0: # If we are on our first iteration. In this case, we # need to instantiate the `conditions_to_remove` # object with the first two labels that we want to # remove. conditions_to_remove = np.logical_and( raw_articles_df[child_tier_label] != labels_with_less[i], raw_articles_df[child_tier_label] != labels_with_less[i + 1]) elif i > 1: # If we are on either our third or further down # iteration. If this is the case, then we know that # the `conditions_to_remove` object has been # instantiated. We just need to add on to it with # the remaining labels that we would like to remove. conditions_to_remove = np.logical_and( conditions_to_remove, raw_articles_df[child_tier_label] != labels_with_less[i]) articles_df = raw_articles_df[conditions_to_remove] else: # All the article counts for each class pass the test :). articles_df = raw_articles_df # Next, obtain your X (features) matrix and your y (labels) vector. _, featue_matrix = bag_of_words_converter(mode="tfidf", parent_class_label=None, articles_df=articles_df, upper_n_gram=2, upper_features=300, apply_PCA=True) labels_arr = np.array( articles_df[child_tier_label].tolist()) # Next, implement the algorithm the user has specified. if normalized_mode == "smote": # If the user would first like to oversample with the SMOTE # algorithm. sm_model = SMOTE(random_state=169, n_jobs=3) final_feature_matrix, final_labels_arr = sm_model.fit_resample( featue_matrix, labels_arr) elif normalized_mode == "enn": # If the user would like to undersample with the Tomek links # algorithm enn_model = EditedNearestNeighbours(sampling_strategy="auto", n_jobs=3) final_feature_matrix, final_labels_arr = enn_model.fit_resample( featue_matrix, labels_arr) elif normalized_mode == "smoteenn": # If the user would first like to oversample with SMOTE and then # improve on that new set of samples by undersampling with the # ENN algorithm # Instantiate the smoteenn object from imblearn that first # performs SMOTE and then ENN. sm_enn_model = SMOTEENN(random_state=169, n_jobs=3) # Fit and resample with this pipeline object. final_feature_matrix, final_labels_arr = sm_enn_model.fit_resample( featue_matrix, labels_arr) to_return = (final_feature_matrix, final_labels_arr) return to_return
if x == 'Positive' else 0) vectorizer = TfidfVectorizer() tfidf_model = vectorizer.fit_transform(df['review'].values) X = df['review'] y = df['user_sentiment'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) #Transforming the X_train and X_test using the tf-idf model X_train = vectorizer.transform(X_train) X_test = vectorizer.transform(X_test) smt = SMOTE() X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train) log_model = LogisticRegression() params = { 'C': np.logspace(-10, 1, 15), 'class_weight': [None, 'balanced'], 'penalty': ['l1', 'l2'] } cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True) # Create grid search using 5-fold cross validation clf_LR = GridSearchCV(log_model, params, cv=cv, scoring='roc_auc', n_jobs=-1) clf_LR.fit(X_train_sm, y_train_sm)
sns.heatmap( abs(df.corr()), annot=True ) #Shows the features correlation. ps: there's a bug on matplotlib #3.1.1. plt.show() # IN [6]: to split the data and balance it. X_train, X_test, y_train, y_test = train_test_split(df.drop(['target'], axis=1), df['target'], test_size=0.2, random_state=2) X_train.head() X_train.shape # checking if balanced? y_train.mean() sm = SMOTE(random_state=1) X_train_balanced, y_train_balanced = sm.fit_resample(X_train, y_train) # Balancing it X_train_balanced.shape y_train_balanced.shape y_train_balanced.mean() # IN [7]: Building models Decision Tree and Random Forest. dt = DecisionTreeClassifier(random_state=2) print('CV score:', cross_val_score(dt, X_train_balanced, y_train_balanced, cv=3).mean()) rf = RandomForestClassifier(n_estimators=100, random_state=2) print('CV score:', cross_val_score(rf, X_train_balanced, y_train_balanced, cv=3).mean()) rf_grid = RandomForestClassifier(random_state=2) # creates a new estimator # IN [8]: Random Forest. # Create the parameter grid based on the results of random search rf_param_grid = {
X_test_RFE = pd.DataFrame(rfe.transform(X_test), columns=X_test.columns[rfe.support_]) #X_train_RFE = X_train[rfe.support_] print(X_train.shape, X_train_RFE.shape) #%% from imblearn.over_sampling import ADASYN, SMOTE, SVMSMOTE #OVERSAMPLING sampler_smote = SMOTE(n_jobs=-1) sampler_svm = SVMSMOTE(n_jobs=-1) sampler_adasyn = ADASYN(n_jobs=-1) X_smote, y_smote = sampler_smote.fit_resample(X=X_train, y=y_train.ravel()) X_svm, y_svm = sampler_svm.fit_resample(X=X_train, y=y_train.ravel()) X_adasyn, y_adasyn = sampler_adasyn.fit_resample(X=X_train, y=y_train.ravel()) #%% #baseline rf = ensemble.RandomForestClassifier(n_estimators=100, max_depth=8, criterion="entropy", n_jobs=-1) rf.fit(X=X_train, y=y_train.ravel()) #FROM NOW ON, USE THE TUNED VERSION ALTHOUGH WE SHOULD RE-TUNE #weight classes rf_balanced = ensemble.RandomForestClassifier(
def train_model(paras): input = paras.input[0] model_path = paras.model[0] cancer_type = paras.cancer_type[0] classifier_name = paras.classifier[0] input_description = paras.input_description[0] model_description = paras.model_description[0] positive_num = paras.positive_num[0] author = paras.author[0] email = paras.email[0] input_df = pd.read_csv(input, index_col=0).dropna() genes = input_df.columns.to_list() genes.remove("msi") msi_status = input_df["msi"].to_list() # print(msi_status) y_label = [ 1 if i.upper() in ["MSI", "MSI-H", "MSI_H"] else 0 for i in msi_status ] class_num = Counter(y_label) if class_num[1] < positive_num: logger.error( "The No. of MSI sample lower than the minimum values, Please set with -p." ) x = input_df[genes] smo = SMOTE(random_state=1) x_balanced, y_label_balanced = smo.fit_resample(x, y_label) classifier = build_classrfier(type=classifier_name) classifier.fit(x_balanced, y_label_balanced) y_pre = classifier.predict(x) y_pre_pro = classifier.predict_proba(x)[:, 1] roc_auc = roc_auc_score(y_label, y_pre_pro) conf_matrix = confusion_matrix(y_label, y_pre, labels=None, sample_weight=None) TN = conf_matrix[0][0] FN = conf_matrix[1][0] TP = conf_matrix[1][1] FP = conf_matrix[0][1] sensitivity = TP / (TP + FN) precision = TP / (TP + FP) specificity = TN / (TN + FP) f_score1 = f1_score(y_label, y_pre) accuracy_score = (TP + TN) / (FN + FP + TN + TP) description = {} description["Cancer Type"] = cancer_type description["Classifier"] = classifier_name description["Input Description"] = input_description description["Model Description"] = model_description description["Available Trained Sample"] = len(input_df) description["Positive Sample"] = class_num[1] description["Negative Sample"] = class_num[0] description["Accuracy"] = accuracy_score description["AUC"] = roc_auc description["F1 Score"] = f_score1 description["Sensitivity"] = sensitivity description["Specificity"] = specificity description["Precision"] = precision description["Model Path"] = os.path.abspath(model_path) description["Author"] = os.path.abspath(author) description["Email"] = os.path.abspath(email) for item, value in description.items(): logger.info(item + ": {}".format(value)) with open(model_path, 'wb') as f: pickle.dump(classifier, f) pickle.dump(description, f) pickle.dump(genes, f)
# Slice inputs and outputs [input_data_train, output_data_train] = slice_data(train_data_frame) [input_data_test, sample_ids] = slice_data(test_data_frame) # Original class distribution count_per_class(output_data_train) # If resample flag is True, we need to resample the training dataset by generating new synthetic samples if resample: resampler = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=5, n_jobs=4) print("Resampling data") [input_data_train, output_data_train] = resampler.fit_resample( input_data_train, output_data_train) # Original class distribution print("Done resampling") # Resampled class distribution count_per_class(output_data_train) # Train the classifier print("Started training") clf = clf.fit(input_data_train, output_data_train) print("Finished training") # Predict print("Prediction") predicted = clf.predict(input_data_test) proba = clf.predict_proba(input_data_test) print("Finished prediction")
def predict(): if request.method == "POST": if request.form['features'].isnumeric(): start = dt.datetime.now() train = pd.read_csv("train.csv") X = train.drop('labels', 1) y = train['labels'] #features num_features = int(request.form['features']) features_name_to_keep = pd.read_csv("features.csv") features_name_to_keep = features_name_to_keep[ features_name_to_keep['rank'] < num_features]["col"].to_list() final_features = pd.DataFrame(features_name_to_keep, columns=['final_features']) final_features.to_csv("final_features.csv") #only keep selected features from feature selection X = X[features_name_to_keep] # Split into Train and Test dataset X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42) # SMOTE oversample = SMOTE() X_train, y_train = oversample.fit_resample(X_train, y_train) #model training #XGBoost clf_xgb = XGBClassifier(colsample_bytree=0.3, learning_rate=0.1, max_depth=6, reg_alpha=0.8) clf_xgb.fit(X_train, y_train) accuracy_xgb = clf_xgb.score(X_test, y_test) metrics_xgb = precision_recall_fscore_support( y_test, clf_xgb.predict(X_test), average='macro') joblib.dump(clf_xgb, 'xgb.pkl') # Random Forest clf_rf = RandomForestClassifier(n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=None, bootstrap=False) clf_rf.fit(X_train, y_train) accuracy_rf = clf_rf.score(X_test, y_test) metrics_rf = precision_recall_fscore_support( y_test, clf_rf.predict(X_test), average='macro') joblib.dump(clf_rf, 'rf.pkl') # Adaboost clf_ab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=225, learning_rate=0.3) clf_ab.fit(X_train, y_train) accuracy_ab = clf_ab.score(X_test, y_test) metrics_ab = precision_recall_fscore_support( y_test, clf_ab.predict(X_test), average='macro') joblib.dump(clf_ab, 'ab.pkl') # End timer time_taken = str(dt.datetime.now() - start) if accuracy_xgb > accuracy_rf: highest_accuracy_model = "XGBoost" elif accuracy_ab > accuracy_xgb: highest_accuracy_model = "AdaBoost" else: highest_accuracy_model = "Random Forest" if metrics_xgb[2] > metrics_rf[2]: highest_fscore_model = "XGBoost" elif metrics_ab[2] > metrics_xgb[2]: highest_fscore_model = "AdaBoost" else: highest_fscore_model = "Random Forest" return render_template( "metrics.html", metrics_xgb=metrics_xgb, accuracy_xgb=accuracy_xgb, metrics_rf=metrics_rf, accuracy_rf=accuracy_rf, metrics_ab=metrics_ab, accuracy_ab=accuracy_ab, time_taken=time_taken, highest_accuracy_model=highest_accuracy_model, highest_fscore_model=highest_fscore_model) else: flash("Please enter a digit for the number of features to use!") return redirect("/model")
def __init__(self, getfile, test_num): #-----SPLIT DATASETS------- self.getfile = getfile self.test_num = test_num tested = pd.read_csv(self.getfile) x = tested.iloc[:, [5, 6]].values # output y = tested.iloc[:, 7].values xtrain, xtest, ytrain, ytest = train_test_split( x, y, test_size=self.test_num, random_state=42) print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape) #testx = len(xtest) #print(testx) #4 Feature Scaling #Feature Scaling or Standardization: It is a step of Data Pre Processing which is applied to independent variables or features of data. # It basically helps to normalise the data within a particular range. Sometimes, it also helps in speeding up the calculations in an algorithm. sc_x = StandardScaler() xtrain = sc_x.fit_transform(np.asarray(xtrain)) xtest = sc_x.transform(np.asarray(xtest)) counter = Counter(y) #---------------SMOTE ALGORITHM-------------------------- print("Before OverSampling, counts of label '1': {}\n".format( sum(ytrain == 1))) print("Before OverSampling, counts of label '-1': {} \n".format( sum(ytrain == -1))) print('WITH SMOTE') os = RandomOverSampler(sampling_strategy='minority') xtrain_res, ytrain_res = os.fit_sample(x, y) oversample = SMOTE() xtrain, ytrain = oversample.fit_resample(xtrain_res, ytrain_res.ravel()) counter = Counter(ytrain) print(counter) print('After OverSampling, the shape of train_X: {}'.format( xtrain.shape)) print('After OverSampling, the shape of train_y: {} \n'.format( ytrain.shape)) print("After OverSampling, counts of label '1': {}".format( sum(ytrain == 1))) print("After OverSampling, counts of label '-1': {}".format( sum(ytrain == -1))) #---------------LOGISTIC REGRESSION---------------------- #5 Fitting the Logistic Regression to the Training Set: #We create a classifier object of LR class classifier = LogisticRegression() #Fit logistic regression model to the training set (Xtrain and ytrain) classifier.fit(xtrain, ytrain) #vget = classifier.vard #print(vget) #6 Predicting the Test set results #Using predict method for the classifier object and put Xtest for #argument y_pred = classifier.predict(xtest) #print(y_pred) posed = 1 neued = 1 neged = 1 import MySQLdb mydb = MySQLdb.connect(host="127.0.0.1", user="******", password="", database="logitregression_data") mycursor = mydb.cursor() logit = [] with open('temp_file.csv', 'r') as tempo: read = csv.reader(tempo, delimiter=',') for tem in read: logit.append(tem) with open(getfile, 'r') as file: reader = csv.reader(file, delimiter=',') all_value = [] counter = 0 mycursor.execute("DELETE FROM hybrid_logitval") #-----------The Result On The Logistic Regression Process Based on the Number of Test size will be seperated and determine the overall Result-------------- for over in y_pred: counter += 1 if over == 1: posed += 1 resu = 'Positive' regval = 1 elif over == 0: neued += 1 resu = 'Neutral' regval = 0 else: neged += 1 resu = 'Negative' regval = -1 #stregval = str(regval) #valued = (counter,over,stregval, resu) query2 = "INSERT INTO `hybrid_logitval`(`HYB_ID`, `HYB_VALUE`, `HYB_SENTIMENT`, `HYB_RESULT`) VALUES (%s,%s,%s,%s)" mycursor.execute(query2, (counter, logit[counter], regval, resu)) for row in reader: #print(row[0]) value = (row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7]) all_value.append(value) mycursor.execute("DELETE FROM `baseline`") query = "INSERT INTO `baseline`(`ID`, `TWEETS`, `TOKENIZED`, `STOP_WORDS`, `STEMMED`, `POLARITY`, `SUBJECTIVITY`, `SENTIMENT`) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)" mycursor.executemany(query, all_value) mycursor.execute("DELETE FROM `baseline` WHERE `baseline`.`ID` = 0") mydb.commit() mydb.close() #---------------CONFUSION MATRIX---------------------- #7 Making the Confusion Matrix. It contains the correct and incorrect predictions of our model #ytest parameter will be y_test #y_pred is the logistic regression model prediction cm = confusion_matrix(ytest, y_pred) import warnings warnings.filterwarnings("ignore") cr = classification_report(ytest, y_pred) print(ytest) print("Confusion Matrix : \n", cm) print(cr) import mlxtend.plotting from mlxtend.plotting import plot_confusion_matrix class_names = ['-1', '0', '1'] fig, ax = plot_confusion_matrix(conf_mat=cm, colorbar=True, class_names=class_names) fig.canvas.set_window_title('HYBRID LOGISTIC REGRESSION') plt.ylabel('Actual label') plt.xlabel('Predicted label') plt.show() #-------SENDS ALL VALUES TO APPEAR ON THE USER INTERFACE---------------- global accurate, confuse, posi, neut, nega, overall, plots, replot, percentage, reports accurate = accuracy_score(ytest, y_pred) print(accurate) percentage = "{:.0%}".format(accurate) confuse = cm print(percentage) posi = posed neut = neued nega = neged plots = y_pred replot = plt reports = cr if (neut >= posi) and (neut >= nega): overall = 'NEUTRAL' elif (posi >= neut) and (posi >= nega): overall = 'POSITIVE' else: overall = 'NEGATIVE' print(overall)
from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import TfidfVectorizer tv = TfidfVectorizer() X = new_data.iloc[:, 1] y = new_data.iloc[:, 0] X = tv.fit_transform(new_data.Posts) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) from imblearn.over_sampling import SMOTE sm = SMOTE(random_state=444) X_train_res, y_train_res = sm.fit_resample(X_train, y_train) X_train_res.shape y_train_res.shape X_test.shape y_test.shape model = LogisticRegression() model.fit(X_train_res, y_train_res) y_pred = model.predict(X_test) pickle.dump(tv, open('Transform.pkl', 'wb')) pickle.dump(model, open('Model.pkl', 'wb'))
X_resampled, y_resampled = rus.fit_resample(X,y) np.bincount(y_resampled) plt.scatter(X_resampled[:,0],X_resampled[:,1],c=y_resampled) rus = RandomUnderSampler(random_state=0, sampling_strategy={1:64*2,2:64*10}) X_resampled, y_resampled = rus.fit_resample(X,y) np.bincount(y_resampled) from imblearn.over_sampling import SMOTE sm=SMOTE(k_neighbors=5, random_state=0) X_resampled, y_resampled = sm.fit_resample(X,y) np.bincount(y_resampled) plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled) from imblearn.over_sampling import ADASYN ada=ADASYN(random_state=0, n_neighbors=5) X_resampled, y_resampled = ada.fit_resample(X,y) np.bincount(y_resampled) plt.scatter(X_resampled[:,0], X_resampled[:,1], c=y_resampled)
def create_training_testing(object_name, features_table, training_days=20, model='single', clean=0, feature_set=13, sorting_state=42, SMOTE_state=42, clf_state=42, n_estimators=100, max_depth=7, hostless_cut=0.1): ''' Import the training set and modify the features_table according to the model parameters specified. Parameters ------------- object_name : Name of the object to exclude from training set features_table : Astropy table with all the features of the new transient training_days : What data set to use for training model : Which model to use for training, single or double clean : Clean hostless transients? feature_set : Which feature set to use sorting_state : Seed number for list sorter SMOTE_state : Seed number for SMOTE clf_state : Seed number for classifier n_estimators : Number of trees max_depth : Depth of trees hostless_cut : Only consider hosts with a Pcc lower than this Return --------------- Predicted Probability to be ['Nuclear','SLSN-I','SLSN-II','SNII','SNIIb','SNIIn','SNIa','SNIbc','Star'] ''' # Import Data table_name = pkg_resources.resource_filename( __name__, 'training_set/center_table_%s_%s.txt' % (training_days, model)) training_table_in = table.Table.read(table_name, format='ascii') # Remove bad objects from training sample bad = [ '2020cui', '2019lwy', '2019cvi', '2018jsc', '2005bf', '2005gi', '2007ib', '2008aq', '2008ax', '2009N', '2010id', '2012aw', '2013ai', '2013am', '2013bu', '2013ej', '2013fs', '2016X', '2018cyg', '2018epm', '2018fjw', '2018fii', '2018fuw', '2018gvt', '2018imj', '2018lcd', '2019B', '2019bvq', '2019cda', '2019ci', '2019dok', '2019gaf', '2019gqk', '2019hau', '2019iex', '2019keo', '2019lkw', '2019oa', '2019otb', '2019pjs', '2019sjx', '2019tqb', '2019wbg', '2020ekk', object_name ] good = [i not in bad for i in training_table_in['mod_object_name']] training_table_in = training_table_in[good] # Shuffle Order of Table order = np.arange(len(training_table_in)) np.random.seed(sorting_state) np.random.shuffle(order) training_table = training_table_in[order] # Select Only Clean Data if clean == 0: clean_training = training_table[ np.isfinite(training_table['red_amplitude']) & np.isfinite(training_table['Pcc'])] if clean == 1: clean_training = training_table[ np.isfinite(training_table['red_amplitude']) & np.isfinite(training_table['Pcc']) & (training_table['Pcc'] <= hostless_cut)] # Select Features if feature_set == 0: use_features = [ 'red_amplitude', 'green_amplitude', 'normal_separation', 'deltamag_red', 'deltamag_green' ] if feature_set == 1: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', 'normal_separation', 'deltamag_red', 'deltamag_green' ] if feature_set == 2: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', 'deltamag_red', 'deltamag_green' ] if feature_set == 3: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', 'normal_separation' ] if feature_set == 4: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', ] if feature_set == 5: use_features = [ 'red_amplitude', 'green_amplitude', ] if feature_set == 6: use_features = ['normal_separation', 'deltamag_red', 'deltamag_green'] if feature_set == 7: use_features = [ 'red_amplitude', 'green_amplitude', 'normal_separation' ] if feature_set == 8: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', 'normal_separation', 'color' ] if feature_set == 9: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', 'normal_separation', 'deltamag_red', 'deltamag_green', 'color' ] if feature_set == 10: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', 'normal_separation', 'Pcc' ] if feature_set == 11: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', 'normal_separation', 'redshift' ] if feature_set == 12: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', 'normal_separation', 'absmag' ] if feature_set == 13: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', 'normal_separation', 'model_color' ] if feature_set == 14: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', 'normal_separation', 'deltamag_red', 'deltamag_green', 'model_color' ] if feature_set == 15: use_features = [ 'red_amplitude', 'green_amplitude', 'normal_separation', 'deltamag_red', 'deltamag_green', 'model_color' ] if feature_set == 16: use_features = [ 'red_amplitude', 'green_amplitude', 'delta_time', 'normal_separation', 'model_color', 'redshift' ] # If using the 'double' model, add the W2 parameter if model == 'double': use_features += ['red_amplitude2', 'green_amplitude2'] # Create array with Training and Testing data training_data = np.array(clean_training[use_features].to_pandas()) testing_data = np.array(features_table[use_features].to_pandas()) # Get names of objects and classes training_class_in = np.array(clean_training['class']) # Group Transients into groups training_class_in[np.where(training_class_in == 'LBV')] = 'Star' training_class_in[np.where(training_class_in == 'Varstar')] = 'Star' training_class_in[np.where(training_class_in == 'CV')] = 'Star' training_class_in[np.where(training_class_in == 'SNIbn')] = 'SNIbc' training_class_in[np.where(training_class_in == 'SNIb')] = 'SNIbc' training_class_in[np.where(training_class_in == 'SNIbc')] = 'SNIbc' training_class_in[np.where(training_class_in == 'SNIc')] = 'SNIbc' training_class_in[np.where(training_class_in == 'SNIc-BL')] = 'SNIbc' training_class_in[np.where(training_class_in == 'SNII')] = 'SNII' training_class_in[np.where(training_class_in == 'SNIIP')] = 'SNII' training_class_in[np.where(training_class_in == 'TDE')] = 'Nuclear' training_class_in[np.where(training_class_in == 'AGN')] = 'Nuclear' classes_names = { 'Nuclear': 0, 'SLSN-I': 1, 'SLSN-II': 2, 'SNII': 3, 'SNIIb': 4, 'SNIIn': 5, 'SNIa': 6, 'SNIbc': 7, 'Star': 8 } training_class = np.array([classes_names[i] for i in training_class_in]).astype(int) # SMOTE the data sampler = SMOTE(random_state=SMOTE_state) data_train_smote, class_train_smote = sampler.fit_resample( training_data, training_class) # Train Random Forest Classifier clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=clf_state) clf.fit(data_train_smote, class_train_smote) # Predict Excluded Object predicted_probability = 100 * clf.predict_proba(testing_data) return predicted_probability
for i,j in enumerate(newclassdist_count[0:-1]): if abs(j-average)<c: c = abs(j-average) newmaj = i majority_class_new = majority_class+'_c'+str(newmaj) minority_class_new = minority_class+'_c0' ## 3. Create the dataset that only contains the new majority and minority classes data_majmin = [] target_majmin = [] for m, label in enumerate(target_cd): if label == majority_class_new or label == minority_class_new: data_majmin.append(data[m]) target_majmin.append(label) if oversampler.lower() == 'smote': sm = SMOTE() data_over, target_over = sm.fit_resample(data_majmin, target_majmin) elif oversampler.lower() == 'adasyn': ada = ADASYN() data_over, target_over = ada.fit_resample(data_majmin, target_majmin) else: print('Invalid oversampling algorithm.') sys.exit() ## 4. combine this with the remaning classes data_cdsmote = data_over.copy() target_cdsmote = target_over.copy() for m, label in enumerate(target_cd): if label != minority_class_new and label != majority_class_new: data_cdsmote.append(data[m]) target_cdsmote.append(label)
##Once the bag of words is prepared, the dataset should be divided into training and test sets: X = amazon2['Reviews1'] y = amazon2['sent'] X_train_bow, X_test_bow, y_train_bow, y_test_bow = train_test_split( bow_data, # Features amazon2['sent'], # Target variable test_size=0.2, # 20% test size random_state=0) # random state for replication purposes # # SAMPLING # In[11]: sm = SMOTE(random_state=42) X_train_smo, y_train_smo = sm.fit_resample(X_train_bow, y_train_bow) Counter(y_train_bow) Counter(y_train_smo) # # TRAINING & TESTING # In[12]: ### 1. logistic regression lr_model = LogisticRegression() #Fit train and test into the model lr_model.fit(X_train_smo, y_train_smo)
#读取数据,存属性 dfs = pd.read_csv(params['data_path']) col = list(dfs.columns) data = np.array(dfs.iloc[:,:-1]) labels = np.array(dfs['label']) #统计原数据的标签数量 c1 = Counter(labels) #根据指定的ratio(少数样本/多数样本),来生成ra集合,用以在SMOTE的指定创建 m = max(c1.values()) ra = {} for i in c1.keys(): if (c1[i]/m) < params['ratio']: ra[i] = int(m*params['ratio']) #对样本进行非均衡处理 smo = SMOTE(sampling_strategy=ra, k_neighbors=params['kneighbors'], random_state=42) data_smote,labels_smote = smo.fit_resample(data,labels) #c2_smote = Counter(labels_smote) #存储过采样后的数据 labels_smote = np.array(labels_smote).reshape(-1,1) data_label = np.hstack((data_smote,labels_smote)) result = pd.DataFrame(data_label, columns = col) result.to_csv(params['save_path'], sep=',', header=True, index=False) except Exception as e: print(e)
dfDescartadosUnder = dfDescartados.sample(qtdeConfirmados) dfUnder = pd.concat([dfDescartadosUnder, dfConfirmados], axis=0) xTreino = dfUnder[features].values yTreino = dfUnder['RESULTADO'].values #Over sampling print('\nOver Sampling') dfConfirmadosOver = dfConfirmados.sample(qtdeDescartados, replace=True) dfOver = pd.concat([dfDescartados, dfConfirmadosOver], axis=0) xTreino = dfOver[features].values yTreino = dfOver['RESULTADO'].values #Smote sampling print('\nSmote Sampling') oversample = SMOTE() xTreino, yTreino = oversample.fit_resample(treino[features], treino['RESULTADO']) xTreino = xTreino.values yTreino = yTreino.values #Classificador Randon Forest classifierRF = RandomForestClassifier(random_state=1986, criterion='gini', max_depth=10, n_estimators=50, n_jobs=-1) #Treina com todos registros classifierRF.fit(xTreino, yTreino) #Feature Selection print('\nFeature Selection') featuresSelection = zip(classifierRF.feature_importances_, features) for importance, feature in sorted(featuresSelection, reverse=True)[:30]: print('%s: %f%%' % (feature, importance*100))
def balance_data(final_data, class_column): x, y = final_data.iloc[:, :-1], final_data[class_column] oversample = SMOTE() return oversample.fit_resample(x, y)