###### Reciver Operating Characteristic Curve (ROC curve) ###### ############################################################################### from sklearn.metrics import roc_curve, auc from scipy import interp pipe_lr = make_pipeline( StandardScaler(), PCA(n_components=2), LogisticRegression(solver='liblinear', penalty='l2', random_state=1, C=100.0)) X_train2 = X_train[:, [4, 14]] X_train.shape X_train2.shape cv = list(StratifiedKFold(n_splits=3, random_state=1).split(X_train, y_train)) fig = plt.figure(figsize=(7, 5)) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas = pipe_lr.fit(X_train2[train], y_train[train]).predict_proba(X_train2[test]) fpr, tpr, thresholds = roc_curve(y_train[test], probas[:, 1], pos_label=1) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr)
sample_weight = df.values[:, 0] model = RFRegressorFeatureOutOfFold(name='rf', sample_weight=sample_weight) model.fit(df, y) for clf, (idx_train, idx_valid) in zip(model._fitted_models, model.get_fold_splitting(df.values, y)): assert np.array_equal(clf.fit_params_.get('sample_weight', None), sample_weight[idx_train]) @pytest.mark.parametrize( 'cv', [ KFold(n_splits=2), StratifiedKFold(n_splits=10, shuffle=True, random_state=71), # Must set random state ]) def test_custom_cv_class(cv, binary_data): df, y = binary_data clf = boosting.XGBoostClassifierOutOfFold(name='xgb', cv=cv) for origin, model in zip(cv.split(df.values, y), clf.get_fold_splitting(df.values, y)): assert np.array_equal(origin[0], model[0]) assert np.array_equal(origin[1], model[1]) def test_custom_cv_as_list(): """can set custom cv as list of train / test indexes""" cv = [[1, 2, 3], [4, 5], [2, 4, 5], [1, 3]] clf = boosting.XGBoostClassifierOutOfFold(name='xgb', cv=cv)
data_type + '.csv', delimiter=',') emt_y_coord = np.loadtxt(path + '/data_' + str(max_length) + '_y_' + data_type + '.csv', delimiter=',') emt_label = np.loadtxt(path + '/data_' + str(max_length) + '_label_' + data_type + '.csv', delimiter=',') emt_x_coord = np.reshape(emt_x_coord, (-1, max_length, 1)) emt_y_coord = np.reshape(emt_y_coord, (-1, max_length, 1)) emt_x = np.concatenate((emt_x_coord, emt_y_coord), axis=2) emt_ground = emt_label # split data into folds skf = StratifiedKFold(n_splits=folds, shuffle=True) # early stopping options. Stop training (patience) steps after val_loss starts to increase. early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, verbose=1, mode='min', baseline=None, restore_best_weights=True) ########################################## # Folds ########################################## cvscores = [] # blank matrix
# # # make a list of train dataset list[path,cry] tmp_list = [] for label in os.listdir(config.TRAIN): path = config.TRAIN + label + '/' for wavefile in os.listdir(path): path_tmp='' path_tmp = path + wavefile tmp_list.append([path_tmp,label]) train_file = pd.DataFrame(tmp_list,columns=['file_path','label']) #print(train_file) del tmp_list # # # split skf = StratifiedKFold(**config.split) train_file['fold']=-1 for fold_id, (tr_ind, val_ind) in enumerate(skf.split(train_file, train_file['label'])): train_file.iloc[val_ind,-1] = fold_id #print(train_file['fold']) use_fold = config.globals["use_fold"] train_file_list = train_file.query("fold != @use_fold")[["file_path", "label"]].values.tolist() val_file_list = train_file.query("fold == @use_fold")[["file_path", "label"]].values.tolist() #print("[fold {}] train: {}, val: {}".format(use_fold, len(train_file_list), len(val_file_list))) engine.set_seed(config.globals["seed"]) device = torch.device(config.globals["device"]) # # # get loader train_loader, val_loader = dataset.get_loaders_for_training(
metrics=[auc]) return model earlystopping = callbacks.EarlyStopping(monitor='val_auc', min_delta=0,\ patience=5, verbose=0, mode='max') checkpoint = callbacks.ModelCheckpoint('bestmodel.h5', monitor='val_auc', verbose=0, \ save_best_only=False, period=1) rlr = callbacks.ReduceLROnPlateau( monitor='val_auc',\ factor=0.1, patience=3, verbose=0, \ cooldown=0, min_lr=0) CALLBACKS = [earlystopping, checkpoint, rlr] NFOLDS = 10 EPOCHS = 10 BATCHSIZE = 64 skf = StratifiedKFold(n_splits=NFOLDS) predictions = np.zeros((len(test), )) validations = np.zeros((len(train), )) for train_index, valid_index in skf.split(X, y): X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] model = create_model() model.fit(list(np.transpose(X_train)),y_train,validation_data=(list(np.transpose(X_valid)),y_valid),\ epochs=EPOCHS,batch_size=BATCHSIZE,verbose=2,callbacks=CALLBACKS) validations[valid_index] = model.predict(list( np.transpose(X_valid))).flatten() predictions += model.predict(list(np.transpose(X_test))).flatten() / NFOLDS submission = pd.DataFrame(predictions, columns=target_col)
reg_alpha=3, reg_lambda=5, max_depth=-1, n_estimators=5000, objective='binary', subsample=0.9, colsample_bytree=0.77, subsample_freq=1, learning_rate=0.05, random_state=1000, n_jobs=4, min_child_weight=4, min_child_samples=5, min_split_gain=0) skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True) oof_preds = np.zeros(train.shape[0]) sub_preds = np.zeros(test_id.shape[0]) best_score = [] for index, (train_index, test_index) in enumerate(skf.split(train, label)): lgb_model.fit(train.iloc[train_index], label.iloc[train_index], verbose=50, eval_set=[(train.iloc[train_index], label.iloc[train_index]), (train.iloc[test_index], label.iloc[test_index])], early_stopping_rounds=30) best_score.append(lgb_model.best_score_['valid_1']['binary_logloss']) print(best_score)
def build_poi_id_model(features, labels, names): """ Function to train classifier to predict labels given features Parameters ---------- features: list of dictionaries per dataset lables: list of boolean labels Return values clf, features_list clf: trained classifier features_list: list of features used by the classifier """ # Split into training and testing splitter = StratifiedKFold(n_splits=10) features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.05, random_state=123456, stratify=labels ) # Setting for persistance # In the persistance run, texts from emails are extracted for # training and testing data sets and the results are persisted # to files. # If not in persistance run, these files are only loaded and # processing of the emails is skipped # Pipeline to process email texts # First, extract texts from person # then, eventually persist those texts # then, vectorize the texts # then, select only the percentile with the most separating power # then, convert result to dense array (needed for some classifiers) pipeline_email_text = Pipeline([ ("GetEmailText", SelectMatchFeatures(feature_match="word_.*")), #("SelectPercentile", SelectPercentile(score_func=f_classif, percentile=10)), ("SelectPercentile", SelectKBest(score_func=chi2, k=250)), #("SVC", SelectFromModel(LinearSVC(class_weight="balanced", C=0.7), threshold=0.25)), # ("NaiveBayes", SelectFromModel(MultinomialNB(alpha=.5, fit_prior=False), threshold=0.5)), #("Scale", StandardScaler()), ]) pipeline_subjects = Pipeline([ ("GetEmailText", SelectMatchFeatures(feature_match="sub_.*")), ("SelectPercentile", SelectKBest(score_func=chi2, k=100)), # ("NaiveBayes", SelectFromModel(MultinomialNB(alpha=1, fit_prior=False))), #("Scale", StandardScaler()) ]) # Process financial features pipeline_financial = Pipeline([ ("Selector", SelectFeatureList(selected_feature_list=FEATURES_FINANCIAL, convert_to_numeric=True)), ("ConvertToVector", DictVectorizer(sparse=False)), ("Impute", ImputeOrZero(strategy="zero")), ("Log1P", FunctionTransformer(func=log_trans)), ]) # Process other features # First, drop email_adress feature, which is only needed to # load the email texts # then, convert dictionary to dense vector pipeline_email = Pipeline([ ("Selector", SelectFeatureList(selected_feature_list=FEATURES_EMAIL, convert_to_numeric=True)), ("ConvertToVector", DictVectorizer(sparse=False)), ("Log1P", FunctionTransformer(func=log_trans)), ]) feature_union = FeatureUnion( transformer_list=[ ("email_text", pipeline_email_text), ("subjects", pipeline_subjects), ("financial", pipeline_financial), ("email", pipeline_email), ], #transformer_weights={'email_text': 0, 'subjects': 1, 'financial': 1, 'email': 1}, ) # Combine email text features and other features # then run classifier on these features pipeline_union = Pipeline([ ("union", feature_union), ("Scale", StandardScaler()), #("Select", SelectKBest(score_func=f_classif, k=10)), # ("KNeighborsClassifier", KNeighborsClassifier()), ("KNeighborsClassifier", KNeighborsClassifier(n_neighbors=1, metric='minkowski', weights='distance')), # ("SVC", SVC(class_weight='balanced')), # ("SVC", SVC(C=0.8, kernel='rbf', class_weight='balanced')), # ("DecisionTree", RandomForestClassifier()), # ("DecisionTree", RandomForestClassifier(n_estimators=10, min_samples_split=6, min_samples_leaf=1, class_weight=None)), #("NaiveBayes", MultinomialNB(alpha=1, fit_prior=False)), ]) # Fit the complete pipeline # Test accuracy of model param_grid_union = { "union__transformer_weights": [ # {'email_text': 1, 'subjects': 1, 'financial': 1, 'email': 1}, # {'email_text': 0, 'subjects': 1, 'financial': 1, 'email': 1}, # {'email_text': 1, 'subjects': 0, 'financial': 1, 'email': 1}, # {'email_text': 1, 'subjects': 1, 'financial': 0, 'email': 1}, # {'email_text': 1, 'subjects': 1, 'financial': 1, 'email': 0}, # {'email_text': 0, 'subjects': 0, 'financial': 1, 'email': 1}, # {'email_text': 0, 'subjects': 1, 'financial': 0, 'email': 1}, # {'email_text': 0, 'subjects': 1, 'financial': 1, 'email': 0}, # {'email_text': 1, 'subjects': 0, 'financial': 0, 'email': 1}, # {'email_text': 1, 'subjects': 0, 'financial': 1, 'email': 0}, # {'email_text': 1, 'subjects': 1, 'financial': 0, 'email': 0}, # {'email_text': 0, 'subjects': 0, 'financial': 0, 'email': 1}, # {'email_text': 0, 'subjects': 0, 'financial': 1, 'email': 0}, { 'email_text': 0, 'subjects': 1, 'financial': 0, 'email': 0 }, # {'email_text': 1, 'subjects': 0, 'financial': 0, 'email': 0}, ], # "union__email_text__SelectPercentile__k": [10, 50, 100, 250, 500], # "union__email_text__SelectPercentile__score_func": [chi2, f_classif], # "union__subjects__SelectPercentile__k": [2, 3, 5, 10, 100, 200], # "union__subjects__SelectPercentile__score_func": [chi2, f_classif], # "union__financial__Impute__strategy": ["median", "zero"], # "DecisionTree__min_samples_split": [2,4,6], # "DecisionTree__min_samples_leaf": [1,2,4], # "DecisionTree__n_estimators": [5, 10, 20], # "NaiveBayes__alpha": [.5, .8, 1], # "SVC__C": [.2, .5, .8, 1], # "SVC__kernel": ['rbf', 'sigmoid', 'linear'], # "SVC__class_weight": [None, 'balanced'], # "SVC__probability": [False, True], "KNeighborsClassifier__n_neighbors": [1, 3, 5], "KNeighborsClassifier__weights": ["uniform", "distance"], "KNeighborsClassifier__metric": ["minkowski", "manhattan"] } grid_search_union = GridSearchCV(pipeline_union, param_grid=param_grid_union, cv=10, scoring="f1") start = time() np.random.seed(42) grid_search_union.fit(features, labels) print( "GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search_union.cv_results_['params']))) report(grid_search_union.cv_results_) np.random.seed(42) best_est = np.flatnonzero( grid_search_union.cv_results_['rank_test_score'] == 1)[0] print grid_search_union.cv_results_['params'][best_est] pipeline_union.set_params( **grid_search_union.cv_results_['params'][best_est]) pred = cross_val_predict(pipeline_union, features, labels, cv=10) print confusion_matrix(labels, pred) print classification_report(labels, pred) print "Accuracy: ", accuracy_score(labels, pred) pickle.dump(pipeline_union, open("full_classifier.pkl", "w")) # Pepare data for tester feature_select = FeatureUnion(transformer_list=[("subjects", pipeline_subjects)]) feature_transformed = feature_select.fit_transform(features, labels) # extract names of subject features sub_features = feature_select.transformer_list[0][1].named_steps[ "GetEmailText"].get_feature_names() select_sub_features_idx = feature_select.transformer_list[0][ 1].named_steps["SelectPercentile"].get_support(indices=True) select_sub_features = np.take(sub_features, select_sub_features_idx).tolist() data_dict = VectorToDict(feature_names=select_sub_features, dataset_names=names).fit_transform( feature_transformed, labels) # Prepare classifier for tester clf = Pipeline([ ("Scale", StandardScaler()), ("KNeighborsClassifier", KNeighborsClassifier(n_neighbors=3, metric="manhattan", weights="distance")), ]) # Return classifier, names of features used and data return clf, select_sub_features, data_dict
def setup_kfold(X, Y, n_splits): kf = StratifiedKFold(n_splits=n_splits, random_state=SEED) kf.get_n_splits(X) return kf
def cross_validate_models(X, y, clf_models, seen_index, n_splits=10, classes=None, upsample=False, roundup=False, df=None, stratified_k=False, test_index=None, p_threshold=None): if stratified_k: label_encoder = LabelEncoder() kf = StratifiedKFold(n_splits=n_splits) kfs = kf.split(X[seen_index], label_encoder.fit_transform(y[seen_index])) else: kf = KFold(n_splits=n_splits) kfs = kf.split(X[seen_index], y[seen_index]) i = 0 def tpr(y_true, y_pred): return roc_curve(y_true, y_pred)[1] def fpr(y_true, y_pred): return roc_curve(y_true, y_pred)[0] def prec(y_true, y_pred): return precision_recall_curve(y_true, y_pred)[0] def rec(y_true, y_pred): return precision_recall_curve(y_true, y_pred)[1] scores = [ # name, function, on y when multiclas, on each y when multiclass, # proba ('p', precision_score, True, True, False), ('r', recall_score, True, True, False), ('f1', f1_score, True, True, False), ('e', accuracy_score, True, True, False), ('i', None, False, False, False), ('auc', roc_auc_score, True, True, True), ('tpr', tpr, False, True, True), ('fpr', fpr, False, True, True), ('prec', prec, False, True, True), ('rec', rec, False, True, True) ] if classes: scores += [('cov_err', coverage_error, True, False, False), ('LRAP', label_ranking_average_precision_score, True, False, False), ('LRL', label_ranking_loss, True, False, False)] for model in clf_models: for m in scores: model[m[0]] = [] metrics = ['e'] if classes: for j, y_class in enumerate(classes): for m in scores: if m[1]: model[f'{m[0]}\n{y_class}'] = [] metrics += [f'p\n{y_class}', f'r\n{y_class}'] if test_index is not None: test_preds = [] for k_train, k_test in kfs: k_train = seen_index[k_train] k_test = seen_index[k_test] if test_index is not None: k_test = test_index if upsample: ros = RandomOverSampler(random_state=42) if classes: lp = LabelPowerset() yt = lp.transform(y) X_train, y_resampled = ros.fit_resample( X[k_train], yt[k_train]) y_train = lp.inverse_transform(y_resampled).todense() else: X_train, y_train = ros.fit_resample(X[k_train], y[k_train].todense()) else: X_train = X[k_train] y_train = y[k_train] i += 1 print(i) for model in clf_models: if callable(model['model']): clf = model['model'](X.shape[1], y.shape[1]) else: clf = model['model'] model['i'].append(i) if hasattr(clf, "epochs"): weights = None if clf.custom_weights: weights = clf.custom_weights # weights = {} # for i,c in enumerate(classes): # weights[i] = round((1-y[seen_index,i].sum()/len(seen_index))*50) # print(weights) clf.fit(X_train, y_train, epochs=clf.epochs, class_weight=weights.values(), verbose=clf.verbose, batch_size=20) else: clf.fit(X_train, y_train) predictions = clf.predict(X[k_test]) if np.ravel(predictions)[0] not in [1, 0]: predictions = predictions.round() try: predictions_proba = clf.predict_proba(X[k_test]) if p_threshold is not None: predictions = np.where(predictions_proba >= p_threshold, 1, 0)[:, ] except: predictions_proba = predictions print( "WARNING! Can't predict probabilities with this model, just using binary predictions" ) if hasattr(predictions_proba, "todense"): predictions_proba = predictions_proba.todense() if hasattr(predictions, "todense"): predictions = predictions.todense() if test_index is not None: test_preds.append(predictions_proba) if classes: if roundup: # for j, c in enumerate(predictions_proba.argmax(axis=1)): # predictions[j,c] = 1 y_pred_arr = predictions_proba ai = np.expand_dims(np.argmax(y_pred_arr, axis=1), axis=1) maximums = np.maximum(y_pred_arr.max(1), 0.51) np.put_along_axis(y_pred_arr, ai, maximums.reshape(ai.shape), axis=1) predictions = np.round(predictions_proba) for m in scores: if m[4]: y_pred = predictions_proba else: y_pred = predictions if not m[1] or not m[2]: continue try: model[m[0]].append(m[1](y[k_test], y_pred, average="weighted")) except TypeError: model[m[0]].append(m[1](y[k_test], y_pred)) except ValueError: pass for j, y_class in enumerate(classes): # if y[k_train,i].sum() == 0: # print("no labels for {y_class}") for m in scores: if not m[1]: continue if m[3]: # if do this metric on each class if m[4]: # if use probabilities y_pred = predictions_proba else: y_pred = predictions try: model[f'{m[0]}\n{y_class}'].append(m[1]( y[k_test, j], y_pred[:, j])) except: model[f'{m[0]}\n{y_class}'].append(None) if df is not None: df.loc[ k_test, f"{y_class} - k_prediction"] = predictions_proba[:, j] df.loc[ k_test, f"{y_class} - k_prediction_binary"] = predictions[:, j] else: for m in scores: if not m[1]: continue model[m[0]].append(m[1](y[k_test], predictions)) if df is not None: df.loc[k_test, "y_k_prediction"] = predictions_proba[:, 1] if classes: if df is not None: return clf_models, metrics, df return clf_models, metrics else: if df is not None: return clf_models, df elif test_index is not None: return clf_models, np.array(test_preds) return clf_models
def grid_search(data_folder, folds_count, **kwargs): """ Performs grid search of all possible combinations of given parameters with logarithmic ranges. Saves results in formatted file in location pointed by get_grid_search_results_path method """ sentence_embeddings = kwargs['sentence_embeddings'] word_embeddings = kwargs['word_embeddings'] classifiers = kwargs['classifiers'] n_jobs = kwargs['n_jobs'] if 'n_jobs' in kwargs else 1 # prepare output files for classifier_class, _ in classifiers: our_classifier_wrapper = CLASSIFIERS_WRAPPERS[classifier_class] output_path = get_grid_search_results_path(data_folder, our_classifier_wrapper) eval_output_path = get_evaluation_path(data_folder, our_classifier_wrapper) t_eval_output_path = get_train_set_evaluation_path( data_folder, our_classifier_wrapper) if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) else: # clear output file with open(output_path, 'w'): pass if not os.path.exists(os.path.dirname(eval_output_path)): os.makedirs(os.path.dirname(eval_output_path)) else: # clear evaluation output file with open(eval_output_path, 'w'): pass if not os.path.exists(os.path.dirname(t_eval_output_path)): os.makedirs(os.path.dirname(t_eval_output_path)) else: # clear train evaluation output file with open(t_eval_output_path, 'w'): pass skf = StratifiedKFold(n_splits=5) for word_emb_class, word_emb_params in word_embeddings: word_embedding = word_emb_class(*word_emb_params) word_embedding.build() for sen_emb_class in sentence_embeddings: sen_emb = sen_emb_class() feature_builder = FeatureBuilder() str_word_emb_params = ','.join(map(str, word_emb_params)) embedding_desc = ';'.join([ word_emb_class.__name__, str_word_emb_params, sen_emb_class.__name__ ]) print("Testing embedding: {0}".format(embedding_desc)) sen_emb.build(word_embedding) feature_builder.build(sen_emb, LABELS, SENTENCES) # Train and test indices for double cross-validation train_index, test_index = next( skf.split(feature_builder.features, feature_builder.labels)) for classifier_class, tested_params in classifiers: our_classifier_wrapper = CLASSIFIERS_WRAPPERS[classifier_class] output_path = get_grid_search_results_path( data_folder, our_classifier_wrapper) eval_output_path = get_evaluation_path(data_folder, our_classifier_wrapper) t_eval_output_path = get_train_set_evaluation_path( data_folder, our_classifier_wrapper) combs = reduce(operator.mul, map(len, tested_params.itervalues()), 1) print( "Testing {0} hyperparameters ({1} combinations)...".format( classifier_class.__name__, combs)) # for keras we need to create a sklearn wrapper to use GridSearchCV if classifier_class == KerasNeuralNetworkClassifier: model = KerasClassifier( build_fn=create_keras_model, features_count=feature_builder.features.shape[1], verbose=0) else: model = classifier_class() if classifier_class == RandomForestClassifier or classifier_class == KerasNeuralNetworkClassifier: # use 1 job because of high memory usage of these classifiers clf = GridSearchCV(estimator=model, param_grid=tested_params, n_jobs=1, cv=folds_count) else: clf = GridSearchCV(estimator=model, param_grid=tested_params, n_jobs=n_jobs, cv=folds_count) clf.fit(feature_builder.features[train_index], feature_builder.labels[train_index]) evaluation = clf.score(feature_builder.features[test_index], feature_builder.labels[test_index]) t_evaluation = clf.score(feature_builder.features[train_index], feature_builder.labels[train_index]) with open(output_path, 'a') as output_file: for mean_score, params in zip( clf.cv_results_['mean_test_score'], clf.cv_results_['params']): output_file.write('{:s};{:s};{:4.2f}\n'.format( embedding_desc, str(params), mean_score * 100)) with open(eval_output_path, 'a') as output_file: output_file.write('{:s};{:4.2f}\n'.format( embedding_desc, evaluation * 100)) with open(t_eval_output_path, 'a') as output_file: output_file.write('{:s};{:4.2f}\n'.format( embedding_desc, t_evaluation * 100))
def parameterSearch(self, paramSets, X, Y, numSplits=2, valSplit=0.0, epochs=1, batchSize=None, saveModel=False, visualize=False, saveLoc=''): # create CV dat LOOV #numSplits = 2 Kf = StratifiedKFold(n_splits=numSplits) callBacks = [ EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True) ] if (visualize): callBacks.append( TensorBoard(log_dir='./logs', histogram_freq=3, write_graph=False, write_images=False, update_freq='epoch', profile_batch=2, embeddings_freq=0, embeddings_metadata=None)) #for each parameter set # make a model # #X = [0,1,2,3,4,5,6,7,8,9] modelFile = open(self.outputPath + "fileModel.csv", 'w') resultFile = open(self.outputPath + "fileResult.csv", 'w') resultFile.write( "modelNum|True REM|False REM|False NonREM|True NonREM|Acc|Sens|Spec|Recall|Precision|f1score|finalLoss\n" ) modelNum = 0 for paramSet in paramSets: modelFile.write(str(modelNum) + "|") json.dump(paramSet, modelFile) modelFile.write("\n") print("\n\n=================\nTesting Model " + str(modelNum) + "\n=================\n") #print(paramSet, flush=True) try: model = self.convModel(paramSet) print(model.summary()) #model.save_weights('temp_weights.h5') j = 0 for trainInd, testInd in Kf.split(X, np.argmax(Y, axis=1)): fitHistory = model.fit(X[trainInd], Y[trainInd], batch_size=batchSize, verbose=0, validation_split=valSplit, epochs=epochs, callbacks=callBacks) if (saveModel): modelWeightFile = saveLoc + f'{modelNum}.{j}.weights.h5' model.save_weights(modelWeightFile) #model.save(modelWeightFile) Ypred = np.zeros((testInd.shape[0], Y.shape[1])) Yi = 0 for pred in np.argmax(model.predict(X[testInd], batch_size=None), axis=1): Ypred[Yi][pred] = 1 Yi += 1 #NOTE: #confusionMatrix = multilabel_confusion_matrix(Y[testInd], Ypred)[0] ##print(confusionMatrix) ##confusionMatrix = confusion_matrix(np.argmax(Y[testInd], axis=1), np.argmax(Ypred, axis=1)) ##print(confusionMatrix) ##print('f1_score:',f1_score(Y[testInd], Ypred, average='macro')) #resultFile.write(str(modelNum) + "|") ##for row in confusionMatrix: ## for el in row: ## resultFile.write(str(el) + "|") ##"modelNum|True REM|False NonREM|False REM|True NonREM|Acc|Sens|Spec|Recall|Precision|f1score\n" # #tn = confusionMatrix[0][0] #fn = confusionMatrix[1][0] #tp = confusionMatrix[1][1] #fp = confusionMatrix[0][1] tp = tn = fn = fp = 0 Yi = 0 for y in Y[testInd]: tp += Ypred[Yi][0] * y[0] fp += max(Ypred[Yi][0] - y[0], 0) tn += Ypred[Yi][1] * y[1] fn += max(Ypred[Yi][1] - y[1], 0) Yi += 1 acc = sens = spec = prec = rec = f1 = 0 acc = (tp + tn) / (tp + tn + fp + fn) if (tp + fn > 0): sens = tp / (tp + fn) if (tn + fp > 0): spec = tn / (tn + fp) if (tp + fp > 0): prec = tp / (tp + fp) if (tp + fn > 0): rec = tp / (tp + fn) if (prec + rec > 0): f1 = 2 * ((prec * rec) / (prec + rec)) resultFile.write( f"{modelNum}|{tp:.3f}|{fp:.3f}|{fn:.3f}|{tn:.3f}|{acc:.3f}|{sens:.3f}|{spec:.3f}|{rec:.3f}|{prec:.3f}|{f1:.3f}|{fitHistory.history['loss'][-1]:10.3f}\n" ) print( f"{'Validate':10s}|{'modelNum':10s}|{'tp':10s}|{'fp':10s}|{'fn':10s}|{'tn':10s}|{'acc':10s}|{'sens':10s}|{'spec':10s}|{'rec':10s}|{'prec':10s}|{'f1':10s}|{'loss':10s}\n" ) print( f"{j:10d}|{modelNum:10d}|{tp:10.3f}|{fp:10.3f}|{fn:10.3f}|{tn:10.3f}|{acc:10.3f}|{sens:10.3f}|{spec:10.3f}|{rec:10.3f}|{prec:10.3f}|{f1:10.3f}|{fitHistory.history['loss'][-1]:10.3f}\n", flush=True) #resultFile.write(str(f1_score(Y[testInd], Ypred, average='macro')) + "|\n") #model.load_weights('temp_weights.h5') self.reset_weights(model) j += 1 except Exception as e: resultFile.write("error\n") print(str(e)) K.clear_session() modelNum += 1 if self.killer.kill_now: resultFile.write("killed\n") print("killed") break modelFile.close() resultFile.close()
dataset_exp = numpy.loadtxt(os.path.join(path, fileexp), delimiter="\t")# Change the path to your local system dataset_cnv = numpy.loadtxt(os.path.join(path, filecnv), delimiter="\t")# Change the path to your local system # split into input (X) and output (Y) variables X_clinical = dataset_clinical[:,0:25] Y_clinical = dataset_clinical[:,25] # split into input (X) and output (Y) variables X_exp = dataset_exp[:,0:400] Y_exp = dataset_exp[:,400] # split into input (X) and output (Y) variables X_cnv = dataset_cnv[:,0:200] Y_cnv = dataset_cnv[:,200] print('*********************************Training the Clinical CNN *****************************************') # kfold_value fold cross validation kfold = StratifiedKFold(n_splits=kfold_value, shuffle=False, random_state=1) acc_clinical = [] Pr_clinical = [] Sn_clinical = [] Mcc_clinical = [] i=1 for train_index, test_index in kfold.split(X_clinical, Y_clinical): print(i,"th Fold *****************************************") i=i+1 x_train_clinical, x_test_clinical=X_clinical[train_index],X_clinical[test_index] y_train_clinical, y_test_clinical = Y_clinical[train_index],Y_clinical[test_index] x_train_clinical = numpy.expand_dims(x_train_clinical, axis=2) x_test_clinical = numpy.expand_dims(x_test_clinical, axis=2) # first Clinical CNN Model init =initializers.glorot_normal(seed=1) bias_init =initializers.Constant(value=0.1)
V = pca.fit(X) varPC= (pca.explained_variance_ratio_) lambdas = pca.singular_values_ full_dict['full_mat'] = X full_dict['labels']=y full_dict['V']= V full_dict['varPC']= varPC # X is your cell gene matrix, y is your class labels #%% # Split into train/test kCV = 5 skf = StratifiedKFold(n_splits=kCV, shuffle= True) Atrain = {} Atest = {} ytest = {} ytrain = {} proprestest = {} proprestrain = {} folds_dict = {'trainmat':{}, 'trainlabel':{}, 'eigenvectors':{}, 'eigvals':{}, 'meanvec':{}} for i in range(kCV): for train_index, test_index in skf.split(X, y): Atrain[i] = X.iloc[train_index, :] Atest[i] = X.iloc[test_index, :] ytest[i]= y[test_index] ytrain[i]= y[train_index] proprestest[i] = sum(ytest[i])/len(ytest[i])
def main(config: DictConfig) -> None: prepair_dir(config) set_seed(config.data.seed) label_cols = [ "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2" ] train_dfs, names = load_train_data(config.store.workdir) test_dfs = load_test_data(config.store.workdir) remove_cols = [ "knn_age_pred", "knn_domain1_var1", "densenet121_age_pred", "densenet121_domain1_var1_pred", "densenet121_domain1_var2_pred", "densenet121_domain2_var2_pred", "3dcnn_resnet18_domain1_var2_pred", "3dcnn_resnet18_domain2_var1_pred", "3dcnn_resnet18_domain2_var2_pred", "1dresnet18_domain1_var1_pred", "1dresnet18_domain1_var2_pred", "1dresnet18_domain2_var2_pred", "simple_3dcnn_domain1_var1_pred", "simple_3dcnn_domain1_var2_pred", "simple_3dcnn_domain2_var2_pred", "transformer_domain2_var1_pred", "transformer_domain2_var2_pred", "transformer_domain1_var1_pred", "transformer_domain1_var2_pred", "lgbm_gnn_feature_domain1_var2_pred", "lgbm_gnn_feature_domain2_var2_pred", "lgbm_gnn_featured_domain1_var2_pred", "lgbm_gnn_featured_domain2_var2_pred", "lgbm_cnn_feature_domain1_var2_pred", "lgbm_cnn_feature_domain2_var2_pred", "lgbm_2plus1dcnn_feature_domain1_var2_pred", "lgbm_2plus1dcnn_feature_domain2_var2_pred", "xgb_2plus1dcnn_feature_age_pred", "xgb_2plus1dcnn_feature_domain1_var2_pred", "xgb_2plus1dcnn_feature_domain2_var2_pred", "simple_3dcnn_domain2_var1_pred", "simple_3dcnn_3label_domain1_var2_pred", "gin_domain1_var1_pred", "gin_domain2_var1_pred", "2plus1dcnn_resnet10_domain1_var2_pred", "resnest14d_domain1_var1_pred", "resnest14d_domain1_var2_pred", "resnest14d_domain2_var2_pred", ] train_ft_dict = {} test_ft_dict = {} feature_cols = [] train_ft_dict["Id"] = train_dfs[0]["Id"] test_ft_dict["Id"] = test_dfs[0]["Id"] for label_col in label_cols: train_ft_dict[label_col] = train_dfs[0][label_col] for name, df in zip(names, train_dfs): for label_col in label_cols: if (f"{label_col}_pred" in df.columns and f"{name}_{label_col}_pred" not in remove_cols): train_ft_dict[f"{name}_{label_col}_pred"] = df[ f"{label_col}_pred"] feature_cols += [f"{name}_{label_col}_pred"] elif f"{name}_{label_col}_pred" in remove_cols: df.drop(f"{label_col}_pred", axis=1, inplace=True) feat_dict = make_domain_feature(df, mode="train", name=name) train_ft_dict.update(feat_dict) feature_cols += list(feat_dict.keys()) for name, df in zip(names, test_dfs): for label_col in label_cols: for i in range(5): if (f"{label_col}_pred_fold{i}" in df.columns and f"{name}_{label_col}_pred" not in remove_cols): test_ft_dict[f"{name}_{label_col}_pred_fold{i}"] = df[ f"{label_col}_pred_fold{i}"] elif (f"{name}_{label_col}_pred" in remove_cols and f"{label_col}_pred_fold{i}" in df.columns): df.drop(f"{label_col}_pred_fold{i}", axis=1, inplace=True) feat_dict = make_domain_feature(df, mode="test", name=name) test_ft_dict.update(feat_dict) train_df = pd.DataFrame(train_ft_dict) test_df = pd.DataFrame(test_ft_dict) train_df["age"] = ( pd.read_csv(f"{config.store.workdir}/input/train_scores.csv" ).sort_values("Id").reset_index(drop=True)["age"]) age_rank = train_df["age"].values // 10 * 10 skf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True) train_df, test_df = preprocess(train_df, test_df, feature_cols) for feature_col in feature_cols: train_df[feature_col].fillna(0, inplace=True) test_df[feature_col].fillna(0, inplace=True) train_df = cudf.from_pandas(train_df) test_df = cudf.from_pandas(test_df) if config.randomize_age: set_seed(777_777_777) train_df["age"] += np.array( [randomize_age(age) for age in train_df["age"].values]) skf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True) train_df = train_df.reset_index(drop=True) logger.info("=" * 10 + "parameter search" + "=" * 10) best_c = {} for label_col in label_cols: best = np.inf if label_col == "age": feature_cols_ = [ col for col in feature_cols if f"{label_col}" in col ] else: feature_cols_ = feature_cols for c in [2**(i) for i in range(-14, 1)]: y_oof = np.zeros(train_df.shape[0]) for n_fold, (train_index, val_index) in enumerate(skf.split(age_rank, age_rank)): train_df_fold = train_df.iloc[train_index] valid_df_fold = train_df.iloc[val_index] train_df_fold = train_df_fold[ train_df_fold[label_col].notnull()] model = SVR(kernel="linear", C=c, cache_size=3000.0) model.fit(train_df_fold[feature_cols_], train_df_fold[label_col]) y_oof[val_index] = model.predict( valid_df_fold[feature_cols_]).to_array() test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict( test_df[feature_cols_]) train_df[f"{label_col}_pred"] = y_oof notnull_idx = train_df[label_col].notnull() score = normalized_absolute_errors( train_df[notnull_idx][label_col].values, train_df[notnull_idx][f"{label_col}_pred"].values, ) logger.info(f"c={c}, {label_col}: {score}") if score <= best: best = score best_c[label_col] = c logger.info("=" * 10 + "prediction" + "=" * 10) for label_col in label_cols: y_oof = np.zeros(train_df.shape[0]) if label_col == "age": feature_cols_ = [ col for col in feature_cols if f"{label_col}" in col ] else: feature_cols_ = feature_cols for n_fold, (train_index, val_index) in enumerate(skf.split(age_rank, age_rank)): train_df_fold = train_df.iloc[train_index] valid_df_fold = train_df.iloc[val_index] train_df_fold = train_df_fold[train_df_fold[label_col].notnull()] model = SVR(kernel="linear", C=best_c[label_col], cache_size=3000.0) model.fit(train_df_fold[feature_cols_], train_df_fold[label_col]) y_oof[val_index] = model.predict( valid_df_fold[feature_cols_]).to_array() test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict( test_df[feature_cols_]) train_df[f"{label_col}_pred"] = y_oof notnull_idx = train_df[label_col].notnull() score = normalized_absolute_errors( train_df[notnull_idx][label_col].values, train_df[notnull_idx][f"{label_col}_pred"].values, ) logger.info(f"c={c}, {label_col}: {score}") score = 0 for label_col, weight in zip(label_cols, [0.3, 0.175, 0.175, 0.175, 0.175]): notnull_idx = train_df[label_col].notnull() score += (normalized_absolute_errors( train_df[notnull_idx][label_col].to_array(), train_df[notnull_idx][f"{label_col}_pred"].to_array(), ) * weight) logger.info(f"all: {score}") train_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_train.csv"), index=False, ) test_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_test.csv"), index=False, ) if config.store.gcs_project is not None: upload_directory(config.store) sub_df = make_submission(test_df) sub_df.to_csv( os.path.join(config.store.result_path, f"{config.store.model_name}_submission.csv"), index=False, )
x = range(1, min(len_max, n_pre_subs) + 1) plt.cla() plt.plot(x, means, label='mean blend') plt.plot(x, medians, label='median blend', color='r') plt.text(x[-1], means[-1], str(means[-1])) plt.text(x[-1], medians[-1], str(medians[-1])) plt.legend() plt.xlabel('# pre-subs') plt.ylabel('score') plt.title('score vs # pre-subs') plt.savefig('output/validation.png') folds = StratifiedKFold(n_splits=N_VALIDATION_SPLITS, shuffle=True, random_state=int(time())) for N_VALIDATION_SPLITS_iter, (index_train, index_valid) in enumerate(folds.split(X, y)): X_train, y_train = X.iloc[index_train], y.iloc[index_train] X_valid, y_valid = X.iloc[index_valid], y.iloc[index_valid] print('\n====== validation {}/{} ======='.format( N_VALIDATION_SPLITS_iter + 1, N_VALIDATION_SPLITS)) print('#presubs|\tscore (mean / median)') predictions_list = [] scores_val_mean = [] scores_val_median = []
#convert Dataframe to Array for Model Training x_train = x_train.values ################# #LOGISTIC REGRESSION ################# print('\nLOGISTIC REGRESSION') #Grid Search from sklearn.model_selection import StratifiedKFold from sklearn.metrics import accuracy_score CV_acc = 0 best_c = -1000 C = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000] kfolds = StratifiedKFold(n_splits = 10, shuffle = True) #Testing Different Values for C for c in C: predicted = [] lr_gs = LogisticRegression(solver = 'liblinear', C = c) #10-Fold Cross Validation for training, testing in kfolds.split(x_train, y_train): lr_gs.fit(x_train[training], y_train[training]) pred = lr_gs.predict(x_train[testing]) for p in pred: predicted.append(p) score = accuracy_score(y_train, predicted)
def run_imli(DATASET_NAME, ID_HOLDOUT, CATEGORICAL_FEATURES): # Load Dataset import pandas as pd HOLDOUT_DIR = "/root/skripsi/imli/experiment/%s/holdout/" % DATASET_NAME df_total = pd.read_csv(HOLDOUT_DIR + ("%s.csv" % DATASET_NAME)) df_train_val = pd.read_csv(HOLDOUT_DIR + ("%s-%d-train.csv" % (DATASET_NAME, ID_HOLDOUT))) df_test = pd.read_csv(HOLDOUT_DIR + ("%s-%d-test.csv" % (DATASET_NAME, ID_HOLDOUT))) import numpy as np df_total_np = df_total.values X_total = df_total_np[:, :-1] y_total = df_total_np[:, -1] df_train_val_np = df_train_val.values X_train_val = df_train_val_np[:, :-1] y_train_val = df_train_val_np[:, -1] df_test_np = df_test.values X_test = df_test_np[:, :-1] y_test = df_test_np[:, -1] # Parameters from sklearn.model_selection import StratifiedKFold k_cv = 10 lambdas = [5, 10] n_clauses = [1, 2, 3] rule_types = ["CNF", "DNF"] partition_sizes = [8, 16, 32, 64, 128] is_floors = [True, True, True, True, True] solver="/root/open-wbo/open-wbo" timeout=1000 from IMLI import IMLI features = IMLI().generate_features( X_total, y_total, categorical_features_id=[int(u) for u in CATEGORICAL_FEATURES], discretizer="entropy" # n_thresholds=9 ) # Testing import time print("l: lamda, n_clause, rule_type, id_cv, train_size, val_size, n_partitions, real_partition_size, n_features, training_time, val_accuracy, rule_size, rule, classification_report_train, classification_report_val") best_mean_val_acc = 0 # to choose hyperparameter (lamda, n_clause, rule_type) chosen_val_accs = [] # list of cv val accs from chosen hyperparameter chosen_params = {} # chosen hyperparameter chosen_training_times = [] # list of training time from chosen hyperparameter # finding best hyperparameter for lamda in lambdas: for n_clause in n_clauses: for rule_type in rule_types: for partition_size, is_floor in zip(partition_sizes, is_floors): skf = StratifiedKFold(n_splits=k_cv, shuffle=True, random_state=42) id_cv = 0 val_accs = [] training_times = [] n_partitions_s = [] # cross validation start for train_id, val_id in skf.split(X_train_val, y_train_val): N = train_id.shape[0] if is_floor: # partition_size is the minimum allowed partition_size n_partitions = N // partition_size else: n_partitions = (N + partition_size - 1) // partition_size n_partitions_s.append(n_partitions) train_size = N val_size = val_id.shape[0] real_partition_size = [N // n_partitions, (N + n_partitions - 1) // n_partitions] imli = IMLI( n_clauses=n_clause, lamda=lamda, rule_type=rule_type, solver=solver, n_partitions=n_partitions, timeout=timeout) start_time = time.time() imli.fit(X_train_val[train_id], y_train_val[train_id], features) end_time = time.time() training_time = end_time - start_time training_times.append(training_time) # report validation data y_true = y_train_val[val_id] y_pred = imli.predict(X_train_val[val_id]) from sklearn.metrics import accuracy_score, classification_report accuracy = accuracy_score(y_true, y_pred) val_accs.append(accuracy) classification_report_val = classification_report(y_true, y_pred, output_dict=True, zero_division=0) # report training data y_true_train = y_train_val[train_id] y_pred_train = imli.predict(X_train_val[train_id]) classification_report_train = classification_report(y_true_train, y_pred_train, output_dict=True, zero_division=0) n_features = imli.n_features rule = imli.get_rule() rule_size = imli.get_rule_size() # lamda, n_clause, rule_type, id_cv, train_size, val_size, n_partitions, real_partition_size, n_features, training_time, val_accuracy, rule_size, rule, classification_report_train, classification_report_val print("l: %d,%d,%s,%d,%d,%d,%d,%s,%d,%f,%f,%d,%s,%s,%s" % ( lamda, n_clause, rule_type, id_cv, train_size, val_size, n_partitions, str(real_partition_size).replace(',', ';'), n_features, training_time, accuracy, rule_size, rule, str(classification_report_train).replace(',', ';'), str(classification_report_val).replace(',', ';') )) id_cv += 1 # cross validation done mean_val_acc_cv = np.mean(val_accs) std_val_acc_cv = np.std(val_accs) mean_training_time = np.mean(training_times) std_training_time = np.std(training_times) # n_partitions (values, counts) = np.unique(n_partitions_s, return_counts=True) n_partitions = values[counts.argmax()] params = { 'lamda': lamda, 'n_clause': n_clause, 'rule_type': rule_type, 'real_partition_size': real_partition_size, 'n_partitions': n_partitions } print("r: ---> Params: %s, (mean val acc cv) %f (+- %f), (mean training time) %f (+- %f)" % ( str(params), mean_val_acc_cv, std_val_acc_cv, mean_training_time, std_training_time )) if (mean_val_acc_cv > best_mean_val_acc): best_mean_val_acc = mean_val_acc_cv chosen_val_accs = val_accs chosen_params = params chosen_training_time = training_times # retrain using best hyperparameter best_model = IMLI( n_clauses=chosen_params['n_clause'], lamda=chosen_params['lamda'], rule_type=chosen_params['rule_type'], solver=solver, n_partitions=chosen_params['n_partitions'], timeout=timeout) start_time = time.time() best_model.fit(X_train_val, y_train_val, features) end_time = time.time() retrain_training_time = end_time - start_time # report test y_pred_test = best_model.predict(X_test) y_true_test = y_test from sklearn.metrics import accuracy_score, classification_report test_acc_holdout = accuracy_score(y_true_test, y_pred_test) test_classification_report_holdout = classification_report(y_true_test, y_pred_test, output_dict=True, zero_division=0) # report train val y_pred_train_val = best_model.predict(X_train_val) y_true_train_val = y_train_val train_val_classification_report_holdout = classification_report(y_true_train_val, y_pred_train_val, output_dict=True, zero_division=0) N = X_train_val.shape[0] n_partitions=chosen_params['n_partitions'] real_partition_size = [N // n_partitions, (N + n_partitions - 1) // n_partitions] print("t: ------------------") print("t: HOLDOUT %d" % ID_HOLDOUT) print("t: size training:", X_train_val.shape) print("t: size testing:", X_test.shape) print("t: test_acc_holdout: %f" % test_acc_holdout) print("t: ") print("t: best_val_accs:", chosen_val_accs) print("t: best_mean_val_acc: %f (+- %f)" % (np.mean(chosen_val_accs), np.std(chosen_val_accs))) print("t: chosen_params: {}".format(chosen_params)) print("t: ") print("t: chosen_cv_training_time: %s" % chosen_training_time) print("t: chosen_mean_cv_training_time: %f (+- %f)" % (np.mean(chosen_training_time), np.std(chosen_training_time))) print("t: retrain_training_time: %f" % retrain_training_time) print("t: ") print("t: best_model_rule_size (retrained with params): %d" % best_model.get_rule_size()) print("t: best_model_rule (retrained with params):") print("t: " + best_model.get_rule()) print("t: ") print("t: n_features (retrained with params): %d" % best_model.n_features) print("t: real_partition_size (retrained with params): %s" % real_partition_size) print("t: ") print("t: test_classification_report_holdout: %s" % str(test_classification_report_holdout).replace(',', ';')) print("t: ") print("t: train_val_classification_report_holdout: %s" % str(train_val_classification_report_holdout).replace(',', ';'))
print('Total train data len: ' + str(len(train_labels)) + ' | Positive samples: ' + str(sum(train_labels))) print('Total test data len: ' + str(len(test_labels)) + ' | Positive samples: ' + str(sum(test_labels))) print('Train Features shape ', train_features.shape) print('Test Features shape ', test_features.shape) oversample = SMOTE() train_features, train_labels = oversample.fit_resample(train_features, train_labels) print('After Up sampling') print('Total train data len: ' + str(len(train_labels)) + ' | Positive samples: ' + str(sum(train_labels))) print('Train Features shape ', train_features.shape) for kernel_ in ["poly", "rbf"]: print('***********************************', kernel_, '***********************************') k = 5 kf = StratifiedKFold(n_splits=k, random_state=None, shuffle=False) model = svm.SVC(kernel=kernel_, gamma='auto') for e, (train_idx, test_idx) in enumerate(kf.split(train_features, train_labels)): print(' ---------- KFOLD ', e) tr_features, tr_labels = train_features[train_idx], train_labels[train_idx] te_features, te_labels = train_features[test_idx], train_labels[test_idx] model.fit(tr_features, tr_labels) predictions = model.predict(tr_features) train_metrics = accuracy_fn(predictions, tr_labels, threshold=threshold) train_metrics = {'train_' + k: v for k, v in train_metrics.items()} print(f'***** Train Metrics ***** ') print( f"Accuracy: {'%.5f' % train_metrics['train_accuracy']} " f"| UAR: {'%.5f' % train_metrics['train_uar']}| F1:{'%.5f' % train_metrics['train_f1']} " f"| Precision:{'%.5f' % train_metrics['train_precision']} " f"| Recall:{'%.5f' % train_metrics['train_recall']} | AUC:{'%.5f' % train_metrics['train_auc']}")
print(data["Embarked"].unique()) print(data["Embarked"].value_counts()) data.loc[data["Embarked"] == "S", "Embarked"] = 0 data.loc[data["Embarked"] == "C", "Embarked"] = 1 data.loc[data["Embarked"] == "Q", "Embarked"] = 2 data["Embarked"] = data["Embarked"].fillna(3) # data.loc[data["Embarked"]==None,"Embarked"]=3 print(data["Embarked"].describe()) print(data["Embarked"].unique()) print(data["Embarked"].value_counts()) print("--------------LogisticRegression---------------") predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"] train = data[predictors].values y = data["Survived"] print(train) # print(train.describe()) kfold = StratifiedKFold(n_splits=3, random_state=1) #分层划分 scores = [] # for train_index, test_index in kfold.split(train,y): # # print("Train Index:", train_index, ",Test Index:", test_index) # X_train,X_test=train[train_index],train[test_index] # y_train,y_test=y[train_index],y[test_index] # lr = LogisticRegression() # lr.fit(X_train,y_train) # score = lr.score(X_test,y_test) # scores.append(score) # print(score) print(data["Age"]) print(pd.qcut(data['Age'], 5))
(rows, cols) = (local_labels == i).nonzero() samples = len(rows) for p in range(samples): data[cont, :] = img[rows[p], cols[p], :].flatten() labels[cont] = i - 1 cont += 1 data /= 255 crossval_splits = 5 accuracy = numpy.zeros(crossval_splits) sensitivity = numpy.zeros(crossval_splits) specificity = numpy.zeros(crossval_splits) cont = 0 skf = StratifiedKFold(n_splits=crossval_splits, shuffle=True, random_state=123) skf.get_n_splits(data, labels) for train_index, test_index in skf.split(data, labels): train_data, test_data = data[train_index], data[test_index] train_labels, test_labels = labels[train_index], labels[test_index] #XGB Classifier model = XGBClassifier(use_label_encoder=False, booster='gbtree', random_state=123) model.fit(train_data, train_labels) #Compute scores pred = model.predict(test_data) predictions = [round(value) for value in pred] predictions = numpy.asarray(predictions)
random_state=1, shuffle=True) # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC(gamma='auto'))) # evaluate each model in turn results = [] names = [] for name, model in models: kfold = StratifiedKFold(n_splits=10, random_state=1) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy') results.append(cv_results) names.append(name) print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())) # Compare Algorithms pyplot.boxplot(results, labels=names) pyplot.title('Algorithm Comparison') pyplot.show() # Make predictions on validation dataset model = SVC(gamma='auto')
test_size=0.25, random_state=100) # In[13]: clf = RandomForestClassifier(n_estimators=200, class_weight='balanced', max_depth=16, max_features="auto", random_state=25) #clf = LogisticRegression(max_iter=1000, random_state=42) clf.fit(X_train, y_train) # In[14]: scores = cross_val_score(clf, X_train, y_train, cv=StratifiedKFold(5)) print("Train Acc: ", scores.mean()) # In[15]: print("Test Acc:", clf.score(X_test, y_test)) y_pred = clf.predict(X_test) y_proba = clf.predict_proba(X_test) print("Confusion Matrix") print(confusion_matrix(y_test, y_pred)) # In[16]: print(classification_report(y_test, y_pred)) fpr1, tpr1, thresholds = metrics.roc_curve(y_test, y_proba[:, 1]) plt.plot(fpr1, tpr1)
else: return 7 def histcoverage(coverage): histall = np.zeros((1,8)) for c in coverage: histall[0,c] += 1 return histall train_df["coverage"] = train_df.masks.map(np.sum) / pow(img_size_target, 2) train_df["coverage_class"] = train_df.masks.map(get_mask_type) train_all = [] evaluate_all = [] skf = StratifiedKFold(n_splits=cv_total, random_state=1234, shuffle=True) for train_index, evaluate_index in skf.split(train_df.index.values, train_df.coverage_class): train_all.append(train_index) evaluate_all.append(evaluate_index) print(train_index.shape,evaluate_index.shape) # the shape is slightly different in different cv, it's OK def get_cv_data(cv_index): train_index = train_all[cv_index-1] evaluate_index = evaluate_all[cv_index-1] x_train = np.array(train_df.images[train_index].map(upsample).tolist()).reshape(-1, img_size_target, img_size_target, 1) y_train = np.array(train_df.masks[train_index].map(upsample).tolist()).reshape(-1, img_size_target, img_size_target, 1) x_valid = np.array(train_df.images[evaluate_index].map(upsample).tolist()).reshape(-1, img_size_target, img_size_target, 1) y_valid = np.array(train_df.masks[evaluate_index].map(upsample).tolist()).reshape(-1, img_size_target, img_size_target, 1) return x_train,y_train,x_valid,y_valid """#### Show some examples of different mask"""
def model_evaluate(model_name, x, y, epoch_num): cv = StratifiedKFold(n_splits=6, shuffle=False) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) colors = cycle( ['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange']) lw = 2 print 'Cross validation' i = 0 i_reduced = range(0, len(y), epoch_num) x_r = x[i_reduced] y_r = y[i_reduced] probabilities = np.empty(len(y_r)) probabilities_epoch = np.empty([len(y_r), epoch_num]) # plt.figure() roc_auc_max = 0 for (train, test), color in zip(cv.split(x_r, y_r), colors): # Recalculating indexes to make all observations from one signal be present in train or test train_full = np.zeros(len(train) * epoch_num, dtype='int') for k in range(0, len(train), 1): for j in range(0, epoch_num, 1): train_full[k * epoch_num + j] = train[k] * epoch_num + j test_full = np.zeros(len(test) * epoch_num, dtype='int') for k in range(0, len(test), 1): for j in range(0, epoch_num, 1): test_full[k * epoch_num + j] = test[k] * epoch_num + j # print 'Model fitting...' classifier = model_create(model_name) classifier = model_fit(classifier, x[train_full], y[train_full]) # print 'Predicting...' probas = model_predict(classifier, x[test_full]) p, p_x = prob_decide(probas[:, 1], epoch_num) probabilities[test] = p probabilities_epoch[test, :] = p_x # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y_r[test], p) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) if roc_auc > roc_auc_max: roc_auc_max = roc_auc best_classifier = classifier # print 'Best classifier found!' print 'Iteration #' + str(i) + ': AUC = ' + str(roc_auc) # plt.plot(fpr, tpr, lw=lw, color=color, # label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) i += 1 # plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', # label='Luck') mean_tpr /= cv.get_n_splits(x, y) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) print 'Mean AUC = ', mean_auc # plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', # label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw) # plt.xlim([-0.05, 1.05]) # plt.ylim([-0.05, 1.05]) # plt.xlabel('False Positive Rate') # plt.ylabel('True Positive Rate') # plt.title('Receiver operating characteristic example') # plt.legend(loc="lower right") # # plt.show() # plt.draw() return mean_auc, best_classifier, probabilities, y_r, probabilities_epoch
# fix random seed for reproducibility seed = 7 numpy.random.seed(seed) # load dataset dataframe = read_csv("sonar.csv", header=None) dataset = dataframe.values # split into input (X) and output (Y) variables X = dataset[:,0:60].astype(float) Y = dataset[:,60] # encode class values as integers encoder = LabelEncoder() encoder.fit(Y) encoded_Y = encoder.transform(Y) # baseline model def create_baseline(): # create model model = Sequential() model.add(Dense(60, input_dim=60, init='normal', activation='relu')) model.add(Dense(1, init='normal', activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model # evaluate baseline model with standardized dataset estimators = [] estimators.append(('standardize', StandardScaler())) estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, nb_epoch=100, batch_size=5, verbose=0))) pipeline = Pipeline(estimators) kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) results = cross_val_score(pipeline, X, encoded_Y, cv=kfold) print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
GDSCR.loc[GDSCR.iloc[:, 0] == 'S'] = 1 GDSCR.columns = ['targets'] GDSCR = GDSCR.loc[ls2, :] ls_mb_size = [13, 30, 64] ls_h_dim = [1023, 512, 256, 128, 64, 32, 16] ls_marg = [0.5, 1, 1.5, 2, 2.5] ls_lr = [0.5, 0.1, 0.05, 0.01, 0.001, 0.005, 0.0005, 0.0001, 0.00005, 0.00001] ls_epoch = [20, 50, 10, 15, 30, 40, 60, 70, 80, 90, 100] ls_rate = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8] ls_wd = [0.01, 0.001, 0.1, 0.0001] ls_lam = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] Y = GDSCR['targets'].values skf = StratifiedKFold(n_splits=5, random_state=42) for iters in range(max_iter): k = 0 mbs = random.choice(ls_mb_size) hdm1 = random.choice(ls_h_dim) hdm2 = random.choice(ls_h_dim) hdm3 = random.choice(ls_h_dim) mrg = random.choice(ls_marg) lre = random.choice(ls_lr) lrm = random.choice(ls_lr) lrc = random.choice(ls_lr) lrCL = random.choice(ls_lr) epch = random.choice(ls_epoch) rate1 = random.choice(ls_rate) rate2 = random.choice(ls_rate)
"コンロ3口", "コンロ4口以上", "コンロ設置可(コンロ1口)", "コンロ設置可(コンロ2口)", "コンロ設置可(コンロ3口)", "コンロ設置可(コンロ4口以上)", "コンロ設置可(口数不明)", "システムキッチン", "冷蔵庫あり", "独立キッチン", "給湯", "電気コンロ", "BSアンテナ", "CATV", "CSアンテナ", "インターネット使用料無料", "インターネット対応", "光ファイバー", "有線放送", "高速インターネット", "24時間換気システム", "2面採光", "3面採光", "ウォークインクローゼット", "エアコン付", "エレベーター", "オール電化", "ガスその他", "ガス暖房", "クッションフロア", "シューズボックス", "タイル張り", "トランクルーム", "バリアフリー", "バルコニー", "フローリング", "プロパンガス", "ペアガラス", "ルーフバルコニー", "ロフト付き", "下水", "二世帯住宅", "二重サッシ", "井戸", "公営水道", "冷房", "出窓", "地下室", "室内洗濯機置場", "室外洗濯機置場", "専用庭", "床下収納", "床暖房", "排水その他", "敷地内ごみ置き場", "水道その他", "汲み取り", "洗濯機置場なし", "浄化槽", "石油暖房", "都市ガス", "防音室", "bicycle_parking", "car_parking", "bike_parking", "structure", "fixed_term" ] #################### ## Train model #################### folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) oof = np.zeros(len(train)) predictions = np.zeros(len(test)) feature_importance_df = pd.DataFrame() for fold, (train_idx, val_idx) in enumerate(folds.split(train, train["town_cleaned"])): print(f"Fold {fold+1}") train_data = lgb.Dataset(train.iloc[train_idx][use_cols], label=log_target[train_idx], categorical_feature=categorical_cols) val_data = lgb.Dataset(train.iloc[val_idx][use_cols], label=log_target[val_idx], categorical_feature=categorical_cols) num_round = N_ROUNDS callbacks = [log_evaluation(logger, period=100)]
def loadDataset(): # data used for the predictions dfData = read_csv("./data/data_0.csv", header=None, sep=',') dfLabels = read_csv("./data/labels.csv", header=None) return dfData.as_matrix(), dfLabels.as_matrix().ravel( ) # to have it in the format that the classifiers like plt.figure(figsize=(12, 12)) X, y = loadDataset() numberOfFolds = 10 skf = StratifiedKFold(n_splits=numberOfFolds, shuffle=True) indexes = [(training, test) for training, test in skf.split(X, y)] labels = np.max(y) + 1 yTest = [] yNew = [] cMatrix = np.zeros((labels, labels)) countCorrect = 0 for train_index, test_index in indexes: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # let's normalize, anyway # MinMaxScaler StandardScaler Normalizer scaler = StandardScaler()
start_time = time.time() estimator = lgb.LGBMClassifier(nthread=3, silent=True) param_grid = { 'learning_rate': [0.002], 'n_estimators': [4000, 5000], 'num_leaves': [20, 30, 40], # 'max_depth': [-1], 'boosting_type': ['gbdt'], 'objective': ['binary'], 'seed': [777], 'colsample_bytree': [0.8], 'subsample': [0.8], 'reg_alpha': [0], 'reg_lambda': [0] } cv = StratifiedKFold(n_splits=4) gbm = RandomizedSearchCV(estimator, param_distributions=param_grid, cv=cv, scoring='roc_auc', n_iter=2) gbm.fit(train_data_x, train_data_y) print(time.time() - start_time) # In[ ]: print("best params are : ", gbm.best_params_)
def run_KNN(X, y): parametrosK = [1, 5, 11, 15, 21, 25] scoreMedio = 0 somaScores = 0 melhorScoreGeral = 0 melhorKGeral = 0 melhorModeloGeral = None # para a validação externa utilizaremos 5-fold fold_5 = StratifiedKFold(n_splits=5) for indices_treino, indices_teste in fold_5.split(X, y): # criando novos dados a partir dos indices selecionados X_treino = X[indices_treino] X_teste = X[indices_teste] y_treino = y[indices_treino] y_teste = y[indices_teste] fold_3 = StratifiedKFold(n_splits=3) melhorScore = 0 melhorK = 1 melhorModelo = None for indices_treino_2, indices_teste_2 in fold_3.split(X_treino, y_treino): X_treino2 = X[indices_treino_2] X_teste2 = X[indices_teste_2] y_treino2 = y[indices_treino_2] y_teste2 = y[indices_teste_2] # novos conjuntos de treino e teste criados # fazendo grid search no parametro K for k in parametrosK: # inicializando KNN knn = KNeighborsClassifier(n_neighbors=k) # treinando KNN knn.fit(X_treino2, y_treino2) # medindo acurácia do KNN score = knn.score(X_teste2, y_teste2) # salvando melhores parametros if score > melhorScore: melhorScore = score melhorK = k melhorModelo = knn # treinando novamente SVM, agora utilizando os melhores parametros C e gamma encontrados knn = KNeighborsClassifier(n_neighbors=melhorK) knn.fit(X_treino, y_treino) score = knn.score(X_teste, y_teste) if score > melhorScoreGeral: melhorScoreGeral = score melhorKGeral = melhorK melhorModeloGeral = melhorModelo # acumulando acurácia para cálculo da acurácia média somaScores += score # calculando e printando acurácia média scoreMedio = (1.0 * somaScores) / 5.0 print("[KNN] Media acuracia do KNN eh: ", scoreMedio) print("[KNN] Melhor acuracia alcancada pelo KNN eh: ", melhorScoreGeral, ", Hiperparametros: K= ", melhorKGeral) return melhorScoreGeral, scoreMedio, melhorModeloGeral