def study_fergus(features, target, preprocessing=StandardScaler(), grid=True, random_seed=42, output_file='fergus_results.json'): results= {} from sklearn.neural_network import MLPClassifier #base_classifier= RadialBasisNeuralNetworkClassifier() base_classifier=MLPClassifier() grid_search_params= {'hidden_layer_sizes': [(100,), (50,), (200)], 'activation': ['logistic', 'tanh', 'relu'], 'alpha': [0.0001, 0.001, 0.01, 0.1]} # without oversampling classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc') pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=10, random_state= random_seed) preds= evaluate(pipeline, features, target, validator) results['without_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['without_oversampling_details']= preds.to_dict() print('without oversampling: ', results['without_oversampling_auc']) # with correct oversampling #base_classifier= RadialBasisNeuralNetworkClassifier() classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc') classifier= OversamplingClassifier(SMOTE(), classifier) pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=10, random_state= random_seed) preds= evaluate(classifier, features, target, validator) results['with_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['with_oversampling_details']= preds.to_dict() print('with oversampling: ', results['with_oversampling_auc']) #base_classifier= RadialBasisNeuralNetworkClassifier() # in-sample evaluation classifier= base_classifier preds= classifier.fit(features.values, target.values).predict_proba(features.values)[:,1] preds= pd.DataFrame({'fold': 0, 'label': target.values, 'prediction': preds}) results['in_sample_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['in_sample_details']= preds.to_dict() print('in sample: ', results['in_sample_auc']) # with incorrect oversampling X, y= SMOTE().sample(features.values, target.values) X= pd.DataFrame(X, columns=features.columns) y= pd.Series(y) #base_classifier= RadialBasisNeuralNetworkClassifier() classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc') pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=10, random_state= random_seed) preds= evaluate(pipeline, X, y, validator) results['incorrect_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['incorrect_oversampling_details']= preds.to_dict() print('incorrect oversampling: ', results['incorrect_oversampling_auc']) json.dump(results, open(output_file, 'w')) return results
def study_khan(features, target, preprocessing=StandardScaler(), grid=True, random_seed=42, output_file='khan.json'): results= {} base_classifier= SVC(kernel='rbf', probability=True, random_state=random_seed) grid_search_params= {'C': [10**i for i in range(-4, 5)]} np.random.seed(random_seed) # without oversampling classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='accuracy') pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=5, random_state= random_seed) preds= evaluate(pipeline, features, target, validator) results['without_oversampling_auc']= accuracy_score(preds['label'].values, preds['prediction'].values > 0.5) results['without_oversampling_details']= preds.to_dict() print('without oversampling: ', results['without_oversampling_auc']) # with correct oversampling classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='accuracy') classifier= OversamplingClassifier(ADASYN(random_state=random_seed), classifier) pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=5, random_state= random_seed) preds= evaluate(classifier, features, target, validator) results['with_oversampling_auc']= accuracy_score(preds['label'].values, preds['prediction'].values > 0.5) results['with_oversampling_details']= preds.to_dict() print('with oversampling: ', results['with_oversampling_auc']) # in-sample evaluation classifier= base_classifier preds= classifier.fit(features.values, target.values).predict_proba(features.values)[:,1] preds= pd.DataFrame({'fold': 0, 'label': target.values, 'prediction': preds}) results['in_sample_auc']= accuracy_score(preds['label'].values, preds['prediction'].values > 0.5) results['in_sample_details']= preds.to_dict() print('in sample: ', results['in_sample_auc']) # with incorrect oversampling X, y= ADASYN(random_state=random_seed).sample(features.values, target.values) X= pd.DataFrame(X, columns=features.columns) y= pd.Series(y) classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='accuracy') pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=5, random_state=random_seed) preds= evaluate(pipeline, X, y, validator) results['incorrect_oversampling_auc']= accuracy_score(preds['label'].values, preds['prediction'].values > 0.5) results['incorrect_oversampling_details']= preds.to_dict() print('incorrect oversampling: ', results['incorrect_oversampling_auc']) json.dump(results, open(output_file, 'w')) return results
def study_jagerlibensek(features, target, preprocessing=StandardScaler(), grid=True, random_seed=42, output_file='acharya_results.json'): results= {} base_classifier= QuadraticDiscriminantAnalysis() grid_search_params= {'reg_param': [0.01, 0.1, 0.5, 0.9, 0.99]} # without oversampling classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc') pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=10, random_state= random_seed) preds= evaluate(pipeline, features, target, validator) results['without_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['without_oversampling_details']= preds.to_dict() print('without oversampling: ', results['without_oversampling_auc']) # with correct oversampling classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc') classifier= OversamplingClassifier(ADASYN(), classifier) pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=10, random_state= random_seed) preds= evaluate(classifier, features, target, validator) results['with_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['with_oversampling_details']= preds.to_dict() print('with oversampling: ', results['with_oversampling_auc']) # in-sample evaluation classifier= base_classifier preds= classifier.fit(features.values, target.values).predict_proba(features.values)[:,1] preds= pd.DataFrame({'fold': 0, 'label': target.values, 'prediction': preds}) results['in_sample_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['in_sample_details']= preds.to_dict() print('in sample: ', results['in_sample_auc']) # with incorrect oversampling X, y= ADASYN().sample(features.values, target.values) X= pd.DataFrame(X, columns=features.columns) y= pd.Series(y) classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc') pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=10, random_state= random_seed) preds= evaluate(pipeline, X, y, validator) results['incorrect_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['incorrect_oversampling_details']= preds.to_dict() print('incorrect oversampling: ', results['incorrect_oversampling_auc']) json.dump(results, open(output_file, 'w')) return results
def study_sadiahmed(features, target, preprocessing=StandardScaler(), grid=True, random_seed=42, output_file='sadiahmed_results.json'): features = features.loc[:, sadiahmed_features + ['Rectime', 'Gestation']] mask = (features['Rectime'] >= 27) & (features['Rectime'] <= 32) features = features.loc[mask, :] target = target.loc[mask] term_ix = np.arange(len(target), dtype=int)[target == 1] preterm_ix = np.arange(len(target), dtype=int)[target == 0] term_diffs = list(features.iloc[term_ix]['Gestation'] - features.iloc[term_ix]['Rectime']) preterm_diffs = list(features.iloc[preterm_ix]['Gestation'] - features.iloc[preterm_ix]['Rectime']) features = features.drop(['Rectime', 'Gestation'], axis=1) # These indices seem to best resemble the reported mean and stddev... term_sample = list(term_ix[np.argsort(term_diffs)[10:25]]) preterm_sample = list(preterm_ix[np.argsort(preterm_diffs)[2:17]]) features = features.iloc[term_sample + preterm_sample, :] target = target.iloc[term_sample + preterm_sample] results = {} base_classifier = SVC(kernel='linear', random_state=random_seed, probability=True) grid_search_params = { 'kernel': ['linear'], 'C': [10**i for i in range(-4, 5)], 'probability': [True], 'random_state': [random_seed] } # without oversampling classifier = base_classifier if not grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') pipeline = classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator = StratifiedKFold(n_splits=10, random_state=random_seed) preds = evaluate(pipeline, features, target, validator) results['without_oversampling_auc'] = roc_auc_score( preds['label'].values, preds['prediction'].values) results['without_oversampling_details'] = preds.to_dict() print('without oversampling: ', results['without_oversampling_auc']) # with correct oversampling classifier = base_classifier if not grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') classifier = OversamplingClassifier(ADASYN(), classifier) pipeline = classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator = StratifiedKFold(n_splits=10, random_state=random_seed) preds = evaluate(classifier, features, target, validator) results['with_oversampling_auc'] = roc_auc_score( preds['label'].values, preds['prediction'].values) results['with_oversampling_details'] = preds.to_dict() print('with oversampling: ', results['with_oversampling_auc']) # in-sample evaluation classifier = base_classifier preds = classifier.fit(features.values, target.values).predict_proba(features.values)[:, 1] preds = pd.DataFrame({ 'fold': 0, 'label': target.values, 'prediction': preds }) results['in_sample_auc'] = roc_auc_score(preds['label'].values, preds['prediction'].values) results['in_sample_details'] = preds.to_dict() print('in sample: ', results['in_sample_auc']) # with incorrect oversampling X, y = ADASYN().sample(features.values, target.values) X = pd.DataFrame(X, columns=features.columns) y = pd.Series(y) classifier = base_classifier if not grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') pipeline = classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator = StratifiedKFold(n_splits=10, random_state=random_seed) preds = evaluate(pipeline, X, y, validator) results['incorrect_oversampling_auc'] = roc_auc_score( preds['label'].values, preds['prediction'].values) results['incorrect_oversampling_details'] = preds.to_dict() print('incorrect oversampling: ', results['incorrect_oversampling_auc']) json.dump(results, open(output_file, 'w')) return results
def study_idowu(features, target, preprocessing=StandardScaler(), grid=True, random_seed=42, output_file='idowu.json'): results = {} base_classifier = RandomForestClassifier(random_state=random_seed) grid_search_params = { 'max_depth': [3, 5, 10, None], 'min_samples_leaf': [1, 3, 5] } np.random.seed(random_seed) # without oversampling classifier = base_classifier if not grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') pipeline = classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator = StratifiedKFold(n_splits=5, random_state=random_seed) preds = evaluate(pipeline, features, target, validator) results['without_oversampling_auc'] = roc_auc_score( preds['label'].values, preds['prediction'].values) results['without_oversampling_details'] = preds.to_dict() print('without oversampling: ', results['without_oversampling_auc']) # with correct oversampling classifier = base_classifier if not grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') classifier = OversamplingClassifier(SMOTE(random_state=random_seed), classifier) pipeline = classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator = StratifiedKFold(n_splits=5, random_state=random_seed) preds = evaluate(classifier, features, target, validator) results['with_oversampling_auc'] = roc_auc_score( preds['label'].values, preds['prediction'].values) results['with_oversampling_details'] = preds.to_dict() print('with oversampling: ', results['with_oversampling_auc']) # in-sample evaluation classifier = base_classifier preds = classifier.fit(features.values, target.values).predict_proba(features.values)[:, 1] preds = pd.DataFrame({ 'fold': 0, 'label': target.values, 'prediction': preds }) results['in_sample_auc'] = roc_auc_score(preds['label'].values, preds['prediction'].values) results['in_sample_details'] = preds.to_dict() print('in sample: ', results['in_sample_auc']) # with incorrect oversampling X, y = SMOTE(random_state=random_seed).sample(features.values, target.values) X = pd.DataFrame(X, columns=features.columns) y = pd.Series(y) classifier = base_classifier if not grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') pipeline = classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator = StratifiedKFold(n_splits=5, random_state=random_seed) preds = evaluate(pipeline, X, y, validator) results['incorrect_oversampling_auc'] = roc_auc_score( preds['label'].values, preds['prediction'].values) results['incorrect_oversampling_details'] = preds.to_dict() print('incorrect oversampling: ', results['incorrect_oversampling_auc']) json.dump(results, open(output_file, 'w')) return results