コード例 #1
0
ファイル: khan.py プロジェクト: yyht/EHG-Oversampling
def study_khan(features, target, preprocessing=StandardScaler(), grid=True, random_seed=42, output_file='khan.json'):
    results= {}
    base_classifier= SVC(kernel='rbf', probability=True, random_state=random_seed)
    grid_search_params= {'C': [10**i for i in range(-4, 5)]}

    np.random.seed(random_seed)

    # without oversampling
    classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='accuracy')
    pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)])
    validator= StratifiedKFold(n_splits=5, random_state= random_seed)

    preds= evaluate(pipeline, features, target, validator)
    results['without_oversampling_auc']= accuracy_score(preds['label'].values, preds['prediction'].values > 0.5)
    results['without_oversampling_details']= preds.to_dict()

    print('without oversampling: ', results['without_oversampling_auc'])

    # with correct oversampling
    classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='accuracy')
    classifier= OversamplingClassifier(ADASYN(random_state=random_seed), classifier)
    pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)])
    validator= StratifiedKFold(n_splits=5, random_state= random_seed)

    preds= evaluate(classifier, features, target, validator)
    results['with_oversampling_auc']= accuracy_score(preds['label'].values, preds['prediction'].values > 0.5)
    results['with_oversampling_details']= preds.to_dict()
    print('with oversampling: ', results['with_oversampling_auc'])

    # in-sample evaluation
    classifier= base_classifier
    preds= classifier.fit(features.values, target.values).predict_proba(features.values)[:,1]
    preds= pd.DataFrame({'fold': 0, 'label': target.values, 'prediction': preds})
    results['in_sample_auc']= accuracy_score(preds['label'].values, preds['prediction'].values > 0.5)
    results['in_sample_details']= preds.to_dict()
    print('in sample: ', results['in_sample_auc'])

    # with incorrect oversampling
    X, y= ADASYN(random_state=random_seed).sample(features.values, target.values)
    X= pd.DataFrame(X, columns=features.columns)
    y= pd.Series(y)

    classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='accuracy')
    pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)])
    validator= StratifiedKFold(n_splits=5, random_state=random_seed)

    preds= evaluate(pipeline, X, y, validator)
    results['incorrect_oversampling_auc']= accuracy_score(preds['label'].values, preds['prediction'].values > 0.5)
    results['incorrect_oversampling_details']= preds.to_dict()
    print('incorrect oversampling: ', results['incorrect_oversampling_auc'])


    json.dump(results, open(output_file, 'w'))
    return results
コード例 #2
0
def study_jagerlibensek(features, target, preprocessing=StandardScaler(), grid=True, random_seed=42, output_file='acharya_results.json'):
    results= {}
    base_classifier= QuadraticDiscriminantAnalysis()
    grid_search_params= {'reg_param': [0.01, 0.1, 0.5, 0.9, 0.99]}

    # without oversampling
    classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc')
    pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)])
    validator= StratifiedKFold(n_splits=10, random_state= random_seed)

    preds= evaluate(pipeline, features, target, validator)
    results['without_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values)
    results['without_oversampling_details']= preds.to_dict()

    print('without oversampling: ', results['without_oversampling_auc'])

    # with correct oversampling
    classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc')
    classifier= OversamplingClassifier(ADASYN(), classifier)
    pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)])
    validator= StratifiedKFold(n_splits=10, random_state= random_seed)

    preds= evaluate(classifier, features, target, validator)
    results['with_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values)
    results['with_oversampling_details']= preds.to_dict()
    print('with oversampling: ', results['with_oversampling_auc'])

    # in-sample evaluation
    classifier= base_classifier
    preds= classifier.fit(features.values, target.values).predict_proba(features.values)[:,1]
    preds= pd.DataFrame({'fold': 0, 'label': target.values, 'prediction': preds})
    results['in_sample_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values)
    results['in_sample_details']= preds.to_dict()
    print('in sample: ', results['in_sample_auc'])

    # with incorrect oversampling
    X, y= ADASYN().sample(features.values, target.values)
    X= pd.DataFrame(X, columns=features.columns)
    y= pd.Series(y)

    classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc')
    pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)])
    validator= StratifiedKFold(n_splits=10, random_state= random_seed)

    preds= evaluate(pipeline, X, y, validator)
    results['incorrect_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values)
    results['incorrect_oversampling_details']= preds.to_dict()
    print('incorrect oversampling: ', results['incorrect_oversampling_auc'])

    json.dump(results, open(output_file, 'w'))
    return results
コード例 #3
0
ファイル: khan.py プロジェクト: yyht/EHG-Oversampling
    def fit(self, X, y):
        base_classifier= SVC(kernel='rbf', probability=True, random_state=self.random_state)
        grid_search_params= {'C': [10**i for i in range(-4, 5)]}

        classifier= base_classifier if not self.grid else GridSearchCV(base_classifier, grid_search_params, scoring='accuracy')
        classifier= OversamplingClassifier(ADASYN(random_state=self.random_state), classifier)
        self.pipeline= classifier if not self.preprocessing else Pipeline([('preprocessing', self.preprocessing), ('classifier', classifier)])
        self.pipeline.fit(X, y)

        return self
コード例 #4
0
    def fit(self, X, y):
        base_classifier= QuadraticDiscriminantAnalysis()
        grid_search_params= {'reg_param': [0.01, 0.1, 0.5, 0.9, 0.99]}

        classifier= base_classifier if not self.grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc')
        classifier= OversamplingClassifier(ADASYN(), classifier)
        self.pipeline= classifier if not self.preprocessing else Pipeline([('preprocessing', self.preprocessing), ('classifier', classifier)])
        self.pipeline.fit(X, y)

        return self
コード例 #5
0
    def fit(self, X, y):
        base_classifier = RandomForestClassifier(
            random_state=self.random_state)
        grid_search_params = {
            'max_depth': [3, 5, 10, None],
            'min_samples_leaf': [1, 3, 5]
        }

        classifier = base_classifier if not self.grid else GridSearchCV(
            base_classifier, grid_search_params, scoring='roc_auc')
        classifier = OversamplingClassifier(
            ADASYN(random_state=self.random_state), classifier)
        self.pipeline = classifier if not self.preprocessing else Pipeline([
            ('preprocessing', self.preprocessing), ('classifier', classifier)
        ])
        self.pipeline.fit(X, y)

        return self
コード例 #6
0
    def fit(self, X, y):
        base_classifier = SVC(kernel='linear',
                              random_state=self.random_state,
                              probability=True)
        grid_search_params = {
            'kernel': ['linear'],
            'C': [10**i for i in range(-4, 5)],
            'probability': [True],
            'random_state': [self.random_state]
        }

        classifier = base_classifier if not self.grid else GridSearchCV(
            base_classifier, grid_search_params, scoring='roc_auc')
        classifier = OversamplingClassifier(ADASYN(), classifier)
        self.pipeline = classifier if not self.preprocessing else Pipeline([
            ('preprocessing', self.preprocessing), ('classifier', classifier)
        ])
        self.pipeline.fit(X, y)

        return self
コード例 #7
0
def study_sadiahmed(features,
                    target,
                    preprocessing=StandardScaler(),
                    grid=True,
                    random_seed=42,
                    output_file='sadiahmed_results.json'):
    features = features.loc[:, sadiahmed_features + ['Rectime', 'Gestation']]
    mask = (features['Rectime'] >= 27) & (features['Rectime'] <= 32)
    features = features.loc[mask, :]
    target = target.loc[mask]

    term_ix = np.arange(len(target), dtype=int)[target == 1]
    preterm_ix = np.arange(len(target), dtype=int)[target == 0]

    term_diffs = list(features.iloc[term_ix]['Gestation'] -
                      features.iloc[term_ix]['Rectime'])
    preterm_diffs = list(features.iloc[preterm_ix]['Gestation'] -
                         features.iloc[preterm_ix]['Rectime'])

    features = features.drop(['Rectime', 'Gestation'], axis=1)

    # These indices seem to best resemble the reported mean and stddev...
    term_sample = list(term_ix[np.argsort(term_diffs)[10:25]])
    preterm_sample = list(preterm_ix[np.argsort(preterm_diffs)[2:17]])

    features = features.iloc[term_sample + preterm_sample, :]
    target = target.iloc[term_sample + preterm_sample]

    results = {}
    base_classifier = SVC(kernel='linear',
                          random_state=random_seed,
                          probability=True)
    grid_search_params = {
        'kernel': ['linear'],
        'C': [10**i for i in range(-4, 5)],
        'probability': [True],
        'random_state': [random_seed]
    }

    # without oversampling
    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='roc_auc')
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    validator = StratifiedKFold(n_splits=10, random_state=random_seed)

    preds = evaluate(pipeline, features, target, validator)
    results['without_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['without_oversampling_details'] = preds.to_dict()

    print('without oversampling: ', results['without_oversampling_auc'])

    # with correct oversampling
    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='roc_auc')
    classifier = OversamplingClassifier(ADASYN(), classifier)
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    validator = StratifiedKFold(n_splits=10, random_state=random_seed)

    preds = evaluate(classifier, features, target, validator)
    results['with_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['with_oversampling_details'] = preds.to_dict()
    print('with oversampling: ', results['with_oversampling_auc'])

    # in-sample evaluation
    classifier = base_classifier
    preds = classifier.fit(features.values,
                           target.values).predict_proba(features.values)[:, 1]
    preds = pd.DataFrame({
        'fold': 0,
        'label': target.values,
        'prediction': preds
    })
    results['in_sample_auc'] = roc_auc_score(preds['label'].values,
                                             preds['prediction'].values)
    results['in_sample_details'] = preds.to_dict()
    print('in sample: ', results['in_sample_auc'])

    # with incorrect oversampling
    X, y = ADASYN().sample(features.values, target.values)
    X = pd.DataFrame(X, columns=features.columns)
    y = pd.Series(y)

    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='roc_auc')
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    validator = StratifiedKFold(n_splits=10, random_state=random_seed)

    preds = evaluate(pipeline, X, y, validator)
    results['incorrect_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['incorrect_oversampling_details'] = preds.to_dict()
    print('incorrect oversampling: ', results['incorrect_oversampling_auc'])

    json.dump(results, open(output_file, 'w'))
    return results
コード例 #8
0
def study_acharya(features,
                  target,
                  preprocessing=StandardScaler(),
                  grid=True,
                  random_seed=42,
                  output_file='acharya_results.json'):
    #features= features.loc[:,acharya_features]

    results = {}
    base_classifier = SVC(kernel='rbf',
                          random_state=random_seed,
                          probability=True)
    grid_search_params = {
        'kernel': ['rbf'],
        'C': [10**i for i in range(-4, 5)],
        'probability': [True],
        'random_state': [random_seed]
    }

    # without oversampling
    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='accuracy')
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    validator = StratifiedKFold(n_splits=10, random_state=random_seed)

    preds = evaluate(pipeline, features, target, validator)
    results['without_oversampling_acc'] = accuracy_score(
        preds['label'].values, preds['prediction'].values > 0.5)
    results['without_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['without_oversampling_details'] = preds.to_dict()

    print('without oversampling: ', results['without_oversampling_auc'])

    # with correct oversampling
    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='accuracy')
    classifier = OversamplingClassifier(ADASYN(), classifier)
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    correct_pipeline = pipeline
    validator = StratifiedKFold(n_splits=10, random_state=random_seed)

    preds = evaluate(classifier, features, target, validator)
    results['with_oversampling_acc'] = accuracy_score(
        preds['label'].values, preds['prediction'].values > 0.5)
    results['with_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['with_oversampling_details'] = preds.to_dict()
    print('with oversampling: ', results['with_oversampling_auc'])

    # with incorrect oversampling
    X, y = ADASYN().sample(features.values, target.values)
    X = pd.DataFrame(X, columns=features.columns)
    y = pd.Series(y)

    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='accuracy')
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    validator = StratifiedKFold(n_splits=10, random_state=random_seed)

    preds = evaluate(pipeline, X, y, validator)
    results['incorrect_oversampling_acc'] = accuracy_score(
        preds['label'].values, preds['prediction'].values > 0.5)
    results['incorrect_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['incorrect_oversampling_details'] = preds.to_dict()

    print('incorrect oversampling: ', results['incorrect_oversampling_auc'])

    json.dump(results, open(output_file, 'w'))

    return results
コード例 #9
0
def study_peng(features,
               target,
               preprocessing=StandardScaler(),
               grid=True,
               random_seed=42,
               output_file='idowu.json'):
    results = {}
    base_classifier = RandomForestClassifier(random_state=random_seed)
    grid_search_params = {
        'max_depth': [3, 5, 10, None],
        'min_samples_leaf': [1, 3, 5]
    }

    mask = features['Rectime'] <= 26
    features = features.loc[mask, :]
    target = target.loc[mask]

    features = features.drop('Rectime', axis=1)

    np.random.seed(random_seed)

    # without oversampling
    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='roc_auc')
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    validator = StratifiedKFold(n_splits=5, random_state=random_seed)

    preds = evaluate(pipeline, features, target, validator)
    results['without_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['without_oversampling_details'] = preds.to_dict()

    print('without oversampling: ', results['without_oversampling_auc'])

    # with correct oversampling
    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='roc_auc')
    classifier = OversamplingClassifier(ADASYN(random_state=random_seed),
                                        classifier)
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    validator = StratifiedKFold(n_splits=5, random_state=random_seed)

    preds = evaluate(classifier, features, target, validator)
    results['with_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['with_oversampling_details'] = preds.to_dict()
    print('with oversampling: ', results['with_oversampling_auc'])

    # in-sample evaluation
    classifier = base_classifier
    preds = classifier.fit(features.values,
                           target.values).predict_proba(features.values)[:, 1]
    preds = pd.DataFrame({
        'fold': 0,
        'label': target.values,
        'prediction': preds
    })
    results['in_sample_auc'] = roc_auc_score(preds['label'].values,
                                             preds['prediction'].values)
    results['in_sample_details'] = preds.to_dict()
    print('in sample: ', results['in_sample_auc'])

    # with incorrect oversampling
    X, y = ADASYN(random_state=random_seed).sample(features.values,
                                                   target.values)
    X = pd.DataFrame(X, columns=features.columns)
    y = pd.Series(y)

    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='roc_auc')
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    validator = StratifiedKFold(n_splits=5, random_state=random_seed)

    preds = evaluate(pipeline, X, y, validator)
    results['incorrect_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['incorrect_oversampling_details'] = preds.to_dict()
    print('incorrect oversampling: ', results['incorrect_oversampling_auc'])

    json.dump(results, open(output_file, 'w'))
    return results