Example #1
0
###### Reciver Operating Characteristic Curve (ROC curve)                ######
###############################################################################
from sklearn.metrics import roc_curve, auc
from scipy import interp

pipe_lr = make_pipeline(
    StandardScaler(), PCA(n_components=2),
    LogisticRegression(solver='liblinear',
                       penalty='l2',
                       random_state=1,
                       C=100.0))

X_train2 = X_train[:, [4, 14]]
X_train.shape
X_train2.shape
cv = list(StratifiedKFold(n_splits=3, random_state=1).split(X_train, y_train))

fig = plt.figure(figsize=(7, 5))

mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
all_tpr = []

for i, (train, test) in enumerate(cv):
    probas = pipe_lr.fit(X_train2[train],
                         y_train[train]).predict_proba(X_train2[test])

    fpr, tpr, thresholds = roc_curve(y_train[test], probas[:, 1], pos_label=1)
    mean_tpr += interp(mean_fpr, fpr, tpr)
    mean_tpr[0] = 0.0
    roc_auc = auc(fpr, tpr)
Example #2
0
    sample_weight = df.values[:, 0]
    model = RFRegressorFeatureOutOfFold(name='rf', sample_weight=sample_weight)
    model.fit(df, y)

    for clf, (idx_train,
              idx_valid) in zip(model._fitted_models,
                                model.get_fold_splitting(df.values, y)):
        assert np.array_equal(clf.fit_params_.get('sample_weight', None),
                              sample_weight[idx_train])


@pytest.mark.parametrize(
    'cv',
    [
        KFold(n_splits=2),
        StratifiedKFold(n_splits=10, shuffle=True,
                        random_state=71),  # Must set random state
    ])
def test_custom_cv_class(cv, binary_data):
    df, y = binary_data
    clf = boosting.XGBoostClassifierOutOfFold(name='xgb', cv=cv)

    for origin, model in zip(cv.split(df.values, y),
                             clf.get_fold_splitting(df.values, y)):
        assert np.array_equal(origin[0], model[0])
        assert np.array_equal(origin[1], model[1])


def test_custom_cv_as_list():
    """can set custom cv as list of train / test indexes"""
    cv = [[1, 2, 3], [4, 5], [2, 4, 5], [1, 3]]
    clf = boosting.XGBoostClassifierOutOfFold(name='xgb', cv=cv)
                         data_type + '.csv',
                         delimiter=',')
emt_y_coord = np.loadtxt(path + '/data_' + str(max_length) + '_y_' +
                         data_type + '.csv',
                         delimiter=',')
emt_label = np.loadtxt(path + '/data_' + str(max_length) + '_label_' +
                       data_type + '.csv',
                       delimiter=',')

emt_x_coord = np.reshape(emt_x_coord, (-1, max_length, 1))
emt_y_coord = np.reshape(emt_y_coord, (-1, max_length, 1))
emt_x = np.concatenate((emt_x_coord, emt_y_coord), axis=2)
emt_ground = emt_label

# split data into folds
skf = StratifiedKFold(n_splits=folds, shuffle=True)

# early stopping options. Stop training (patience) steps after val_loss starts to increase.
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss',
                                               patience=patience,
                                               verbose=1,
                                               mode='min',
                                               baseline=None,
                                               restore_best_weights=True)

##########################################
# Folds
##########################################
cvscores = []

# blank matrix
Example #4
0
    
# # # make a list of train dataset   list[path,cry]
    tmp_list = []

    for label in os.listdir(config.TRAIN):
        path = config.TRAIN + label + '/'
        for wavefile in os.listdir(path):
            path_tmp=''
            path_tmp = path + wavefile
            tmp_list.append([path_tmp,label])
    train_file = pd.DataFrame(tmp_list,columns=['file_path','label'])
    #print(train_file)
    del tmp_list
    
# # # split
    skf = StratifiedKFold(**config.split)
    train_file['fold']=-1
    for fold_id, (tr_ind, val_ind) in enumerate(skf.split(train_file, train_file['label'])):
        train_file.iloc[val_ind,-1] = fold_id
    #print(train_file['fold'])
    use_fold = config.globals["use_fold"]
    train_file_list = train_file.query("fold != @use_fold")[["file_path", "label"]].values.tolist()
    val_file_list = train_file.query("fold == @use_fold")[["file_path", "label"]].values.tolist()

    #print("[fold {}] train: {}, val: {}".format(use_fold, len(train_file_list), len(val_file_list)))

    engine.set_seed(config.globals["seed"])
    device = torch.device(config.globals["device"])
    
# # # get loader
    train_loader, val_loader = dataset.get_loaders_for_training(
                  metrics=[auc])
    return model

earlystopping = callbacks.EarlyStopping(monitor='val_auc', min_delta=0,\
                              patience=5, verbose=0, mode='max')
checkpoint = callbacks.ModelCheckpoint('bestmodel.h5', monitor='val_auc', verbose=0, \
                             save_best_only=False, period=1)

rlr = callbacks.ReduceLROnPlateau( monitor='val_auc',\
                                  factor=0.1, patience=3, verbose=0, \
                                  cooldown=0, min_lr=0)
CALLBACKS = [earlystopping, checkpoint, rlr]
NFOLDS = 10
EPOCHS = 10
BATCHSIZE = 64
skf = StratifiedKFold(n_splits=NFOLDS)

predictions = np.zeros((len(test), ))
validations = np.zeros((len(train), ))
for train_index, valid_index in skf.split(X, y):
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    model = create_model()
    model.fit(list(np.transpose(X_train)),y_train,validation_data=(list(np.transpose(X_valid)),y_valid),\
              epochs=EPOCHS,batch_size=BATCHSIZE,verbose=2,callbacks=CALLBACKS)
    validations[valid_index] = model.predict(list(
        np.transpose(X_valid))).flatten()
    predictions += model.predict(list(np.transpose(X_test))).flatten() / NFOLDS

submission = pd.DataFrame(predictions, columns=target_col)
                               reg_alpha=3,
                               reg_lambda=5,
                               max_depth=-1,
                               n_estimators=5000,
                               objective='binary',
                               subsample=0.9,
                               colsample_bytree=0.77,
                               subsample_freq=1,
                               learning_rate=0.05,
                               random_state=1000,
                               n_jobs=4,
                               min_child_weight=4,
                               min_child_samples=5,
                               min_split_gain=0)

skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True)

oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test_id.shape[0])

best_score = []
for index, (train_index, test_index) in enumerate(skf.split(train, label)):
    lgb_model.fit(train.iloc[train_index],
                  label.iloc[train_index],
                  verbose=50,
                  eval_set=[(train.iloc[train_index], label.iloc[train_index]),
                            (train.iloc[test_index], label.iloc[test_index])],
                  early_stopping_rounds=30)
    best_score.append(lgb_model.best_score_['valid_1']['binary_logloss'])
    print(best_score)
Example #7
0
def build_poi_id_model(features, labels, names):
    """
    Function to train classifier to predict labels given features

    Parameters
    ----------
    features:   list of dictionaries per dataset
    lables:     list of boolean labels

    Return values
    clf, features_list
    clf:            trained classifier
    features_list:  list of features used by the classifier
    """
    # Split into training and testing
    splitter = StratifiedKFold(n_splits=10)

    features_train, features_test, labels_train, labels_test = \
        train_test_split(features, labels, test_size=0.05,
                         random_state=123456,
                         stratify=labels
                        )

    # Setting for persistance
    # In the persistance run, texts from emails are extracted for
    # training and testing data sets and the results are persisted
    # to files.
    # If not in persistance run, these files are only loaded and
    # processing of the emails is skipped

    # Pipeline to process email texts
    # First, extract texts from person
    # then, eventually persist those texts
    # then, vectorize the texts
    # then, select only the percentile with the most separating power
    # then, convert result to dense array (needed for some classifiers)
    pipeline_email_text = Pipeline([
        ("GetEmailText", SelectMatchFeatures(feature_match="word_.*")),
        #("SelectPercentile", SelectPercentile(score_func=f_classif, percentile=10)),
        ("SelectPercentile", SelectKBest(score_func=chi2, k=250)),
        #("SVC", SelectFromModel(LinearSVC(class_weight="balanced", C=0.7), threshold=0.25)),
        # ("NaiveBayes", SelectFromModel(MultinomialNB(alpha=.5, fit_prior=False), threshold=0.5)),
        #("Scale", StandardScaler()),
    ])

    pipeline_subjects = Pipeline([
        ("GetEmailText", SelectMatchFeatures(feature_match="sub_.*")),
        ("SelectPercentile", SelectKBest(score_func=chi2, k=100)),
        # ("NaiveBayes", SelectFromModel(MultinomialNB(alpha=1, fit_prior=False))),
        #("Scale", StandardScaler())
    ])
    # Process financial features
    pipeline_financial = Pipeline([
        ("Selector",
         SelectFeatureList(selected_feature_list=FEATURES_FINANCIAL,
                           convert_to_numeric=True)),
        ("ConvertToVector", DictVectorizer(sparse=False)),
        ("Impute", ImputeOrZero(strategy="zero")),
        ("Log1P", FunctionTransformer(func=log_trans)),
    ])

    # Process other features
    # First, drop email_adress feature, which is only needed to
    # load the email texts
    # then, convert dictionary to dense vector
    pipeline_email = Pipeline([
        ("Selector",
         SelectFeatureList(selected_feature_list=FEATURES_EMAIL,
                           convert_to_numeric=True)),
        ("ConvertToVector", DictVectorizer(sparse=False)),
        ("Log1P", FunctionTransformer(func=log_trans)),
    ])

    feature_union = FeatureUnion(
        transformer_list=[
            ("email_text", pipeline_email_text),
            ("subjects", pipeline_subjects),
            ("financial", pipeline_financial),
            ("email", pipeline_email),
        ],
        #transformer_weights={'email_text': 0, 'subjects': 1, 'financial': 1, 'email': 1},
    )
    # Combine email text features and other features
    # then run classifier on these features
    pipeline_union = Pipeline([
        ("union", feature_union),
        ("Scale", StandardScaler()),
        #("Select", SelectKBest(score_func=f_classif, k=10)),
        # ("KNeighborsClassifier", KNeighborsClassifier()),
        ("KNeighborsClassifier",
         KNeighborsClassifier(n_neighbors=1,
                              metric='minkowski',
                              weights='distance')),
        # ("SVC", SVC(class_weight='balanced')),
        #  ("SVC", SVC(C=0.8, kernel='rbf', class_weight='balanced')),
        #  ("DecisionTree", RandomForestClassifier()),
        # ("DecisionTree", RandomForestClassifier(n_estimators=10, min_samples_split=6, min_samples_leaf=1, class_weight=None)),
        #("NaiveBayes", MultinomialNB(alpha=1, fit_prior=False)),
    ])

    # Fit the complete pipeline
    # Test accuracy of model
    param_grid_union = {
        "union__transformer_weights": [
            #    {'email_text': 1, 'subjects': 1, 'financial': 1, 'email': 1},
            #    {'email_text': 0, 'subjects': 1, 'financial': 1, 'email': 1},
            #    {'email_text': 1, 'subjects': 0, 'financial': 1, 'email': 1},
            #    {'email_text': 1, 'subjects': 1, 'financial': 0, 'email': 1},
            #    {'email_text': 1, 'subjects': 1, 'financial': 1, 'email': 0},
            #    {'email_text': 0, 'subjects': 0, 'financial': 1, 'email': 1},
            #    {'email_text': 0, 'subjects': 1, 'financial': 0, 'email': 1},
            #    {'email_text': 0, 'subjects': 1, 'financial': 1, 'email': 0},
            #    {'email_text': 1, 'subjects': 0, 'financial': 0, 'email': 1},
            #    {'email_text': 1, 'subjects': 0, 'financial': 1, 'email': 0},
            #    {'email_text': 1, 'subjects': 1, 'financial': 0, 'email': 0},
            #    {'email_text': 0, 'subjects': 0, 'financial': 0, 'email': 1},
            #    {'email_text': 0, 'subjects': 0, 'financial': 1, 'email': 0},
            {
                'email_text': 0,
                'subjects': 1,
                'financial': 0,
                'email': 0
            },
            #    {'email_text': 1, 'subjects': 0, 'financial': 0, 'email': 0},
        ],
        # "union__email_text__SelectPercentile__k": [10, 50, 100, 250, 500],
        # "union__email_text__SelectPercentile__score_func": [chi2, f_classif],
        # "union__subjects__SelectPercentile__k": [2, 3, 5, 10, 100, 200],
        # "union__subjects__SelectPercentile__score_func": [chi2, f_classif],
        # "union__financial__Impute__strategy": ["median", "zero"],
        # "DecisionTree__min_samples_split": [2,4,6],
        # "DecisionTree__min_samples_leaf": [1,2,4],
        # "DecisionTree__n_estimators": [5, 10, 20],
        # "NaiveBayes__alpha": [.5, .8, 1],
        #  "SVC__C": [.2, .5, .8, 1],
        #  "SVC__kernel": ['rbf', 'sigmoid', 'linear'],
        #  "SVC__class_weight": [None, 'balanced'],
        #  "SVC__probability": [False, True],
        "KNeighborsClassifier__n_neighbors": [1, 3, 5],
        "KNeighborsClassifier__weights": ["uniform", "distance"],
        "KNeighborsClassifier__metric": ["minkowski", "manhattan"]
    }

    grid_search_union = GridSearchCV(pipeline_union,
                                     param_grid=param_grid_union,
                                     cv=10,
                                     scoring="f1")
    start = time()
    np.random.seed(42)
    grid_search_union.fit(features, labels)

    print(
        "GridSearchCV took %.2f seconds for %d candidate parameter settings." %
        (time() - start, len(grid_search_union.cv_results_['params'])))
    report(grid_search_union.cv_results_)

    np.random.seed(42)
    best_est = np.flatnonzero(
        grid_search_union.cv_results_['rank_test_score'] == 1)[0]
    print grid_search_union.cv_results_['params'][best_est]
    pipeline_union.set_params(
        **grid_search_union.cv_results_['params'][best_est])

    pred = cross_val_predict(pipeline_union, features, labels, cv=10)
    print confusion_matrix(labels, pred)
    print classification_report(labels, pred)
    print "Accuracy: ", accuracy_score(labels, pred)

    pickle.dump(pipeline_union, open("full_classifier.pkl", "w"))

    # Pepare data for tester
    feature_select = FeatureUnion(transformer_list=[("subjects",
                                                     pipeline_subjects)])
    feature_transformed = feature_select.fit_transform(features, labels)
    # extract names of subject features
    sub_features = feature_select.transformer_list[0][1].named_steps[
        "GetEmailText"].get_feature_names()
    select_sub_features_idx = feature_select.transformer_list[0][
        1].named_steps["SelectPercentile"].get_support(indices=True)
    select_sub_features = np.take(sub_features,
                                  select_sub_features_idx).tolist()

    data_dict = VectorToDict(feature_names=select_sub_features,
                             dataset_names=names).fit_transform(
                                 feature_transformed, labels)

    # Prepare classifier for tester
    clf = Pipeline([
        ("Scale", StandardScaler()),
        ("KNeighborsClassifier",
         KNeighborsClassifier(n_neighbors=3,
                              metric="manhattan",
                              weights="distance")),
    ])
    # Return classifier, names of features used and data
    return clf, select_sub_features, data_dict
Example #8
0
def setup_kfold(X, Y, n_splits):
    kf = StratifiedKFold(n_splits=n_splits, random_state=SEED)
    kf.get_n_splits(X)

    return kf
Example #9
0
def cross_validate_models(X,
                          y,
                          clf_models,
                          seen_index,
                          n_splits=10,
                          classes=None,
                          upsample=False,
                          roundup=False,
                          df=None,
                          stratified_k=False,
                          test_index=None,
                          p_threshold=None):

    if stratified_k:
        label_encoder = LabelEncoder()
        kf = StratifiedKFold(n_splits=n_splits)
        kfs = kf.split(X[seen_index],
                       label_encoder.fit_transform(y[seen_index]))
    else:
        kf = KFold(n_splits=n_splits)
        kfs = kf.split(X[seen_index], y[seen_index])
    i = 0

    def tpr(y_true, y_pred):
        return roc_curve(y_true, y_pred)[1]

    def fpr(y_true, y_pred):
        return roc_curve(y_true, y_pred)[0]

    def prec(y_true, y_pred):
        return precision_recall_curve(y_true, y_pred)[0]

    def rec(y_true, y_pred):
        return precision_recall_curve(y_true, y_pred)[1]

    scores = [
        # name, function, on y when multiclas, on each y when multiclass, # proba
        ('p', precision_score, True, True, False),
        ('r', recall_score, True, True, False),
        ('f1', f1_score, True, True, False),
        ('e', accuracy_score, True, True, False),
        ('i', None, False, False, False),
        ('auc', roc_auc_score, True, True, True),
        ('tpr', tpr, False, True, True),
        ('fpr', fpr, False, True, True),
        ('prec', prec, False, True, True),
        ('rec', rec, False, True, True)
    ]

    if classes:
        scores += [('cov_err', coverage_error, True, False, False),
                   ('LRAP', label_ranking_average_precision_score, True, False,
                    False), ('LRL', label_ranking_loss, True, False, False)]

    for model in clf_models:
        for m in scores:
            model[m[0]] = []
        metrics = ['e']
        if classes:
            for j, y_class in enumerate(classes):
                for m in scores:
                    if m[1]:
                        model[f'{m[0]}\n{y_class}'] = []

                metrics += [f'p\n{y_class}', f'r\n{y_class}']

    if test_index is not None:
        test_preds = []

    for k_train, k_test in kfs:
        k_train = seen_index[k_train]
        k_test = seen_index[k_test]
        if test_index is not None:
            k_test = test_index
        if upsample:
            ros = RandomOverSampler(random_state=42)
            if classes:
                lp = LabelPowerset()
                yt = lp.transform(y)
                X_train, y_resampled = ros.fit_resample(
                    X[k_train], yt[k_train])
                y_train = lp.inverse_transform(y_resampled).todense()
            else:
                X_train, y_train = ros.fit_resample(X[k_train],
                                                    y[k_train].todense())
        else:
            X_train = X[k_train]
            y_train = y[k_train]
        i += 1
        print(i)
        for model in clf_models:
            if callable(model['model']):
                clf = model['model'](X.shape[1], y.shape[1])
            else:
                clf = model['model']
            model['i'].append(i)

            if hasattr(clf, "epochs"):
                weights = None
                if clf.custom_weights:
                    weights = clf.custom_weights
                    # weights = {}
                    # for i,c in enumerate(classes):
                    #     weights[i] =  round((1-y[seen_index,i].sum()/len(seen_index))*50)
                    # print(weights)
                clf.fit(X_train,
                        y_train,
                        epochs=clf.epochs,
                        class_weight=weights.values(),
                        verbose=clf.verbose,
                        batch_size=20)
            else:
                clf.fit(X_train, y_train)
            predictions = clf.predict(X[k_test])
            if np.ravel(predictions)[0] not in [1, 0]:
                predictions = predictions.round()
            try:
                predictions_proba = clf.predict_proba(X[k_test])
                if p_threshold is not None:
                    predictions = np.where(predictions_proba >= p_threshold, 1,
                                           0)[:, ]
            except:
                predictions_proba = predictions
                print(
                    "WARNING! Can't predict probabilities with this model, just using binary predictions"
                )
            if hasattr(predictions_proba, "todense"):
                predictions_proba = predictions_proba.todense()
            if hasattr(predictions, "todense"):
                predictions = predictions.todense()
            if test_index is not None:
                test_preds.append(predictions_proba)

            if classes:
                if roundup:
                    # for j, c in enumerate(predictions_proba.argmax(axis=1)):
                    #     predictions[j,c] = 1
                    y_pred_arr = predictions_proba
                    ai = np.expand_dims(np.argmax(y_pred_arr, axis=1), axis=1)
                    maximums = np.maximum(y_pred_arr.max(1), 0.51)
                    np.put_along_axis(y_pred_arr,
                                      ai,
                                      maximums.reshape(ai.shape),
                                      axis=1)

                    predictions = np.round(predictions_proba)

                for m in scores:
                    if m[4]:
                        y_pred = predictions_proba
                    else:
                        y_pred = predictions

                    if not m[1] or not m[2]:
                        continue
                    try:
                        model[m[0]].append(m[1](y[k_test],
                                                y_pred,
                                                average="weighted"))
                    except TypeError:
                        model[m[0]].append(m[1](y[k_test], y_pred))
                    except ValueError:
                        pass
                for j, y_class in enumerate(classes):
                    # if y[k_train,i].sum() == 0:
                    #     print("no labels for {y_class}")
                    for m in scores:
                        if not m[1]:
                            continue
                        if m[3]:  # if do this metric on each class
                            if m[4]:  # if use probabilities
                                y_pred = predictions_proba
                            else:
                                y_pred = predictions
                            try:
                                model[f'{m[0]}\n{y_class}'].append(m[1](
                                    y[k_test, j], y_pred[:, j]))
                            except:
                                model[f'{m[0]}\n{y_class}'].append(None)
                    if df is not None:
                        df.loc[
                            k_test,
                            f"{y_class} - k_prediction"] = predictions_proba[:,
                                                                             j]
                        df.loc[
                            k_test,
                            f"{y_class} - k_prediction_binary"] = predictions[:,
                                                                              j]
            else:
                for m in scores:
                    if not m[1]:
                        continue
                    model[m[0]].append(m[1](y[k_test], predictions))
                if df is not None:
                    df.loc[k_test, "y_k_prediction"] = predictions_proba[:, 1]

    if classes:
        if df is not None:
            return clf_models, metrics, df
        return clf_models, metrics
    else:
        if df is not None:
            return clf_models, df
        elif test_index is not None:
            return clf_models, np.array(test_preds)
        return clf_models
def grid_search(data_folder, folds_count, **kwargs):
    """ Performs grid search of all possible combinations of given parameters with logarithmic ranges.
        Saves results in formatted file in location pointed by get_grid_search_results_path method """

    sentence_embeddings = kwargs['sentence_embeddings']
    word_embeddings = kwargs['word_embeddings']
    classifiers = kwargs['classifiers']
    n_jobs = kwargs['n_jobs'] if 'n_jobs' in kwargs else 1

    # prepare output files
    for classifier_class, _ in classifiers:
        our_classifier_wrapper = CLASSIFIERS_WRAPPERS[classifier_class]
        output_path = get_grid_search_results_path(data_folder,
                                                   our_classifier_wrapper)
        eval_output_path = get_evaluation_path(data_folder,
                                               our_classifier_wrapper)
        t_eval_output_path = get_train_set_evaluation_path(
            data_folder, our_classifier_wrapper)
        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))
        else:  # clear output file
            with open(output_path, 'w'):
                pass
        if not os.path.exists(os.path.dirname(eval_output_path)):
            os.makedirs(os.path.dirname(eval_output_path))
        else:  # clear evaluation output file
            with open(eval_output_path, 'w'):
                pass
        if not os.path.exists(os.path.dirname(t_eval_output_path)):
            os.makedirs(os.path.dirname(t_eval_output_path))
        else:  # clear train evaluation output file
            with open(t_eval_output_path, 'w'):
                pass

    skf = StratifiedKFold(n_splits=5)

    for word_emb_class, word_emb_params in word_embeddings:
        word_embedding = word_emb_class(*word_emb_params)
        word_embedding.build()
        for sen_emb_class in sentence_embeddings:
            sen_emb = sen_emb_class()
            feature_builder = FeatureBuilder()
            str_word_emb_params = ','.join(map(str, word_emb_params))
            embedding_desc = ';'.join([
                word_emb_class.__name__, str_word_emb_params,
                sen_emb_class.__name__
            ])
            print("Testing embedding: {0}".format(embedding_desc))

            sen_emb.build(word_embedding)
            feature_builder.build(sen_emb, LABELS, SENTENCES)

            # Train and test indices for double cross-validation
            train_index, test_index = next(
                skf.split(feature_builder.features, feature_builder.labels))

            for classifier_class, tested_params in classifiers:
                our_classifier_wrapper = CLASSIFIERS_WRAPPERS[classifier_class]

                output_path = get_grid_search_results_path(
                    data_folder, our_classifier_wrapper)
                eval_output_path = get_evaluation_path(data_folder,
                                                       our_classifier_wrapper)
                t_eval_output_path = get_train_set_evaluation_path(
                    data_folder, our_classifier_wrapper)

                combs = reduce(operator.mul,
                               map(len, tested_params.itervalues()), 1)
                print(
                    "Testing {0} hyperparameters ({1} combinations)...".format(
                        classifier_class.__name__, combs))

                # for keras we need to create a sklearn wrapper to use GridSearchCV
                if classifier_class == KerasNeuralNetworkClassifier:
                    model = KerasClassifier(
                        build_fn=create_keras_model,
                        features_count=feature_builder.features.shape[1],
                        verbose=0)
                else:
                    model = classifier_class()

                if classifier_class == RandomForestClassifier or classifier_class == KerasNeuralNetworkClassifier:
                    # use 1 job because of high memory usage of these classifiers
                    clf = GridSearchCV(estimator=model,
                                       param_grid=tested_params,
                                       n_jobs=1,
                                       cv=folds_count)
                else:
                    clf = GridSearchCV(estimator=model,
                                       param_grid=tested_params,
                                       n_jobs=n_jobs,
                                       cv=folds_count)

                clf.fit(feature_builder.features[train_index],
                        feature_builder.labels[train_index])

                evaluation = clf.score(feature_builder.features[test_index],
                                       feature_builder.labels[test_index])
                t_evaluation = clf.score(feature_builder.features[train_index],
                                         feature_builder.labels[train_index])

                with open(output_path, 'a') as output_file:
                    for mean_score, params in zip(
                            clf.cv_results_['mean_test_score'],
                            clf.cv_results_['params']):
                        output_file.write('{:s};{:s};{:4.2f}\n'.format(
                            embedding_desc, str(params), mean_score * 100))

                with open(eval_output_path, 'a') as output_file:
                    output_file.write('{:s};{:4.2f}\n'.format(
                        embedding_desc, evaluation * 100))

                with open(t_eval_output_path, 'a') as output_file:
                    output_file.write('{:s};{:4.2f}\n'.format(
                        embedding_desc, t_evaluation * 100))
Example #11
0
    def parameterSearch(self,
                        paramSets,
                        X,
                        Y,
                        numSplits=2,
                        valSplit=0.0,
                        epochs=1,
                        batchSize=None,
                        saveModel=False,
                        visualize=False,
                        saveLoc=''):
        # create CV dat LOOV
        #numSplits = 2
        Kf = StratifiedKFold(n_splits=numSplits)
        callBacks = [
            EarlyStopping(monitor='val_loss',
                          patience=3,
                          restore_best_weights=True)
        ]
        if (visualize):
            callBacks.append(
                TensorBoard(log_dir='./logs',
                            histogram_freq=3,
                            write_graph=False,
                            write_images=False,
                            update_freq='epoch',
                            profile_batch=2,
                            embeddings_freq=0,
                            embeddings_metadata=None))

        #for each parameter set
        # make a model
        #
        #X = [0,1,2,3,4,5,6,7,8,9]
        modelFile = open(self.outputPath + "fileModel.csv", 'w')
        resultFile = open(self.outputPath + "fileResult.csv", 'w')
        resultFile.write(
            "modelNum|True REM|False REM|False NonREM|True NonREM|Acc|Sens|Spec|Recall|Precision|f1score|finalLoss\n"
        )
        modelNum = 0
        for paramSet in paramSets:

            modelFile.write(str(modelNum) + "|")
            json.dump(paramSet, modelFile)
            modelFile.write("\n")
            print("\n\n=================\nTesting Model " + str(modelNum) +
                  "\n=================\n")
            #print(paramSet, flush=True)
            try:
                model = self.convModel(paramSet)
                print(model.summary())
                #model.save_weights('temp_weights.h5')
                j = 0
                for trainInd, testInd in Kf.split(X, np.argmax(Y, axis=1)):

                    fitHistory = model.fit(X[trainInd],
                                           Y[trainInd],
                                           batch_size=batchSize,
                                           verbose=0,
                                           validation_split=valSplit,
                                           epochs=epochs,
                                           callbacks=callBacks)
                    if (saveModel):
                        modelWeightFile = saveLoc + f'{modelNum}.{j}.weights.h5'
                        model.save_weights(modelWeightFile)
                        #model.save(modelWeightFile)
                    Ypred = np.zeros((testInd.shape[0], Y.shape[1]))
                    Yi = 0
                    for pred in np.argmax(model.predict(X[testInd],
                                                        batch_size=None),
                                          axis=1):
                        Ypred[Yi][pred] = 1
                        Yi += 1

                    #NOTE:
                    #confusionMatrix = multilabel_confusion_matrix(Y[testInd], Ypred)[0]
                    ##print(confusionMatrix)
                    ##confusionMatrix = confusion_matrix(np.argmax(Y[testInd], axis=1), np.argmax(Ypred, axis=1))
                    ##print(confusionMatrix)
                    ##print('f1_score:',f1_score(Y[testInd], Ypred, average='macro'))
                    #resultFile.write(str(modelNum) + "|")
                    ##for row in confusionMatrix:
                    ##    for el in row:
                    ##        resultFile.write(str(el) + "|")
                    ##"modelNum|True REM|False NonREM|False REM|True NonREM|Acc|Sens|Spec|Recall|Precision|f1score\n"
                    #
                    #tn = confusionMatrix[0][0]
                    #fn = confusionMatrix[1][0]
                    #tp = confusionMatrix[1][1]
                    #fp = confusionMatrix[0][1]

                    tp = tn = fn = fp = 0
                    Yi = 0
                    for y in Y[testInd]:
                        tp += Ypred[Yi][0] * y[0]
                        fp += max(Ypred[Yi][0] - y[0], 0)
                        tn += Ypred[Yi][1] * y[1]
                        fn += max(Ypred[Yi][1] - y[1], 0)
                        Yi += 1

                    acc = sens = spec = prec = rec = f1 = 0
                    acc = (tp + tn) / (tp + tn + fp + fn)
                    if (tp + fn > 0):
                        sens = tp / (tp + fn)
                    if (tn + fp > 0):
                        spec = tn / (tn + fp)
                    if (tp + fp > 0):
                        prec = tp / (tp + fp)
                    if (tp + fn > 0):
                        rec = tp / (tp + fn)
                    if (prec + rec > 0):
                        f1 = 2 * ((prec * rec) / (prec + rec))
                    resultFile.write(
                        f"{modelNum}|{tp:.3f}|{fp:.3f}|{fn:.3f}|{tn:.3f}|{acc:.3f}|{sens:.3f}|{spec:.3f}|{rec:.3f}|{prec:.3f}|{f1:.3f}|{fitHistory.history['loss'][-1]:10.3f}\n"
                    )
                    print(
                        f"{'Validate':10s}|{'modelNum':10s}|{'tp':10s}|{'fp':10s}|{'fn':10s}|{'tn':10s}|{'acc':10s}|{'sens':10s}|{'spec':10s}|{'rec':10s}|{'prec':10s}|{'f1':10s}|{'loss':10s}\n"
                    )
                    print(
                        f"{j:10d}|{modelNum:10d}|{tp:10.3f}|{fp:10.3f}|{fn:10.3f}|{tn:10.3f}|{acc:10.3f}|{sens:10.3f}|{spec:10.3f}|{rec:10.3f}|{prec:10.3f}|{f1:10.3f}|{fitHistory.history['loss'][-1]:10.3f}\n",
                        flush=True)

                    #resultFile.write(str(f1_score(Y[testInd], Ypred, average='macro')) + "|\n")
                    #model.load_weights('temp_weights.h5')
                    self.reset_weights(model)
                    j += 1

            except Exception as e:
                resultFile.write("error\n")
                print(str(e))

            K.clear_session()
            modelNum += 1

            if self.killer.kill_now:
                resultFile.write("killed\n")
                print("killed")
                break

        modelFile.close()
        resultFile.close()
dataset_exp = numpy.loadtxt(os.path.join(path, fileexp), delimiter="\t")# Change the path to your local system
dataset_cnv = numpy.loadtxt(os.path.join(path, filecnv), delimiter="\t")# Change the path to your local system

# split into input (X) and output (Y) variables
X_clinical = dataset_clinical[:,0:25]
Y_clinical = dataset_clinical[:,25]
# split into input (X) and output (Y) variables
X_exp = dataset_exp[:,0:400]
Y_exp = dataset_exp[:,400]
# split into input (X) and output (Y) variables
X_cnv = dataset_cnv[:,0:200]
Y_cnv = dataset_cnv[:,200]

print('*********************************Training the Clinical CNN *****************************************')
# kfold_value fold cross validation
kfold = StratifiedKFold(n_splits=kfold_value, shuffle=False, random_state=1)
acc_clinical = []
Pr_clinical = []
Sn_clinical = []
Mcc_clinical = []
i=1  
for train_index, test_index in kfold.split(X_clinical, Y_clinical):
	print(i,"th Fold *****************************************")
	i=i+1
	x_train_clinical, x_test_clinical=X_clinical[train_index],X_clinical[test_index]	
	y_train_clinical, y_test_clinical = Y_clinical[train_index],Y_clinical[test_index] 	
	x_train_clinical = numpy.expand_dims(x_train_clinical, axis=2)
	x_test_clinical = numpy.expand_dims(x_test_clinical, axis=2)
	# first Clinical CNN Model
	init =initializers.glorot_normal(seed=1)
	bias_init =initializers.Constant(value=0.1)
V = pca.fit(X)  
varPC= (pca.explained_variance_ratio_)  

lambdas = pca.singular_values_ 
full_dict['full_mat'] = X
full_dict['labels']=y
full_dict['V']= V
full_dict['varPC']= varPC



# X is your cell gene matrix, y is your class labels
#%%
# Split into train/test
kCV = 5
skf = StratifiedKFold(n_splits=kCV, shuffle= True)
Atrain = {}
Atest = {}
ytest = {}
ytrain = {}
proprestest = {}
proprestrain = {}

folds_dict = {'trainmat':{}, 'trainlabel':{}, 'eigenvectors':{}, 'eigvals':{}, 'meanvec':{}}
for i in range(kCV):    
    for train_index, test_index in skf.split(X, y):
        Atrain[i] = X.iloc[train_index, :]
        Atest[i] = X.iloc[test_index, :]
        ytest[i]= y[test_index]
        ytrain[i]= y[train_index]
        proprestest[i] = sum(ytest[i])/len(ytest[i])
Example #14
0
def main(config: DictConfig) -> None:
    prepair_dir(config)
    set_seed(config.data.seed)
    label_cols = [
        "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"
    ]
    train_dfs, names = load_train_data(config.store.workdir)
    test_dfs = load_test_data(config.store.workdir)
    remove_cols = [
        "knn_age_pred",
        "knn_domain1_var1",
        "densenet121_age_pred",
        "densenet121_domain1_var1_pred",
        "densenet121_domain1_var2_pred",
        "densenet121_domain2_var2_pred",
        "3dcnn_resnet18_domain1_var2_pred",
        "3dcnn_resnet18_domain2_var1_pred",
        "3dcnn_resnet18_domain2_var2_pred",
        "1dresnet18_domain1_var1_pred",
        "1dresnet18_domain1_var2_pred",
        "1dresnet18_domain2_var2_pred",
        "simple_3dcnn_domain1_var1_pred",
        "simple_3dcnn_domain1_var2_pred",
        "simple_3dcnn_domain2_var2_pred",
        "transformer_domain2_var1_pred",
        "transformer_domain2_var2_pred",
        "transformer_domain1_var1_pred",
        "transformer_domain1_var2_pred",
        "lgbm_gnn_feature_domain1_var2_pred",
        "lgbm_gnn_feature_domain2_var2_pred",
        "lgbm_gnn_featured_domain1_var2_pred",
        "lgbm_gnn_featured_domain2_var2_pred",
        "lgbm_cnn_feature_domain1_var2_pred",
        "lgbm_cnn_feature_domain2_var2_pred",
        "lgbm_2plus1dcnn_feature_domain1_var2_pred",
        "lgbm_2plus1dcnn_feature_domain2_var2_pred",
        "xgb_2plus1dcnn_feature_age_pred",
        "xgb_2plus1dcnn_feature_domain1_var2_pred",
        "xgb_2plus1dcnn_feature_domain2_var2_pred",
        "simple_3dcnn_domain2_var1_pred",
        "simple_3dcnn_3label_domain1_var2_pred",
        "gin_domain1_var1_pred",
        "gin_domain2_var1_pred",
        "2plus1dcnn_resnet10_domain1_var2_pred",
        "resnest14d_domain1_var1_pred",
        "resnest14d_domain1_var2_pred",
        "resnest14d_domain2_var2_pred",
    ]
    train_ft_dict = {}
    test_ft_dict = {}
    feature_cols = []
    train_ft_dict["Id"] = train_dfs[0]["Id"]
    test_ft_dict["Id"] = test_dfs[0]["Id"]
    for label_col in label_cols:
        train_ft_dict[label_col] = train_dfs[0][label_col]
    for name, df in zip(names, train_dfs):
        for label_col in label_cols:
            if (f"{label_col}_pred" in df.columns
                    and f"{name}_{label_col}_pred" not in remove_cols):
                train_ft_dict[f"{name}_{label_col}_pred"] = df[
                    f"{label_col}_pred"]
                feature_cols += [f"{name}_{label_col}_pred"]
            elif f"{name}_{label_col}_pred" in remove_cols:
                df.drop(f"{label_col}_pred", axis=1, inplace=True)

        feat_dict = make_domain_feature(df, mode="train", name=name)
        train_ft_dict.update(feat_dict)
        feature_cols += list(feat_dict.keys())

    for name, df in zip(names, test_dfs):
        for label_col in label_cols:
            for i in range(5):
                if (f"{label_col}_pred_fold{i}" in df.columns
                        and f"{name}_{label_col}_pred" not in remove_cols):
                    test_ft_dict[f"{name}_{label_col}_pred_fold{i}"] = df[
                        f"{label_col}_pred_fold{i}"]
                elif (f"{name}_{label_col}_pred" in remove_cols
                      and f"{label_col}_pred_fold{i}" in df.columns):
                    df.drop(f"{label_col}_pred_fold{i}", axis=1, inplace=True)
        feat_dict = make_domain_feature(df, mode="test", name=name)
        test_ft_dict.update(feat_dict)
    train_df = pd.DataFrame(train_ft_dict)
    test_df = pd.DataFrame(test_ft_dict)
    train_df["age"] = (
        pd.read_csv(f"{config.store.workdir}/input/train_scores.csv"
                    ).sort_values("Id").reset_index(drop=True)["age"])
    age_rank = train_df["age"].values // 10 * 10
    skf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True)

    train_df, test_df = preprocess(train_df, test_df, feature_cols)
    for feature_col in feature_cols:
        train_df[feature_col].fillna(0, inplace=True)
        test_df[feature_col].fillna(0, inplace=True)
    train_df = cudf.from_pandas(train_df)
    test_df = cudf.from_pandas(test_df)
    if config.randomize_age:
        set_seed(777_777_777)
        train_df["age"] += np.array(
            [randomize_age(age) for age in train_df["age"].values])
    skf = StratifiedKFold(n_splits=5, random_state=777, shuffle=True)
    train_df = train_df.reset_index(drop=True)
    logger.info("=" * 10 + "parameter search" + "=" * 10)
    best_c = {}
    for label_col in label_cols:
        best = np.inf
        if label_col == "age":
            feature_cols_ = [
                col for col in feature_cols if f"{label_col}" in col
            ]
        else:
            feature_cols_ = feature_cols
        for c in [2**(i) for i in range(-14, 1)]:
            y_oof = np.zeros(train_df.shape[0])
            for n_fold, (train_index,
                         val_index) in enumerate(skf.split(age_rank,
                                                           age_rank)):
                train_df_fold = train_df.iloc[train_index]
                valid_df_fold = train_df.iloc[val_index]
                train_df_fold = train_df_fold[
                    train_df_fold[label_col].notnull()]
                model = SVR(kernel="linear", C=c, cache_size=3000.0)
                model.fit(train_df_fold[feature_cols_],
                          train_df_fold[label_col])
                y_oof[val_index] = model.predict(
                    valid_df_fold[feature_cols_]).to_array()
                test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict(
                    test_df[feature_cols_])
            train_df[f"{label_col}_pred"] = y_oof
            notnull_idx = train_df[label_col].notnull()
            score = normalized_absolute_errors(
                train_df[notnull_idx][label_col].values,
                train_df[notnull_idx][f"{label_col}_pred"].values,
            )
            logger.info(f"c={c}, {label_col}: {score}")
            if score <= best:
                best = score
                best_c[label_col] = c
    logger.info("=" * 10 + "prediction" + "=" * 10)
    for label_col in label_cols:
        y_oof = np.zeros(train_df.shape[0])
        if label_col == "age":
            feature_cols_ = [
                col for col in feature_cols if f"{label_col}" in col
            ]
        else:
            feature_cols_ = feature_cols
        for n_fold, (train_index,
                     val_index) in enumerate(skf.split(age_rank, age_rank)):
            train_df_fold = train_df.iloc[train_index]
            valid_df_fold = train_df.iloc[val_index]
            train_df_fold = train_df_fold[train_df_fold[label_col].notnull()]
            model = SVR(kernel="linear",
                        C=best_c[label_col],
                        cache_size=3000.0)
            model.fit(train_df_fold[feature_cols_], train_df_fold[label_col])
            y_oof[val_index] = model.predict(
                valid_df_fold[feature_cols_]).to_array()
            test_df[f"{label_col}_pred_fold{n_fold}"] = model.predict(
                test_df[feature_cols_])
        train_df[f"{label_col}_pred"] = y_oof
        notnull_idx = train_df[label_col].notnull()
        score = normalized_absolute_errors(
            train_df[notnull_idx][label_col].values,
            train_df[notnull_idx][f"{label_col}_pred"].values,
        )
        logger.info(f"c={c}, {label_col}: {score}")
    score = 0
    for label_col, weight in zip(label_cols,
                                 [0.3, 0.175, 0.175, 0.175, 0.175]):
        notnull_idx = train_df[label_col].notnull()
        score += (normalized_absolute_errors(
            train_df[notnull_idx][label_col].to_array(),
            train_df[notnull_idx][f"{label_col}_pred"].to_array(),
        ) * weight)
    logger.info(f"all: {score}")
    train_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_train.csv"),
        index=False,
    )
    test_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_test.csv"),
        index=False,
    )
    if config.store.gcs_project is not None:
        upload_directory(config.store)

    sub_df = make_submission(test_df)
    sub_df.to_csv(
        os.path.join(config.store.result_path,
                     f"{config.store.model_name}_submission.csv"),
        index=False,
    )
Example #15
0
    x = range(1, min(len_max, n_pre_subs) + 1)

    plt.cla()
    plt.plot(x, means, label='mean blend')
    plt.plot(x, medians, label='median blend', color='r')
    plt.text(x[-1], means[-1], str(means[-1]))
    plt.text(x[-1], medians[-1], str(medians[-1]))
    plt.legend()
    plt.xlabel('# pre-subs')
    plt.ylabel('score')
    plt.title('score vs # pre-subs')
    plt.savefig('output/validation.png')


folds = StratifiedKFold(n_splits=N_VALIDATION_SPLITS,
                        shuffle=True,
                        random_state=int(time()))
for N_VALIDATION_SPLITS_iter, (index_train,
                               index_valid) in enumerate(folds.split(X, y)):
    X_train, y_train = X.iloc[index_train], y.iloc[index_train]
    X_valid, y_valid = X.iloc[index_valid], y.iloc[index_valid]

    print('\n====== validation {}/{} ======='.format(
        N_VALIDATION_SPLITS_iter + 1, N_VALIDATION_SPLITS))
    print('#presubs|\tscore (mean / median)')

    predictions_list = []

    scores_val_mean = []
    scores_val_median = []
Example #16
0
#convert Dataframe to Array for Model Training
x_train = x_train.values
        
#################
#LOGISTIC REGRESSION
#################
print('\nLOGISTIC REGRESSION')

#Grid Search
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

CV_acc = 0
best_c = -1000
C = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
kfolds = StratifiedKFold(n_splits = 10, shuffle = True)

#Testing Different Values for C
for c in C:
    predicted = []
    lr_gs = LogisticRegression(solver = 'liblinear', C = c)
    
    #10-Fold Cross Validation
    for training, testing in kfolds.split(x_train, y_train):
        lr_gs.fit(x_train[training], y_train[training])
        pred = lr_gs.predict(x_train[testing])
        
        for p in pred:
            predicted.append(p)
    
    score = accuracy_score(y_train, predicted)
def run_imli(DATASET_NAME, ID_HOLDOUT, CATEGORICAL_FEATURES):
    # Load Dataset
    import pandas as pd
    HOLDOUT_DIR = "/root/skripsi/imli/experiment/%s/holdout/" % DATASET_NAME
    df_total = pd.read_csv(HOLDOUT_DIR + ("%s.csv" % DATASET_NAME))
    df_train_val = pd.read_csv(HOLDOUT_DIR + ("%s-%d-train.csv" % (DATASET_NAME, ID_HOLDOUT)))
    df_test = pd.read_csv(HOLDOUT_DIR + ("%s-%d-test.csv" % (DATASET_NAME, ID_HOLDOUT)))

    import numpy as np
    
    df_total_np = df_total.values
    X_total = df_total_np[:, :-1]
    y_total = df_total_np[:, -1]

    df_train_val_np = df_train_val.values
    X_train_val = df_train_val_np[:, :-1]
    y_train_val = df_train_val_np[:, -1]

    df_test_np = df_test.values
    X_test = df_test_np[:, :-1]
    y_test = df_test_np[:, -1]

    # Parameters
    from sklearn.model_selection import StratifiedKFold
    k_cv = 10
    lambdas = [5, 10]
    n_clauses = [1, 2, 3]
    rule_types = ["CNF", "DNF"]
    partition_sizes = [8, 16, 32, 64, 128]
    is_floors = [True, True, True, True, True]
    solver="/root/open-wbo/open-wbo"
    timeout=1000
    
    from IMLI import IMLI
    features = IMLI().generate_features(
        X_total,
        y_total,
        categorical_features_id=[int(u) for u in CATEGORICAL_FEATURES],
        discretizer="entropy"
#         n_thresholds=9
    )

    # Testing
    import time

    print("l: lamda, n_clause, rule_type, id_cv, train_size, val_size, n_partitions, real_partition_size, n_features, training_time, val_accuracy, rule_size, rule, classification_report_train, classification_report_val")
    
    best_mean_val_acc = 0 # to choose hyperparameter (lamda, n_clause, rule_type)
    chosen_val_accs = [] # list of cv val accs from chosen hyperparameter
    chosen_params = {} # chosen hyperparameter
    chosen_training_times = [] # list of training time from chosen hyperparameter
    
    # finding best hyperparameter
    for lamda in lambdas:
        for n_clause in n_clauses:
            for rule_type in rule_types:
                for partition_size, is_floor in zip(partition_sizes, is_floors):
                    
                    skf = StratifiedKFold(n_splits=k_cv, shuffle=True, random_state=42)
                    id_cv = 0
                    
                    val_accs = []
                    training_times = []
                    n_partitions_s = []
                    
                    # cross validation start
                    for train_id, val_id in skf.split(X_train_val, y_train_val):
                        N = train_id.shape[0]
                        
                        if is_floor: # partition_size is the minimum allowed partition_size
                            n_partitions = N // partition_size
                        else:
                            n_partitions = (N + partition_size - 1) // partition_size
                            
                        n_partitions_s.append(n_partitions)
                        
                        train_size = N
                        val_size = val_id.shape[0]
                        
                        real_partition_size = [N // n_partitions, (N + n_partitions - 1) // n_partitions]
                        
                        imli = IMLI(
                            n_clauses=n_clause,
                            lamda=lamda,
                            rule_type=rule_type,
                            solver=solver,
                            n_partitions=n_partitions,
                            timeout=timeout)

                        start_time = time.time()
                        imli.fit(X_train_val[train_id], y_train_val[train_id], features)
                        end_time = time.time()
                        training_time = end_time - start_time
                        
                        training_times.append(training_time)
                        
                        # report validation data
                        y_true = y_train_val[val_id]
                        y_pred = imli.predict(X_train_val[val_id])

                        from sklearn.metrics import accuracy_score, classification_report
                        accuracy = accuracy_score(y_true, y_pred)
                        val_accs.append(accuracy)
                        
                        classification_report_val = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
                        
                        # report training data
                        y_true_train = y_train_val[train_id]
                        y_pred_train = imli.predict(X_train_val[train_id])
                        
                        classification_report_train = classification_report(y_true_train, y_pred_train, output_dict=True, zero_division=0)
                        
                        n_features = imli.n_features
                        rule = imli.get_rule()
                        rule_size = imli.get_rule_size()
                        
                        # lamda, n_clause, rule_type, id_cv, train_size, val_size, n_partitions, real_partition_size, n_features, training_time, val_accuracy, rule_size, rule, classification_report_train, classification_report_val
                        
                        print("l: %d,%d,%s,%d,%d,%d,%d,%s,%d,%f,%f,%d,%s,%s,%s" % (
                            lamda,
                            n_clause,
                            rule_type,
                            id_cv,
                            train_size,
                            val_size,
                            n_partitions,
                            str(real_partition_size).replace(',', ';'),
                            n_features,
                            training_time,
                            accuracy,
                            rule_size,
                            rule,
                            str(classification_report_train).replace(',', ';'),
                            str(classification_report_val).replace(',', ';')
                        ))
                        
                        id_cv += 1
                        
                    # cross validation done
                    mean_val_acc_cv = np.mean(val_accs)
                    std_val_acc_cv = np.std(val_accs)
                    mean_training_time = np.mean(training_times)
                    std_training_time = np.std(training_times)
                    
                    # n_partitions
                    (values, counts) = np.unique(n_partitions_s, return_counts=True)
                    n_partitions = values[counts.argmax()]
                    
                    params = {
                        'lamda': lamda,
                        'n_clause': n_clause,
                        'rule_type': rule_type,
                        'real_partition_size': real_partition_size,
                        'n_partitions': n_partitions
                    }
                    
                    print("r: ---> Params: %s, (mean val acc cv) %f (+- %f), (mean training time) %f (+- %f)" % (
                        str(params),
                        mean_val_acc_cv,
                        std_val_acc_cv,
                        mean_training_time,
                        std_training_time
                    ))

                    if (mean_val_acc_cv > best_mean_val_acc):
                        best_mean_val_acc = mean_val_acc_cv
                        chosen_val_accs = val_accs
                        chosen_params = params
                        chosen_training_time = training_times
                        
    # retrain using best hyperparameter
    best_model = IMLI(
        n_clauses=chosen_params['n_clause'],
        lamda=chosen_params['lamda'],
        rule_type=chosen_params['rule_type'],
        solver=solver,
        n_partitions=chosen_params['n_partitions'],
        timeout=timeout)
    
    start_time = time.time()
    best_model.fit(X_train_val, y_train_val, features)
    end_time = time.time()
    retrain_training_time = end_time - start_time
    
    # report test
    y_pred_test = best_model.predict(X_test)
    y_true_test = y_test

    from sklearn.metrics import accuracy_score, classification_report
    test_acc_holdout = accuracy_score(y_true_test, y_pred_test)
    test_classification_report_holdout = classification_report(y_true_test, y_pred_test, output_dict=True, zero_division=0)
    
    # report train val
    y_pred_train_val = best_model.predict(X_train_val)
    y_true_train_val = y_train_val
    train_val_classification_report_holdout = classification_report(y_true_train_val, y_pred_train_val, output_dict=True, zero_division=0)
    
    
    N = X_train_val.shape[0]
    n_partitions=chosen_params['n_partitions']
    real_partition_size = [N // n_partitions, (N + n_partitions - 1) // n_partitions]
        
    print("t: ------------------")
    print("t: HOLDOUT %d" % ID_HOLDOUT)
    print("t: size training:", X_train_val.shape)
    print("t: size testing:", X_test.shape)
    print("t: test_acc_holdout: %f" % test_acc_holdout)
    print("t: ")
    print("t: best_val_accs:", chosen_val_accs)
    print("t: best_mean_val_acc: %f (+- %f)" % (np.mean(chosen_val_accs), np.std(chosen_val_accs)))
    print("t: chosen_params: {}".format(chosen_params))
    print("t: ")
    print("t: chosen_cv_training_time: %s" % chosen_training_time)
    print("t: chosen_mean_cv_training_time: %f (+- %f)" % (np.mean(chosen_training_time), np.std(chosen_training_time)))
    print("t: retrain_training_time: %f" % retrain_training_time)
    print("t: ")
    print("t: best_model_rule_size (retrained with params): %d" % best_model.get_rule_size())
    print("t: best_model_rule (retrained with params):")
    print("t: " + best_model.get_rule())
    print("t: ")
    print("t: n_features (retrained with params): %d" % best_model.n_features)
    print("t: real_partition_size (retrained with params): %s" % real_partition_size)
    print("t: ")
    print("t: test_classification_report_holdout: %s" % str(test_classification_report_holdout).replace(',', ';'))
    print("t: ")
    print("t: train_val_classification_report_holdout: %s" % str(train_val_classification_report_holdout).replace(',', ';'))
Example #18
0
print('Total train data len: ' + str(len(train_labels)) + ' | Positive samples: ' + str(sum(train_labels)))
print('Total test data len: ' + str(len(test_labels)) + ' | Positive samples: ' + str(sum(test_labels)))
print('Train Features shape ', train_features.shape)
print('Test Features shape ', test_features.shape)

oversample = SMOTE()
train_features, train_labels = oversample.fit_resample(train_features, train_labels)
print('After Up sampling')
print('Total train data len: ' + str(len(train_labels)) + ' | Positive samples: ' + str(sum(train_labels)))
print('Train Features shape ', train_features.shape)

for kernel_ in ["poly", "rbf"]:
    print('***********************************', kernel_, '***********************************')
    k = 5
    kf = StratifiedKFold(n_splits=k, random_state=None, shuffle=False)
    model = svm.SVC(kernel=kernel_, gamma='auto')
    for e, (train_idx, test_idx) in enumerate(kf.split(train_features, train_labels)):
        print(' ---------- KFOLD ', e)
        tr_features, tr_labels = train_features[train_idx], train_labels[train_idx]
        te_features, te_labels = train_features[test_idx], train_labels[test_idx]
        model.fit(tr_features, tr_labels)
        predictions = model.predict(tr_features)
        train_metrics = accuracy_fn(predictions, tr_labels, threshold=threshold)
        train_metrics = {'train_' + k: v for k, v in train_metrics.items()}
        print(f'***** Train Metrics ***** ')
        print(
                f"Accuracy: {'%.5f' % train_metrics['train_accuracy']} "
                f"| UAR: {'%.5f' % train_metrics['train_uar']}| F1:{'%.5f' % train_metrics['train_f1']} "
                f"| Precision:{'%.5f' % train_metrics['train_precision']} "
                f"| Recall:{'%.5f' % train_metrics['train_recall']} | AUC:{'%.5f' % train_metrics['train_auc']}")
Example #19
0
print(data["Embarked"].unique())
print(data["Embarked"].value_counts())
data.loc[data["Embarked"] == "S", "Embarked"] = 0
data.loc[data["Embarked"] == "C", "Embarked"] = 1
data.loc[data["Embarked"] == "Q", "Embarked"] = 2
data["Embarked"] = data["Embarked"].fillna(3)
# data.loc[data["Embarked"]==None,"Embarked"]=3
print(data["Embarked"].describe())
print(data["Embarked"].unique())
print(data["Embarked"].value_counts())

print("--------------LogisticRegression---------------")
predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]
train = data[predictors].values
y = data["Survived"]
print(train)
# print(train.describe())
kfold = StratifiedKFold(n_splits=3, random_state=1)  #分层划分
scores = []
# for train_index, test_index in kfold.split(train,y):
#     # print("Train Index:", train_index, ",Test Index:", test_index)
#     X_train,X_test=train[train_index],train[test_index]
#     y_train,y_test=y[train_index],y[test_index]
#     lr = LogisticRegression()
#     lr.fit(X_train,y_train)
#     score = lr.score(X_test,y_test)
#     scores.append(score)
#     print(score)

print(data["Age"])
print(pd.qcut(data['Age'], 5))
Example #20
0
            (rows, cols) = (local_labels == i).nonzero()
            samples = len(rows)
            for p in range(samples):
                data[cont, :] = img[rows[p], cols[p], :].flatten()
                labels[cont] = i - 1
                cont += 1

data /= 255

crossval_splits = 5
accuracy = numpy.zeros(crossval_splits)
sensitivity = numpy.zeros(crossval_splits)
specificity = numpy.zeros(crossval_splits)
cont = 0

skf = StratifiedKFold(n_splits=crossval_splits, shuffle=True, random_state=123)
skf.get_n_splits(data, labels)

for train_index, test_index in skf.split(data, labels):
    train_data, test_data = data[train_index], data[test_index]
    train_labels, test_labels = labels[train_index], labels[test_index]
    #XGB Classifier
    model = XGBClassifier(use_label_encoder=False,
                          booster='gbtree',
                          random_state=123)
    model.fit(train_data, train_labels)

    #Compute scores
    pred = model.predict(test_data)
    predictions = [round(value) for value in pred]
    predictions = numpy.asarray(predictions)
                                                                random_state=1,
                                                                shuffle=True)
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear',
                                        multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1)
    cv_results = cross_val_score(model,
                                 X_train,
                                 Y_train,
                                 cv=kfold,
                                 scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
pyplot.boxplot(results, labels=names)
pyplot.title('Algorithm Comparison')
pyplot.show()

# Make predictions on validation dataset
model = SVC(gamma='auto')
                                                    test_size=0.25,
                                                    random_state=100)

# In[13]:

clf = RandomForestClassifier(n_estimators=200,
                             class_weight='balanced',
                             max_depth=16,
                             max_features="auto",
                             random_state=25)
#clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, y_train)

# In[14]:

scores = cross_val_score(clf, X_train, y_train, cv=StratifiedKFold(5))
print("Train Acc: ", scores.mean())

# In[15]:

print("Test Acc:", clf.score(X_test, y_test))
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))

# In[16]:

print(classification_report(y_test, y_pred))
fpr1, tpr1, thresholds = metrics.roc_curve(y_test, y_proba[:, 1])
plt.plot(fpr1, tpr1)
    else:
        return 7

def histcoverage(coverage):
    histall = np.zeros((1,8))
    for c in coverage:
        histall[0,c] += 1
    return histall

train_df["coverage"] = train_df.masks.map(np.sum) / pow(img_size_target, 2)

train_df["coverage_class"] = train_df.masks.map(get_mask_type)

train_all = []
evaluate_all = []
skf = StratifiedKFold(n_splits=cv_total, random_state=1234, shuffle=True)
for train_index, evaluate_index in skf.split(train_df.index.values, train_df.coverage_class):
    train_all.append(train_index)
    evaluate_all.append(evaluate_index)
    print(train_index.shape,evaluate_index.shape) # the shape is slightly different in different cv, it's OK

def get_cv_data(cv_index):
    train_index = train_all[cv_index-1]
    evaluate_index = evaluate_all[cv_index-1]
    x_train = np.array(train_df.images[train_index].map(upsample).tolist()).reshape(-1, img_size_target, img_size_target, 1)
    y_train = np.array(train_df.masks[train_index].map(upsample).tolist()).reshape(-1, img_size_target, img_size_target, 1)
    x_valid = np.array(train_df.images[evaluate_index].map(upsample).tolist()).reshape(-1, img_size_target, img_size_target, 1)
    y_valid = np.array(train_df.masks[evaluate_index].map(upsample).tolist()).reshape(-1, img_size_target, img_size_target, 1)
    return x_train,y_train,x_valid,y_valid

"""#### Show  some examples of different mask"""
def model_evaluate(model_name, x, y, epoch_num):
    cv = StratifiedKFold(n_splits=6, shuffle=False)

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)

    colors = cycle(
        ['cyan', 'indigo', 'seagreen', 'yellow', 'blue', 'darkorange'])
    lw = 2

    print 'Cross validation'
    i = 0

    i_reduced = range(0, len(y), epoch_num)
    x_r = x[i_reduced]
    y_r = y[i_reduced]

    probabilities = np.empty(len(y_r))
    probabilities_epoch = np.empty([len(y_r), epoch_num])

    # plt.figure()
    roc_auc_max = 0
    for (train, test), color in zip(cv.split(x_r, y_r), colors):
        # Recalculating indexes to make all observations from one signal be present in train or test
        train_full = np.zeros(len(train) * epoch_num, dtype='int')
        for k in range(0, len(train), 1):
            for j in range(0, epoch_num, 1):
                train_full[k * epoch_num + j] = train[k] * epoch_num + j

        test_full = np.zeros(len(test) * epoch_num, dtype='int')
        for k in range(0, len(test), 1):
            for j in range(0, epoch_num, 1):
                test_full[k * epoch_num + j] = test[k] * epoch_num + j

        # print 'Model fitting...'
        classifier = model_create(model_name)
        classifier = model_fit(classifier, x[train_full], y[train_full])

        # print 'Predicting...'
        probas = model_predict(classifier, x[test_full])

        p, p_x = prob_decide(probas[:, 1], epoch_num)

        probabilities[test] = p
        probabilities_epoch[test, :] = p_x

        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_r[test], p)
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = auc(fpr, tpr)
        if roc_auc > roc_auc_max:
            roc_auc_max = roc_auc
            best_classifier = classifier
            # print 'Best classifier found!'
        print 'Iteration #' + str(i) + ': AUC = ' + str(roc_auc)
        # plt.plot(fpr, tpr, lw=lw, color=color,
        #          label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        i += 1

    # plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k',
    #          label='Luck')

    mean_tpr /= cv.get_n_splits(x, y)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    print 'Mean AUC = ', mean_auc

    # plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--',
    #          label='Mean ROC (area = %0.2f)' % mean_auc, lw=lw)
    # plt.xlim([-0.05, 1.05])
    # plt.ylim([-0.05, 1.05])
    # plt.xlabel('False Positive Rate')
    # plt.ylabel('True Positive Rate')
    # plt.title('Receiver operating characteristic example')
    # plt.legend(loc="lower right")
    # # plt.show()
    # plt.draw()

    return mean_auc, best_classifier, probabilities, y_r, probabilities_epoch
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# load dataset
dataframe = read_csv("sonar.csv", header=None)
dataset = dataframe.values
# split into input (X) and output (Y) variables
X = dataset[:,0:60].astype(float)
Y = dataset[:,60]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# baseline model
def create_baseline():
	# create model
	model = Sequential()
	model.add(Dense(60, input_dim=60, init='normal', activation='relu'))
	model.add(Dense(1, init='normal', activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model
# evaluate baseline model with standardized dataset
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, nb_epoch=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
Example #26
0
GDSCR.loc[GDSCR.iloc[:, 0] == 'S'] = 1
GDSCR.columns = ['targets']
GDSCR = GDSCR.loc[ls2, :]

ls_mb_size = [13, 30, 64]
ls_h_dim = [1023, 512, 256, 128, 64, 32, 16]
ls_marg = [0.5, 1, 1.5, 2, 2.5]
ls_lr = [0.5, 0.1, 0.05, 0.01, 0.001, 0.005, 0.0005, 0.0001, 0.00005, 0.00001]
ls_epoch = [20, 50, 10, 15, 30, 40, 60, 70, 80, 90, 100]
ls_rate = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
ls_wd = [0.01, 0.001, 0.1, 0.0001]
ls_lam = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]

Y = GDSCR['targets'].values

skf = StratifiedKFold(n_splits=5, random_state=42)

for iters in range(max_iter):
    k = 0
    mbs = random.choice(ls_mb_size)
    hdm1 = random.choice(ls_h_dim)
    hdm2 = random.choice(ls_h_dim)
    hdm3 = random.choice(ls_h_dim)
    mrg = random.choice(ls_marg)
    lre = random.choice(ls_lr)
    lrm = random.choice(ls_lr)
    lrc = random.choice(ls_lr)
    lrCL = random.choice(ls_lr)
    epch = random.choice(ls_epoch)
    rate1 = random.choice(ls_rate)
    rate2 = random.choice(ls_rate)
    "コンロ3口", "コンロ4口以上", "コンロ設置可(コンロ1口)", "コンロ設置可(コンロ2口)", "コンロ設置可(コンロ3口)",
    "コンロ設置可(コンロ4口以上)", "コンロ設置可(口数不明)", "システムキッチン", "冷蔵庫あり", "独立キッチン", "給湯",
    "電気コンロ", "BSアンテナ", "CATV", "CSアンテナ", "インターネット使用料無料", "インターネット対応", "光ファイバー",
    "有線放送", "高速インターネット", "24時間換気システム", "2面採光", "3面採光", "ウォークインクローゼット", "エアコン付",
    "エレベーター", "オール電化", "ガスその他", "ガス暖房", "クッションフロア", "シューズボックス", "タイル張り",
    "トランクルーム", "バリアフリー", "バルコニー", "フローリング", "プロパンガス", "ペアガラス", "ルーフバルコニー",
    "ロフト付き", "下水", "二世帯住宅", "二重サッシ", "井戸", "公営水道", "冷房", "出窓", "地下室",
    "室内洗濯機置場", "室外洗濯機置場", "専用庭", "床下収納", "床暖房", "排水その他", "敷地内ごみ置き場", "水道その他",
    "汲み取り", "洗濯機置場なし", "浄化槽", "石油暖房", "都市ガス", "防音室", "bicycle_parking",
    "car_parking", "bike_parking", "structure", "fixed_term"
]

####################
## Train model
####################
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
feature_importance_df = pd.DataFrame()

for fold, (train_idx,
           val_idx) in enumerate(folds.split(train, train["town_cleaned"])):
    print(f"Fold {fold+1}")
    train_data = lgb.Dataset(train.iloc[train_idx][use_cols],
                             label=log_target[train_idx],
                             categorical_feature=categorical_cols)
    val_data = lgb.Dataset(train.iloc[val_idx][use_cols],
                           label=log_target[val_idx],
                           categorical_feature=categorical_cols)
    num_round = N_ROUNDS
    callbacks = [log_evaluation(logger, period=100)]
Example #28
0
def loadDataset():

    # data used for the predictions
    dfData = read_csv("./data/data_0.csv", header=None, sep=',')
    dfLabels = read_csv("./data/labels.csv", header=None)

    return dfData.as_matrix(), dfLabels.as_matrix().ravel(
    )  # to have it in the format that the classifiers like


plt.figure(figsize=(12, 12))

X, y = loadDataset()
numberOfFolds = 10
skf = StratifiedKFold(n_splits=numberOfFolds, shuffle=True)
indexes = [(training, test) for training, test in skf.split(X, y)]

labels = np.max(y) + 1
yTest = []
yNew = []
cMatrix = np.zeros((labels, labels))
countCorrect = 0
for train_index, test_index in indexes:

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # let's normalize, anyway
    # MinMaxScaler StandardScaler Normalizer
    scaler = StandardScaler()
Example #29
0
start_time = time.time()
estimator = lgb.LGBMClassifier(nthread=3, silent=True)

param_grid = {
    'learning_rate': [0.002],
    'n_estimators': [4000, 5000],
    'num_leaves': [20, 30, 40],
    #     'max_depth': [-1],
    'boosting_type': ['gbdt'],
    'objective': ['binary'],
    'seed': [777],
    'colsample_bytree': [0.8],
    'subsample': [0.8],
    'reg_alpha': [0],
    'reg_lambda': [0]
}
cv = StratifiedKFold(n_splits=4)
gbm = RandomizedSearchCV(estimator,
                         param_distributions=param_grid,
                         cv=cv,
                         scoring='roc_auc',
                         n_iter=2)

gbm.fit(train_data_x, train_data_y)

print(time.time() - start_time)

# In[ ]:

print("best params are : ", gbm.best_params_)
def run_KNN(X, y):
	parametrosK = [1, 5, 11, 15, 21, 25]

	scoreMedio = 0
	somaScores = 0
	melhorScoreGeral = 0
	melhorKGeral = 0

	melhorModeloGeral = None

	# para a validação externa utilizaremos 5-fold
	fold_5 = StratifiedKFold(n_splits=5)

	for indices_treino, indices_teste in fold_5.split(X, y):

		# criando novos dados a partir dos indices selecionados
		X_treino = X[indices_treino]
		X_teste = X[indices_teste]
		y_treino = y[indices_treino]
		y_teste = y[indices_teste]

		fold_3 = StratifiedKFold(n_splits=3)

		melhorScore = 0
		melhorK = 1
		melhorModelo = None

		for indices_treino_2, indices_teste_2 in fold_3.split(X_treino, y_treino):

			X_treino2 = X[indices_treino_2]
			X_teste2 = X[indices_teste_2]
			y_treino2 = y[indices_treino_2]
			y_teste2 = y[indices_teste_2]

			# novos conjuntos de treino e teste criados

			# fazendo grid search no parametro K
			for k in parametrosK:
				# inicializando KNN
				knn = KNeighborsClassifier(n_neighbors=k)
				# treinando KNN
				knn.fit(X_treino2, y_treino2)
				# medindo acurácia do KNN
				score = knn.score(X_teste2, y_teste2)

				# salvando melhores parametros
				if score > melhorScore:
					melhorScore = score
					melhorK = k
					melhorModelo = knn

		# treinando novamente SVM, agora utilizando os melhores parametros C e gamma encontrados
		knn = KNeighborsClassifier(n_neighbors=melhorK)
		knn.fit(X_treino, y_treino)
		score = knn.score(X_teste, y_teste)
		if score > melhorScoreGeral:
			melhorScoreGeral = score
			melhorKGeral = melhorK
			melhorModeloGeral = melhorModelo

		# acumulando acurácia para cálculo da acurácia média
		somaScores += score

	# calculando e printando acurácia média
	scoreMedio = (1.0 * somaScores) / 5.0
	print("[KNN] Media acuracia do KNN eh: ", scoreMedio)
	print("[KNN] Melhor acuracia alcancada pelo KNN eh: ", melhorScoreGeral, ", Hiperparametros: K= ", melhorKGeral)
	return melhorScoreGeral, scoreMedio, melhorModeloGeral