def find_weights(X_orig, y_orig, test_subject):
    '''
    computes weights for each data point that is 
    proportional to the probability of it belonging to
    the testing data
    '''
    clf = RFC(n_estimators=10)
    # clf = LR(solver='lbfgs')
    X = X_orig.reshape(X_orig.shape[0], X_orig.shape[1] * X_orig.shape[2])
    y = y_orig.reshape(y_orig.shape[0])
    predictions = np.zeros(y.shape)
    kf = SKF(n_splits=10, shuffle=True, random_state=1234)
    # kf = KFold(n_splits=10, shuffle=True)
    for train_idx, test_idx in kf.split(X, y):
        # print('Training discriminator model for fold {}'.format(fold))
        X_train, X_test = X[train_idx], X[test_idx]
        y_train = y[train_idx]
        X_train, y_train = RandomUnderSampler().fit_resample(X_train, y_train)
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        clf.fit(X_train, y_train)
        probs = clf.predict_proba(X_test)[:, 1]
        predictions[test_idx] = probs
    print(
        f'{test_subject}: ROC-AUC for train and test distributions: {AUC(y, predictions)}'
    )
    weights = predictions  # (1/predictions_test) - 1
    weights /= np.mean(
        weights)  # we do this to re-normalize the computed log-loss
    np.save('../data/cs/CS_weights_{}.npy'.format(test_subject), weights)
    def startified_splits_single_label(self):

        self.dataset_split = {
            'train_X': [],
            'train_y': [],
            'valid_X': [],
            'valid_y': [],
        }

        skf = SKF(n_splits=1, test_size=0.2, random_state=42)

        data = self.train_df['file_path']
        labels = self.train_df['skill']
        for train_index, test_index in skf.split(data, labels):

            self.dataset_split['train_X'].append(
                [data[d] for d in train_index if d in data][:])
            self.dataset_split['valid_X'].append(
                [data[d] for d in test_index if d in data][:])

            self.dataset_split['train_y'].append([[self.train_df['skill'][d]]
                                                  for d in train_index
                                                  if d in labels][:])

            self.dataset_split['valid_y'].append([
                self.train_df['skill'][d] for d in test_index if d in labels
            ][:])
Exemple #3
0
def main():
    rfc = RFC(n_estimators=100, n_jobs=-1)
    fs = SelectFromModel(rfc)
    pca = PCA()
    svm = SVC()
    estimators = zip(["feature_selection", "pca", "svm"], [fs, pca, svm])
    pl = Pipeline(estimators)
    parameters = {
    "feature_selection__threshold" : ["mean", "median"],
    "pca__n_components" : [0.8, 0.5],
    "svm__gamma" : [0.001, 0.01, 0.05],
    "svm__C" : [1, 10]
    }
    gclf = GridSearchCV(pl, parameters, n_jobs=-1, verbose=2)
    
    digits = load_digits()
    X = digits.data
    y = digits.target
    first_fold = True
    trues = []
    preds = []
    for train_index, test_index in SKF().split(X, y):
        if first_fold:
            gclf.fit(X[train_index], y[train_index])
            clf = gclf.best_estimator_
            first_fold = False
        clf.fit(X[train_index,], y[train_index])
        trues.append(y[test_index])
        preds.append(clf.predict(X[test_index]))
    
        true_labels = np.hstack(trues)
        pred_labels = np.hstack(preds)
        print("p:{0:.6f} r:{1:.6f} f1:{2:.6f}".format(*prf(true_labels,pred_labels,average="macro")))
Exemple #4
0
def make_setup(
    n_bins, max_depth, random_state, n_folds,
    datasets_folder, experiments_folder, methods, datasets
):  
    try:
        os.mkdir(datasets_folder)
    except:
        rmtree(datasets_folder)
        os.mkdir(datasets_folder)
        
    for dataset in datasets:
        dataset_folder = datasets_folder + "/" + dataset
        try:
            os.mkdir(dataset_folder)
        except:
            pass
        X, y, s = globals()["get_"+dataset](show=False)
        joblib.dump(X, dataset_folder+"/X.pkl")
        joblib.dump(y, dataset_folder+"/y.pkl")
        joblib.dump(s, dataset_folder+"/s.pkl")
        # stratifying splits w.r.t [y, s]
        strata = []
        for i in range(len(X)):
            row = str(y[i])
            if len(s.shape) == 1: # if only single binary sens attr
                row += str(s[i])

            else:
                for j in range(s.shape[1]):
                    row += str(s[i,j])
            strata.append(row)
        fold = 0
        splitter = SKF(n_splits=n_folds, shuffle=True, random_state=random_state)
        for train_idx, test_idx in splitter.split(X, strata):
            joblib.dump(test_idx, dataset_folder+"/"+str(fold)+"_test_idx.pkl")
            joblib.dump(train_idx, dataset_folder+"/"+str(fold)+"_train_idx.pkl")
            fold += 1
    try:
        os.mkdir(experiments_folder)
    except:
        rmtree(experiments_folder)
        os.mkdir(experiments_folder)
        
    for method in methods:
        method_folder = experiments_folder + "/" + method
        try:
            os.mkdir(method_folder)
        except:
            rmtree(method_folder)
            os.mkdir(method_folder)
        
        for dataset in datasets:
            exp_dataset_folder = method_folder + "/" + dataset
            try:
                os.mkdir(exp_dataset_folder)
            except:
                rmtree(exp_dataset_folder)
                os.mkdir(exp_dataset_folder)
    def __init__(self, X, y, classifier, init_style, fratio_weight):
        Problem.__init__(self, minimized=True)
        self.X = X
        self.y = y
        self.no_instances, self.no_features = self.X.shape
        self.threshold = 0.6
        self.dim = self.no_features
        self.clf = classifier
        self.init_style = init_style
        self.f_weight = fratio_weight

        # stratified only applicable when enough instnaces for each class
        k = 10
        labels, counts = np.unique(self.y, return_counts=True)
        label_min = np.min(counts)
        if label_min < k:
            self.skf = KFold(n_splits=k, shuffle=True, random_state=1617)
            self.skf_valid = KFold(n_splits=k, shuffle=True, random_state=1990)
        else:
            self.skf = SKF(n_splits=k, shuffle=True, random_state=1617)
            self.skf_valid = SKF(n_splits=k, shuffle=True, random_state=1990)

        self.scores = reliefF(self.X, self.y, k=1)
        self.scores = self.scores / np.sum(self.scores)

        # from Orange.data import Domain, Table
        # from Orange.preprocess.discretize import EntropyMDL
        # from Orange.preprocess import Discretize
        # from skfeature.utility.mutual_information import su_calculation
        # domain = Domain.from_numpy(X=X, Y=y)
        # table = Table.from_numpy(domain=domain, X=X, Y=y)
        # disc = Discretize()
        # disc.method = EntropyMDL(force=True)
        # table_dis = disc(table)
        # X_dis = table_dis.X
        # test_scores = []
        # for i in range(self.no_features):
        #     test_scores.append(su_calculation(X_dis[:, i], y))
        # test_scores = np.array(test_scores)
        # test_scores = test_scores/np.sum(test_scores)
        # self.scores = test_scores

        self.surrogate_clf = SVC(random_state=1617)
def SVM_gridsearch(parameters, data_train, labels_train, number_splits,
                   num_threads):
    svm_clf = svm.SVC(gamma="scale", probability=True)
    # multiprocessing.cpu_count()
    clf = GSCV(svm_clf,
               parameters,
               cv=SKF(n_splits=number_splits),
               verbose=2,
               n_jobs=num_threads)
    clf.fit(data_train, labels_train)
    return clf
Exemple #7
0
def param_selector(**kwargs):
    skf = SKF(n_splits=5, shuffle=True)
    skf.get_n_splits(X, y)
    y_pred = y.copy()
    flag = False
    # Iterate through folds
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train, y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred
Exemple #8
0
    def SVM_KCross(self):
        """
        SVM, 10交差検証で精度と結果を出力
        参照 https://qiita.com/kazuki_hayakawa/items/18b7017da9a6f73eba77
             https://qiita.com/nittaryo1977/items/44553b9f555fe7932cca
             https://hayataka2049.hatenablog.jp/entry/2018/03/12/213524
             https://qiita.com/yhyhyhjp/items/c81f7cea72a44a7bfd3a
        """
        # 交差検証
        skf = SKF(n_splits=10, random_state=0, shuffle=True)
        trues = []
        preds = []
        test_files = []
        for train_index, test_index in skf.split(self.features, self.targets):
            # 正規化
            sc = StandardScaler()
            sc.fit(self.features[train_index])
            X_train_std = sc.transform(self.features[train_index])
            X_test_std = sc.transform(self.features[test_index])

            # モデル定義(デフォルトだとRBFカーネル)
            num_m = self.targets[train_index].size
            num_c = np.sum(self.targets[train_index] == 0)
            svm = SVC(random_state=None,
                      probability=True,
                      class_weight={
                          0: 1,
                          1: num_c / num_m
                      })

            # 学習
            svm.fit(X_train_std, self.targets[train_index])
            trues.append(self.targets[test_index])

            # 推論
            preds.append(svm.predict(X_test_std))
            test_files.append(np.hstack(self.files[test_index]))

        # 精度出力
        print(
            classification_report(np.hstack(trues),
                                  np.hstack(preds),
                                  target_names=["bad", "good"]))
        # 予測結果を出力
        self.__makeResultDir(np.hstack(test_files), np.hstack(trues),
                             np.hstack(preds))
Exemple #9
0
    def __init__(self,
                 n_splits=5,
                 n_repeats=None,
                 shuffle=False,
                 random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
        self.n_repeats = n_repeats

        if self.n_repeats is not None:
            self.cvcls = RSKF(n_splits=self.n_splits,
                              n_repeats=self.n_repeats,
                              random_state=self.random_state)
        else:
            self.cvcls = SKF(n_splits=self.n_splits,
                             shuffle=self.shuffle,
                             random_state=self.random_state)
Exemple #10
0
    def split(self, n_splits=5, info=False):
        skf = SKF(n_splits=n_splits,
                  shuffle=True).split(self.images, self.labels)
        folds = []
        for train_idx, test_idx in skf:
            train_dataset = copy(self.dataset)
            train_dataset.__dict__["images"] = list(
                map(lambda idx: self.images[idx], train_idx))
            train_dataset.__dict__["labels"] = list(
                map(lambda idx: self.labels[idx], train_idx))

            test_dataset = copy(self.dataset)
            test_dataset.__dict__["images"] = list(
                map(lambda idx: self.images[idx], test_idx))
            test_dataset.__dict__["labels"] = list(
                map(lambda idx: self.labels[idx], test_idx))

            folds.append((train_dataset, test_dataset))

        if info:
            folds_info(folds)
        return folds
Exemple #11
0
    def __init__(self,
                 split_type='holdout',
                 partitions=2,
                 partition=0,
                 test_size=0.3,
                 seed=0,
                 fields=None):
        if fields is None:
            fields = ['X', 'Y']
        config = self._to_config(locals())

        # Using 'self.algorithm' here to avoid 'algorithm' inside config.
        if split_type == "cv":
            self.algorithm = SKF(shuffle=True,
                                 n_splits=partitions,
                                 random_state=seed)
            del config['test_size']
        elif split_type == "loo":
            self.algorithm = LOO()
            del config['partitions']
            del config['partition']
            del config['test_size']
            del config['seed']
        elif split_type == 'holdout':
            self.algorithm = HO(n_splits=partitions,
                                test_size=test_size,
                                random_state=seed)
        else:
            raise Exception('Wrong split_type: ', split_type)

        super().__init__(config)

        self.partitions = partitions
        self.partition = partition
        self.test_size = test_size
        self.seed = seed
        self.fields = fields
    def startified_splits(self):

        self.dataset_split = {
            'train_X': [], 'train_y': [],
            'valid_X': [], 'valid_y': [],
        }
        

        skf = SKF(
            n_splits=self.config.KFolds, shuffle=True, random_state=42
            )

        data = self.df['file_path']
        labels = self.df['isbeauty']
        for train_index, test_index in skf.split(data, labels):
            
            self.dataset_split['train_X'].append([data[d] for d in train_index if d in data][:])
            self.dataset_split['valid_X'].append([data[d] for d in test_index if d in data][:])
            
            self.dataset_split['train_y'].append([
                [self.df['isbeauty'][d], self.df['skill'][d]] for d in train_index if d in labels][:])

            self.dataset_split['valid_y'].append(
               [[self.df['isbeauty'][d], self.df['skill'][d]] for d in test_index if d in labels][:])            
Exemple #13
0
#      classif.fit(X_tr, Result[u[:n_train]])
#      Pred = classif.predict(X_te)
#      print(np.sum(Pred == Result[u[n_train:]]) / (n_total - n_train))
#      L.append(np.sum(Pred == Result[u[n_train:]]) / (n_total - n_train))

#  plt.figure()
#  plt.plot(L, 'k+')
#  plt.show()

clf = PCA(n_components=25)
classif = RFC(n_estimators=500, n_jobs=-1, class_weight='balanced')
# classif = ADA()

S = []
Confusion = []
skf = SKF(n_splits=10, shuffle=True)
for train, test in skf.split(mf, Result):
    X_train = mf[train]
    X_test = mf[test]
    label_train = Result[train]
    label_test = Result[test]
    X_tr = clf.fit_transform(X_train)
    X_te = clf.transform(X_test)
    classif.fit(X_tr, label_train)
    Pred = classif.predict(X_te)
    C = CM(Pred, label_test)
    s = score(Pred, label_test)
    S.append(s)
    Confusion.append(C)
    print(s, C)
Exemple #14
0
def save_predictions(t, filename, rs):

    # create RandomForest classifier with parameters given in _conf.py
    clf = RandomForestClassifier(random_state=rs,
                                 verbose=verbosity,
                                 class_weight='balanced',
                                 n_estimators=conf.n_estimators,
                                 n_jobs=conf.max_n_jobs,
                                 max_features=conf.tree_max_features,
                                 max_depth=conf.tree_max_depth)

    # use ground truth to create folds for outer cross validation in a stratified way, i.e. such that
    # each label occurs equally often

    participant_scores = np.genfromtxt('Data/Binned_Personality.csv',
                                       skip_header=1,
                                       delimiter=',').astype(int)[:, t + 1]
    outer_cv = SKF(conf.n_outer_folds, shuffle=True, random_state=True)
    len_outer_cv = outer_cv.get_n_splits(participant_scores)

    # initialise arrays to save information
    feat_imp = np.zeros((len_outer_cv, conf.max_n_feat))  # feature importance
    preds = np.zeros((conf.n_participants),
                     dtype=int)  # predictions on participant level
    x = np.zeros(31)  # placeholder for X instead of actual training data
    for outer_i, (outer_train_participants,
                  outer_test_participants) in enumerate(
                      outer_cv.split(x, participant_scores)):
        print(str(outer_i + 1) + '/' + str(conf.n_outer_folds))

        # find best window size in inner cv, and discard unimportant features
        inner_performance = np.zeros((conf.n_inner_folds, 1))
        inner_feat_importances = np.zeros((conf.max_n_feat, 1))

        #load all the extracted features
        x_all, y_all, ids_all = load_data(t)
        if shuffle_labels:
            np.random.seed(316588 + 111 * t + rs)
            perm = np.random.permutation(len(y_all))
            y_all = y_all[perm]
            ids_all = ids_all[perm]

        # cut out the outer train samples
        outer_train_samples = np.array(
            [p in outer_train_participants for p in ids_all])
        outer_train_x = x_all[outer_train_samples, :]
        outer_train_y = y_all[outer_train_samples]
        outer_train_y_ids = ids_all[outer_train_samples]

        # build inner cross validation such that all samples of one person are either in training or testing
        inner_cv = LKF(n_splits=conf.n_inner_folds)
        for inner_i, (inner_train_indices, inner_test_indices) in enumerate(
                inner_cv.split(outer_train_y_ids)):

            # create inner train and test samples. Note: both are taken from outer train samples!
            inner_x_train = outer_train_x[inner_train_indices, :]
            inner_y_train = outer_train_y[inner_train_indices]
            inner_x_test = outer_train_x[inner_test_indices, :]
            inner_y_test = outer_train_y[inner_test_indices]

            # fit Random Forest
            clf.fit(inner_x_train, np.ravel(inner_y_train))

            # save predictions and feature importance
            inner_pred = clf.predict(inner_x_test)
            inner_pred = inner_pred.reshape(-1, 1)
            inner_feat_importances[:, 0] += clf.feature_importances_

            # compute and save performance in terms of accuracy
            innerpreds = []
            innertruth = []
            inner_test_ids = outer_train_y_ids[inner_test_indices]
            for testp in np.unique(inner_test_ids):
                (values,
                 counts) = np.unique(inner_pred[inner_test_ids == testp],
                                     return_counts=True)
                ind = np.argmax(counts)
                innerpreds.append(values[ind])
                innertruth.append(inner_y_test[inner_test_ids == testp][0])
            inner_performance[inner_i,
                              0] = accuracy_score(np.array(innertruth),
                                                  np.array(innerpreds))
            print('ACC: ', '%.2f' % (inner_performance[inner_i, 0] * 100))

        # evaluate classifier on outer cv using the most informative features
        chosen_i = np.argmax(np.mean(inner_performance, axis=0))
        chosen_features = (inner_feat_importances[:, chosen_i] /
                           float(conf.n_inner_folds)) > 0.005

        # reload all data
        x, y, ids = load_data(t, chosen_features=chosen_features)
        if shuffle_labels:
            np.random.seed(316588 + 111 * t + rs + 435786)
            perm = np.random.permutation(len(y))
            y = y[perm]
            ids = ids[perm]
        outer_train_samples = np.array(
            [p in outer_train_participants for p in ids])
        outer_test_samples = np.array(
            [p in outer_test_participants for p in ids])
        if outer_train_samples.size > 0 and outer_test_samples.size > 0:
            x_train = x[outer_train_samples, :]
            y_train = y[outer_train_samples]
            x_test = x[outer_test_samples, :]
            y_test = y[outer_test_samples]

            # fit Random Forest
            clf.fit(x_train, np.ravel(y_train))
            pred = clf.predict(x_test)
            pred = pred.reshape(-1, 1)
            for testp in outer_test_participants:
                if testp in ids[outer_test_samples]:
                    # majority voting over all samples that belong to participant testp
                    (values, counts) = np.unique(
                        pred[ids[outer_test_samples] == testp],
                        return_counts=True)
                    ind = np.argmax(counts)
                    preds[testp] = values[ind]
                else:
                    # participant does not occour in outer test set
                    preds[testp] = -1
            # save the resulting feature importance
            feat_imp[outer_i, chosen_features] = clf.feature_importances_
        else:
            for testp in outer_test_participants:
                preds[testp] = -1
        feat_imp[outer_i, chosen_features] = -1

    # compute resulting F1 score and save to file
    nonzero_preds = preds[preds > -1]
    nonzero_truth = participant_scores[preds > -1]
    f1 = f1_score(nonzero_truth, nonzero_preds, average='macro')
    accuracy = accuracy_score(nonzero_truth, nonzero_preds)
    np.savez(filename,
             f1=f1,
             accuracy=accuracy,
             feature_importances=feat_imp,
             inner_feat_importances='inner_feat_importances')
from sklearn.metrics import classification_report as CR

print("Classification Report:\n", CR(Y_test, pred, zero_division=0))

# ### Cross Validation

# In[12]:

from sklearn.model_selection import StratifiedKFold as SKF
from sklearn.model_selection import cross_val_score as CVS

model = SVC(kernel='rbf', C=13, gamma=0.325)
folds = 5

start = T()
cross_val = SKF(n_splits=folds, shuffle=True, random_state=4)
scores = CVS(model, X, Y, scoring='accuracy', cv=cross_val)
end = T()

accuracy = scores.mean() * 100
print(f"SVC has mean accuracy of {accuracy:.3f}%\n" +
      f"Cross Validation took {(end-start)*1000:.3f}ms")

# ### Calculate F1-Score of the model

# In[13]:

from sklearn.metrics import f1_score as F1

f1score = F1(Y_test, pred, average='weighted')
print(f"SVC has F1-Score = {f1score * 100:.3f}%")
    embed_text.append(vec_embed)

embed_text = np.asarray(embed_text)
print(embed_text.shape)  # this is 3D dimension, need to change to 2d
flat_embeds = np.reshape(embed_text, (embed_text.shape[0], -1))
print(flat_embeds.shape)  # shape in 2d

NB = MultinomialNB()
pc = Perceptron()
svm = LinearSVC()
lr = LogisticRegression()
random_forest = rf()
KNN = knn(n_neighbors=3)
CNN = cnn()

from sklearn.model_selection import StratifiedKFold as SKF
skf = SKF(n_splits=5)
X = flat_embeds
y = label

for clf in [lr]:
    #for clf in [pc, svm, lr, KNN, CNN, random_forest]:
    acc = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        acc.append(clf.score(X_test, y_test))
    acc = np.asarray(acc)
    print(clf, acc.mean())
Exemple #17
0
        print("[!] using adv_training")
        adversarial_training(model, adv_layer_names, 0.5)
    evaluator = Evaluate(filename=cfg["filename"] + "_fold{}".format(fold_id), data=dev_data)
    model.fit_generator(train_D.__iter__(),
                              steps_per_epoch=len(train_D),
                              epochs=RUN_EPOCH,
                              callbacks=[evaluator],
                              shuffle=True
                              )
    del model, train_data, dev_data
    gc.collect()
    print("[!] finish fold_id =", fold_id)
    print("-" * 81)
    

skf = SKF(FOLD_NUM, shuffle=False, random_state=SEED)

print(all_data.shape)
_t0 = time()
for fold_id, (trn_ind, val_ind) in enumerate(skf.split(range(len(all_data)), all_data["label"])):
    if fold_id not in FOLD_ID:
        continue
    t0 = time()
    dev_data = all_data.iloc[val_ind].reset_index(drop=True)
    train_data = all_data.iloc[trn_ind].reset_index(drop=True)
    cfg["num_example"] = len(train_data)
    print("-" * 81)
    print("[!] start fold_id =", fold_id, train_data.shape, dev_data.shape)
    print(cfg)
    K.clear_session()
    gc.collect()
Exemple #18
0
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j,
                 i,
                 format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


clf_class = XGBC  # Replace the classifier here
to_add = 'XGBoost Classifier'  # Replace the name here
# kwargs
# Construct a kfolds object
skf = SKF(n_splits=5, shuffle=True)
skf.get_n_splits(X, y)
y_pred = y.copy()
flag = False
# Iterate through folds
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train = y[train_index]
    # Initialize a classifier with key word arguments
    clf = clf_class()
    clf.fit(X_train, y_train)
    y_pred[test_index] = clf.predict(X_test)
    # if (not flag):
    #   flag = True
    #   feature_importance = clf.feature_importances_
    #   # make importances relative to max importance
Exemple #19
0
    clf = lgb.train(params=params, train_set=dtrain,valid_sets=dval)
    imp_df = pd.DataFrame()
    imp_df['feature'] = featurename
    imp_df['importance_gain'] = clf.feature_importance(importance_type='gain')
    imp_df['importance_split'] = clf.feature_importance(importance_type='split')
    return imp_df, clf

def impdf_all_fu(imp_df_all,threshold):
    imp_df_all_normal=imp_df_all[['feature','importance_split']].groupby(['feature'],as_index=False).mean()
    imp_df_all_normal = imp_df_all_normal.sort_values('importance_split', ascending = False).reset_index(drop = True)
    imp_df_all_normal['normalized_importance'] = imp_df_all_normal['importance_split'] / imp_df_all_normal['importance_split'].sum()
    imp_df_all_normal['cumulative_importance'] = np.cumsum(imp_df_all_normal['normalized_importance'])
    imp_df_all_normal= imp_df_all_normal.sort_values('cumulative_importance')
    record_low_importance = imp_df_all_normal[imp_df_all_normal['cumulative_importance'] > threshold]
    return  imp_df_all_normal,record_low_importance
skf=SKF(n_splits=500,random_state=3,shuffle=True)
lightpara={'objective':'binary','n_estimators':1000,'learning_rate':0.05,'num_leaves':50,'tree_learner':'data','num_threads':8,'bagging_fraction':0.8,'feature_fraction':0.8,'metric':'auc'}
clfreportall=pd.DataFrame()
otherreportall=pd.DataFrame()
inpathlb=['365lgb','730lgb','1095lgb','1460lgb','1825lgb']
aucresult1=[]
accresult1=[]
mccresult1=[]
for k in range(len(inpath)):
    print("start reading")
    table=pd.read_csv(inpath[k])
    table=table[['Uid','variable','status']]
    table=table.drop_duplicates()
    patientclass=pd.read_csv(classpath[k])
    patientclass=patientclass[['Uid','class','t2dmclass','controlclass','classweight']]
    totaluid=patientclass['Uid'].values
Exemple #20
0
set_rf_samples(60000)
# reset_rf_samples() to revert back to default behavior

# ### Building a classifier

# In[23]:

m = RandomForestClassifier(n_jobs=-1, max_depth=5)
predictions = np.zeros(y.shape)

# We are using stratified 4 fold to ensure that percentage for each class is preserved and we cover the whole data once.
# For each row the classifier will calculate the probability of it belonging to train.

# In[24]:

skf = SKF(n_splits=20, shuffle=True, random_state=100)
for fold, (train_idx, test_idx) in enumerate(skf.split(x, y)):

    X_train, X_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    m.fit(X_train, y_train)
    probs = m.predict_proba(X_test)[:, 1]
    predictions[test_idx] = probs

# ### Results

# We'll output the ROC-AUC metric for our classifier as an estimate how much covariate shift this data has. As we can see that value for AUC is very close to .5. It implies that our classifier is not able to distinguish the rows whether it is belonging to train or test. This implies that majority of the observations comes from a feature space which is not particular to test or train.

# In[25]:
def train_model(train_data , valid_data, feat_cols,fold=1):
    #scores_=[]
    hold_score = {}
    train_score = {}

    train_data,valid_data = train_data.reset_index(),valid_data.reset_index()

    train = train_data
    test = valid_data

    X_train_1,y_train_1 = train.drop('isFraud',axis=1),train['isFraud']
    X_test,y_test = test.drop('isFraud',axis=1),test['isFraud'] ##important row

    X = X_train_1
    y = y_train_1
    kf = SKF(n_splits=5, shuffle=True)
    kf.get_n_splits(X,y)
    kfold_train_1=[]
    kfold_test_1=[]


    for train_index, test_index in kf.split(X,y):
        #print("TRAIN:", train_index, "TEST:", test_index)
        kfold_train_1.append([train_index])
        kfold_test_1.append([test_index])

        #hidden_units=[16,16,16,16,16,16]
    dnn_model = tf.estimator.DNNClassifier(hidden_units=[16,16,16,16],feature_columns=feat_cols,model_dir='/home/edgar/Desktop/Fraud/saved_models_{}/'.format(fold),n_classes=2,optimizer=lambda: tf.train.AdamOptimizer(
        learning_rate=tf.train.exponential_decay(
            learning_rate=0.001,
            global_step=tf.train.get_global_step(),
            decay_steps=5000,
            decay_rate=0.86)))

    for i in range(0,5):

        train = train_data.iloc[list(kfold_train_1[:][i][0])]
        test = valid_data.iloc[list(kfold_test_1[:][i][0])]


        X_train_2,y_train_2 = train.drop('isFraud',axis=1),train['isFraud']
        X_test_2,y_test_2 = test.drop('isFraud',axis=1),test['isFraud']

        input_func = tf.estimator.inputs.pandas_input_fn(x=X_train_2,y=y_train_2,batch_size=800,num_epochs=1500,shuffle=True)


        dnn_model.train(input_fn=input_func,steps=15000)

        hold_score['Kfold_{}_sub_{}'.format(fold,i)] = eval_input_func(x_ = X_train_2, y_ =y_train_2,model = dnn_model)
        train_score['Kfold_{}_sub_{}'.format(fold,i)] = eval_input_func(x_ = X_test, y_ = y_test,model = dnn_model)


    train_score_,hold_score_ = [],[]

    for key in hold_score.keys():
        hold_score_.append(hold_score[key])
        train_score_.append(train_score[key])

    dftocsv = pd.DataFrame({'train_score':train_score_,'hold_score':hold_score_})
    dftocsv.to_csv('scores_{}.csv'.format(fold))

    b = eval_input_func(x_ = X_test, y_ = y_test,model = dnn_model)


    print('\n')
    print('*********************************************************')
    print('*********************************************************')
    print('\n')
    #print(i)
    print('Fold {}, Accuracy: {}'.format((fold), b))
    print('\n')
    print('*********************************************************')
    print('*********************************************************')
    print('\n')
Exemple #22
0
    'max_bin': 2**8 -1, 'metric': 'auc',
    'colsample_bytree': 0.33, #0.4
    'bagging_fraction': 0.9, 
    'bagging_freq': 10, 
    'scale_pos_weight': 1.02, 
    'bagging_seed': 619, #619
    'feature_fraction_seed': 619 #619
    }
    
nrounds = 2000  
kfolds = 5  
oof_train=pd.DataFrame({'UCIC_ID': tr_ids, 'Responders':0})
best=[]
score=[]

skf = SKF( n_splits=kfolds, shuffle=True,random_state=123)
i=0
for train_index, test_index in skf.split(df_train, Y):
    print('Fold {0}'.format(i + 1))
    X_train, X_val = df_train[train_index], df_train[test_index]
    y_train, y_val = Y[train_index],Y[test_index]

    ltrain = lgb.Dataset(X_train,y_train)
    lval = lgb.Dataset(X_val,y_val, reference= ltrain)

    gbdt = lgb.train(lgb_params, ltrain, nrounds, valid_sets=lval,
                         verbose_eval=100,
                         early_stopping_rounds=30)  
    bst=gbdt.best_iteration
    pred=gbdt.predict(X_val, num_iteration=bst)
    oof_train.loc[test_index,"Responders"]= pred
Exemple #23
0
                      "pca-gnn f1", "lda-gnn precision", "lda-gnn recall",
                      "lda-gnn f1"
                  ])
for n_components in [5, 10, 15, 20, 25, 30, 40]:
    pca = PCA(n_components=n_components)  # 主成分分析
    lda = LDA(n_components=n_components)  # 線形判別分析

    # zipは複数のリストの要素をまとめて取得
    steps1 = list(zip(["pca", "gnb"], [pca, gnb]))
    steps2 = list(zip(["lda", "gnb"], [lda, gnb]))

    p1 = Pipeline(steps1)
    p2 = Pipeline(steps2)

    score_lst = []
    for decomp_name, clf in zip(["pca", "lda"], [p1, p2]):
        trues = []
        preds = []
        for train_index, test_index in SKF(shuffle=True, random_state=0).split(
                digits.data, digits.target):
            clf.fit(digits.data[train_index], digits.target[train_index])
            trues.append(digits.target[test_index])
            preds.append(clf.predict(digits.data[test_index]))
        scores = prf(np.hstack(trues), np.hstack(preds), average="macro")
        score_lst.extend(scores[:-1])
    df = df.append(pd.Series([n_components, *score_lst], index=df.columns),
                   ignore_index=True)
print(df)
df.plot(x="n_components", y=["pca-gnn f1", "lda-gnn f1"])
plt.savefig("判別成分分析_参考.png")