Exemple #1
0
    def validate(self):
        '''
        Ten-fold cross-validation with stratified sampling.
        '''
        print('Validating new model: {}()'.format(self.__class__.__name__))

        accuracy_scores = []
        precision_scores = []
        recall_scores = []
        f1_scores = []

        sss = StratifiedShuffleSplit(n_splits=10)
        for train_index, test_index in sss.split(self.data, self.labels):
            x_train, x_test = self.data[train_index], self.data[test_index]
            y_train, y_test = self.labels[train_index], self.labels[test_index]

            model = self.create_model()
            model.fit(x_train, y_train, epochs=100, batch_size=128,
                      class_weight=self.class_weight)
            y_pred = model.predict_classes(x_test, batch_size=128)

            accuracy_scores.append(accuracy_score(y_test, y_pred))
            precision_scores.append(precision_score(y_test, y_pred))
            recall_scores.append(recall_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred))

        print('')
        print('Accuracy: {}'.format(np.mean(accuracy_scores)))
        print('Precision: {}'.format(np.mean(precision_scores)))
        print('Recall: {}'.format(np.mean(recall_scores)))
        print('F1-measure: {}'.format(np.mean(f1_scores)))
Exemple #2
0
def simple_classification(n_samples=100, n_features=10, random_state=33):
    """
    Generate simple classification task for training.

    Parameters
    ----------
    n_samples : int
        Number of samples in dataset.
    n_features : int
        Number of features for each sample.
    random_state : int
        Random state to make results reproducible.

    Returns
    -------
    tuple
        Returns tuple that contains 4 variables. There are input train,
        input test, target train, target test respectevly.
    """
    X, y = datasets.make_classification(n_samples=n_samples,
                                        n_features=n_features,
                                        random_state=random_state)
    shuffle_split = StratifiedShuffleSplit(n_splits=1, train_size=0.6,
                                           random_state=random_state)

    train_index, test_index = next(shuffle_split.split(X, y))
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    return x_train, x_test, y_train, y_test
Exemple #3
0
    def fit_model(self, X, y):
        """
        X::pd.DataFrame: Input data
        y::np.ndarray: response for input data
        """
        X = X.values
        XY = np.hstack((X, y[:, None]))
        np.random.shuffle(XY)
        X = XY[:, :-1]
        y = XY[:, -1]
        cv_out = StratifiedShuffleSplit(n_splits=400)
        cv_in = StratifiedKFold(n_splits=5)
        clf = Pipeline([('scaler', StandardScaler()),
                        ('lg', linear_model.LogisticRegressionCV(
                                  penalty='l1',
                                  solver='liblinear',
                                  cv=cv_in))])

        self.res = {'coef':[], 'auc':[], 'model':0}

        for idx, (train, test) in enumerate(cv_out.split(X, y)):
            clf.fit(X[train], y[train])
            prediction = clf.predict(X[test])
            self.res['coef'].append((idx, clf.named_steps['lg'].coef_[0]))
            self.res['auc'].append((idx, roc_auc_score(y[test], prediction)))

        self.res['model'] = clf

        output_saved = self.save_pickle(self.res, self.out)
        return output_saved
Exemple #4
0
    def fit(self, X, y, X_test=None, y_test=None):
        super(MLP, self).fit(X, y)

        callbacks = []
        test = X_test is not None and y_test is not None
        if test:
            self.test_loss = TestLossHistory(X_test, y_test)
            callbacks.append(self.test_loss)

        if self.n_class == 1 and self.n_label > 2:
            yr = unroll(y)

        if self.early_stop:
            sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
            train_index, val_index = next(iter(sss.split(X, y)))
            x_train, x_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            stop = EarlyStopping(monitor="val_loss", patience=self.patience, verbose=self.verbose)
            callbacks.append(stop)

            history = self.model.fit(
                x_train,
                y_train,
                nb_epoch=self.max_epoch,
                verbose=self.verbose,
                callbacks=callbacks,
                validation_data=(x_val, y_val),
            )

        else:
            history = self.model.fit(X, y, nb_epoch=self.max_epoch, verbose=self.verbose, callbacks=callbacks)

        self.history = history.history
        return self
def split(data, test_size):
    X, y = np.array(data.data), np.array(data.target)

    splitter = StratifiedShuffleSplit(n_iter=1, test_size=test_size)
    train, test = next(splitter.split(X, y))

    return X[train], y[train], X[test], y[test]
    def _get_validation_split(self):
        train = pd.read_csv(self.train_csv_file)
        # mapping labels to integer classes
        flatten = lambda l: [item for sublist in l for item in sublist]
        labels = list(set(flatten([l.split(' ') for l in train['tags'].values])))
        label_map = {l: i for i, l in enumerate(labels)}

        y_train = []
        for f,tags in (train.values):
            targets = np.zeros(len(label_map))
            for t in tags.split(' '):
                targets[label_map[t]] = 1
            y_train.append(targets)

        y_train = np.array(y_train, np.uint8)
        trn_index = []
        val_index = []
        index = np.arange(len(train))
        for i in (range(len(label_map))):
            sss = StratifiedShuffleSplit(n_splits=2, test_size=self.validation_split, random_state=i)
            for train_index, test_index in sss.split(index,y_train[:,i]):
                X_train, X_test = index[train_index], index[test_index]
            # to ensure there is no repetetion within each split and between the splits
            trn_index = trn_index + list(set(X_train) - set(trn_index) - set(val_index))
            val_index = val_index + list(set(X_test) - set(val_index) - set(trn_index))
        return np.array(trn_index), np.array(val_index)
def outer_cv_loop(Xdata,Ydata,clf,parameters=[],
                    n_splits=10,test_size=0.25):

    pred=numpy.zeros(len(Ydata))
    importances=[]
    kf=StratifiedShuffleSplit(n_splits=n_splits,test_size=test_size)
    rocscores=[]
    for train,test in kf.split(Xdata,Ydata):
        if numpy.var(Ydata[test])==0:
           print('zero variance',varname)
           rocscores.append(numpy.nan)
           continue
        Ytrain=Ydata[train]
        Xtrain=fancyimpute.SoftImpute(verbose=False).complete(Xdata[train,:])
        Xtest=fancyimpute.SoftImpute(verbose=False).complete(Xdata[test,:])
        if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2:
           smt = SMOTETomek()
           Xtrain,Ytrain=smt.fit_sample(Xtrain.copy(),Ydata[train])
        # filter out bad folds
        clf.fit(Xtrain,Ytrain)
        pred=clf.predict(Xtest)
        if numpy.var(pred)>0:
           rocscores.append(roc_auc_score(Ydata[test],pred))
        else:
           rocscores.append(numpy.nan)
        importances.append(clf.feature_importances_)
    return rocscores,importances
def main():
    args = cli_parser().parse_args()

    TEST_PERCENT = args.test_percent
    RAND_STATE = args.rand_state
    OUTPUT_BASE = args.output_base
    CLS_TO_FILEPATH = args.cls_to_cmdProcessedCsv

    # Parse CSV files associated to classes
    cls_uuids = {}
    for cls, filepath in six.iteritems(CLS_TO_FILEPATH):
        cls_uuids[cls] = sorted({r[1] for r in csv.reader(open(filepath))})

    cls_list = sorted(cls_uuids)
    all_label, all_uuids = \
        zip(*[(cls_name, uuid)
              for cls_name in cls_list
              for uuid in cls_uuids[cls_name]])
    # Transform into numpy array for multi-index access later
    all_label = numpy.array(all_label)
    all_uuids = numpy.array(all_uuids)

    # ``n_splits=1``  -- Only make one train/test split
    sss = StratifiedShuffleSplit(n_splits=1, test_size=TEST_PERCENT,
                                 random_state=RAND_STATE)

    # Get array of index position values of ``all_uuids`` of uuids to use for
    # train and test sets, respectively.
    train_index, test_index = \
        iter(sss.split(numpy.zeros(len(all_label)), all_label)).next()
    uuids_train, uuids_test = all_uuids[train_index], all_uuids[test_index]
    label_train, label_test = all_label[train_index], all_label[test_index]

    print("Train:")
    for cls_label in cls_list:
        cnt = label_train.tolist().count(cls_label)
        print("- %s:\t%d\t(~%.2f %% of total class examples)"
              % (cls_label, cnt, float(cnt) / len(cls_uuids[cls_label]) * 100))
    print("Test:")
    for cls_label in cls_list:
        cnt = label_test.tolist().count(cls_label)
        print("- %s:\t%d\t(~%.2f %% of total class examples)"
              % (cls_label, cnt, float(cnt) / len(cls_uuids[cls_label]) * 100))

    # Save out files for use with ``classifier_model_validation``
    with open('%s.all_uuids.csv' % OUTPUT_BASE, 'w') as f:
        w = csv.writer(f)
        for uuid, label in itertools.izip(all_uuids, all_label):
            w.writerow([uuid, label])

    with open('%s.train_uuids.csv' % OUTPUT_BASE, 'w') as f:
        w = csv.writer(f)
        for uuid, label in itertools.izip(uuids_train, label_train):
            w.writerow([uuid, label])

    with open('%s.test_uuids.csv' % OUTPUT_BASE, 'w') as f:
        w = csv.writer(f)
        for uuid, label in itertools.izip(uuids_test, label_test):
            w.writerow([uuid, label])
 def __init__(self, fm_decoder, n_iter=5, test_size=0.2, train_size=None,
              random_state=None):
     self.fm_decoder = fm_decoder
     StratifiedShuffleSplit.__init__(
         self,
         n_iter=n_iter,
         test_size=test_size,
         train_size=train_size,
         random_state=random_state)
Exemple #10
0
 def robust_coef(self,xwl2,hm_y,n_iter=100):
     skf = StratifiedShuffleSplit(n_splits=n_iter, test_size=.2,random_state=1)
     coefs_ = []
     intercept_ = []
     for train,test in skf.split(xwl2,hm_y):
         self.clf2.fit(xwl2[train,:],hm_y[train])
         coefs_.append(self.clf2.coef_)
         intercept_.append(self.clf2.intercept_)
     self.clf2.coef_ = np.stack(coefs_).mean(0)
     self.clf2.intercept_ = np.stack(intercept_).mean(0)
Exemple #11
0
def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999):
    f = open(os.path.join('datasets', 'titanic', 'titanic3.csv'))
    # Remove . from home.dest, split on quotes because some fields have commas
    keys = f.readline().strip().replace('.', '').split('","')
    lines = f.readlines()
    f.close()
    string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat',
                   'homedest']
    string_keys = [s for s in string_keys if s not in feature_skip_tuple]
    numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare']
    numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple]
    train_vectorizer_list = []
    test_vectorizer_list = []

    n_samples = len(lines)
    numeric_data = np.zeros((n_samples, len(numeric_keys)))
    numeric_labels = np.zeros((n_samples,), dtype=int)

    # Doing this twice is horribly inefficient but the file is small...
    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        numeric_labels[n] = line_dict["survived"]

    sss = StratifiedShuffleSplit(n_iter=1, test_size=test_size, random_state=12)
    # This is a weird way to get the indices but it works
    train_idx = None
    test_idx = None
    for train_idx, test_idx in sss.split(numeric_data, numeric_labels):
        pass

    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        if n in train_idx:
            train_vectorizer_list.append(strings)
        else:
            test_vectorizer_list.append(strings)
        numeric_data[n] = np.asarray([line_dict[k]
                                      for k in numeric_keys])

    train_numeric = numeric_data[train_idx]
    test_numeric = numeric_data[test_idx]
    train_labels = numeric_labels[train_idx]
    test_labels = numeric_labels[test_idx]

    vec = DictVectorizer()
    # .toarray() due to returning a scipy sparse array
    train_categorical = vec.fit_transform(train_vectorizer_list).toarray()
    test_categorical = vec.transform(test_vectorizer_list).toarray()
    train_data = np.concatenate([train_numeric, train_categorical], axis=1)
    test_data = np.concatenate([test_numeric, test_categorical], axis=1)
    keys = numeric_keys + string_keys
    return keys, train_data, test_data, train_labels, test_labels
Exemple #12
0
def shuffled_split(housing):
  add_income_category(housing)
  split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
  for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
  strat_test_set["income_cat"].value_counts() / len(strat_test_set)

  for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

  return strat_train_set, strat_test_set
def test_stratified_shuffle_split_overlap_train_test_bug():
    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
    # the original bug report
    y = [0, 1, 2, 3] * 3 + [4, 5] * 5
    X = np.ones_like(y)

    sss = StratifiedShuffleSplit(n_splits=1,
                                 test_size=0.5, random_state=0)

    train, test = next(iter(sss.split(X=X, y=y)))

    assert_array_equal(np.intersect1d(train, test), [])
def test_stratifiedshufflesplit_list_input():
    # Check that when y is a list / list of string labels, it works.
    sss = StratifiedShuffleSplit(test_size=2, random_state=42)
    X = np.ones(7)
    y1 = ['1'] * 4 + ['0'] * 3
    y2 = np.hstack((np.ones(4), np.zeros(3)))
    y3 = y2.tolist()

    np.testing.assert_equal(list(sss.split(X, y1)),
                            list(sss.split(X, y2)))
    np.testing.assert_equal(list(sss.split(X, y3)),
                            list(sss.split(X, y2)))
def _split_data(X, y, p_train=0.5, seed=None):
    """
    Splits data into train and test data.
    X contains the data and y contains the labels.
    """
    sss = StratifiedShuffleSplit(n_splits=1, test_size=None, train_size=p_train,
                                 random_state=seed)

    train_index, test_index = next(iter(sss.split(X, y)))
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    return (X_train, y_train), (X_test, y_test)
def test_stratified_shuffle_split_even():
    # Test the StratifiedShuffleSplit, indices are drawn with a
    # equal chance
    n_folds = 5
    n_iter = 1000

    def assert_counts_are_ok(idx_counts, p):
        # Here we test that the distribution of the counts
        # per index is close enough to a binomial
        threshold = 0.05 / n_splits
        bf = stats.binom(n_splits, p)
        for count in idx_counts:
            p = bf.pmf(count)
            assert_true(p > threshold,
                        "An index is not drawn with chance corresponding "
                        "to even draws")

    for n_samples in (6, 22):
        labels = np.array((n_samples // 2) * [0, 1])
        splits = StratifiedShuffleSplit(n_iter=n_iter,
                                        test_size=1. / n_folds,
                                        random_state=0)

        train_counts = [0] * n_samples
        test_counts = [0] * n_samples
        n_splits = 0
        for train, test in splits.split(X=np.ones(n_samples), y=labels):
            n_splits += 1
            for counter, ids in [(train_counts, train), (test_counts, test)]:
                for id in ids:
                    counter[id] += 1
        assert_equal(n_splits, n_iter)

        n_train, n_test = _validate_shuffle_split(n_samples,
                                                  test_size=1./n_folds,
                                                  train_size=1.-(1./n_folds))

        assert_equal(len(train), n_train)
        assert_equal(len(test), n_test)
        assert_equal(len(set(train).intersection(test)), 0)

        label_counts = np.unique(labels)
        assert_equal(splits.test_size, 1.0 / n_folds)
        assert_equal(n_train + n_test, len(labels))
        assert_equal(len(label_counts), 2)
        ex_test_p = float(n_test) / n_samples
        ex_train_p = float(n_train) / n_samples

        assert_counts_are_ok(train_counts, ex_train_p)
        assert_counts_are_ok(test_counts, ex_test_p)
Exemple #17
0
    def gen_sample_array(self):
        try:
            from sklearn.model_selection import StratifiedShuffleSplit
        except:
            print('Need scikit-learn for this functionality')
        import numpy as np
        
        s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5)
        X = th.randn(self.class_vector.size(0),2).numpy()
        y = self.class_vector.numpy()
        s.get_n_splits(X, y)

        train_index, test_index = next(s.split(X, y))
        return np.hstack([train_index, test_index])
Exemple #18
0
def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv.split(features, labels): 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print(clf)
        print(PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
        print(RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
        print("")
    except:
        print("Got a divide by zero when trying out:", clf)
        print("Precision or recall may be undefined due to a lack of true positive predicitons.")
def main_cv_loop(Xdata,Ydata,clf,parameters,
                n_folds=4,oversample_thresh=0.1,verbose=False):

    # use stratified K-fold CV to get roughly equal folds
    #kf=StratifiedKFold(n_splits=nfolds)
    kf=StratifiedShuffleSplit(n_splits=4,test_size=0.2)
    # use oversampling if the difference in prevalence is greater than 20%
    if numpy.abs(numpy.mean(Ydata)-0.5)>oversample_thresh:
        oversample='smote'
    else:
        oversample='none'

    # variables to store outputs
    pred=numpy.zeros(len(Ydata))  # predicted values
    pred_proba=numpy.zeros(len(Ydata))  # predicted values
    kernel=[]
    C=[]
    fa_ctr=0

    for train,test in kf.split(Xdata,Ydata):
        Xtrain=Xdata[train,:]
        Xtest=Xdata[test,:]
        Ytrain=Ydata[train]
        if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2:
            if verbose:
                print('oversampling using SMOTETomek')
            sm = SMOTETomek()
            Xtrain, Ytrain = sm.fit_sample(Xtrain, Ytrain)

        best_estimator_,bestroc,fa=inner_cv_loop(Xtrain,Ytrain,clf,
                    parameters,verbose=True)
        if not fa is None:
            if verbose:
                print('transforming using fa')
                print(fa)
            tmp=fa.transform(Xtest)
            Xtest=tmp
            fa_ctr+=1
        pred_proba.flat[test]=best_estimator_.predict_proba(Xtest)
        pred.flat[test]=best_estimator_.predict(Xtest)
        kernel.append(best_estimator_.kernel)
        C.append(best_estimator_.C)
    return roc_auc_score(Ydata,pred,average='weighted'),Ydata,pred,pred_proba
Exemple #20
0
def start_to_fit(X, y):
    classifiers = [
        KNeighborsClassifier(3),
        SVC(probability=True),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        GaussianNB(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        LogisticRegression()]

    res_cols = ['Classifier','Accuracy']
    res = pd.DataFrame(columns = res_cols)

    data_set = StratifiedShuffleSplit(n_splits=10, test_size=0.3, train_size=0.7, random_state=0)

    accuracy_dic ={}


    for train_index, test_index in data_set.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        for clf in classifiers:
            name = clf.__class__.__name__
            clf.fit(X_train, y_train)
            #train_predictions = clf.predict(X_test)
            accuracy = accuracy_score(y_test, clf.predict(X_test))
            if name in accuracy_dic:
                accuracy_dic[name] += accuracy
            else:
                accuracy_dic[name] = accuracy

    for clf in accuracy_dic:
        accuracy_dic[clf] = accuracy_dic[clf] / 10.0
        res_entry = pd.DataFrame([[clf, accuracy_dic[clf]]], columns=res_cols)
        res = res.append(res_entry)

    print res
def splitTrainTest(inputDF,random_state):

    simpleTrainSet, simpleTestSet = train_test_split(inputDF, test_size=0.2, random_state=random_state)

    inputDF["income_category"] = np.ceil(inputDF["median_income"]/1.5)
    inputDF["income_category"].where( inputDF["income_category"] < 5.0 , 5.0, inplace = True )

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=19)
    for trainIndices, testIndices in split.split(inputDF,inputDF["income_category"]):
        stratifiedTrainSet = inputDF.loc[trainIndices]
        stratifiedTestSet  = inputDF.loc[testIndices]

    print('\ninputDF["income_category"].value_counts() / len(inputDF)')
    print(   inputDF["income_category"].value_counts() / len(inputDF) )

    for set in (stratifiedTrainSet,stratifiedTestSet):
        set.drop(["income_category"],axis=1,inplace=True)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( stratifiedTrainSet , stratifiedTestSet )
def train_and_test(raw_data, label="Qw", degree=1, p=0.1):
    # my_full_pipeline = Pipeline([
    # #         ('removeFirstFrame', RemoveFirstFrame(frame)),
    #         ('featureSelection', full_pipeline)
    # ])

    from sklearn.model_selection import StratifiedShuffleSplit

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=142)
    for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
        strat_train_set = raw_data.iloc[train_index]
        strat_test_set = raw_data.iloc[test_index]
    # strat_test_set[LABEL].value_counts() / len(strat_test_set)
    X_train = my_transform(strat_train_set, label, degree)
    X_test = my_transform(strat_test_set, label, degree)
    train_y = X_train[:,-1]
    train_set = X_train[:,:-1]
    test_y = X_test[:,-1]
    test_set = X_test[:,:-1]
    return (train_set, train_y, test_set, test_y)
def splitTrainTest(inputDF,random_state):

    ms_spec = importlib.util.find_spec(name="sklearn.model_selection")
    if ms_spec is None:
        trainSet, testSet = train_test_split(inputDF, test_size=0.2, random_state=random_state)
    else:
        inputDF["income_category"] = np.ceil(inputDF["median_income"]/1.5)
        inputDF["income_category"].where( inputDF["income_category"] < 5.0 , 5.0, inplace = True )

        split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=19)
        for trainIndices, testIndices in split.split(inputDF,inputDF["income_category"]):
            trainSet = inputDF.loc[trainIndices]
            testSet  = inputDF.loc[testIndices]

        print('\nincome category relative sizes (whole data set)')
        print(   inputDF["income_category"].value_counts() / len(inputDF) )

        for set in (trainSet,testSet):
            set.drop(["income_category"],axis=1,inplace=True)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( trainSet , testSet )
Exemple #24
0
 def suffle_hm(self,x,y,gamma=0.5,n_iter=50):
     hm_count = np.zeros_like(y).astype(float)
     hm = np.zeros_like(y).astype(float)
     skf = StratifiedShuffleSplit(n_splits=n_iter, test_size=.25,random_state=1)
     coefs_ = []
     sv_ = []
     for train,test in skf.split(x,y):
         self.clf1.fit(x[train,:],y[train])
         hm_count[test] += 1.
         hm[test] += (self.clf1.predict(x[test,:])==y[test]).astype(float)
         #coefs_.append(self.clf1.dual_coef_)
         #coefs_.append(self.clf1.coef_)
         #sv_.append(self.clf1.support_vectors_)
     proba = hm/hm_count
     if self.verbose:
         print(hm_count)
         print(proba)
     #self.clf1.dual_coef_ = np.stack(coefs_).mean(0)
     #self.clf1.support_vectors_ = np.stack(sv_).mean(0)
     #self.clf1.coef_ = np.stack(coefs_).mean(0)
     self.clf1.fit(x,y)
     return (proba>=gamma).astype(int),proba
Exemple #25
0
    def fit_model(self, X, y):
        """
        X::pd.DataFrame: Input data
        y::np.ndarray: response for input data
        """
        cv_out = StratifiedShuffleSplit(n_splits=400)

        clf = Pipeline([('scaler', StandardScaler()),
                        ('fs', CustFsNoiseWinnow()),
                        ('et', ExtraTreesClassifier(n_estimators=2000))])

        self.res = {'mask':[], 'fimp':[], 'auc':[], 'model':0}

        for idx, (train, test) in enumerate(cv_out.split(X, y)):
            clf.fit(X[train], y[train])
            prediction = clf.predict(X[test])
            self.res['mask'].append((idx, clf.named_steps['fs'].mask_))
            self.res['fimp'].append((idx, clf.named_steps['et'].feature_importances_))
            self.res['auc'].append((idx, roc_auc_score(y[test], prediction)))

        self.res['model'] = clf
        output_saved = self.save_pickle(self.res, self.out)
        return output_saved
Exemple #26
0
def train_age(kfold, batchsize, lr_age, lr_gender, num_epochs, p_augment,
              device, num_age_classes, num_gender_classes, test_fold,
              train_fold, random_seed):
    all_accuracy_age = []
    all_val_loss_age = []
    all_stat_fold = []

    for fold in range(kfold):
        all_stat = defaultdict(list)

        # image paths
        train_data = train_fold[fold]['image_path'].copy().reset_index(
            drop=True).to_list()
        test_data = test_fold[fold]['image_path'].copy().reset_index(
            drop=True).to_list()

        #get label
        train_age_label = train_fold[fold]['age'].copy().reset_index(
            drop=True).to_list()
        train_gender_label = train_fold[fold]['gender'].copy().reset_index(
            drop=True).to_list()
        test_age_label = test_fold[fold]['age'].copy().reset_index(
            drop=True).to_list()
        test_gender_label = test_fold[fold]['gender'].copy().reset_index(
            drop=True).to_list()

        #create train-validation stratified split
        sss = StratifiedShuffleSplit(n_splits=10, random_state=random_seed)

        #split based on age, more balanced for both age and gender
        train_idx, val_idx = list(sss.split(train_data, train_age_label))[0]

        train_idx = list(train_idx)
        val_idx = list(val_idx)

        #create dataloader for gender
        train_dataset = AgeDataset(
            '',
            list(np.array(train_data)[train_idx]),
            list(np.array(train_age_label)[train_idx]),
            list(np.array(train_gender_label)[train_idx]),
            p_augment=p_augment)
        val_dataset = AgeDataset('',
                                 list(np.array(train_data)[val_idx]),
                                 list(np.array(train_age_label)[val_idx]),
                                 list(np.array(train_gender_label)[val_idx]),
                                 validation=True)
        test_dataset = AgeDataset('',
                                  test_data,
                                  test_age_label,
                                  test_gender_label,
                                  validation=True)

        train_loader = DataLoader(train_dataset,
                                  batch_size=batchsize,
                                  shuffle=True)
        val_loader = DataLoader(val_dataset,
                                batch_size=batchsize,
                                shuffle=False)
        test_loader = DataLoader(test_dataset,
                                 batch_size=batchsize,
                                 shuffle=False)

        val_gender_label = list(np.array(train_gender_label)[val_idx])
        val_age_label = list(np.array(train_age_label)[val_idx])

        model = InceptionResnetV1(classify=True,
                                  pretrained='vggface2',
                                  num_classes=num_age_classes)
        model = model.to(device)

        #optimizer
        optimizer = optim.AdamW(model.parameters(), lr=lr_age)
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer, [5, 10])

        #loss
        criterion = nn.CrossEntropyLoss()

        best_acc_age = 0
        best_val_loss_age = 999

        print(f'Fold {fold+1}\n')
        for epoch in range(num_epochs):
            print(f'epoch: {epoch}\n')
            train_loss_age = 0
            val_loss_age = 0

            #Training
            model.train()
            iterat = 0
            vsego = len(train_loader)
            for batch in train_loader:
                print(f'batch_num: {100*(iterat/vsego)}%\n')

                # Load image batch
                batch_data, batch_age_label = batch
                batch_data = batch_data.to(device)
                batch_age_label = batch_age_label.to(device)

                # Clear gradients
                optimizer.zero_grad()

                with torch.set_grad_enabled(True):
                    pred_age = model(batch_data)
                    loss_age = criterion(pred_age, batch_age_label)

                    train_loss_age += loss_age.detach().item()
                    loss_age.backward()
                    optimizer.step()

                iterat = iterat + 1

            #Validation
            model.eval()
            all_pred_age = torch.empty(0).to(device)
            for batch in val_loader:

                # Load image batch
                batch_data, batch_age_label = batch
                batch_data = batch_data.to(device)
                batch_age_label = batch_age_label.to(device)

                with torch.set_grad_enabled(False):
                    pred_age = model(batch_data)
                    loss_age = criterion(pred_age, batch_age_label)
                    val_loss_age += loss_age.detach().item()
                    all_pred_age = torch.cat(
                        (all_pred_age,
                         nn.functional.softmax(pred_age.detach(), dim=1)), 0)

            train_loss_age /= len(train_loader)
            val_loss_age /= len(val_loader)

            all_pred_age = all_pred_age.cpu().numpy()
            pred_label_age = list(np.argmax(all_pred_age, axis=1))

            acc_age = accuracy_score(val_age_label, pred_label_age)
            if acc_age > best_acc_age:
                best_acc_age = acc_age
                best_val_loss_age = val_loss_age
                torch.save(model.state_dict(), f'models/age_model{fold}.pth')

            all_stat['train_loss'].append(train_loss_age)
            all_stat['val_loss'].append(val_loss_age)
            all_stat['val_acc'].append(acc_age)

            print(
                f'Epoch {epoch} | train loss: {train_loss_age} | val loss: {val_loss_age} | accuracy: {round(acc_age*100, 2)}%'
            )
            scheduler.step()

        #INFERENCE
        with torch.no_grad():
            model.load_state_dict(torch.load(f'models/age_model{fold}.pth'))
            model.eval()
            test_pred_age = torch.empty(0).to(device)
            for batch in test_loader:

                # Load image batch
                batch_data, batch_age_label = batch
                batch_data = batch_data.to(device)
                batch_age_label = batch_age_label.to(device)

                with torch.set_grad_enabled(False):
                    pred_age = model(batch_data)
                    test_pred_age = torch.cat(
                        (test_pred_age,
                         nn.functional.softmax(pred_age.detach(), dim=1)), 0)

            test_pred_age = test_pred_age.cpu().numpy()
            pred_label_age = list(np.argmax(test_pred_age, axis=1))

            acc_age = accuracy_score(test_age_label, pred_label_age)
            all_stat['test_acc'].append(acc_age)
            all_stat['conf'].append(
                confusion_matrix(test_age_label,
                                 pred_label_age,
                                 labels=list(range(num_age_classes))))
            all_stat['conf_norm'].append(
                confusion_matrix(test_age_label,
                                 pred_label_age,
                                 normalize='true',
                                 labels=list(range(num_age_classes))))
            all_stat['test_pred'].append(pred_label_age)
            all_stat['test_target'].append(test_age_label)

        all_accuracy_age.append(acc_age)
        all_val_loss_age.append(best_val_loss_age)
        print(
            f'TEST ACCURACY: {round(acc_age*100,2)}% | Val. Accuracy: {round(best_acc_age*100,2)}% | Val. Loss.: {best_val_loss_age}\n'
        )

        all_stat_fold.append(all_stat)

    all_accuracy_age = np.array(all_accuracy_age)
    all_val_loss_age = np.array(all_val_loss_age)

    mean_accuracy_age = round(all_accuracy_age.mean() * 100, 2)

    print(f'\nOverall Accuracy: {mean_accuracy_age} p/m')
Exemple #27
0
def rodar_experimento(dir_experimento, documentos_validos, freq_min,
                      op_stopwords, op_ica, op_tesauro, op_tam_vec, lista_k,
                      rnd, exp, w2v_geral, ftt_geral, glv_geral):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=rnd)
    X = documentos_validos.id
    y = documentos_validos.Assunto
    stopwords = nltk.corpus.stopwords.words('portuguese')
    diretorio = "dados/corpus_tratado/"
    le = LabelEncoder()

    #index[0] são os indices de treino, e index[1] são os de teste
    for index in sss.split(X, y):
        X_treino, X_teste = X[index[0]], X[index[1]]
        y_treino, y_teste = y[index[0]], y[index[1]]

        # instanciando o corpus do conjunto de treinamento
        base_treino = criar_base_treino(exp, X_treino, y_treino, diretorio,
                                        stopwords)
        # criando vocabulário
        vocab = extrair_vocabulario(base_treino, freq_min, stopwords,
                                    op_stopwords, op_ica, op_tesauro)
        # treinando modelos juridicos
        w2v_jur, ftt_jur, glv_jur = treinar_modelos_jur(
            X_treino, X_teste, y_treino, y_teste, vocab, diretorio, exp,
            op_tam_vec)
        #criando representações através da soma de vetores
        bs = criar_representacoes_soma_jur(X_teste, y_teste, vocab, diretorio,
                                           w2v_jur, ftt_jur, glv_jur, exp,
                                           op_tam_vec)
        criar_representacoes_soma_ger(vocab, diretorio, w2v_geral, ftt_geral,
                                      glv_geral, exp, op_tam_vec, bs)

        ######DOC2VEC####
        print('--------- Treinando doc2vec do experimento ' + str(exp) +
              ' ---------')
        os.mkdir('resultados/' + dir_experimento)
        corpus = "dados/" + dir_experimento + "/base_treino_glv.txt"
        model = Doc2Vec(corpus_file=corpus,
                        vector_size=100,
                        window=5,
                        min_count=1,
                        workers=8)
        model.save("dados/" + dir_experimento + "/doc2vec_jur.model")
        print(
            '--------- Inferindo vetores para docs de teste do experimento ' +
            str(exp) + ' ---------')
        base_teste = pd.read_csv("dados/" + dir_experimento +
                                 "/vetores_teste.csv")
        base_teste['doc2vec_jur'] = [
            normalize(model.infer_vector(x[0].split(' ')).reshape(1, -1))
            for x in base_teste.teores
        ]
        base_teste.to_csv('dados/experimento_' + str(exp) +
                          '/vetores_teste.csv',
                          index=False)

        df = pd.read_csv('dados/' + dir_experimento + '/vetores_teste.csv')
        print('++++++ modelos ++++++ ' + df.iloc[:, 3:].columns)

        for modelo in df.iloc[:, 3:].columns:
            #####AGRUPAMENTOS###############
            print('--------- Agrupando dados para o modelo ' + modelo +
                  ' no experimento' + str(exp) + ' ---------')
            df[modelo] = df[modelo].apply(lambda x: converter_string_array(x))
            X_kmeans = np.stack(df[modelo])
            X_kmeans = X_kmeans.reshape(X_kmeans.shape[0], X_kmeans.shape[2])
            y_kmeans = df['assunto']
            le.fit(y_kmeans)
            y_kmeans = le.transform(y_kmeans)
            lista_scores_k = computar_scores_agrupamento(
                X_kmeans, y_kmeans, dir_experimento, modelo, lista_k)
            #gerar_graficos_kmeans(lista_scores_k, dir_experimento, modelo)
            np.save(
                'resultados/' + dir_experimento + '/' + modelo +
                '_lista_scores_k.npy', lista_scores_k)
            print('******   dados de agrupamento do modelo ' + modelo +
                  'salvos.')

            #####MATRIZES DE SIMILARIDADE##############
            print('--------- executando analyzer para experimento ' +
                  str(exp) + ' ---------')
            sim_m = calc_matriz_sim(df[modelo], dir_experimento)
            calcular_sim_assuntos(df['assunto'], sim_m, df[modelo].name,
                                  dir_experimento)
            plt.close()
Exemple #28
0
    df = test_data.fillna(np.mean(train_data['Age']))
    scaled_data = scaler.transform(df[['Age', 'Fare']])
    df[['Age', 'Fare']] = scaled_data
    for var in categorical:
        df = pd.concat([df, pd.get_dummies(df[var], prefix=var)], axis=1)
        del df[var]
    testdf = df
    test_data = df.to_numpy()
    train_labels = train_data_dropped[:, 0]
    train_data_dropped = train_data_dropped[:, 1:]

    ### Running classification
    acc, val_acc, loss, val_loss = [], [], [], []
    ## Running k-folds classification to improve generalization and reduce overfitting
    K = StratifiedShuffleSplit(10, train_size=0.6)
    for train_index, test_index in K.split(train_data_dropped, train_labels):
        x_train, y_train = train_data_dropped[train_index], train_labels[
            train_index]
        x_valid, y_valid = train_data_dropped[test_index], train_labels[
            test_index]
        # ##Only need to balance the training data, not the validation data.
        # x_train, x_valid, y_train, y_valid = train_test_split(train_data_dropped,
        #                                                       train_labels, test_size=0.2,
        #                                                       shuffle= True)

        y_train = pd.get_dummies(y_train).to_numpy()
        y_valid = pd.get_dummies(y_valid).to_numpy()
        history = model.fit(
            x_train,
            y_train,
#########################Creating a Training + Test Set#########################
###Using Scikit-Learn (BEST -RECOMMENDED!) (Method 4)

#One liner..
#Benefit: Can input multiple data set, can input random_state (So the training set will not change)

train_set_04, test_set_04 = train_test_split(housing,
                                             test_size=0.2,
                                             random_state=42)

#########################Creating a Training + Test Set#########################
###Using Scikit-Learn (Categorization/Strata) (Method 5)

#Method 04 is the best, but if your data is small -> Sample Bias could happen
#Method 05 is good when samples is small and we want to select samples based on categorized main features.

#Categorize Samples Based on Important Features
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

#Split the data using Strata
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(housing, housing["income_cat"]):
    train_set_05 = housing.loc[train_index]
    test_set_05 = housing.loc[test_index]

#Check Distribution of Categorized/Stratified Samples
housing["income_cat"].value_counts() / len(housing)
Exemple #30
0
print()
x_data = np.array([x[0:num_data_col] for x in survey_data]) # [0,6) == [0,5]
    
y_data = np.array([y[num_data_col:num_data_col+num_choice_col] for y in survey_data])
x_headers = [h for h in survey_headers[1:6]]

print('x-shape: ' + str(x_data.shape))
print('y-shape: ' + str(y_data.shape)) 

# ---------------------------------------------
#%%
# use sklearn to perform stratified randomized partitioning into training and dev sets
# this is necessary because the vehicle choice dataset is very unbalanced
trainPerc = 0.95; devePerc = 0.05 # deep learning uses much higher %'s for training
sss = StratifiedShuffleSplit(n_splits=1, train_size=trainPerc, test_size = devePerc)
train_indices,deve_indices = next(sss.split(x_data, y_data))
num_train_rows = len(train_indices) # need this later on
# create the patitions
x_vals_train = x_data[train_indices,:]
y_vals_train = y_data[train_indices,:]

x_vals_deve = x_data[deve_indices,:]
y_vals_deve = y_data[deve_indices,:]

print("num_train_rows: %u, num_deve_rows: %u" %(num_train_rows, len(deve_indices)))

# ---------------------------------------------
#%%
# setup training
a_stdv = 0.1          # standard dev. for initialization of node weights
masker = MultiNiftiMasker(mask_img=gm_mask, target_shape=shape,
                          target_affine=affine, smoothing_fwhm=6.,
                          standardize=True, detrend=True, mask_strategy='epi',
                          memory=mem, memory_level=2, n_jobs=2,
                          verbose=10)

##############################################################################
# Cross Validator
# ---------------

from sklearn.model_selection import StratifiedShuffleSplit

n_iter = 100
classes = phenotypic
_, labels = np.unique(classes, return_inverse=True)
cv = StratifiedShuffleSplit(n_splits=n_iter,
                            test_size=0.25, random_state=0)
##############################################################################
# Functional Connectivity Analysis model
# ---------------------------------------
from model import LearnBrainRegions

connectomes = ['correlation', 'partial correlation', 'tangent']

############################################################################
# Gather results - Data structure

columns = ['atlas', 'measure', 'classifier', 'scores', 'iter_shuffle_split',
           'n_regions', 'smoothing_fwhm', 'dataset', 'compcor_10',
           'motion_regress', 'dimensionality', 'connectome_regress', 'scoring',
           'region_extraction', 'multi_pca_reduction', 'reduction_n_components',
           'covariance_estimator', 'min_region_size_in_mm3']
                          50)  # 返回在对数刻度上均匀间隔的数字
for i in gamma_range:
    clf = SVC(kernel="rbf", gamma=i, cache_size=5000).fit(Xtrain, Ytrain)
    score.append(clf.score(Xtest, Ytest))
print(max(score), gamma_range[score.index(max(score))])
plt.plot(gamma_range, score)
plt.show()

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

time0 = time()
gamma_range = np.logspace(-10, 1, 20)
coef0_range = np.linspace(0, 5, 10)
param_grid = dict(gamma=gamma_range, coef0=coef0_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=420)
grid = GridSearchCV(SVC(kernel="poly", degree=1, cache_size=5000), param_grid=param_grid, cv=cv)
grid.fit(X, y)
print("The best parameters are %s, score = %0.5f" % (grid.best_params_, grid.best_score_))
print(datetime.datetime.fromtimestamp(time() - time0).strftime("%M:%S:%f"))

# 调参C
score = []
C_range = np.linspace(0.01, 30, 50)
for i in C_range:
    clf = SVC(kernel="linear", C=i, cache_size=5000).fit(Xtrain, Ytrain)
    score.append(clf.score(Xtest, Ytest))
print(max(score), C_range[score.index(max(score))])
plt.plot(C_range, score)
plt.show()
Exemple #33
0
                                test_predictions,
                                average='weighted')
    recall = recall_score(test_labels, test_predictions, average='weighted')
    f1 = 2.0 * (precision * recall) / (precision + recall)

    print("Test Precision: %.4f" % (precision))
    print("Test Recall: %.4f" % (recall))
    print("Test f1_score: %.4f" % (f1))

    return accuracy, precision, recall, f1


filename = sys.argv[1]
X_data, Y_data = load_csv(filename)

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.125)
metrics = []
fold = 1
for train_indices, test_indices in sss.split(X_data, Y_data):
    train_data, test_data = X_data[train_indices], X_data[test_indices]
    train_labels, test_labels = Y_data[train_indices], Y_data[test_indices]
    metrics.append(SVM(train_data, train_labels, test_data, test_labels))
    fold += 1

accuracy = 0.00
precision = 0.00
recall = 0.00
fi = 0.00
for i in metrics:
    accuracy += i[0]
    precision += i[1]
    avgFlakyTest /= successFold
    avgNonFlakyTest /= successFold

    return (avgFlakyTrain, avgNonFlakyTrain, avgFlakyTest, avgNonFlakyTest,
            avgP, avgR, storage, avgTPrep, avgTPred)


if __name__ == "__main__":
    projectBasePath = "dataset"
    projectName = "pinto-ds"
    outDir = "results/"
    os.makedirs(outDir, exist_ok=True)

    numSplit = 30
    testSetSize = 0.2
    kf = StratifiedShuffleSplit(n_splits=numSplit, test_size=testSetSize)

    # DISTANCE
    outFile = "params-distance.csv"
    with open(os.path.join(outDir, outFile), "w") as fo:
        fo.write(
            "distance,k,sigma,eps,precision,recall,storage,preparationTime,predictionTime\n"
        )

    k = 7
    sigma = 0.5
    dim = 0  # number of dimensions (0: JL with error eps)
    eps = 0.3  # JL eps
    params = {"algorithm": "brute", "metric": "cosine", "weights": "uniform"}
    for metric in ["cosine", "euclidean"]:
        for k in [3, 7]:
def stratify(housing):
    split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state =42)
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
    return strat_train_set, strat_test_set
Exemple #36
0
df1['label'] = 'BENIGN'
df2 = pd.read_csv('../results/dataset_dos.csv')
df2['label'] = 'dos'
df3 = pd.read_csv('../results/dataset_hb.csv')
df3['label'] = 'heartbleed'
frames = [df1, df2, df3]

print('join datasets')
df = pd.concat(frames)

print('separate y')
X = df.drop(columns=['label'])
y = df['label'].values

print('StratifiedShuffleSplit')
sss = StratifiedShuffleSplit(n_splits=1, test_size=110000, random_state=1)
print('split')
print(sss.get_n_splits(X, y))

list = []
for train_index, test_index in sss.split(X, y):
    for index in test_index:
        list.append(df.iloc[index].values)

dts = pd.DataFrame(list, columns=df.columns)
dts = df.drop(columns=['ipsrc', 'ipdst'])

print('saving')
dts.to_csv("../results/dataset_110000.csv",
           sep=',',
           encoding='utf-8',
Exemple #37
0
def prepareData():
    if wiki_model_name in os.listdir(wiki_model_path):
        model = gensim.models.KeyedVectors.load(
            os.path.join(wiki_model_path, wiki_model_name))
    else:
        print("Word2vec model not found in {}".format(wiki_model_path))

    vec_len = len(model['a'])
    print("Word2vec Vector length {}".format(vec_len))

    SheetsToParse = [
        'AAPL', 'MSFT', 'GE', 'IBM', 'DIS', 'PG', 'AXP', 'BA', 'DD', 'JNJ',
        'KO', 'MCD', 'MMM'
    ]
    #df= parseExcelFileWithMultipleSheetsAndCombine("/datadrive/Sahil/code/GL/fewTrails/twitter/Tweet-Scale.xlsx",SheetsToParse)
    df = pd.read_csv(
        "/datadrive/Sahil/code/GL/fewTrails/twitter/twitter_training.csv")

    #df = pd.read_csv(training_data_csv, encoding='iso-8859-1')
    sentences_len = [len(str(s).split()) for s in df['text']]
    max_len = max(sentences_len) + 20  # 20 margin

    print("Max Sentence length {}".format(max_len))

    V_index_dict = getIndexedDict(model)
    vocab_size = len(V_index_dict)
    embedding_weights = getEmbeddings(vocab_size, vec_len)

    data_X = []

    for sen in df.text[:]:
        #vec = np.zeros(max_len)
        vec = []
        for index, word in enumerate(word_tokenize(str(sen))[:max_len]):
            if word in V_index_dict.keys():
                vec.append(V_index_dict[word])
            else:
                vec.append(0)
        data_X.append(vec)

    data_X = np.array(data_X)

    data_X = sequence.pad_sequences(data_X, maxlen=max_len)

    y = df.Rating_m
    y = to_categorical(y, num_classes=None)
    print(y)
    print("Shape of Y{}".format(y.shape))

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)

    for train_index, test_index1 in sss.split(data_X, y):
        print("TRAIN:", train_index, "TEST:", test_index1)
        print("TRAIN:", len(train_index), "TEST:", len(test_index1))
        X_train, X_test = data_X[train_index], data_X[test_index1]
        y_train, y_test = y[train_index], y[test_index1]

    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=seed)

    for val_index, test_index2 in sss.split(X_test, y_test):
        print("TRAIN:", val_index, "TEST:", test_index2)
        print("TRAIN:", len(val_index), "TEST:", len(test_index2))
        X_val, X_test = X_test[val_index], X_test[test_index2]
        y_val, y_test = y_test[val_index], y_test[test_index2]

    data = {}

    data["X_train"] = X_train
    data["X_test"] = X_test
    data["X_val"] = X_val
    data["y_train"] = y_train
    data["y_test"] = y_test
    data["y_val"] = y_val

    data["train_index"] = train_index
    data["test_index"] = test_index1[test_index2]
    data["val_index"] = test_index1[val_index]

    data["max_len"] = max_len
    data["vec_len"] = vec_len
    data["vocab_size"] = vocab_size
    pickle.dump(data, open(saved_data_filename, 'wb'))
neg_data = neg_data[:len(pos_data)]
neg_label = neg_label[:len(pos_data)]
#trace_data, trace_label = load_data("data/relevant_documents/english", 1)
#trace_data = np.array(trace_data)
#trace_label = np.array(trace_label)
#
print('split')
all_data = []
all_data.extend(pos_data + neg_data)
all_labels = []
all_labels.extend(pos_label + neg_label)
print len(all_labels), len(all_data)
all_data = np.array(all_data)
all_labels = np.array(all_labels)
print('split')
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

print('split')
idx = 0
batch_size = 64
num_classes = 2
epochs = 5

filepath = "uk_best.hdf5"

#filepath="weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"

for train_index, test_index in sss.split(all_data, all_labels):
    X_train, X_test = all_data[train_index], all_data[test_index]
    y_train, y_test = all_labels[train_index], all_labels[test_index]
    y_f1_test = y_test
model_results = []

iterations = 100

model = RandomForestClassifier(n_jobs=-1,
                               random_state=55,
                               min_samples_split=20,
                               n_estimators=500,
                               max_features='auto',
                               min_samples_leaf=20,
                               oob_score='TRUE')
modelname = 'RF'

#  Make 'iterations' index vectors for the train-test split
sss = StratifiedShuffleSplit(n_splits=iterations,
                             test_size=0.33,
                             random_state=None)

accuracy_scores_is = []
accuracy_scores_oos = []
precision_scores_is = []
precision_scores_oos = []
recall_scores_is = []
recall_scores_oos = []
f1_scores_is = []
f1_scores_oos = []

#  Initialize the confusion matrix
cm_sum_is = np.zeros((2, 2))
cm_sum_oos = np.zeros((2, 2))
def classify(
    X,
    y,
    verbose=False,
    nfolds=2,
    dim_red=None,
    n_components=[5, 10, 20],
    scale=True,
    fs=None,
    njobs=1,
    LR_C=[0.01, 0.1, 1, 10, 100],
    LR_class_weight=[None, "balanced"],
    SVC_C=[0.01, 0.1, 1, 10, 100],
    SVC_class_weight=[None, "balanced"],
    SVC_kernels=["rbf", "linear", "poly"],
    n_estimators=[10, 20, 30],
    max_features=["auto", "log2", None],
    **kwargs
):

    # spit out to the screen the function parameters, for logging
    if verbose:
        import inspect

        frame = inspect.currentframe()
        args, _, _, values = inspect.getargvalues(frame)
        print 'function name "%s"' % inspect.getframeinfo(frame)[2]
        for i in args[2:]:
            print "    %s = %s" % (i, values[i])

    # prepare configuration for cross validation test harness
    seed = 8

    # prepare models
    models = []
    # all these support multiclass:
    # http://scikit-learn.org/stable/modules/multiclass.html
    models.append(
        (
            "LR",
            LogisticRegression(multi_class="multinomial", solver="newton-cg"),
            {"C": LR_C, "class_weight": LR_class_weight},
        )
    )
    models.append(("LDA", LinearDiscriminantAnalysis(), {}))
    models.append(("RndFor", RandomForestClassifier(), {"n_estimators": n_estimators, "max_features": max_features}))
    models.append(("NB", GaussianNB(), {}))
    models.append(("SVC", SVC(), {"C": SVC_C, "class_weight": SVC_class_weight, "kernel": SVC_kernels}))
    models.append(("Most frequent", DummyClassifier(strategy="most_frequent"), {}))
    models.append(("Stratified", DummyClassifier(strategy="stratified"), {}))

    # spit out to the screen the parameters to be tried in each classifier
    if verbose:
        print "Trying these parameters:"
        for m in models:
            print m[0], ":", m[2]

    # evaluate each model in turn
    results = []
    names = []
    for name, model, params in models:
        # need to create the CV objects inside the loop because they get used
        # and not get reset!
        inner_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.1, random_state=seed)
        outer_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.1, random_state=seed)
        #     # do this if no shuffling is wanted
        #     inner_cv = StratifiedKFold(n_splits=num_folds, random_state=seed)
        #     outer_cv = StratifiedKFold(n_splits=num_folds, random_state=seed)
        steps = [("clf", model)]
        pipe_params = {}
        for key, val in params.iteritems():
            key_name = "clf__%s" % key
            pipe_params[key_name] = val
        if fs == "l1":
            lsvc = LinearSVC(C=0.1, penalty="l1", dual=False)
            fs = feature_selection.SelectFromModel(lsvc)
        elif fs == "rfe":
            fs = feature_selection.RFE(estimator=model)
            pipe_params["feat_sel__n_features_to_select"] = n_components
        steps = [("feat_sel", fs)] + steps
        if dim_red is not None:
            if dim_red == "pca":
                dr = decomposition.PCA()
                pipe_params["dim_red__n_components"] = n_components
            elif dim_red == "ica":
                dr = decomposition.FastICA()
                pipe_params["dim_red__n_components"] = n_components
            steps = [("dim_red", dr)] + steps
        if scale:
            steps = [("scale", preprocessing.RobustScaler())] + steps

        pipe = Pipeline(steps)
        cv_results = []
        cnt = 0
        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            opt_model = GridSearchCV(estimator=pipe, param_grid=pipe_params, verbose=0, n_jobs=njobs, cv=inner_cv)
            opt_model.fit(X_train, y_train)
            if verbose:
                if len(params.keys()) > 0:
                    print "Best paramaters for", name, " (%d/%d):" % (cnt + 1, outer_cv.n_splits)
                    print opt_model.best_params_
            predictions = opt_model.predict(X_test)
            cv_results.append(metrics.accuracy_score(y_test, predictions))
            cnt += 1
        results.append(cv_results)
        names.append(name)
    if verbose:
        print "\n======"
        for model, res in zip(models, results):
            msg = "%s: %f (%f)" % (model[0], np.mean(res), np.std(res))
            print (msg)
        print "Chance: %f" % (1 / float(len(np.unique(y))))
        print "======\n"
    return results, models
Exemple #41
0
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()


from sklearn.model_selection import StratifiedShuffleSplit

housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(n_splits=folds, random_state=42)

    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print(clf)
        print(
            PERF_FORMAT_STRING.format(accuracy,
                                      precision,
                                      recall,
                                      f1,
                                      f2,
                                      display_precision=5))
        print(
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
                                         true_negatives))
        print("")
    except:
        print("Got a divide by zero when trying out:", clf)
        print(
            "Precision or recall may be undefined due to a lack of true positive predicitons."
        )
Exemple #43
0
#for word in dictionary:
#    def countTokens(tokens):
#        return tokens.count(word)
#    data[word] = data['tokens'].apply(countTokens)


#data.drop("tokens", axis = 1, inplace = True)
print('counting tokens by file')
data['tok_array'] = data['tokens'].apply(createTokenArray)

print('saving data')
data.to_csv('data.csv', sep=',', encoding='utf-8')

data.drop("tokens", axis = 1, inplace = True)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["type"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

def type_proportions(data):
    return data["type"].value_counts() / len(data)

compare_props = pd.DataFrame({
    "Overall": type_proportions(data),
    "Stratified": type_proportions(strat_train_set),
    "Stratified-test": type_proportions(strat_test_set),
}).sort_index()
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
compare_props["Strat. test %error"] = 100 * compare_props["Stratified-test"] / compare_props["Overall"] - 100
Exemple #44
0
def processtarget(inp):
	global thresh
	activity_threshold = thresh
	sdict = {idx:i for idx, i in enumerate([round(float(i),2) for i in np.arange(0,.9,0.1)])}
	uniprot,infile = inp
	try: matrix,active_scaf,pactivity = processfile(infile.groupby('smiles').mean().reset_index()[['smiles','pchembl_value']].values,file=True)
	except TypeError: return
	if len(matrix) < 100: return
	vector = [1 if x >= activity_threshold else 0 for x in pactivity]
	sfvector = []
	#set up cdf for bioactivity scale
	for standard_deviation_threshold in sorted(sdict.values()):
		if standard_deviation_threshold == 0.0:
			sfvector.append(vector)
		else:
			reweighted = convertPvalue(pactivity,activity_threshold,standard_deviation_threshold)
			sfvector.append(reweighted)
	#process the inactive set
	if sum(vector) < 100: return
	print(uniprot)
	nact = sum(vector)
	ninact = len(vector)-sum(vector)
	conf_smiles = []
	egids = uniprot_egid.get(uniprot)
	if egids != None:
		for egid in egids:
			try:
				with zipfile.ZipFile(path_to_pidgin_inactives + egid + '.smi.zip') as z:
					conf_smiles += [i.split(' ')[0] for i in z.open(egid + '.smi').read().decode('UTF-8').splitlines()]		
			except: pass
	req = nact * 2
	if req < 1000: req = 1000
	if req > 2000: req = 2000
	req -= ninact
	if req < 0: req = 0
	conf_inactives, inactive_scaf = [], []
	#sample inactives if necessary
	if len(conf_smiles) > 0:
		random.seed(2)
		random.shuffle(conf_smiles)
		try:
			random.seed(2)
			conf_inactives,inactive_scaf = calcFingerprints_array(random.sample(conf_smiles,req))
		except ValueError: conf_inactives,inactive_scaf = calcFingerprints_array(conf_smiles)
	conf_smiles = []
	vector2 = []
	for i in conf_inactives:
		if req > 0:
			matrix.append(i)
			vector2.append(0)
			req-=1
	conf_inactives = None
	ninact += len(vector2)
	nse = 0
	if req > 0:
		vector2 += [0] * req
		random_bg, random_scaf = getfp(req)
		nse = len(random_bg)
		matrix += random_bg
		inactive_scaf += random_scaf
	del random_bg, random_scaf
	all_scafs = active_scaf+inactive_scaf
	del active_scaf, inactive_scaf
	scaf_dict = {s[0]:s[1] for s in zip(set(all_scafs),range(0,len(set(all_scafs)),1))}
	all_scafs = [scaf_dict[sca] for sca in all_scafs]
	nscaf = len(scaf_dict.keys())
	vector += vector2
	pactivity = np.array(pactivity + [0] * len(vector2), dtype=np.float32)
	sfvector = [s+vector2 for s in sfvector]
	vector2 = None
	matrix = np.array(matrix, dtype=np.uint8)
	vector = np.array(vector, dtype=np.uint8)
	sfvector = [np.array(s) for s in sfvector]
	skf = StratifiedShuffleSplit(n_splits=3, random_state=2, test_size=0.75, train_size=0.25)
	lso = GroupShuffleSplit(n_splits=3, random_state=2, test_size=0.75, train_size=0.25)
	base_predicted1, base_predicted2, base_predicted3 = [], [], []
	y_lab, y_lab_raw, y_binary = [], [], []
	per_fold=[]
	try:
		#remove '[:1]' to enable scaffold splitting
		for split_method, split_name in [(skf,0),(lso,1)][:1]:
			#for each splitting method, perform the evaluation
			for train, test in split_method.split(matrix,vector,groups=all_scafs):
				x, y, X_test,Y_binary, Y_raw = matrix[train], vector[train], matrix[test], vector[test], pactivity[test]
				class_weights = class_weight.compute_class_weight('balanced',np.unique(y),y)
				sw = np.array([class_weights[1] if i == 1 else class_weights[0] for i in y])				
				rfc = RandomForestClassifier(n_jobs = 1, n_estimators=200, class_weight='balanced', random_state=2)
				###### ###### ###### ###### ###### ###### ###### ###### ###### 
				brfc=sklearn.base.clone(rfc)
				brfc.fit(x,y,sample_weight=sw)
				#for each emulated experimental error, generate predictions
				for sidx,ystrain in enumerate(sfvector):
					sw2 = ystrain[train]
					py=np.zeros([len(sw2),2])
					py[:,1] = sw2
					py[:,0] = 1-py[:,1]
					prfc = prf(n_estimators=200, bootstrap=True, keep_proba=0.05)
					prfc.fit(X=x.astype(float), py=py.astype(float))
					rfr = RandomForestRegressor(n_jobs = 1, n_estimators=200, random_state=2)
					rfr.fit(x,sw2)
					p_prfc = [round(pr,3) for pr in list(np.array(prfc.predict_proba(X=X_test.astype(float)))[:,1])]
					p_brfc = [round(pr,3) for pr in list(brfc.predict_proba(X_test)[:,1])]
					p_rfr = [round(pr,3) for pr in list(np.array(rfr.predict(X_test)))]
					for sidx2, ystest in enumerate(sfvector):
						y_test=list(ystest[test])
						#add base rf method output
						base_predicted1 += p_brfc
						#add base prf method output (when stdev = 0)
						base_predicted2 += p_prfc
						#add prf method output
						base_predicted3 += p_rfr
						y_lab_raw += list(Y_raw)
						y_lab += list(y_test)
						y_binary += list(Y_binary)
						per_fold.append([len(y_test),[split_name,sdict[sidx],sdict[sidx2]]])
	except ValueError: return
	return [uniprot,nact,ninact,nse,nscaf], [y_binary,y_lab_raw,y_lab,base_predicted1,base_predicted2,base_predicted3], per_fold
    return img_data, np.array(_2d_images)


train, labels, test, classes = encode(train, test)
train = train.values

img_data, _2d_images = load_image_data()

##plt.imshow(_2d_images[0])#, interpolation='nearest')
##plt.show()
#img_data = np.array(img_data)
##img_data = img_data.reshape(1584, rows,cols)
##print("data loaded")
##input()
# splittrain data into train and validation
sss = StratifiedShuffleSplit(test_size=0.2, random_state=23)
for train_index, valid_index in sss.split(train, labels):
    X_train, X_valid = train[train_index], train[valid_index]
    y_train, y_valid = labels[train_index], labels[valid_index]
    X_train_img, X_valid_img = img_data[train_index], img_data[valid_index]
    X_train_2dimg, X_valid_2dimg = _2d_images[train_index], _2d_images[
        valid_index]

X_test = test.values

print("Done")

import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
# %%

import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.patches as patches

from utils import get_data, plot_prediction_samples

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics
# %%
imgs, labels = get_data(as_gray=False)
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
_, super_idx = next(sss.split(imgs, labels))
X, _, y, y_super_test = train_test_split(imgs,
                                         labels,
                                         test_size=0.2,
                                         random_state=42,
                                         stratify=labels)

canny_svm_preds = np.load('data/canny_svm_train_preds.npy', allow_pickle=True)

hog_svm_preds = np.load('data/hog_svm_train_preds.npy', allow_pickle=True)

cnn_preds = np.load('data/cv_cnn_train_preds.npy', allow_pickle=True)
transfer_preds = np.load('data/transfer_cnn_train_preds.npy',
                         allow_pickle=True)

bagged_cnn_preds = np.load('data/cv_cnn_super_preds.npy', allow_pickle=True)
Exemple #47
0
array([ 0.938...,  0.963...,  0.944...])
"""
testing = 1

fileDir = os.path.join(os.getcwd(), 'MicroMaster', 'AI', 'week7ML',
                       'input3.csv')
input_data = np.genfromtxt(fileDir, delimiter=',', skip_header=1)
X = input_data[:, :2]
y = input_data[:, 2]
if testing: print(X)

test_size = 0.4
random_state = 0
n_splits = 5
cv = StratifiedShuffleSplit(n_splits=n_splits,
                            test_size=test_size,
                            random_state=random_state)

# SVM with Linear Kernel
# https://stats.stackexchange.com/questions/31066/what-is-the-influence-of-c-in-svms-with-linear-kernel
# https://stats.stackexchange.com/questions/73032/linear-kernel-and-non-linear-kernel-for-support-vector-machine
"""kernel : string, optional (default=’rbf’)

Specifies the kernel type to be used in the algorithm. It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable. If none is given, ‘rbf’ will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples)."""

kernel = 'linear'
C = [0.1, 0.5, 1, 5, 10, 50, 100]
param_grid = dict(C=C)
grid = GridSearchCV(SVC(kernel=kernel), param_grid=param_grid, cv=cv)

grid.fit(X, y)
# All the numerical features in the dataset
# print(df.describe())
# plotting histogram for features
df.hist()
plt.tight_layout()
plt.show()

# splitting data into train-test -- normal split
train_set, test_set = train_test_split(df, test_size=0.25, random_state=42)

# stratified split
# adding income category feature for stratified splitting
df['income_category'] = np.ceil(df['median_income'] / 1.5)
df['income_category'].where(df['income_category'] < 5, 5.0, inplace=True)
# stratified splitting
split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_index, test_index in split.split(df, df['income_category']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]
# removing 'income_category' from df
for set_ in (strat_train_set, strat_test_set):
    set_.drop(['income_category'], axis=1, inplace=True)

# plotting a scatter graph longitude vs latitude
# the more dense places are more populated
df.plot(kind='scatter', x='longitude', y='latitude', alpha=0.1)
plt.show()

# getting correlation matrix for df
corr_mat = df.corr()
# print(corr_mat)
Exemple #49
0
    def train(self, datasets):
        ''' Initialize, train and predict a classifier.
        This includes: Feature engineering (i.e. PCA) and
        selection, training clf, (hyper)parameter optimization,
        and a prediction on the test set. Make sure to save
        all variables you want to keep track of in the instance.

        Input:
            datasets:: dict
                Contains train and test x, y

        Output:
            clf:: instance, dict, list, None
                Trained classifier/regressor instance, such as
                sklearn logistic regression. Is not used
                outside this file, so can be left empty
            datasets:: dict
                Dictionary containing the UPDATED train and test
                sets. Any new features should be present in this
                dict
            test_y_hat:: list
                List containing the probabilities of outcomes.
        '''

        train_x = datasets['train_x']
        test_x = datasets['test_x']
        train_y = datasets['train_y']
        test_y = datasets['test_y']

        self.learn_size += [{
            'tr_x': train_x.shape,
            'tr_y': train_y.shape,
            'te_x': test_x.shape,
            'te_y': test_y.shape
        }]

        train_x = self.impute_missing_values(train_x)
        test_x = self.impute_missing_values(test_x)

        # Define pipeline
        self.pipeline = self.get_pipeline()

        # Model and feature selection
        # TODO ideally also the feature selection would take place within a CV pipeline

        if self.model_args['grid_search']:
            # print("Train classfier using grid search for best parameters.")
            cv = StratifiedShuffleSplit(n_splits=5,
                                        test_size=0.2,
                                        random_state=self.random_state)
            grid = RandomizedSearchCV(self.pipeline,
                                      param_distributions=self.grid,
                                      cv=cv,
                                      scoring='roc_auc',
                                      n_jobs=-2,
                                      n_iter=50)

            grid.fit(train_x, train_y)
            clf = grid.best_estimator_
            self.trained_classifiers += [clf]
            # print("Best estimator: ", clf)
        else:
            # Train classifier without optimization.
            clf = self.pipeline
            clf.fit(train_x, train_y)

        self.coefs.append(clf['XGB'].feature_importances_)

        test_y_hat = clf.predict_proba(test_x)  # Predict

        if 'feature_selection' in clf.named_steps:
            # columns = train_x.columns[np.argsort(clf.named_steps\
            #                             .feature_selection\
            #                             .pvalues_)][0:self.model_args['n_features']].to_list()
            # self.n_best_features += [columns]
            # print(columns)

            idx_sorted = np.argsort(clf['feature_selection'].pvalues_)
            f_values = clf['feature_selection'].scores_
            p_values = clf['feature_selection'].pvalues_
            columns = train_x.columns[
                idx_sorted[0:self.model_args['n_features']]].to_list()
            self.n_best_features += [[columns, f_values, p_values]]
            print(columns)

        else:
            columns = train_x.columns

        idx_train = train_x.index
        idx_test = test_x.index

        if self.model_args['add_missing_indicator']:
            missing_cols = columns.to_list()\
                              + ['{}_nan'.format(c)
                                 for c in train_x.loc[:, train_x.isna().any()]]

        train_x = pd.DataFrame(clf[:-1].transform(train_x))
        test_x = pd.DataFrame(clf[:-1].transform(test_x))

        if self.model_args['add_missing_indicator']:
            train_x.columns = missing_cols
            test_x.columns = missing_cols
        else:
            train_x.columns = columns
            test_x.columns = columns

        train_x.index = idx_train
        test_x.index = idx_test

        datasets = {
            "train_x": train_x,
            "test_x": test_x,
            "train_y": train_y,
            "test_y": test_y
        }

        explainer = shap.TreeExplainer(clf['XGB'])
        shap_values = explainer.shap_values(test_x)

        return clf, datasets, test_y_hat, shap_values, test_x
Exemple #50
0
        features_list_selected.append(features_list[index + 1])
    features_list = features_list_selected

    data = featureFormat(my_dataset, features_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)

    #### Using GridSearchCV with a Stratified Shuffle Split to find best parameters
    from sklearn.model_selection import GridSearchCV
    parameters = {
        'criterion': ('gini', 'entropy'),
        'max_depth': [1, 2, 3, 4, 5],
        'min_samples_leaf': [1, 2, 3, 4, 5],
        'min_samples_split': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    }
    tree = DecisionTreeClassifier()
    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=42)
    clf = GridSearchCV(tree, parameters, cv=sss)
    clf.fit(features, labels)
    print clf.best_params_
    best_params = clf.best_params_

    precision_list = []
    recall_list = []
    for count_fit in range(1, 100):
        features_train, features_test, labels_train, labels_test = \
        train_test_split(features, labels, test_size=0.3, random_state=42, stratify = labels)

        clf = DecisionTreeClassifier(
            min_samples_split=best_params['min_samples_split'],
            criterion=best_params['criterion'],
            max_depth=best_params['max_depth'],