def test_stratifiedshufflesplit_list_input():
    # Check that when y is a list / list of string labels, it works.
    sss = StratifiedShuffleSplit(test_size=2, random_state=42)
    X = np.ones(7)
    y1 = ['1'] * 4 + ['0'] * 3
    y2 = np.hstack((np.ones(4), np.zeros(3)))
    y3 = y2.tolist()

    np.testing.assert_equal(list(sss.split(X, y1)),
                            list(sss.split(X, y2)))
    np.testing.assert_equal(list(sss.split(X, y3)),
                            list(sss.split(X, y2)))
Exemple #2
0
    def validate(self):
        '''
        Ten-fold cross-validation with stratified sampling.
        '''
        print('Validating new model: {}()'.format(self.__class__.__name__))

        accuracy_scores = []
        precision_scores = []
        recall_scores = []
        f1_scores = []

        sss = StratifiedShuffleSplit(n_splits=10)
        for train_index, test_index in sss.split(self.data, self.labels):
            x_train, x_test = self.data[train_index], self.data[test_index]
            y_train, y_test = self.labels[train_index], self.labels[test_index]

            model = self.create_model()
            model.fit(x_train, y_train, epochs=100, batch_size=128,
                      class_weight=self.class_weight)
            y_pred = model.predict_classes(x_test, batch_size=128)

            accuracy_scores.append(accuracy_score(y_test, y_pred))
            precision_scores.append(precision_score(y_test, y_pred))
            recall_scores.append(recall_score(y_test, y_pred))
            f1_scores.append(f1_score(y_test, y_pred))

        print('')
        print('Accuracy: {}'.format(np.mean(accuracy_scores)))
        print('Precision: {}'.format(np.mean(precision_scores)))
        print('Recall: {}'.format(np.mean(recall_scores)))
        print('F1-measure: {}'.format(np.mean(f1_scores)))
Exemple #3
0
    def fit_model(self, X, y):
        """
        X::pd.DataFrame: Input data
        y::np.ndarray: response for input data
        """
        X = X.values
        XY = np.hstack((X, y[:, None]))
        np.random.shuffle(XY)
        X = XY[:, :-1]
        y = XY[:, -1]
        cv_out = StratifiedShuffleSplit(n_splits=400)
        cv_in = StratifiedKFold(n_splits=5)
        clf = Pipeline([('scaler', StandardScaler()),
                        ('lg', linear_model.LogisticRegressionCV(
                                  penalty='l1',
                                  solver='liblinear',
                                  cv=cv_in))])

        self.res = {'coef':[], 'auc':[], 'model':0}

        for idx, (train, test) in enumerate(cv_out.split(X, y)):
            clf.fit(X[train], y[train])
            prediction = clf.predict(X[test])
            self.res['coef'].append((idx, clf.named_steps['lg'].coef_[0]))
            self.res['auc'].append((idx, roc_auc_score(y[test], prediction)))

        self.res['model'] = clf

        output_saved = self.save_pickle(self.res, self.out)
        return output_saved
def outer_cv_loop(Xdata,Ydata,clf,parameters=[],
                    n_splits=10,test_size=0.25):

    pred=numpy.zeros(len(Ydata))
    importances=[]
    kf=StratifiedShuffleSplit(n_splits=n_splits,test_size=test_size)
    rocscores=[]
    for train,test in kf.split(Xdata,Ydata):
        if numpy.var(Ydata[test])==0:
           print('zero variance',varname)
           rocscores.append(numpy.nan)
           continue
        Ytrain=Ydata[train]
        Xtrain=fancyimpute.SoftImpute(verbose=False).complete(Xdata[train,:])
        Xtest=fancyimpute.SoftImpute(verbose=False).complete(Xdata[test,:])
        if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2:
           smt = SMOTETomek()
           Xtrain,Ytrain=smt.fit_sample(Xtrain.copy(),Ydata[train])
        # filter out bad folds
        clf.fit(Xtrain,Ytrain)
        pred=clf.predict(Xtest)
        if numpy.var(pred)>0:
           rocscores.append(roc_auc_score(Ydata[test],pred))
        else:
           rocscores.append(numpy.nan)
        importances.append(clf.feature_importances_)
    return rocscores,importances
Exemple #5
0
def simple_classification(n_samples=100, n_features=10, random_state=33):
    """
    Generate simple classification task for training.

    Parameters
    ----------
    n_samples : int
        Number of samples in dataset.
    n_features : int
        Number of features for each sample.
    random_state : int
        Random state to make results reproducible.

    Returns
    -------
    tuple
        Returns tuple that contains 4 variables. There are input train,
        input test, target train, target test respectevly.
    """
    X, y = datasets.make_classification(n_samples=n_samples,
                                        n_features=n_features,
                                        random_state=random_state)
    shuffle_split = StratifiedShuffleSplit(n_splits=1, train_size=0.6,
                                           random_state=random_state)

    train_index, test_index = next(shuffle_split.split(X, y))
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    return x_train, x_test, y_train, y_test
def split(data, test_size):
    X, y = np.array(data.data), np.array(data.target)

    splitter = StratifiedShuffleSplit(n_iter=1, test_size=test_size)
    train, test = next(splitter.split(X, y))

    return X[train], y[train], X[test], y[test]
Exemple #7
0
    def fit(self, X, y, X_test=None, y_test=None):
        super(MLP, self).fit(X, y)

        callbacks = []
        test = X_test is not None and y_test is not None
        if test:
            self.test_loss = TestLossHistory(X_test, y_test)
            callbacks.append(self.test_loss)

        if self.n_class == 1 and self.n_label > 2:
            yr = unroll(y)

        if self.early_stop:
            sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
            train_index, val_index = next(iter(sss.split(X, y)))
            x_train, x_val = X[train_index], X[val_index]
            y_train, y_val = y[train_index], y[val_index]

            stop = EarlyStopping(monitor="val_loss", patience=self.patience, verbose=self.verbose)
            callbacks.append(stop)

            history = self.model.fit(
                x_train,
                y_train,
                nb_epoch=self.max_epoch,
                verbose=self.verbose,
                callbacks=callbacks,
                validation_data=(x_val, y_val),
            )

        else:
            history = self.model.fit(X, y, nb_epoch=self.max_epoch, verbose=self.verbose, callbacks=callbacks)

        self.history = history.history
        return self
    def _get_validation_split(self):
        train = pd.read_csv(self.train_csv_file)
        # mapping labels to integer classes
        flatten = lambda l: [item for sublist in l for item in sublist]
        labels = list(set(flatten([l.split(' ') for l in train['tags'].values])))
        label_map = {l: i for i, l in enumerate(labels)}

        y_train = []
        for f,tags in (train.values):
            targets = np.zeros(len(label_map))
            for t in tags.split(' '):
                targets[label_map[t]] = 1
            y_train.append(targets)

        y_train = np.array(y_train, np.uint8)
        trn_index = []
        val_index = []
        index = np.arange(len(train))
        for i in (range(len(label_map))):
            sss = StratifiedShuffleSplit(n_splits=2, test_size=self.validation_split, random_state=i)
            for train_index, test_index in sss.split(index,y_train[:,i]):
                X_train, X_test = index[train_index], index[test_index]
            # to ensure there is no repetetion within each split and between the splits
            trn_index = trn_index + list(set(X_train) - set(trn_index) - set(val_index))
            val_index = val_index + list(set(X_test) - set(val_index) - set(trn_index))
        return np.array(trn_index), np.array(val_index)
def main():
    args = cli_parser().parse_args()

    TEST_PERCENT = args.test_percent
    RAND_STATE = args.rand_state
    OUTPUT_BASE = args.output_base
    CLS_TO_FILEPATH = args.cls_to_cmdProcessedCsv

    # Parse CSV files associated to classes
    cls_uuids = {}
    for cls, filepath in six.iteritems(CLS_TO_FILEPATH):
        cls_uuids[cls] = sorted({r[1] for r in csv.reader(open(filepath))})

    cls_list = sorted(cls_uuids)
    all_label, all_uuids = \
        zip(*[(cls_name, uuid)
              for cls_name in cls_list
              for uuid in cls_uuids[cls_name]])
    # Transform into numpy array for multi-index access later
    all_label = numpy.array(all_label)
    all_uuids = numpy.array(all_uuids)

    # ``n_splits=1``  -- Only make one train/test split
    sss = StratifiedShuffleSplit(n_splits=1, test_size=TEST_PERCENT,
                                 random_state=RAND_STATE)

    # Get array of index position values of ``all_uuids`` of uuids to use for
    # train and test sets, respectively.
    train_index, test_index = \
        iter(sss.split(numpy.zeros(len(all_label)), all_label)).next()
    uuids_train, uuids_test = all_uuids[train_index], all_uuids[test_index]
    label_train, label_test = all_label[train_index], all_label[test_index]

    print("Train:")
    for cls_label in cls_list:
        cnt = label_train.tolist().count(cls_label)
        print("- %s:\t%d\t(~%.2f %% of total class examples)"
              % (cls_label, cnt, float(cnt) / len(cls_uuids[cls_label]) * 100))
    print("Test:")
    for cls_label in cls_list:
        cnt = label_test.tolist().count(cls_label)
        print("- %s:\t%d\t(~%.2f %% of total class examples)"
              % (cls_label, cnt, float(cnt) / len(cls_uuids[cls_label]) * 100))

    # Save out files for use with ``classifier_model_validation``
    with open('%s.all_uuids.csv' % OUTPUT_BASE, 'w') as f:
        w = csv.writer(f)
        for uuid, label in itertools.izip(all_uuids, all_label):
            w.writerow([uuid, label])

    with open('%s.train_uuids.csv' % OUTPUT_BASE, 'w') as f:
        w = csv.writer(f)
        for uuid, label in itertools.izip(uuids_train, label_train):
            w.writerow([uuid, label])

    with open('%s.test_uuids.csv' % OUTPUT_BASE, 'w') as f:
        w = csv.writer(f)
        for uuid, label in itertools.izip(uuids_test, label_test):
            w.writerow([uuid, label])
Exemple #10
0
 def robust_coef(self,xwl2,hm_y,n_iter=100):
     skf = StratifiedShuffleSplit(n_splits=n_iter, test_size=.2,random_state=1)
     coefs_ = []
     intercept_ = []
     for train,test in skf.split(xwl2,hm_y):
         self.clf2.fit(xwl2[train,:],hm_y[train])
         coefs_.append(self.clf2.coef_)
         intercept_.append(self.clf2.intercept_)
     self.clf2.coef_ = np.stack(coefs_).mean(0)
     self.clf2.intercept_ = np.stack(intercept_).mean(0)
Exemple #11
0
def load_titanic(test_size=.25, feature_skip_tuple=(), random_state=1999):
    f = open(os.path.join('datasets', 'titanic', 'titanic3.csv'))
    # Remove . from home.dest, split on quotes because some fields have commas
    keys = f.readline().strip().replace('.', '').split('","')
    lines = f.readlines()
    f.close()
    string_keys = ['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat',
                   'homedest']
    string_keys = [s for s in string_keys if s not in feature_skip_tuple]
    numeric_keys = ['pclass', 'age', 'sibsp', 'parch', 'fare']
    numeric_keys = [n for n in numeric_keys if n not in feature_skip_tuple]
    train_vectorizer_list = []
    test_vectorizer_list = []

    n_samples = len(lines)
    numeric_data = np.zeros((n_samples, len(numeric_keys)))
    numeric_labels = np.zeros((n_samples,), dtype=int)

    # Doing this twice is horribly inefficient but the file is small...
    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        numeric_labels[n] = line_dict["survived"]

    sss = StratifiedShuffleSplit(n_iter=1, test_size=test_size, random_state=12)
    # This is a weird way to get the indices but it works
    train_idx = None
    test_idx = None
    for train_idx, test_idx in sss.split(numeric_data, numeric_labels):
        pass

    for n, l in enumerate(lines):
        line_dict = process_titanic_line(l)
        strings = {k: line_dict[k] for k in string_keys}
        if n in train_idx:
            train_vectorizer_list.append(strings)
        else:
            test_vectorizer_list.append(strings)
        numeric_data[n] = np.asarray([line_dict[k]
                                      for k in numeric_keys])

    train_numeric = numeric_data[train_idx]
    test_numeric = numeric_data[test_idx]
    train_labels = numeric_labels[train_idx]
    test_labels = numeric_labels[test_idx]

    vec = DictVectorizer()
    # .toarray() due to returning a scipy sparse array
    train_categorical = vec.fit_transform(train_vectorizer_list).toarray()
    test_categorical = vec.transform(test_vectorizer_list).toarray()
    train_data = np.concatenate([train_numeric, train_categorical], axis=1)
    test_data = np.concatenate([test_numeric, test_categorical], axis=1)
    keys = numeric_keys + string_keys
    return keys, train_data, test_data, train_labels, test_labels
Exemple #12
0
def shuffled_split(housing):
  add_income_category(housing)
  split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
  for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
  strat_test_set["income_cat"].value_counts() / len(strat_test_set)

  for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

  return strat_train_set, strat_test_set
def test_stratified_shuffle_split_overlap_train_test_bug():
    # See https://github.com/scikit-learn/scikit-learn/issues/6121 for
    # the original bug report
    y = [0, 1, 2, 3] * 3 + [4, 5] * 5
    X = np.ones_like(y)

    sss = StratifiedShuffleSplit(n_splits=1,
                                 test_size=0.5, random_state=0)

    train, test = next(iter(sss.split(X=X, y=y)))

    assert_array_equal(np.intersect1d(train, test), [])
def _split_data(X, y, p_train=0.5, seed=None):
    """
    Splits data into train and test data.
    X contains the data and y contains the labels.
    """
    sss = StratifiedShuffleSplit(n_splits=1, test_size=None, train_size=p_train,
                                 random_state=seed)

    train_index, test_index = next(iter(sss.split(X, y)))
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    return (X_train, y_train), (X_test, y_test)
def test_stratified_shuffle_split_even():
    # Test the StratifiedShuffleSplit, indices are drawn with a
    # equal chance
    n_folds = 5
    n_iter = 1000

    def assert_counts_are_ok(idx_counts, p):
        # Here we test that the distribution of the counts
        # per index is close enough to a binomial
        threshold = 0.05 / n_splits
        bf = stats.binom(n_splits, p)
        for count in idx_counts:
            p = bf.pmf(count)
            assert_true(p > threshold,
                        "An index is not drawn with chance corresponding "
                        "to even draws")

    for n_samples in (6, 22):
        labels = np.array((n_samples // 2) * [0, 1])
        splits = StratifiedShuffleSplit(n_iter=n_iter,
                                        test_size=1. / n_folds,
                                        random_state=0)

        train_counts = [0] * n_samples
        test_counts = [0] * n_samples
        n_splits = 0
        for train, test in splits.split(X=np.ones(n_samples), y=labels):
            n_splits += 1
            for counter, ids in [(train_counts, train), (test_counts, test)]:
                for id in ids:
                    counter[id] += 1
        assert_equal(n_splits, n_iter)

        n_train, n_test = _validate_shuffle_split(n_samples,
                                                  test_size=1./n_folds,
                                                  train_size=1.-(1./n_folds))

        assert_equal(len(train), n_train)
        assert_equal(len(test), n_test)
        assert_equal(len(set(train).intersection(test)), 0)

        label_counts = np.unique(labels)
        assert_equal(splits.test_size, 1.0 / n_folds)
        assert_equal(n_train + n_test, len(labels))
        assert_equal(len(label_counts), 2)
        ex_test_p = float(n_test) / n_samples
        ex_train_p = float(n_train) / n_samples

        assert_counts_are_ok(train_counts, ex_train_p)
        assert_counts_are_ok(test_counts, ex_test_p)
Exemple #16
0
def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv.split(features, labels): 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print(clf)
        print(PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
        print(RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
        print("")
    except:
        print("Got a divide by zero when trying out:", clf)
        print("Precision or recall may be undefined due to a lack of true positive predicitons.")
Exemple #17
0
    def gen_sample_array(self):
        try:
            from sklearn.model_selection import StratifiedShuffleSplit
        except:
            print('Need scikit-learn for this functionality')
        import numpy as np
        
        s = StratifiedShuffleSplit(n_splits=self.n_splits, test_size=0.5)
        X = th.randn(self.class_vector.size(0),2).numpy()
        y = self.class_vector.numpy()
        s.get_n_splits(X, y)

        train_index, test_index = next(s.split(X, y))
        return np.hstack([train_index, test_index])
def main_cv_loop(Xdata,Ydata,clf,parameters,
                n_folds=4,oversample_thresh=0.1,verbose=False):

    # use stratified K-fold CV to get roughly equal folds
    #kf=StratifiedKFold(n_splits=nfolds)
    kf=StratifiedShuffleSplit(n_splits=4,test_size=0.2)
    # use oversampling if the difference in prevalence is greater than 20%
    if numpy.abs(numpy.mean(Ydata)-0.5)>oversample_thresh:
        oversample='smote'
    else:
        oversample='none'

    # variables to store outputs
    pred=numpy.zeros(len(Ydata))  # predicted values
    pred_proba=numpy.zeros(len(Ydata))  # predicted values
    kernel=[]
    C=[]
    fa_ctr=0

    for train,test in kf.split(Xdata,Ydata):
        Xtrain=Xdata[train,:]
        Xtest=Xdata[test,:]
        Ytrain=Ydata[train]
        if numpy.abs(numpy.mean(Ytrain)-0.5)>0.2:
            if verbose:
                print('oversampling using SMOTETomek')
            sm = SMOTETomek()
            Xtrain, Ytrain = sm.fit_sample(Xtrain, Ytrain)

        best_estimator_,bestroc,fa=inner_cv_loop(Xtrain,Ytrain,clf,
                    parameters,verbose=True)
        if not fa is None:
            if verbose:
                print('transforming using fa')
                print(fa)
            tmp=fa.transform(Xtest)
            Xtest=tmp
            fa_ctr+=1
        pred_proba.flat[test]=best_estimator_.predict_proba(Xtest)
        pred.flat[test]=best_estimator_.predict(Xtest)
        kernel.append(best_estimator_.kernel)
        C.append(best_estimator_.C)
    return roc_auc_score(Ydata,pred,average='weighted'),Ydata,pred,pred_proba
Exemple #19
0
def start_to_fit(X, y):
    classifiers = [
        KNeighborsClassifier(3),
        SVC(probability=True),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        GaussianNB(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        LogisticRegression()]

    res_cols = ['Classifier','Accuracy']
    res = pd.DataFrame(columns = res_cols)

    data_set = StratifiedShuffleSplit(n_splits=10, test_size=0.3, train_size=0.7, random_state=0)

    accuracy_dic ={}


    for train_index, test_index in data_set.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        for clf in classifiers:
            name = clf.__class__.__name__
            clf.fit(X_train, y_train)
            #train_predictions = clf.predict(X_test)
            accuracy = accuracy_score(y_test, clf.predict(X_test))
            if name in accuracy_dic:
                accuracy_dic[name] += accuracy
            else:
                accuracy_dic[name] = accuracy

    for clf in accuracy_dic:
        accuracy_dic[clf] = accuracy_dic[clf] / 10.0
        res_entry = pd.DataFrame([[clf, accuracy_dic[clf]]], columns=res_cols)
        res = res.append(res_entry)

    print res
def train_and_test(raw_data, label="Qw", degree=1, p=0.1):
    # my_full_pipeline = Pipeline([
    # #         ('removeFirstFrame', RemoveFirstFrame(frame)),
    #         ('featureSelection', full_pipeline)
    # ])

    from sklearn.model_selection import StratifiedShuffleSplit

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=142)
    for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
        strat_train_set = raw_data.iloc[train_index]
        strat_test_set = raw_data.iloc[test_index]
    # strat_test_set[LABEL].value_counts() / len(strat_test_set)
    X_train = my_transform(strat_train_set, label, degree)
    X_test = my_transform(strat_test_set, label, degree)
    train_y = X_train[:,-1]
    train_set = X_train[:,:-1]
    test_y = X_test[:,-1]
    test_set = X_test[:,:-1]
    return (train_set, train_y, test_set, test_y)
def splitTrainTest(inputDF,random_state):

    simpleTrainSet, simpleTestSet = train_test_split(inputDF, test_size=0.2, random_state=random_state)

    inputDF["income_category"] = np.ceil(inputDF["median_income"]/1.5)
    inputDF["income_category"].where( inputDF["income_category"] < 5.0 , 5.0, inplace = True )

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=19)
    for trainIndices, testIndices in split.split(inputDF,inputDF["income_category"]):
        stratifiedTrainSet = inputDF.loc[trainIndices]
        stratifiedTestSet  = inputDF.loc[testIndices]

    print('\ninputDF["income_category"].value_counts() / len(inputDF)')
    print(   inputDF["income_category"].value_counts() / len(inputDF) )

    for set in (stratifiedTrainSet,stratifiedTestSet):
        set.drop(["income_category"],axis=1,inplace=True)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( stratifiedTrainSet , stratifiedTestSet )
Exemple #22
0
 def suffle_hm(self,x,y,gamma=0.5,n_iter=50):
     hm_count = np.zeros_like(y).astype(float)
     hm = np.zeros_like(y).astype(float)
     skf = StratifiedShuffleSplit(n_splits=n_iter, test_size=.25,random_state=1)
     coefs_ = []
     sv_ = []
     for train,test in skf.split(x,y):
         self.clf1.fit(x[train,:],y[train])
         hm_count[test] += 1.
         hm[test] += (self.clf1.predict(x[test,:])==y[test]).astype(float)
         #coefs_.append(self.clf1.dual_coef_)
         #coefs_.append(self.clf1.coef_)
         #sv_.append(self.clf1.support_vectors_)
     proba = hm/hm_count
     if self.verbose:
         print(hm_count)
         print(proba)
     #self.clf1.dual_coef_ = np.stack(coefs_).mean(0)
     #self.clf1.support_vectors_ = np.stack(sv_).mean(0)
     #self.clf1.coef_ = np.stack(coefs_).mean(0)
     self.clf1.fit(x,y)
     return (proba>=gamma).astype(int),proba
def splitTrainTest(inputDF,random_state):

    ms_spec = importlib.util.find_spec(name="sklearn.model_selection")
    if ms_spec is None:
        trainSet, testSet = train_test_split(inputDF, test_size=0.2, random_state=random_state)
    else:
        inputDF["income_category"] = np.ceil(inputDF["median_income"]/1.5)
        inputDF["income_category"].where( inputDF["income_category"] < 5.0 , 5.0, inplace = True )

        split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=19)
        for trainIndices, testIndices in split.split(inputDF,inputDF["income_category"]):
            trainSet = inputDF.loc[trainIndices]
            testSet  = inputDF.loc[testIndices]

        print('\nincome category relative sizes (whole data set)')
        print(   inputDF["income_category"].value_counts() / len(inputDF) )

        for set in (trainSet,testSet):
            set.drop(["income_category"],axis=1,inplace=True)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( trainSet , testSet )
Exemple #24
0
    def fit_model(self, X, y):
        """
        X::pd.DataFrame: Input data
        y::np.ndarray: response for input data
        """
        cv_out = StratifiedShuffleSplit(n_splits=400)

        clf = Pipeline([('scaler', StandardScaler()),
                        ('fs', CustFsNoiseWinnow()),
                        ('et', ExtraTreesClassifier(n_estimators=2000))])

        self.res = {'mask':[], 'fimp':[], 'auc':[], 'model':0}

        for idx, (train, test) in enumerate(cv_out.split(X, y)):
            clf.fit(X[train], y[train])
            prediction = clf.predict(X[test])
            self.res['mask'].append((idx, clf.named_steps['fs'].mask_))
            self.res['fimp'].append((idx, clf.named_steps['et'].feature_importances_))
            self.res['auc'].append((idx, roc_auc_score(y[test], prediction)))

        self.res['model'] = clf
        output_saved = self.save_pickle(self.res, self.out)
        return output_saved
           'covariance_estimator', 'min_region_size_in_mm3']
results = dict()
for column_name in columns:
    results.setdefault(column_name, [])
print(results)

##############################################################################
# Run the analysis now
# --------------------
import pandas as pd

dimensions = [40, 60, 80, 100, 120, 150, 200, 300]
folder_name = name + str(n_iter) + '_kmeans_list_dim_graphlasso'
for model in ['kmeans']:
    for dim in dimensions:
        iter_for_prediction = cv.split(func_imgs, classes)
        for index, (train_index, test_index) in enumerate(iter_for_prediction):
            all_results = draw_predictions(
                imgs=func_imgs,
                labels=labels, groups=classes, index=index,
                dimensionality=dim,
                train_index=train_index, test_index=test_index,
                scoring='roc_auc', models=model, atlases=None,
                masker=masker, connectomes=connectomes,
                confounds=motion_confounds,
                confounds_mask_img=gm_mask,
                connectome_regress_confounds=connectome_regress_confounds)
            print(index)
            # Dump the results
            for model_ in all_results.models_:
                save_path = os.path.join(folder_name, model_, str(dim), str(index))
# **Exercise for part 3**: Use the code below to test what value of
# `n_neighbors` works best for the given data. *Note: do NOT change the metric
# to be anything other than `'euclidean'`. Other distance functions are not
# optimized for the amount of data we are working with.*
#
# **Question for part 3**: What is the accuracy of the best classifier you can
# create for this data (by changing only the `n_neighbors` parameter)?

#%%
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from IPython.html import widgets

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8)
cv = sss.split(X=ds.data, y=ds.target)

# fill in the training and testing data and save as separate variables
for trainidx, testidx in cv:
    # note that these are sparse matrices
    X_train = ds.data[trainidx]
    X_test = ds.data[testidx]
    y_train = ds.target[trainidx]
    y_test = ds.target[testidx]

# fill in your code  here to train and test
# calculate the accuracy and print it for various values of K
clf = KNeighborsClassifier(weights='uniform', metric='euclidean')
accuracies = []
for k in range(1, 10):
    clf.n_neighbors = k
Exemple #27
0
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

housing = load_housing_data()


from sklearn.model_selection import StratifiedShuffleSplit

housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

housing_num = housing.drop("ocean_proximity", axis=1)

rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
Exemple #28
0
train_labels[list(label_mapping.keys())] = pd.DataFrame(
    train_labels['target_v'].values.tolist(), index=train_labels.index)
train_labels['Id'] = train_labels['Id'] + '_green.png'
class_count = train_labels['target_v'].sum()

#  Split
from sklearn.model_selection import StratifiedShuffleSplit


def coalesce(arr):
    return arr[np.argmin(np.array(class_count)[arr])]


train_labels['y_coal'] = train_labels['Target'].apply(coalesce)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, val_index in sss.split(np.zeros(len(train_labels)),
                                        train_labels['y_coal']):
    df_train = train_labels.iloc[train_index]
    df_val = train_labels.iloc[val_index]

df_train.reset_index(drop=True, inplace=True)
df_val.reset_index(drop=True, inplace=True)

#  some gc collection
del train_labels
gc.collect()

##################
###     Model
##################

# constants and utils
Exemple #29
0
def read_data(filename):
    data = pd.read_csv(filename)
    y_array = data['Survived'].values
    X_df = data.drop(['Survived', 'PassengerId'], axis=1)
    return X_df, y_array


if __name__ == '__main__':

    print("Reading file ...")
    X_df, y_array = read_data(train_filename)
    skf = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=57)

    print("Training file ...")
    scores = []
    for train_is, test_is in skf.split(X_df, y_array):
        print('--------------------------')
        X_train_df = X_df.iloc[train_is]
        y_train_array = y_array[train_is]
        X_test_df = X_df.iloc[test_is]
        y_test_array = y_array[test_is]

        fe = feature_extractor.FeatureExtractor()
        fe.fit(X_train_df, y_train_array)
        X_train_array = fe.transform(X_train_df)
        X_test_array = fe.transform(X_test_df)

        clf = classifier.Classifier()
        clf.fit(X_train_array, y_train_array)
        y_proba = clf.predict_proba(X_test_array)
Exemple #30
0
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
#print(f"rows in train set: {len(train_set)}\n rows in test set: {len(test_set)} ")

# In[9]:

print(len(train_set))

# In[10]:

print(len(test_set))

# In[11]:

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing['CHAS']):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# In[12]:

strat_test_set['CHAS'].value_counts()

# In[13]:

housing = strat_train_set.copy()

# ##next haeding is

# ## corelate matrix
#
Exemple #31
0
## Excercises

### 1.clustering

from sklearn.datasets import fetch_olivetti_faces

olivetti = fetch_olivetti_faces()

print(olivetti.DESCR)

olivetti.target

from sklearn.model_selection import StratifiedShuffleSplit

strat_split = StratifiedShuffleSplit(n_splits=1, test_size=40, random_state=42)
train_valid_idx, test_idx = next(strat_split.split(olivetti.data, olivetti.target))
X_train_valid = olivetti.data[train_valid_idx]
y_train_valid = olivetti.target[train_valid_idx]
X_test = olivetti.data[test_idx]
y_test = olivetti.target[test_idx]

strat_split = StratifiedShuffleSplit(n_splits=1, test_size=80, random_state=43)
train_idx, valid_idx = next(strat_split.split(X_train_valid, y_train_valid))
X_train = X_train_valid[train_idx]
y_train = y_train_valid[train_idx]
X_valid = X_train_valid[valid_idx]
y_valid = y_train_valid[valid_idx]

print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(X_test.shape, y_test.shape)
Exemple #32
0
        'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
        'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
        'stalk-surface-below-ring', 'stalk-color-above-ring',
        'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
        'ring-type', 'spore-print-color', 'population', 'habitat'
    ])

mushroom_targets = mushroom_data[['E/P']]

#Complete a Stratified Shuffle Split to dataset in 80/20 ratio
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=43)
sss.get_n_splits(mushroom_data)

for train_index, test_index in sss.split(mushroom_data, mushroom_targets):
    mushroom_train_set = mushroom_data.loc[train_index]
    mushroom_test_set = mushroom_data.loc[test_index]

#Split the training and test datasets into inputs and targets dataframes
mushroom_train_inputs = mushroom_train_set.drop(['E/P'], axis=1)
mushroom_train_targets = mushroom_train_set[['E/P']]

mushroom_test_inputs = mushroom_test_set.drop(['E/P'], axis=1)
mushroom_test_targets = mushroom_test_set[['E/P']]

#Encode the categorical input columns and print the total number of columns from each encoding
train_inputs = mushroom_train_set.drop('E/P', axis=1)

#One-Hot Encoding (scikit func)
from sklearn.preprocessing import OneHotEncoder
Exemple #33
0
def main():

    fetch_housing_data()  # Pull Data
    housing_df = load_housing_data()

    #print(housing_df["ocean_proximity"].value_counts())
    #print(housing_df.describe())
    ''' EXPLORE DATA '''
    #housing_df.hist(bins=50, figsize=(20, 15)) # Plot histogram of features
    #plt.show()

    housing_with_id = housing_df.reset_index()  # adds an `index` column
    housing_with_id["id"] = housing_df["longitude"] * 1000 + housing_df[
        "latitude"]  # Create id using lat and lon

    housing_with_id["income_cat"] = pd.cut(
        housing_with_id["median_income"],
        bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
        labels=[1, 2, 3, 4, 5]
    )  # Create income bins so that you can complete a stratified training/test set

    housing_df["income_cat"] = pd.cut(
        housing_df["median_income"],
        bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
        labels=[1, 2, 3, 4, 5]
    )  # Create income bins so that you can complete a stratified training/test set

    housing_df["income_cat"].hist()
    #plt.show()

    train_set, test_set = split_train_test_by_id(
        housing_with_id, 0.2,
        "id")  # Create train and test set that is not stratified

    split = StratifiedShuffleSplit(
        n_splits=1, test_size=0.2,
        random_state=42)  # Create train and test set that is stratified
    for train_index, test_index in split.split(housing_df,
                                               housing_df["income_cat"]):
        strat_train_set = housing_df.loc[train_index]
        strat_test_set = housing_df.loc[test_index]

    # Print distributions to determine how the test sets compare to overall data
    #print(strat_test_set["income_cat"].value_counts() / len(strat_test_set))
    #print(test_set["income_cat"].value_counts() / len(test_set))
    #print(housing_df["income_cat"].value_counts() / len(housing_df))

    for set_ in (strat_train_set, strat_test_set):
        set_.drop("income_cat", axis=1, inplace=True)
    ''' ADDITIONAL VISUALIZATION '''
    housing_copy = strat_train_set.copy()
    #housing_copy.plot(kind="scatter", x="longitude", y="latitude")
    #housing_copy.plot(kind="scatter", x="longitude", y="latitude", alpha=.1) # alpha defines density

    housing_copy.plot(
        kind="scatter",
        x="longitude",
        y="latitude",
        alpha=0.4,
        s=housing_copy['population'] / 100,
        label='population',
        figsize=(10, 7),
        c='median_house_value',
        cmap=plt.get_cmap('jet'),
        colorbar=True,
    )
    #plt.legend()
    #plt.show()

    corr_matrix = housing_copy.corr()
    #print(corr_matrix)
    from pandas.plotting import scatter_matrix

    attributes = [
        'median_house_value', 'median_income', 'total_rooms',
        'housing_median_age'
    ]  # plot matrix scatter plots for several attributes
    scatter_matrix(housing_copy[attributes], figsize=(12, 8))
    #plt.show()

    housing_copy.plot(kind="scatter",
                      x="median_income",
                      y="median_house_value",
                      alpha=0.1)
    #plt.show()
    ''' CREATE ADDITIONAL ATTRIBUTES '''
    housing_copy['rooms_per_household'] = housing_copy[
        'total_rooms'] / housing_copy['households']
    housing_copy['bedrooms_per_room'] = housing_copy[
        'total_bedrooms'] / housing_copy['total_rooms']
    housing_copy['population_per_households'] = housing_copy[
        'population'] / housing_copy['households']

    corr_matrix = housing_copy.corr()
    corr_matrix['median_house_value'].sort_values(ascending=False)
    #print(corr_matrix['median_house_value'].sort_values(ascending=False))
    ''' DATA CLEANING '''
    housing_copy = strat_train_set.drop("median_house_value", axis=1)
    housing_labels = strat_train_set['median_house_value'].copy()

    #housing_copy.dropna(subset=['total_bedrooms']) # drops records that don't contain a value for total_bedrooms
    #housing_copy.drop('total_bedrooms', axis=1) # drops entire attribute
    #median = housing_copy['total_bedrooms'].median() # median total bedrooms
    #housing_copy['total_bedrooms'].fillna(median, inplace=True) # populate total bedroom NAs with median

    from sklearn.impute import SimpleImputer  # imputer can be used to track stats on all numerical fields
    imputer = SimpleImputer(strategy="median")

    housing_num = housing_copy.drop("ocean_proximity",
                                    axis=1)  # remove non-numerical field
    imputer.fit(housing_num)  # estimate using fit()
    #print(imputer.statistics_)

    X = imputer.transform(
        housing_num
    )  # transform housing_num using imputer median, filling in NAs
    housing_tr = pd.DataFrame(X,
                              columns=housing_num.columns,
                              index=housing_num.index)
    #print(housing_num['total_bedrooms'].count())
    #print(housing_tr['total_bedrooms'].count())

    housing_cat = housing_copy[['ocean_proximity']]
    housing_cat.head(10)

    # One method to create numerical representation. We do not want proximity between values
    #from sklearn.preprocessing import OrdinalEncoder
    #ordinal_encoder = OrdinalEncoder()
    #housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
    #housing_cat_encoded[:10]
    #print(ordinal_encoder.categories_)

    #Create a custom transformer to add extra attributes
    from sklearn.base import BaseEstimator, TransformerMixin
    rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

    class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
        def __init__(self, add_bedrooms_per_room=True):  # no *args or **kargs
            self.add_bedrooms_per_room = add_bedrooms_per_room

        def fit(self, X, y=None):
            return self  # do nothing

        def transform(self, X):
            rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
            population_per_household = X[:, population_ix] / X[:,
                                                               households_ix]
            if self.add_bedrooms_per_room:
                bedrooms_per_room = X[:, bedrooms_ix] / X[:, households_ix]
                return np.c_[X, rooms_per_household, population_per_household,
                             bedrooms_per_room]
            else:
                return np.c_[X, rooms_per_household, population_per_household]

    attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
    housing_extra_attribs = attr_adder.transform(housing_copy.values)

    housing_extra_attribs = pd.DataFrame(
        housing_extra_attribs,
        columns=list(housing_copy.columns) +
        ["rooms_per_household", "population_per_household"],
        index=housing_copy.index)
    print(housing_extra_attribs.head())
    ''' CREATE A PIPELINE FOR NUMERICAL AND CATEGORICAL ATTRIBUTES  '''
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

    housing_num_tr = num_pipeline.fit_transform(housing_num)

    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer

    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]

    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

    housing_prepared = full_pipeline.fit_transform(housing_copy)
    print(housing_prepared.shape)
    ''' LINEAR REGRESSION '''
    from sklearn.linear_model import LinearRegression
    lin_reg = LinearRegression()
    lin_reg.fit(housing_prepared, housing_labels)

    some_data = housing_copy.iloc[:5]
    some_labels = housing_labels.iloc[:5]
    some_data_prepared = full_pipeline.transform(some_data)
    print("predictions", lin_reg.predict(some_data_prepared))
    print("Labels: ", list(some_labels))

    from sklearn.metrics import mean_squared_error
    housing_predictions = lin_reg.predict(housing_prepared)
    lin_mse = mean_squared_error(housing_labels, housing_predictions)
    lin_rmse = np.sqrt(lin_mse)
    print(lin_rmse)
    ''' DECISION TREE '''
    from sklearn.tree import DecisionTreeRegressor

    tree_reg = DecisionTreeRegressor()
    tree_reg.fit(housing_prepared, housing_labels)  # train model
    housing_predictions = tree_reg.predict(housing_prepared)
    tree_mse = mean_squared_error(housing_labels, housing_predictions)
    tree_rmse = np.sqrt(tree_mse)
    print(tree_rmse)
    ''' Cross Validation '''
    from sklearn.model_selection import cross_val_score
    scores = cross_val_score(tree_reg,
                             housing_prepared,
                             housing_labels,
                             scoring="neg_mean_squared_error",
                             cv=10)
    tree_rmse_scores = np.sqrt(-scores)

    def display_scores(scores):
        print("Scores:", scores)
        print("Mean: ", scores.mean())
        print("Standard Deviation: ", scores.std())

    #display_scores(tree_rmse_scores)

    lin_scores = cross_val_score(lin_reg,
                                 housing_prepared,
                                 housing_labels,
                                 scoring="neg_mean_squared_error",
                                 cv=10)
    lin_rmse_scores = np.sqrt(-lin_scores)
    #display_scores(lin_rmse_scores)
    ''' Random Forest '''
    from sklearn.ensemble import RandomForestRegressor

    forest_reg = RandomForestRegressor()
    forest_reg.fit(housing_prepared, housing_labels)
    housing_predictions = forest_reg.predict(housing_prepared)
    forest_mse = mean_squared_error(housing_labels, housing_predictions)
    forest_rmse = np.sqrt(forest_mse)
    print("Training set: ")
    display_scores(-forest_rmse)

    forest_scores = cross_val_score(forest_reg,
                                    housing_prepared,
                                    housing_labels,
                                    scoring="neg_mean_squared_error",
                                    cv=10)
    forest_rmse_scores = np.sqrt(-forest_scores)
    print("Validation set: ")
    display_scores(forest_rmse_scores)
    ''' Save models '''
    #import joblib
    #joblib.dump(my_model, "my_model.pkl")
    #my_model_loaded = joblib.load("my_model.pkl")
    ''' GRID SEARCH '''
    # fiddles with hyperparameters for me
    from sklearn.model_selection import GridSearchCV

    param_grid = [
        {
            'n_estimators': [3, 10, 30],
            'max_features': [2, 4, 6, 8]
        },
        {
            'bootstrap': [False],
            'n_estimators': [3, 10],
            'max_features': [2, 3, 4]
        },
    ]
    forest_reg = RandomForestRegressor()
    grid_search = GridSearchCV(forest_reg,
                               param_grid,
                               cv=5,
                               scoring='neg_mean_squared_error',
                               return_train_score=True)
    grid_search.fit(housing_prepared, housing_labels)

    grid_search.best_params_
    print(grid_search.best_params_)  # Print best combination of parameters
    print(grid_search.best_estimator_)  # Print best estimator

    # Print evaluation scores
    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)

    # Print feature importances
    feature_importances = grid_search.best_estimator_.feature_importances_
    #print(feature_importances)
    extra_attribs = ["rooms)per)hhold", "pop_per_hhold", "bedrooms_per_room"]
    cat_encoder = full_pipeline.named_transformers_["cat"]
    cat_one_hot_attribs = list(cat_encoder.categories_[0])
    attributes = num_attribs + extra_attribs + cat_one_hot_attribs
    print(sorted(zip(feature_importances, attributes), reverse=True))
    ''' EVALUATE ON TEST SET '''
    final_model = grid_search.best_estimator_

    X_test = strat_test_set.drop("median_house_value", axis=1)
    y_test = strat_test_set["median_house_value"].copy()

    X_test_prepared = full_pipeline.transform(X_test)

    final_predictions = final_model.predict(X_test_prepared)

    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)
    print(final_rmse)

    # Compute accuracy w/ confidence intervol
    from scipy import stats
    confidence = .95
    squared_errors = (final_predictions - y_test)**2
    print(
        np.sqrt(
            stats.t.interval(confidence,
                             len(squared_errors) - 1,
                             loc=squared_errors.mean(),
                             scale=stats.sem(squared_errors))))

    print("complete")
def run_classifier(set_size):
    import util as u

    #load data balanced by class labels limited to SET_SIZE
    if set_size:
        df = u.generate_small_set(set_size, fname_data)
    else:  #load whole data limited by balanced undersampling
        df = u.generate_small_set(None, None)

    # split the data into a training set and a validation set
    from sklearn.model_selection import StratifiedShuffleSplit
    sss = StratifiedShuffleSplit(n_splits=5,
                                 test_size=VALIDATION_SPLIT,
                                 random_state=0)
    X = df.eligibility
    y = df.eligible

    scoresTrain = []
    scoresVal = []
    for train_index, test_index in sss.split(X, y):
        df_val, df_train = df.iloc[test_index, :], df.iloc[train_index, :]
        print("training sample after stratified sampling: ")
        print(df_train.describe())
        print("validation sample after after stratified sampling: ")
        print(df_val.describe())
        df_train.to_csv(sep='\t', path_or_buf=data_train)
        df_val.to_csv(sep='\t', path_or_buf=data_val)

        classifier = None
        if TRAIN_MODEL == False:
            print("starting to load model")
            classifier = fasttext.load_model(classifier_fname + '.bin')
        else:
            print("start to train classifier model")
            #classifier = fasttext.supervised(data_train, classifier_fname, pretrained_vectors = './wordEmbeddings/vectorsFastText.vec', epoch= 100)
            #classifier = fasttext.supervised(data_train, classifier_fname, epoch= 100, silent = 0, thread=4, pretrained_vectors = './wordEmbeddings/vectorsFastText_skipgram.vec',  )
            classifier = fasttext.supervised(data_train,
                                             classifier_fname,
                                             epoch=100,
                                             silent=0,
                                             thread=4,
                                             lr=0.1)

            print("end")

        result = classifier.test(data_val)
        print('P@1:', result.precision)
        print('R@1:', result.recall)
        print('Number of examples:', result.nexamples)

        texts = [
            'neuropsychiatric history or altered mental status',
            'pembrolizumab and corticosteroids',
            'trastuzumab and breast cancer and heart insufficiency and dyspnea',
            'trastuzumab and breast cancer',
            'trastuzumab and breast cancer and invasive cancer',
            'nivolumab and hiv',
            'CAR and lymphoma',
            'TCR and breast cancer',
            'in situ breast cancer and pemetrexed',
            'bevacizumab and patients who has had any event of thrombosis',
            'capecitabine and breast cancer and brain metastasis',
            'capecitabine and colon cancer',
            'lapatinib and breast cancer and brain metastasis',
            'pertuzumab and breast cancer and brain metastasis',
        ]

        # predict with the probability
        labels = classifier.predict_proba(texts)
        print(labels)
        result = classifier.test(data_test)
        print(result.precision)  # Precision at one
        print(result.recall)  # Recall at one
        print(result.nexamples)  # Number of test examples

        #k = 1
        # print(classifier.labels)                  # List of labels
        # print(classifier.label_prefix)            # Prefix of the label
        # print(classifier.dim)                     # Size of word vector
        # print(classifier.ws)                      # Size of context window
        # print(classifier.epoch)                   # Number of epochs
        # print(classifier.min_count)               # Minimal number of word occurences
        # print(classifier.neg)                     # Number of negative sampled
        # print(classifier.word_ngrams)             # Max length of word ngram
        # print(classifier.loss_name)               # Loss function name
        # print(classifier.bucket)                  # Number of buckets
        # print(classifier.minn)                    # Min length of char ngram
        # print(classifier.maxn)                    # Max length of char ngram
        # print(classifier.lr_update_rate)          # Rate of updates for the learning rate
        # print(classifier.t)                       # Value of sampling threshold
        # print(classifier.encoding)                # Encoding that used by classifier
        # print(classifier.test(data_val, k))       # Test the classifier
        # print(classifier.predict(texts, k))       # Predict the most likely label
        #print(classifier.predict_proba(texts, k)) # Predict the most likely label include their probability

        #Confusion matrix
        classifier = fasttext.load_model(classifier_fname + '.bin')
        df_val = pd.read_csv(data_val,
                             sep='\t',
                             header=0,
                             names=["index", "y", "x"])

        predicted = pd.Series(np.array(classifier.predict(df_val.x)).flatten())
        predictedTrain = pd.Series(
            np.array(classifier.predict(df_train.eligibility)).flatten())

        d = {"y_true": df_val.y, "y_pred": predicted}
        df_confVal = pd.DataFrame(d)

        truePos = df_confVal.loc[lambda df: (df.y_true == "__label__0") &
                                 (df.y_true == df.y_pred), :]
        FalseNeg = df_confVal.loc[lambda df: (df.y_true == "__label__0") &
                                  (df.y_true != df.y_pred), :]
        trueNeg = df_confVal.loc[lambda df: (df.y_true == "__label__1") &
                                 (df.y_true == df.y_pred), :]
        FalsePos = df_confVal.loc[lambda df: (df.y_true == "__label__1") &
                                  (df.y_true != df.y_pred), :]

        confusion_table = pd.DataFrame(
            {
                "True Positives": [truePos.y_true.size, FalseNeg.y_true.size],
                "True Negatives": [FalsePos.y_true.size, trueNeg.y_true.size]
            },
            index=["Predicted Positives", "Predicted Negatives"])
        print(confusion_table)

        #cohen's Kappa agreement
        from sklearn.metrics import cohen_kappa_score
        kappa = cohen_kappa_score(df_confVal.y_true, df_confVal.y_pred)
        print("kappa =" + str(kappa))

        #classification report
        from sklearn.metrics import classification_report, f1_score
        target_names = ['Eligible', 'Not elegible']
        report = classification_report(df_confVal.y_true,
                                       df_confVal.y_pred,
                                       target_names=target_names)
        print(report)
        f1Val = f1_score(df_confVal.y_true,
                         df_confVal.y_pred,
                         pos_label='__label__0',
                         average='macro')
        scoresVal.append(f1Val)
        f1Train = f1_score(df_train.eligible,
                           predictedTrain,
                           pos_label='__label__0',
                           average='macro')
        scoresTrain.append(f1Train)

    scoresTrain = np.array(scoresTrain)
    scoresVal = np.array(scoresVal)
    print("Accuracy " + str(y.size) + ": %0.2f (+/- %0.2f)" %
          (scoresVal.mean(), scoresVal.std() * 2))
    return scoresTrain, scoresVal
def test_classifier(clf, dataset, feature_list, folds=1000):
    # extract the features specified in features_list
    data = featureFormat(dataset, feature_list, sort_keys=True)
    # split into labels and features (this line assumes that the first
    # feature in the array is the label, which is why "poi" must always
    # be first in the features list
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv.split(features, labels):
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        # print clf
        print(
            PERF_FORMAT_STRING.format(accuracy,
                                      precision,
                                      recall,
                                      f1,
                                      f2,
                                      display_precision=5))
        print(
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
                                         true_negatives))
        print("")
        return clf
    except:
        print("Got a divide by zero when trying out:", clf)
        print(
            "Precision or recall may be undefined due to a lack of true positive predicitons."
        )
Exemple #36
0
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

housing["median_income"].hist()
import numpy as np
#Creating a new column
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
#Everything below 5 will be as such and above 5 will be replaced with 5
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
housing["income_cat"].hist()
plt.show()

#to represent the actual population stratified split is used
from sklearn.model_selection import StratifiedShuffleSplit
#representing the same proportion of category as in population
split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
strat_train_set.describe()
strat_test_set.describe()
strat_train_set.hist(bins=50, figsize=(12, 7))
strat_test_set.hist(bins=50, figsize=(12, 7))
plt.show()

#removing the new column
strat_train_set.drop("income_cat", axis=1, inplace=True)
strat_test_set.drop("income_cat", axis=1, inplace=True)

housing = strat_train_set.copy()

#scatter plot
def main():
    print("hello world")

    #Downloading data
    print("Downloading data")
    download_credit_card_data()

    #Reading in data to pandas dataframe
    print("Reading in data to pandas dataframe")
    credit_card_df = load_credit_card_data()
    print(f"Columns: \n{credit_card_df.columns}\n")
    print(f"Summary stats:\n{credit_card_df.describe()}\n")
    print(f"Head:\n{credit_card_df.head()}\n")

    #Look at histograms of data

    #Choosing interesting features to look at
    print("Choosing interesting features to look at")
    features = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'AGE']

    #Create test and train sets with random number generator seed set
    print("Create test and train sets with random number generator seed set")
    np.random.seed(42)
    train_set, test_set = split_train_test(credit_card_df, 0.2)
    print(f"Train set length = {len(train_set)}")
    print(f"Test set length = {len(test_set)}")
    #Test data set is being set to the side for the time being
    #...but I also need to transform it at some point as well

    #Seems to be some strangely popular ages in the data
    #University seems to be the most popular level of education
    #BILL_AMT features are very tail-heavy

    #Check for missing data values
    # print("Check for missing data values")
    # print(strat_train_set_sample.info())
    # print(strat_train_set_sample.describe())
    # #12000 values in each - no missing values, but shall add
    # print("Adding imputer")
    # imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
    # imp_median.fit(strat_train_set_sample)
    # imp_median.transform(strat_train_set_sample)
    # #Any values need to be encoded?
    #
    #
    # #Feature scaling using StandardScaler
    # print("Feature scaling using StandardScaler")
    # scaler = StandardScaler().fit(strat_train_set)
    # print(scaler.mean_)
    # print(scaler.scale_)

    #Shall sample from age strata to ensure groups are representative of age groups
    print(
        "Shall sample from age strata to ensure groups are representative of age groups"
    )
    #Dividing by 10 gives 6 age categories, rounding up to five groups
    print("Dividing by 10 gives 6 age categories, rounding up to five groups")
    credit_card_df = add_age_category(credit_card_df)

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(credit_card_df,
                                               credit_card_df["AGE_cat"]):
        strat_train_set = credit_card_df.loc[train_index]
        strat_test_set = credit_card_df.loc[test_index]

    print(f"Strat train set length = {len(strat_train_set)}")
    print(f"Strat test set length = {len(strat_test_set)}")

    #Checking how age proportionalities match up with random or stratified sampling
    print(
        "Checking how age proportionalities match up with random or stratified sampling"
    )
    train_set = add_age_category(train_set)
    strat_train_set = add_age_category(strat_train_set)

    print("Overall:")
    print(credit_card_df["AGE_cat"].value_counts() / len(credit_card_df))
    print("Random:")
    print(train_set["AGE_cat"].value_counts() / len(train_set))
    print("Stratified:")
    print(strat_train_set["AGE_cat"].value_counts() / len(strat_train_set))
    #Stratified sampling does give a better representation of the overall data

    #Removing AGE_cat variable from data frames
    print("Removing AGE_cat variable from data frames")
    for set_ in (strat_test_set, strat_train_set):
        set_.drop("AGE_cat", axis=1, inplace=True)

    # print("Overall\tRandom\tStrat\tRand. Error\tStrat. Error\n")
    # for i, cat in enumerate(credit_card_df["AGE_cat"].value_counts()/len(credit_card_df)):
    #     print(cat,
    #     train_set["AGE_cat"].value_counts()[i]/len(train_set),
    #     strat_train_set["AGE_cat"].value_counts()[i]/len(strat_train_set))
    #     print((train_set["AGE_cat"].value_counts()[i]/len(train_set) - cat) * 100/cat, (strat_train_set["AGE_cat"].value_counts()[i]/len(strat_train_set) - cat) * 100/cat)

    #Splitting test set in to target and feature variables
    strat_test_set_X = strat_test_set.drop(
        columns=["default payment next month"])
    strat_test_set_y = strat_test_set["default payment next month"]

    #Adding ratio variable to test set
    #Instance of attribute adder
    attr_adder = RatioAttributesAdder(add_payment_ratios=True)
    #Returns 2D numpy array
    extra_attribs = attr_adder.transform(strat_test_set_X.values)
    #Adding new data to dataframe
    strat_test_set_X = strat_test_set_X.assign(Ratio=extra_attribs[:, -1])

    #Making a sample of the training set to experiment with
    print("Making a sample of the training set to experiment with")
    strat_train_set_sample = strat_train_set.sample(frac=0.9, random_state=42)

    #Adding another attribute to data frame
    # print("Adding another attribute to data frame")
    # #Instance of attribute adder
    # attr_adder = RatioAttributesAdder(add_payment_ratios = True)
    # #Returns 2D numpy array
    # extra_attribs = attr_adder.transform(strat_train_set_sample.values)
    # #Adding new data to dataframe
    # strat_train_set_sample = strat_train_set_sample.assign(Ratio = extra_attribs[:,-1])
    #
    # print("Shapes:")
    # print(extra_attribs.shape)
    # print(strat_train_set_sample.shape)
    #
    # extra_attribs_columns = (strat_train_set_sample.columns)
    # print(extra_attribs_columns)
    # print(type(extra_attribs_columns))
    #
    # # strat_train_set_sample = pd.DataFrame(data = extra_attribs, columns = strat_train_set_sample.columns)
    #
    #
    # print(type(extra_attribs))
    # print(type(strat_train_set_sample))
    # print(len(extra_attribs))
    # print(len(strat_train_set_sample))

    print(strat_train_set_sample.head())

    #Using preparation pipeline
    print("Using preparation pipeline")
    print(strat_train_set_sample.columns)
    strat_train_set_sample_X = strat_train_set_sample.drop(
        columns=["default payment next month"])
    strat_train_set_sample_y = strat_train_set_sample[
        "default payment next month"]

    print(strat_train_set_sample_X.head())
    print(strat_train_set_sample_y.head())

    strat_train_set_sample_X_array = prep_pipeline.fit_transform(
        strat_train_set_sample_X)
    print(strat_train_set_sample_X_array)
    print(strat_train_set_sample_X_array.shape)
    #Putting it back in to pandas df
    new_columns = list(strat_train_set_sample_X.columns)
    new_columns.append("Ratio")
    print(new_columns)
    strat_train_set_sample_X = pd.DataFrame(
        columns=new_columns, data=strat_train_set_sample_X_array)
    # strat_train_set_sample.assign(Ratio = [])
    # strat_train_set_sample.append(strat_train_set_sample_array)

    print(strat_train_set_sample_X)
    print(strat_train_set_sample_X.describe())

    #Selecting correlated features
    print("Selecting correlated features using earlier data ")
    strat_train_set_sample = select_correlated_features(
        strat_train_set_sample,
        threshold=0.08,
        plot_boolean=False,
        target="default payment next month")

    print(strat_train_set_sample)

    #Now trying some models for the data
    print("Now trying some models for the data")

    print(strat_train_set_sample["PAY_0"])

    #Logistic regression
    print("\n\n\nLogistic regression")
    #Instance of logistic regression model
    log_reg = LogisticRegression()  #penalty = 'l2', C = 0.1,random_state = 0)

    # log_reg.fit(X, y)
    # print(f"Score: {log_reg.score(X, y)}")

    #Using GridSearchCV to find optimum parameters

    #Making parameter grid
    param_grid = [{
        'C': np.logspace(-5, 1, 4),
        'penalty': ['l2'],
        'solver': ['sag']
    }]

    #Making grid search object
    grid_clf = GridSearchCV(log_reg,
                            param_grid=param_grid,
                            cv=3,
                            scoring='neg_mean_squared_error')

    grid_clf.fit(strat_train_set_sample_X, strat_train_set_sample_y)

    print(grid_clf.best_params_)

    cvres = grid_clf.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(mean_score, params)

    log_reg = LogisticRegression(penalty='l2', C=0.001, random_state=0)
    log_reg.fit(strat_train_set_sample_X, strat_train_set_sample_y)
    print(f"Score: {log_reg.score(strat_test_set_X, strat_test_set_y)}")

    #Decision trees
    print("Decision tree")

    #Instance of decision tree classifier
    dt_clf = DecisionTreeClassifier()
    dt_clf.fit(strat_train_set_sample_X, strat_train_set_sample_y)

    print(f"Score: {dt_clf.score(strat_test_set_X, strat_test_set_y)}")

    #Support Vector Machine - is taking a very very long time
    # X_svm = np.array(strat_train_set_sample["PAY_0"]).reshape(-1, 1)#.reshape(-1, 1)
    # print("Support Vector Machine")
    # svm_clf = svm.SVC(kernel='linear')
    # svm_clf.fit(X_svm[:1000], y[:1000])
    # print(f"Score: {svm_clf.score(X_svm, y)}")

    #K-nearest neighbours model
    print("K-nearest neighbours")

    knn_model = KNeighborsClassifier(n_neighbors=4)
    knn_model.fit(strat_train_set_sample_X, strat_train_set_sample_y)
    print(f"Score: {knn_model.score(strat_test_set_X, strat_test_set_y)}")

    #Random Forest Regressor
    print("Random forest regressor")
    param_grid = [
        {
            'bootstrap': [True],
            'n_estimators': [3, 10, 30],
            'max_features': [2, 4, 6, 8]
        },
        {
            'bootstrap': [False],
            'n_estimators': [3, 10],
            'max_features': [2, 3, 4]
        },
    ]

    forest_reg = RandomForestClassifier()
    grid_search = GridSearchCV(forest_reg,
                               param_grid,
                               cv=5,
                               scoring='neg_mean_squared_error')

    grid_search.fit(strat_train_set_sample_X, strat_train_set_sample_y)

    cvres = grid_search.cv_results_
    for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
        print(np.sqrt(-mean_score), params)

    print("Checking that the lower the gridsearchcv score, the better")

    forest_reg_1 = RandomForestClassifier(max_features=8, n_estimators=30)
    forest_reg_1.fit(strat_train_set_sample_X, strat_train_set_sample_y)
    print(f"Score: {forest_reg_1.score(strat_test_set_X, strat_test_set_y)}")

    forest_reg_2 = RandomForestClassifier(max_features=8, n_estimators=3)
    forest_reg_2.fit(strat_train_set_sample_X, strat_train_set_sample_y)
    print(f"Score: {forest_reg_2.score(strat_test_set_X, strat_test_set_y)}")

    forest_reg_3 = RandomForestClassifier(max_features=2, n_estimators=3)
    forest_reg_3.fit(strat_train_set_sample_X, strat_train_set_sample_y)
    print(f"Score: {forest_reg_3.score(strat_test_set_X, strat_test_set_y)}")

    #Making an ensemble model
    print(f"Making an ensemble model")
    voting_clf = VotingClassifier(
        estimators=[('lr', log_reg), ('knn', knn_model),
                    ('rndm_for', forest_reg_3)],  # ('svm', svm_clf)],
        voting='soft'  #Soft currently doing better than hard
    )
    voting_clf.fit(strat_train_set_sample_X, strat_train_set_sample_y)

    print(f"Score: {voting_clf.score(strat_test_set_X, strat_test_set_y)}")

    return
Exemple #38
0
def main():
    # 读入数据
    os.chdir('/Users/xuejiang/PycharmProjects/isofom_/data/')
    # ========= Step 1. 读入数据 ===========
    isoform_expression_df = pd.read_csv('select_isoform_express.csv')
    isoform_expression = isoform_expression_df.as_matrix()
    isoform_expression = isoform_expression[:, 2:]
    isoform_name = isoform_expression[:, :2]
    # 对每一列数据进行归一化处理
    scaler = MinMaxScaler()
    isoform_express_scaled = scaler.fit_transform(isoform_expression)

    sample_label_df = pd.read_csv('sample_label.csv')
    sample_label = sample_label_df.as_matrix()
    sample_name = sample_label[:, 0]
    sample_label_state = sample_label[:, 1]
    sample_label_cognitive = sample_label[:, 2]

    isoform_express_scaled_state, sample_label_state = trasform_data_format(isoform_express_scaled, sample_label_state)
    # 对标签数据进行OneHot编码
    sample_label_state_onehot = tf.keras.utils.to_categorical(sample_label_state)

    isoform_express_scaled_cognitive, sample_label_cognitive = trasform_data_format(isoform_express_scaled,
                                                                                    sample_label_cognitive)
    # 对标签数据进行OneHot编码
    sample_label_cognitive_onehot = tf.keras.utils.to_categorical(sample_label_cognitive)

    # ### 用是ad 或者不是 ad 的标签进行实验
    # true_sample_name_s = []
    # true_label_s = []
    # predict_label_s = []
    #
    # ss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, train_size=0.75, random_state=0)
    #
    # start_time_all = time.clock()
    # for train_index, test_index in ss.split(isoform_express_scaled_state, sample_label_state_onehot):
    #     X_train, X_test = isoform_express_scaled_state[train_index], isoform_express_scaled_state[test_index]
    #     y_train, y_test = sample_label_state_onehot[train_index], sample_label_state_onehot[test_index]
    #     y_test_true = sample_label_state[test_index]
    #     sample_test_name = sample_name[test_index]
    #     model = create_model()
    #     model.fit(X_train, y_train, validation_split=0.1, epochs=20, batch_size=20, verbose=1)
    #     predict = model.predict(X_test)
    #     true_sample_name_s.append(sample_test_name)
    #     true_label_s.append(y_test_true)
    #     predict_label_s.append(predict)
    #
    # stop_time_all = time.clock()
    # cost_all_s = stop_time_all - start_time_all
    #
    # true_sample_name_s = np.array(true_sample_name_s)
    # true_sample_name_s = true_sample_name_s.flatten()
    # true_sample_name_s = true_sample_name_s.T
    # true_sample_name_s = np.reshape(true_sample_name_s, (len(true_sample_name_s), 1))
    # print(true_sample_name_s.shape)
    # true_label_s = np.array(true_label_s)
    # true_label_s = true_label_s.flatten()
    # true_label_s = true_label_s.T
    # true_label_s = np.reshape(true_label_s, (len(true_label_s), 1))
    # print(true_label_s.shape)
    # predict_label_s = np.array(predict_label_s)
    # predict_label_s = np.reshape(predict_label_s, (-1, 2))
    # print(predict_label_s.shape)
    #
    # final_pre_s = predict_label_s.argmax(axis=1)
    # final_pre_s = np.array(final_pre_s)
    # final_pre_s = np.reshape(final_pre_s, (len(final_pre_s), 1))
    # print(final_pre_s.shape)
    #
    # label_all_s = np.hstack((true_sample_name_s, true_label_s))
    # label_all_s = np.hstack((label_all_s, predict_label_s))
    # label_all_s = np.hstack((label_all_s, final_pre_s))
    # print(label_all_s.shape)
    # col_names = ['sample name', 'true_label', 'predict 0', 'predict 1', 'predict']
    # col_names = np.array(col_names)
    # label_all_s = np.vstack((col_names, label_all_s))

    ### 用认知评价得分进行实验
    true_sample_name_c = []
    true_label_c = []
    predict_label_c = []

    ss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, train_size=0.75, random_state=0)

    start_time_all = time.clock()
    for train_index, test_index in ss.split(isoform_express_scaled_cognitive, sample_label_cognitive_onehot):
        X_train, X_test = isoform_express_scaled_state[train_index], isoform_express_scaled_cognitive[test_index]
        y_train, y_test = sample_label_cognitive_onehot[train_index], sample_label_cognitive_onehot[test_index]
        y_test_true = sample_label_cognitive[test_index]
        sample_test_name = sample_name[test_index]
        model = create_model()
        model.fit(X_train, y_train, validation_split=0.1, epochs=20, batch_size=20, verbose=1)
        predict = model.predict(X_test)
        true_sample_name_c.append(sample_test_name)
        true_label_c.append(y_test_true)
        predict_label_c.append(predict)

    stop_time_all = time.clock()
    cost_all_c = stop_time_all - start_time_all

    true_sample_name_c = np.array(true_sample_name_c)
    true_sample_name_c = true_sample_name_c.flatten()
    true_sample_name_c = true_sample_name_c.T
    true_sample_name_c = np.reshape(true_sample_name_c, (len(true_sample_name_c), 1))
    print(true_sample_name_c.shape)
    true_label_c = np.array(true_label_c)
    true_label_c = true_label_c.flatten()
    true_label_c = true_label_c.T
    true_label_c = np.reshape(true_label_c, (len(true_label_c), 1))
    print(true_label_c.shape)
    predict_label_c = np.array(predict_label_c)
    predict_label_c = np.reshape(predict_label_c, (-1, 6))
    print(predict_label_c.shape)

    final_pre_c = predict_label_c.argmax(axis=1)
    final_pre_c = np.array(final_pre_c)
    final_pre_c = np.reshape(final_pre_c, (len(final_pre_c), 1))

    label_all_c = np.hstack((true_sample_name_c, true_label_c))
    label_all_c = np.hstack((label_all_c, predict_label_c))
    label_all_c = np.hstack((label_all_c, final_pre_c))
    col_names = ['sample name', 'true_label', 'predict 0', 'predict 1', 'predict 2', 'predict 3', 'predict 4',
                 'predict 5', 'predict']
    col_names = np.array(col_names)
    label_all_c = np.vstack((col_names, label_all_c))

    # 保存结果
    cost = [cost_all_c]
    cost = np.array(cost)
    cost_df = pd.DataFrame(data=cost)
    cost_df.to_csv('/Users/xuejiang/PycharmProjects/isofom_/result/cnn/5/time_cost_c.csv')

    # label_all_s_df = pd.DataFrame(data=label_all_s)
    label_all_c_df = pd.DataFrame(data=label_all_c)

    # label_all_s_df.to_csv('/Users/xuejiang/PycharmProjects/isofom_/result/cnn/9/label_all_s.csv')
    label_all_c_df.to_csv('/Users/xuejiang/PycharmProjects/isofom_/result/cnn/5/label_all_c.csv')
Exemple #39
0
        Y[i] -= 2
    if Y[i] == 6:
        Y[i] -= 3
    if Y[i] == 8 or Y[i] == 9:
        Y[i] -= 4

#calculate class weights for trainer
class_weights = class_weight.compute_class_weight('balanced', np.unique(Y), Y)
class_weights = dict(enumerate(class_weights))

#convert to one hot encoding
Y = keras.utils.to_categorical(Y, dtype='float32')

#split out the test se
sss = StratifiedShuffleSplit(n_splits=2, test_size=500)
tt_val_index, _ = sss.split(X, Y)
X_train_val = X[tt_val_index[0]]
Y_train_val = Y[tt_val_index[0]]
X_test = X[tt_val_index[1]]
Y_test = Y[tt_val_index[1]]

#split out the validation set
sss = StratifiedShuffleSplit(n_splits=2, test_size=100)
tt_index, _ = sss.split(X_train_val, Y_train_val)
X_train = X_train_val[tt_index[0]]
Y_train = Y_train_val[tt_index[0]]
X_val = X_train_val[tt_index[1]]
Y_val = Y_train_val[tt_index[1]]

#define model
input = layers.Input(shape=(183, 183, 6))

# In[21]:


datSet


# In[22]:


from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(datSet, datSet["income_cat"]):
    strat_train_set = datSet.loc[train_index]
    strat_test_set = datSet.loc[test_index]


# In[23]:


strat_train_set


# In[24]:


test_index
# In[10]:

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

print('No Frauds', round(df['Class'].value_counts()[0] / len(df) * 100, 2),
      '% of the dataset')
print('Frauds', round(df['Class'].value_counts()[1] / len(df) * 100, 2),
      '% of the dataset')

X = df.drop('Class', axis=1)
y = df['Class']

sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

# We already have X_train and y_train for undersample data thats why I am using original to distinguish and to not overwrite these variables.
# original_Xtrain, original_Xtest, original_ytrain, original_ytest = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the Distribution of the labels

# Turn into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values
Exemple #42
0
train_dataset = Caltech(DATA_DIR, split='train', transform=train_transform)
test_dataset = Caltech(DATA_DIR, split='test', transform=eval_transform)

class_to_idx = train_dataset.class_to_idx
classes = train_dataset.classes

X = []
y = []

for image, label in train_dataset:
    X.append(image)
    y.append(label)

sss = StratifiedShuffleSplit(n_splits=1, train_size=0.5, random_state=0)  

for train_index, val_index in sss.split(X, y):
    train_indexes = train_index # split the indices for your train split
    val_indexes = val_index # split the indices for your val split

val_dataset = Subset(train_dataset, val_indexes)
train_dataset = Subset(train_dataset, train_indexes)

# Check dataset sizes
print('Train Dataset: {}'.format(len(train_dataset)))
print('Valid Dataset: {}'.format(len(val_dataset)))
print('Test Dataset: {}'.format(len(test_dataset)))
print('Dataset size: {}'.format(len(train_dataset) + len(val_dataset) + len(test_dataset)))

"""**Images distribution among classes**"""

count_train_items = {}
# get the test and hold-out data to jive with each other, regarding sampling based on these bins
# only useful in the linear model, the boosted/bagged treee based models should do fine with whatever we give them
train_data['living_area_cat'] = pd.cut(
    train_data['GrLivArea'],
    bins=[0, 500, 1000, 1500, 2000, 2500, np.inf],
    labels=[1, 2, 3, 4, 5, 6])

#split = StratifiedShuffleSplit(n_splits=1, test_size=my_test_size, random_state=9261774)
#for train_index, test_index in split.split(train_data, train_data['living_area_cat']):
#    X_train = train_data.loc[train_index] # this is the training data
#    X_test = train_data.loc[test_index]   # this is the hold out, the protion of the training i will use for testing

split = StratifiedShuffleSplit(n_splits=1,
                               test_size=my_test_size,
                               random_state=9261774)
for train_index, test_index in split.split(train_data,
                                           train_data['living_area_cat']):
    X_train = train_data.iloc[train_index].copy()  # this is the training data
    X_test = train_data.iloc[test_index].copy(
    )  # this is the hold out, the protion of the training i will use for testing

# set up the y aka the label
y_train = X_train['SalePrice']
y_test = X_test['SalePrice']

# drop SalePrice from the x vars
X_train.drop('SalePrice', axis=1, inplace=True)
X_test.drop('SalePrice', axis=1, inplace=True)

submission_id = sub_data[
    'Id']  # this is the start of the submission data frame.
# sub data is already loaded, store the Id now, later we add in the y predictions
Exemple #44
0
# import model
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

sss = StratifiedShuffleSplit(n_splits = 10,test_size= 0.1,random_state = 10)
sss.split(train_X,train_y)
classifiers = [SVC(),
               RandomForestClassifier(),
               AdaBoostClassifier(),
               GradientBoostingClassifier(),
               LogisticRegression(),
               GaussianNB(),
               KNeighborsClassifier(),
               LinearDiscriminantAnalysis(),
               MLPClassifier(),
               DecisionTreeClassifier()]

acc_table = {} # a dictionary store the prediction
for train_index, test_index in sss.split(train_X,train_y):
    train_X_cv, test_X_cv = train_X[train_index],train_X[test_index]
    train_y_cv, test_y_cv = train_y[train_index],train_y[test_index]
Exemple #45
0
#Loading Data and splitting data into train, validation and test set
import idx2numpy
import numpy as np
file = "t10k-images-idx3-ubyte"
x_test = idx2numpy.convert_from_file(file)
file = "t10k-labels.idx1-ubyte"
y_test = idx2numpy.convert_from_file(file)
file = "train-images-idx3-ubyte"
x_train_val = idx2numpy.convert_from_file(file)
file = "train-labels-idx1-ubyte"
y_train_val = idx2numpy.convert_from_file(file)

test_fold = np.zeros((60000, 1))
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=10000)
for train_index, test_index in sss.split(x_train_val, y_train_val):
    x_train, y_train = x_train_val[train_index], y_train_val[train_index]
    x_val, y_val = x_train_val[test_index], y_train_val[test_index]
    test_fold[train_index] = -1
    test_fold[test_index] = 0
print("Training Set   ", x_train.shape, y_train.shape)
print("Validation Set ", x_val.shape, y_val.shape)
print("Test Set       ", x_test.shape, y_test.shape)

x_train = x_train.reshape(50000, 784)
x_val = x_val.reshape(10000, 784)
x_test = x_test.reshape(10000, 784)
x_train_val = x_train_val.reshape(60000, 784)

#Renormalizing the features of the data
scal = StandardScaler()
def classify(
    X,
    y,
    verbose=False,
    nfolds=2,
    dim_red=None,
    n_components=[5, 10, 20],
    scale=True,
    fs=None,
    njobs=1,
    LR_C=[0.01, 0.1, 1, 10, 100],
    LR_class_weight=[None, "balanced"],
    SVC_C=[0.01, 0.1, 1, 10, 100],
    SVC_class_weight=[None, "balanced"],
    SVC_kernels=["rbf", "linear", "poly"],
    n_estimators=[10, 20, 30],
    max_features=["auto", "log2", None],
    **kwargs
):

    # spit out to the screen the function parameters, for logging
    if verbose:
        import inspect

        frame = inspect.currentframe()
        args, _, _, values = inspect.getargvalues(frame)
        print 'function name "%s"' % inspect.getframeinfo(frame)[2]
        for i in args[2:]:
            print "    %s = %s" % (i, values[i])

    # prepare configuration for cross validation test harness
    seed = 8

    # prepare models
    models = []
    # all these support multiclass:
    # http://scikit-learn.org/stable/modules/multiclass.html
    models.append(
        (
            "LR",
            LogisticRegression(multi_class="multinomial", solver="newton-cg"),
            {"C": LR_C, "class_weight": LR_class_weight},
        )
    )
    models.append(("LDA", LinearDiscriminantAnalysis(), {}))
    models.append(("RndFor", RandomForestClassifier(), {"n_estimators": n_estimators, "max_features": max_features}))
    models.append(("NB", GaussianNB(), {}))
    models.append(("SVC", SVC(), {"C": SVC_C, "class_weight": SVC_class_weight, "kernel": SVC_kernels}))
    models.append(("Most frequent", DummyClassifier(strategy="most_frequent"), {}))
    models.append(("Stratified", DummyClassifier(strategy="stratified"), {}))

    # spit out to the screen the parameters to be tried in each classifier
    if verbose:
        print "Trying these parameters:"
        for m in models:
            print m[0], ":", m[2]

    # evaluate each model in turn
    results = []
    names = []
    for name, model, params in models:
        # need to create the CV objects inside the loop because they get used
        # and not get reset!
        inner_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.1, random_state=seed)
        outer_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=0.1, random_state=seed)
        #     # do this if no shuffling is wanted
        #     inner_cv = StratifiedKFold(n_splits=num_folds, random_state=seed)
        #     outer_cv = StratifiedKFold(n_splits=num_folds, random_state=seed)
        steps = [("clf", model)]
        pipe_params = {}
        for key, val in params.iteritems():
            key_name = "clf__%s" % key
            pipe_params[key_name] = val
        if fs == "l1":
            lsvc = LinearSVC(C=0.1, penalty="l1", dual=False)
            fs = feature_selection.SelectFromModel(lsvc)
        elif fs == "rfe":
            fs = feature_selection.RFE(estimator=model)
            pipe_params["feat_sel__n_features_to_select"] = n_components
        steps = [("feat_sel", fs)] + steps
        if dim_red is not None:
            if dim_red == "pca":
                dr = decomposition.PCA()
                pipe_params["dim_red__n_components"] = n_components
            elif dim_red == "ica":
                dr = decomposition.FastICA()
                pipe_params["dim_red__n_components"] = n_components
            steps = [("dim_red", dr)] + steps
        if scale:
            steps = [("scale", preprocessing.RobustScaler())] + steps

        pipe = Pipeline(steps)
        cv_results = []
        cnt = 0
        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            opt_model = GridSearchCV(estimator=pipe, param_grid=pipe_params, verbose=0, n_jobs=njobs, cv=inner_cv)
            opt_model.fit(X_train, y_train)
            if verbose:
                if len(params.keys()) > 0:
                    print "Best paramaters for", name, " (%d/%d):" % (cnt + 1, outer_cv.n_splits)
                    print opt_model.best_params_
            predictions = opt_model.predict(X_test)
            cv_results.append(metrics.accuracy_score(y_test, predictions))
            cnt += 1
        results.append(cv_results)
        names.append(name)
    if verbose:
        print "\n======"
        for model, res in zip(models, results):
            msg = "%s: %f (%f)" % (model[0], np.mean(res), np.std(res))
            print (msg)
        print "Chance: %f" % (1 / float(len(np.unique(y))))
        print "======\n"
    return results, models
Exemple #47
0
if __name__ == "__main__":
    n_splits = int(sys.argv[3]);
    test_size = float(sys.argv[2]);
    filename = sys.argv[1];
    classes = {"iris.data.txt":['Iris-setosa','Iris-versicolor','Iris-virginica'],
               "scale1.data.txt":[0,1,2]}
#    filename = "scale1.data.txt"
    data = pd.read_csv(filename,header=None);

    sss = StratifiedShuffleSplit(n_splits= n_splits, test_size=test_size);
    X = data.iloc[:,:-1];
    y = data.iloc[:,-1:];
    total_pre = 0.0;
    total_acc = 0.0;
    for train_indices , test_indices in sss.split(X,y):
        train_f = X.loc[train_indices];
        train_l = y.loc[train_indices];
        test_f = X.loc[test_indices];
        test_l = y.loc[test_indices];
        train_set = data.loc[train_indices];
        test_set = data.loc[test_indices];
        train_set.reset_index(inplace=True);
        train_set.drop(labels=['index'],inplace = True,axis = 1)
        decision_tree = dt.Decision_tree(filename);
        decision_tree.reload_data(train_set);
        decision_tree.run();
        pre_l = decision_tree.predict(test_set.values.tolist());
        pre_l_binarized = label_binarize(pre_l,classes=classes[filename]);
        test_l_binarized = label_binarize(test_l,classes=classes[filename]);
Exemple #48
0
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from multiprocessing import Pool, cpu_count
import pickle

dataset = pd.read_csv('../data/data.csv')
le = LabelEncoder()
le.fit(dataset['color'])
train = dataset[['r', 'g', 'b']]
labels = le.transform(dataset['color'])

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=23)
X_train, X_test, y_train, y_test = None, None, None, None
for train_index, test_index in sss.split(train, labels):
    X_train, X_test = train.values[train_index], train.values[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()
]
Exemple #49
0
def classify(X, y, verbose=False, nfolds=5, dim_red=None,
             n_components=[5, 10, 20], scale=True, fs=None,
             njobs=1,
             LR_C=[.01, .1, 1, 10, 100], LR_class_weight=[None, 'balanced'],
             SVC_C=[.01, .1, 1, 10, 100], SVC_class_weight=[None, 'balanced'],
             SVC_kernels=['rbf', 'linear', 'poly'],
             n_estimators=[10, 20, 30], max_features=['auto', 'log2', None],
             shuffle=False,
             **kwargs):

    # spit out to the screen the function parameters, for logging
    if verbose:
        import inspect
        frame = inspect.currentframe()
        args, _, _, values = inspect.getargvalues(frame)
        print 'function name "%s"' % inspect.getframeinfo(frame)[2]
        for i in args[2:]:
            print "    %s = %s" % (i, values[i])

    # prepare configuration for cross validation test harness
    num_instances = len(X)
    seed = 8

    # prepare models
    models = []
    # all these support multiclass:
    # http://scikit-learn.org/stable/modules/multiclass.html
    models.append(('LR', LogisticRegression(multi_class='multinomial',
                                            solver='newton-cg'),
                   {"C": LR_C,
                    "class_weight": LR_class_weight}))
    models.append(('LDA', LinearDiscriminantAnalysis(), {}))
    models.append(('RndFor', RandomForestClassifier(),
                   {'n_estimators': n_estimators,
                    'max_features': max_features}))
    models.append(('NB', GaussianNB(), {}))
    models.append(('SVC', SVC(),
                   {"C": SVC_C,
                    "class_weight": SVC_class_weight,
                    'kernel': SVC_kernels}))
    models.append(('Most frequent', DummyClassifier(strategy='most_frequent'),
                   {}))
    models.append(('Stratified', DummyClassifier(strategy='stratified'), {}))

    # spit out to the screen the parameters to be tried in each classifier
    if verbose:
        print 'Trying these parameters:'
        for m in models:
            print m[0], ':', m[2]

    # evaluate each model in turn
    results = []
    names = []
    scoring = 'accuracy'
    for name, model, params in models:
        # need to create the CV objects inside the loop because they get used
        # and not get reset!
        if shuffle:
            inner_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=.1,
                                              random_state=seed)
            outer_cv = StratifiedShuffleSplit(n_splits=nfolds, test_size=.1,
                                              random_state=seed)
        else:
            # do this if no shuffling is wanted
            inner_cv = StratifiedKFold(n_splits=nfolds, random_state=seed)
            outer_cv = StratifiedKFold(n_splits=nfolds, random_state=seed)
        steps = [('clf', model)]
        pipe_params = {}
        for key, val in params.iteritems():
            key_name = 'clf__%s' % key
            pipe_params[key_name] = val
        if fs == 'l1':
            lsvc = LinearSVC(C=0.1, penalty="l1", dual=False)
            fs = feature_selection.SelectFromModel(lsvc)
        elif fs == 'rfe':
            fs = feature_selection.RFE(estimator=model)
            pipe_params['feat_sel__n_features_to_select'] = n_components
        steps = [('feat_sel', fs)] + steps
        if dim_red is not None:
            if dim_red == 'pca':
                dr = decomposition.PCA()
                pipe_params['dim_red__n_components'] = n_components
            elif dim_red == 'ica':
                dr = decomposition.FastICA()
                pipe_params['dim_red__n_components'] = n_components
            steps = [('dim_red', dr)] + steps
        if scale:
            steps = [('scale', preprocessing.RobustScaler())] + steps

        pipe = Pipeline(steps)
        cv_results = []
        cnt = 0
        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            opt_model = GridSearchCV(estimator=pipe, param_grid=pipe_params,
                                     verbose=0, n_jobs=njobs, cv=inner_cv)
            opt_model.fit(X_train, y_train)
            if verbose:
                if len(params.keys()) > 0:
                    print 'Best paramaters for', name, \
                          ' (%d/%d):' % (cnt + 1, outer_cv.n_splits)
                    print opt_model.best_params_
            predictions = opt_model.predict(X_test)
            cv_results.append(metrics.accuracy_score(y_test, predictions))
            cnt += 1
        results.append(cv_results)
        names.append(name)
    if verbose:
        print '\n======'
        for model, res in zip(models, results):
            msg = "%s: %f (%f)" % (model[0], np.mean(res), np.std(res))
            print(msg)
        print 'Chance: %f' % (1 / float(len(np.unique(y))))
        print '======\n'
    return results, models
Exemple #50
0
    test_path = path.join(results_path,
                          filename + '_' + args.subset_name + '.tsv')

    flag_selection = True

    sex = list(merged_df.gender.values)
    site = list(merged_df.site.values)
    age = list(merged_df.age.values)

    train_index, test_index = None, None

    while flag_selection:

        splits = StratifiedShuffleSplit(n_splits=1, test_size=args.test_size)

        for train_index, test_index in splits.split(np.zeros(len(site)), site):

            age_test = [float(age[idx]) for idx in test_index]
            age_train = [float(age[idx]) for idx in train_index]

            sex_test = [sex_dict[sex[idx]] for idx in test_index]
            sex_train = [sex_dict[sex[idx]] for idx in train_index]

            t_age, p_age = ttest_ind(age_test, age_train)
            T_sex = chi2(sex_test, sex_train)

            print(p_age, T_sex)
            if p_age > args.p_val_threshold and T_sex < args.t_val_threshold:
                flag_selection = False

            test_df = merged_df.iloc[test_index]
Exemple #51
0
print()
x_data = np.array([x[0:num_data_col] for x in survey_data]) # [0,6) == [0,5]
    
y_data = np.array([y[num_data_col:num_data_col+num_choice_col] for y in survey_data])
x_headers = [h for h in survey_headers[1:6]]

print('x-shape: ' + str(x_data.shape))
print('y-shape: ' + str(y_data.shape)) 

# ---------------------------------------------
#%%
# use sklearn to perform stratified randomized partitioning into training and dev sets
# this is necessary because the vehicle choice dataset is very unbalanced
trainPerc = 0.95; devePerc = 0.05 # deep learning uses much higher %'s for training
sss = StratifiedShuffleSplit(n_splits=1, train_size=trainPerc, test_size = devePerc)
train_indices,deve_indices = next(sss.split(x_data, y_data))
num_train_rows = len(train_indices) # need this later on
# create the patitions
x_vals_train = x_data[train_indices,:]
y_vals_train = y_data[train_indices,:]

x_vals_deve = x_data[deve_indices,:]
y_vals_deve = y_data[deve_indices,:]

print("num_train_rows: %u, num_deve_rows: %u" %(num_train_rows, len(deve_indices)))

# ---------------------------------------------
#%%
# setup training
a_stdv = 0.1          # standard dev. for initialization of node weights
learn_rate = 1.0      # gradient descent learning rate
Exemple #52
0
def main():
    #1,加载数据(训练和测试)和预处理数据
    #将NumberTime30-59,60-89,90中标记的96,98替换为NaN
    #将Age中的0替换为NaN
    colnames = [
        'ID', 'label', 'RUUnsecuredL', 'age', 'NOTime30-59', 'DebtRatio',
        'Income', 'NOCredit', 'NOTimes90', 'NORealEstate', 'NOTime60-89',
        'NODependents'
    ]
    col_nas = [
        '', 'NA', 'NA', 0, [98, 96], 'NA', 'NA', 'NA', [98, 96], 'NA',
        [98, 96], 'NA'
    ]
    col_na_values = creatDictKV(colnames, col_nas)
    dftrain = pd.read_csv("./data/cs-training.csv",
                          names=colnames,
                          na_values=col_na_values,
                          skiprows=[0])
    dftrain.pop("NOCredit")
    train_id = [int(x) for x in dftrain.pop("ID")]
    y_train = np.asarray([int(x) for x in dftrain.pop("label")])
    x_train = dftrain.as_matrix()
    dftest = pd.read_csv("./data/cs-test.csv",
                         names=colnames,
                         na_values=col_na_values,
                         skiprows=[0])
    dftest.pop("NOCredit")
    test_id = [int(x) for x in dftest.pop("ID")]
    y_test = np.asarray(dftest.pop("label"))
    x_test = dftest.as_matrix()
    #2,使用StratifiedShuffleSplit将训练数据分解为training_new和test_new(用于验证模型)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33333, random_state=0)
    for train_index, test_index in sss.split(x_train, y_train):
        print("TRAIN:", train_index, "TEST:", test_index)
        x_train_new, x_test_new = x_train[train_index], x_train[test_index]
        y_train_new, y_test_new = y_train[train_index], y_train[test_index]

    y_train = y_train_new
    x_train = x_train_new
    #3,使用Imputer将NaN替换为平均值
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(x_train)
    x_train = imp.transform(x_train)
    x_test_new = imp.transform(x_test_new)
    x_test = imp.transform(x_test)
    #x_train = np.delete(x_train, 5, axis=1)
    #x_test_new = np.delete(x_test_new, 5, axis=1)
    if not os.path.isfile("rfc_model.m"):
        clf = RandomForestClassifier(n_estimators=100,
                                     oob_score=True,
                                     min_samples_split=2,
                                     min_samples_leaf=50,
                                     n_jobs=-1,
                                     class_weight='balanced_subsample',
                                     bootstrap=True)

        #输出特征重要性评估
        clf.fit(x_train, y_train)
        param_grid = {"max_features": [2, 3, 4], "min_samples_leaf": [50]}
        grid_search = GridSearchCV(clf,
                                   cv=10,
                                   scoring='roc_auc',
                                   param_grid=param_grid,
                                   iid=False,
                                   n_jobs=-1)
        #c.输出最佳模型
        grid_search.fit(x_train, y_train)
        joblib.dump(grid_search, "rfc_model.m")
        print("the best parameter:", grid_search.best_params_)
        print("the best score:", grid_search.best_score_)
        predicted_probs_train = grid_search.predict_proba(x_train)
        predicted_probs_train = [x[1] for x in predicted_probs_train]
        computeAUC(y_train, predicted_probs_train)
        print(
            sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_),
                       dftrain.columns),
                   reverse=True))
    else:
        clf = joblib.load("rfc_model.m")
        predicted_probs_test_new = clf.predict_proba(x_test_new)
        predicted_probs_test_new = [x[1] for x in predicted_probs_test_new]
        computeAUC(y_test_new, predicted_probs_test_new)
        clf.fit(x_test_new, y_test_new)
        joblib.dump(clf, "rfc_model.m")
Exemple #53
0
#    def countTokens(tokens):
#        return tokens.count(word)
#    data[word] = data['tokens'].apply(countTokens)


#data.drop("tokens", axis = 1, inplace = True)
print('counting tokens by file')
data['tok_array'] = data['tokens'].apply(createTokenArray)

print('saving data')
data.to_csv('data.csv', sep=',', encoding='utf-8')

data.drop("tokens", axis = 1, inplace = True)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data, data["type"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

def type_proportions(data):
    return data["type"].value_counts() / len(data)

compare_props = pd.DataFrame({
    "Overall": type_proportions(data),
    "Stratified": type_proportions(strat_train_set),
    "Stratified-test": type_proportions(strat_test_set),
}).sort_index()
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
compare_props["Strat. test %error"] = 100 * compare_props["Stratified-test"] / compare_props["Overall"] - 100

compare_props
Exemple #54
0
def lab():
    form = LabForm()

    if form.validate_on_submit():
        X_test = np.array([[
            float(form.latitude.data),
            float(form.longitude.data),
            str(form.month.data),
            str(form.day.data),
            float(form.avg.data),
            float(form.max.data),
            float(form.wind_s.data),
            float(form.wind_avg.data)
        ]])
        print(X_test.shape)
        fires = pd.read_csv('datasets/sanbul-5.csv', sep=',')
        X_test = pd.DataFrame(X_test,
                              columns=[
                                  'latitude', 'longitude', 'month', 'day',
                                  'avg_temp', 'max_temp', 'max_wind_speed',
                                  'avg_wind'
                              ])
        print(X_test)

        from sklearn.model_selection import train_test_split
        train_set, test_set = train_test_split(fires,
                                               test_size=0.2,
                                               random_state=42)
        from sklearn.model_selection import StratifiedShuffleSplit
        split = StratifiedShuffleSplit(n_splits=1,
                                       test_size=0.2,
                                       random_state=42)
        for train_index, test_index in split.split(fires, fires["month"]):
            strat_train_set = fires.loc[train_index]
            strat_test_set = fires.loc[test_index]

        fires = strat_train_set.drop(["burned_area"],
                                     axis=1)  # drop labels for training set
        fires_labels = strat_train_set["burned_area"].copy()
        fires_num = fires.drop(["month", "day"], axis=1)

        from sklearn.preprocessing import OneHotEncoder
        cat_encoder = OneHotEncoder()
        fires_cat = fires[["month"]]
        fires_cat_1hot = cat_encoder.fit_transform(fires_cat)
        cat_encoder = OneHotEncoder(sparse=False)
        fires_cat_1hot = cat_encoder.fit_transform(fires_cat)

        cat_encoder2 = OneHotEncoder()
        fires_cat = fires[["day"]]
        fires_cat_1hot_2 = cat_encoder2.fit_transform(fires_cat)
        cat_encoder2 = OneHotEncoder(sparse=False)
        fires_cat_1hot_2 = cat_encoder2.fit_transform(fires_cat)

        from sklearn.pipeline import Pipeline
        from sklearn.preprocessing import StandardScaler

        num_pipeline = Pipeline([
            ('std_scaler', StandardScaler()),
        ])
        fires_num_tr = num_pipeline.fit_transform(fires_num)

        from sklearn.compose import ColumnTransformer
        num_attribs = list(fires_num)
        cat_attribs = ["month", "day"]
        full_pipeline = ColumnTransformer([
            ("num", num_pipeline, num_attribs),
            ("cat", OneHotEncoder(), cat_attribs),
        ])
        fires_prepared = full_pipeline.fit_transform(fires)
        X_test = full_pipeline.transform(X_test)

        MODEL_NAME = "my_sanbul_model"
        os.environ[
            "GOOGLE_APPLICATION_CREDENTIALS"] = "term-224506-9bc8286b5d7b.json"
        project_id = 'term-224506'
        model_id = MODEL_NAME
        model_path = "projects/{}/models/{}".format(project_id, model_id)
        model_path += "/versions/v0001/"
        ml_resource = googleapiclient.discovery.build("ml", "v1").projects()

        input_data_json = {
            "signature_name": "serving_default",
            "instances": X_test.tolist()
        }
        request = ml_resource.predict(name=model_path, body=input_data_json)
        response = request.execute()
        print("\nresponse:\n", response)

        if "error" in response:
            raise RuntimeError(response["error"])

        predD = np.array([pred['dense_1'] for pred in response["predictions"]])
        print(predD[0][0])
        res = predD[0][0]
        return render_template('result.html', res=res)

    return render_template('prediction.html', form=form)
def run(argv=None):
    """Run the feature selection using ANOVA on the chosen task."""
    args = parser.parse_args(argv)

    print('Retrieving task')
    RS = int(args.RS)
    T = int(args.T)
    TMAX = int(args.TMAX)
    print(f'RS {RS} T {T} TMAX {TMAX}')
    task_name = args.task_name
    task = tasks.get(task_name, n_top_pvals=None)

    temp_dir = f'selected/{task.meta.tag}/temp/'

    print('Retreiving db')
    db = dbs[task.meta.db]

    print('Retrieving y')
    y = task.y
    print(f'y loaded with shape {y.shape}')

    if task.is_classif():
        logger.info('Classification, using f_classif')
        f_callable = f_classif
        ss = StratifiedShuffleSplit(n_splits=TMAX,
                                    test_size=2 / 3,
                                    random_state=RS)
    else:
        logger.info('Regression, using f_regression')
        f_callable = f_regression
        ss = ShuffleSplit(n_splits=TMAX, test_size=2 / 3, random_state=RS)

    index = y.index

    assert T >= 0

    # Alter the task to select only 1/3 for selection
    split_iter = ss.split(y, y)
    for _ in range(T + 1):
        keep_idx, drop_idx = next(split_iter)

    # Convert to index
    keep_index = [index[i] for i in keep_idx]
    drop_index = [index[i] for i in drop_idx]

    def select_idx(df):
        """Define the idx to keep from the database."""
        return df.drop(drop_index, axis=0)

    task.meta.idx_selection = Transform(
        input_features=[],
        transform=select_idx,
    )

    series = pd.Series(keep_index)
    dump_path = f'pvals/{task.meta.tag}/RS{RS}-T{T}-used_idx.csv'
    os.makedirs(os.path.dirname(dump_path), exist_ok=True)
    series.to_csv(dump_path, header=None, index=False)
    print(f'Idx used of shape {series.size}')

    # Ignore existing pvals selection
    task.meta.select = None
    task.meta.encode_select = 'ordinal'

    # Force reload y to take into account previous change
    task._load_y()
    y = task.y
    print(f'y reloaded with shape {y.shape}')

    index = y.index

    temp_df_transposed_path = temp_dir + f'RS{RS}-T{T}-X_transposed.csv'

    print('Retrieving X')
    X = task.X
    print(f'X loaded with shape {X.shape}')

    os.makedirs(temp_dir, exist_ok=True)

    # Little trick here, to iterate efficiently over the features, data is
    # transposed so that features are now in the place of rows.
    # This is useful because it is less memory expensive to iterate over
    # rows than features (rows are loaded on the fly from the file).
    # Particularly usefull with big datasets that doesn't fit in memory.
    X_t = X.transpose()
    X_t.to_csv(temp_df_transposed_path, quoting=csv.QUOTE_ALL)

    # Here we create an iterator over the rows (features, since its transposed)
    # Data is loaded row by row (since chunksize=1) when the iterator is called
    X_t = pd.read_csv(temp_df_transposed_path,
                      iterator=True,
                      chunksize=1,
                      index_col=0)

    # Load types
    print('Loading types')
    db._load_feature_types(task.meta)
    types = db.feature_types[task.meta.tag]

    def pval_one_feature(x, y):
        # Drop rows wih missing values both in f and y
        x = pd.Series(x, index=index)
        x.replace(to_replace='', value=np.nan, inplace=True)
        x = x.astype(float)
        idx_to_drop = set(x.index[x.isna()])
        x = x.drop(idx_to_drop, axis=0)
        y_dropped = y.drop(idx_to_drop, axis=0)

        x = x.to_numpy().reshape(-1, 1)
        y_dropped = y_dropped.to_numpy().reshape(-1)

        assert x.shape[0] == y_dropped.shape[0]

        if x.shape[0] < 0.01 * index.size:  # Not enough sample, skipping
            return None

        _, pval = f_callable(x, y_dropped)

        # Keep only 6 significant digits (not the same as keeping 6 digits)
        # eg 1.23456789e-10 -> 1.234567e-10
        return float(f'{pval[0]:.6g}')

    def handler(row, y):
        name = row.index[0]
        x = np.squeeze(np.transpose(row.to_numpy()))
        print(name)

        if name == '':
            return

        t = types[name]

        if t == CATEGORICAL or t == BINARY:
            # categorical encode
            df = pd.DataFrame({name: x})
            df = df.astype(str)
            df.replace(to_replace='', value=np.nan, inplace=True)

            enc = OneHotEncoder(sparse=False)

            # Cast to str to prevent: "argument must be a string or number"
            # error which occurs when mixed types floats and str

            # Fill missing values with a placeholder
            df.fillna('MISSING_VALUE', inplace=True)

            # Fit transform the encoder
            data_encoded = enc.fit_transform(df)

            feature_names = list(enc.get_feature_names(list(df.columns)))

            df_encoded = pd.DataFrame(data_encoded,
                                      index=df.index,
                                      columns=feature_names)
            L = []
            for f in df_encoded:
                print(f'\t{f}')
                L.append((f, pval_one_feature(df_encoded[f], y)))
            return L

        elif t == CONTINUE_R or t == CONTINUE_I or t == ORDINAL:
            return [(name, pval_one_feature(x, y))]

        print(f'"{name}" ignored ')

    res = Parallel(n_jobs=-1, require='sharedmem')(delayed(handler)(row, y)
                                                   for row in X_t)

    res = [r for r in res if r is not None]

    res = functools.reduce(lambda x, y: x + y, res)
    print(res)

    names, pvals = zip(*res)

    pvals = pd.Series(pvals, index=names)
    print(pvals)
    dump_path = f'pvals/{task.meta.tag}/RS{RS}-T{T}-pvals.csv'
    os.makedirs(os.path.dirname(dump_path), exist_ok=True)
    pvals.to_csv(dump_path, header=False)
Exemple #56
0
        if step % 10 == 0:
            train_accuracy = accuracy.eval(feed_dict={
                x: X_train,
                y_: y_train,
                keep_prob: 1.0
            })
            print("step %d, training accuracy %g" % (step, train_accuracy))
        train_step.run(feed_dict={x: X_train, y_: y_train, keep_prob: 0.5})
    print("test accuracy:%g" % accuracy.eval(feed_dict={
        x: X_valid,
        y_: y_valid,
        keep_prob: 1.0
    }))


if __name__ == '__main__':
    os.environ[
        'TF_CPP_MIN_LOG_LEVEL'] = '3'  #防止系统报错 Allocation of exceeds  10% of system memory
    os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

    train, labels, test, classes = get_data()
    #将train中值归一化为-1~1之间的值
    train_scaled = StandardScaler().fit_transform(train.values)
    sss = StratifiedShuffleSplit(test_size=0.1, random_state=23)
    for train_index, valid_index in sss.split(train_scaled, labels):
        X_train, X_valid = train_scaled[train_index], train_scaled[valid_index]
        y_train, y_valid = labels[train_index], labels[valid_index]
    OneHot = OneHotEncoder().fit(y_train.reshape(-1, 1))
    y_train = OneHot.transform(y_train.reshape(-1, 1)).toarray()
    y_valid = OneHotEncoder().fit_transform(y_valid.reshape(-1, 1)).toarray()
    main()
Exemple #57
0
    print("Number of samples: " + str(nb_samples))

    x = np.array([x for x, _ in ds.data])
    y = np.array(ds.targets)
    y = to_categorical(y)

    if speaker_independence:
        k_folds = len(ds.test_sets)
        splits = zip(ds.train_sets, ds.test_sets)
        print("Using speaker independence %s-fold cross validation" % k_folds)
    else:
        k_folds = 5
        sss = StratifiedShuffleSplit(n_splits=k_folds,
                                     test_size=0.2,
                                     random_state=1)
        splits = sss.split(x, y)
        print("Using %s-fold cross validation by StratifiedShuffleSplit" %
              k_folds)

    cvscores = []
    for (train, test) in splits:
        # create network
        model = networks.create_softmax_la_network(
            input_shape=(globalvars.max_len, globalvars.nb_features),
            nb_classes=nb_classes)

        # compile the model
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
import numpy as np

# # StratifiedShuffleSplit
# * 引数:
#     1. n_splits:分割数(分割をする回数)
#     2. test_size:データ全体に対するテストデータの割合
#     3. random_state:乱数の初期化のための値
# ## StratifiedShuffleSplit.split
# * 引数
#     1. 分割対象のデータ
#     2. データのグループ(分割時にグループの割合が保たれる)
# * 戻り値
#     * JavaでいうIterator的なオブジェクトが返却される
#     * 値の取得にはFor文を使う必要あり(それ以外の方法もある?)

# In[8]:

from sklearn.model_selection import StratifiedShuffleSplit

# In[23]:

data = np.array(['A', 'B', 'a', 'b'])
group = np.array([0, 0, 1, 1])
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)

# In[24]:

for train_index, test_index in sss.split(data, group):
    print('%s %s %s %s' %
          (train_index, data[train_index], test_index, data[test_index]))
'''
# import matplotlib.pyplot as plt
# housing_df.hist(bins=50, figsize=(20,20))
# plt.show()
'''
    split train & test
'''
import numpy as np
housing_df["income_cat"] = np.ceil(housing_df["median_income_value"] / 1.5)
housing_df["income_cat"].where(cond=housing_df["income_cat"] < 0.5,
                               other=0.5,
                               inplace=True)

from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_indices, test_indices in split.split(X=housing_df,
                                               y=housing_df["income_cat"]):
    train_set = housing_df.loc[train_indices]
    test_set = housing_df.loc[test_indices]

housing_train = train_set.drop("median_house_value", axis=1)
housing_test = test_set.drop("median_house_value", axis=1)
'''
    selector for number and text attributes
'''
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin


class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
Exemple #60
0
def get_cv(X, y):
    cv = StratifiedShuffleSplit(n_splits=8, test_size=0.2, random_state=57)
    return cv.split(X, y)