Example #1
0
def split_patients(patients, valid_percent, test_percent, rng=(2014, 10, 22)):
    if isinstance(rng, (list, tuple)):
        rng = make_np_rng(None, rng, which_method='uniform')

    vals = np.asarray(patients.values())
    keys = np.asarray(patients.keys())
    sss = StratifiedShuffleSplit(
        vals, n_iter=1, test_size=test_percent, random_state=rng)
    remaining_idx, test_idx = sss.__iter__().next()

    if valid_percent > 0:
        # Rate of samples required to build validation set
        valid_rate = valid_percent / (1 - test_percent)

        sss = StratifiedShuffleSplit(
            vals[remaining_idx], n_iter=1, test_size=valid_rate, random_state=rng)
        tr_idx, val_idx = sss.__iter__().next()
        valid_idx = remaining_idx[val_idx]
        train_idx = remaining_idx[tr_idx]
    else:
        train_idx = remaining_idx
        valid_idx = []

    train_patients = dict(zip(keys[train_idx], vals[train_idx]))
    valid_patients = dict(zip(keys[valid_idx], vals[valid_idx]))
    test_patients = dict(zip(keys[test_idx], vals[test_idx]))
    return train_patients, valid_patients, test_patients
Example #2
0
def simple_classification(n_samples=100, n_features=10, random_state=33):
    """
    Generate simple classification task for training.

    Parameters
    ----------
    n_samples : int
        Number of samples in dataset.
    n_features : int
        Number of features for each sample.
    random_state : int
        Random state to make results reproducible.

    Returns
    -------
    tuple
        Returns tuple that contains 4 variables. There are input train,
        input test, target train, target test respectevly.
    """
    X, y = datasets.make_classification(n_samples=n_samples,
                                        n_features=n_features,
                                        random_state=random_state)
    shuffle_split = StratifiedShuffleSplit(y, 1, train_size=0.6,
                                           random_state=random_state)

    train_index, test_index = next(shuffle_split.__iter__())
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    return x_train, x_test, y_train, y_test
Example #3
0
    def setUp(self):
        super(QuasiNewtonTestCase, self).setUp()

        X, y = datasets.make_classification(n_samples=100, n_features=10, random_state=33)
        shuffle_split = StratifiedShuffleSplit(y, 1, train_size=0.6, random_state=33)

        train_index, test_index = next(shuffle_split.__iter__())
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        self.X, self.y = X, y
        self.data = (x_train, x_test, y_train, y_test)
Example #4
0
def get_rows_msr(data):
    conf = get_config()
    rng = get_rng()
    train = [data[1][idx] for idx in data[3]]
    test = [data[1][idx] for idx in data[2]]
    # test = [data[1][idx] for idx in data[4]]
    shuffle(train, rng.rand)
    train_y = [y for y, o, p in train]
    # build dev set
    sss = StratifiedShuffleSplit(
        train_y, 1, train_size=0.8, test_size=0.2, random_state=rng)
    train_index, dev_index = sss.__iter__().next()
    return [train[i] for i in train_index], [train[i] for i in dev_index], test
def test_classifier(clf, dataset, feature_list, folds=1000):
    data = featureFormat(dataset, feature_list, sort_keys=True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state=42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv:
        features_train = []
        features_test = []
        labels_train = []
        labels_test = []
        for ii in train_idx:
            features_train.append(features[ii])
            labels_train.append(labels[ii])
        for jj in test_idx:
            features_test.append(features[jj])
            labels_test.append(labels[jj])

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print("Warning: Found a predicted label not == 0 or 1.")
                print("All predictions should take value 0 or 1.")
                print("Evaluating performance for processed predictions:")
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0 * (true_positives + true_negatives) / total_predictions
        precision = 1.0 * true_positives / (true_positives + false_positives)
        recall = 1.0 * true_positives / (true_positives + false_negatives)
        f1 = 2.0 * true_positives / (2 * true_positives + false_positives +
                                     false_negatives)
        f2 = (1 + 2.0 * 2.0) * precision * recall / (4 * precision + recall)
        print(clf)
        print(
            PERF_FORMAT_STRING.format(accuracy,
                                      precision,
                                      recall,
                                      f1,
                                      f2,
                                      display_precision=5))
        print(
            RESULTS_FORMAT_STRING.format(total_predictions, true_positives,
                                         false_positives, false_negatives,
                                         true_negatives))
        print("")
    except:
        print("Got a divide by zero when trying out:", clf)
        print(
            "Precision or recall may be undefined due to a lack of true positive predicitons."
        )
Example #6
0
                       'criterion': ['gini', 'entropy'],
                       'splitter': ['best', 'random'],
                       'max_features': ['sqrt', 'auto', 'log2'],
                       'presort': ['True', 'False'],
                       'random_state': [None, 1, 2, 3, 4, 5, 6, 7, 8, 9]
                   },
                   cv=cv,
                   scoring='f1')
dtc.fit(features, labels)
print 'DecisionTreeClassifier best estimator: ', dtc.best_estimator_
print 'DecisionTreeClassifier best parameters: ', dtc.best_params_
print 'DecisionTreeClassifier best score: ', dtc.best_score_
print '\n'

# Tune K Nearest Neighbors
cv = StratifiedShuffleSplit(labels, 10, random_state=42)
knn = GridSearchCV(KNeighborsClassifier(),
                   param_grid={
                       'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                       'metric': ['manhattan', 'minkowski', 'euclidean'],
                       'weights': ['distance', 'uniform']
                   },
                   cv=cv,
                   scoring='f1')
knn.fit(features, labels)
print 'K Nearest Neighbors best estimator: ', knn.best_estimator_
print 'K Nearest Neighbors best parameters: ', knn.best_params_
print 'K Nearest Neighbors best score: ', knn.best_score_
# tester.test_classifier(knn.best_estimator_, my_dataset, best_features)

# Pipeline
Example #7
0
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

# Provided to give you a starting point. Try a variety of classifiers.
# from sklearn.cross_validation import train_test_split
# features_train, features_test, labels_train, labels_test = \
#     train_test_split(features, labels, test_size=0.3, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedShuffleSplit

cv = StratifiedShuffleSplit(labels, n_iter=100, test_size=0.4, random_state=45)
steps = [('scale', MinMaxScaler()), ('knearest', KNeighborsClassifier())]
clf = Pipeline(steps)

param_grid = {'knearest__n_neighbors': [1, 3, 5, 10]}
grid = GridSearchCV(clf, param_grid, verbose=True, cv=cv, scoring='f1')
grid.fit(features, labels)
print "best estimator:", grid.best_estimator_
print "best score:", grid.best_score_
clf = grid.best_estimator_

### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!

#clf.fit(features_train,labels_train)
            ]).mean(axis=0))
    data_pln = []
    for j in range(len(pln_all)):
        tmp = pln_all[j][band]
        data_pln.append(
            np.asarray([
                bct.centrality.pagerank_centrality(g, d=0.85) for g in tmp
            ]).mean(axis=0))

    data_cls = np.asarray(data_cls)
    data_pln = np.asarray(data_pln)

    X = np.vstack([data_cls, data_pln])
    y = np.concatenate([np.zeros(len(data_cls)), np.ones(len(data_pln))])

    cv = StratifiedShuffleSplit(y, test_size=0.1)

    cv_params = {
        "learning_rate": np.arange(0.1, 1.1, 0.1),
        "max_depth": [1, 2, 3, 4, 5, 6, 7]
    }

    grid = GridSearchCV(xgb.XGBClassifier(n_estimators=500),
                        cv_params,
                        scoring='accuracy',
                        cv=cv,
                        n_jobs=1,
                        verbose=1)
    grid.fit(X, y)
    xgb_cv = grid.best_estimator_
Example #9
0
def sp_one_level(base_dir, data_path='data.csv', seed=123456789):
    """Test the SP."""
    # Make a new directory
    new_dir = os.path.join(base_dir, time.strftime(
                            '%Y%m%d-%H%M%S', time.localtime()))
    os.makedirs(new_dir)

    # Params
    nsplits = 8
    pct_train = 0.8

    # Get data
    data = pd.read_csv(data_path)
    x = data.ix[:, :-1].as_matrix()
    y = data.ix[:, -1].as_matrix()
    x, y = convert_data_to_int(x, y)

    # Create the encoder
    num_bits_per_encoder = 50
    category_encoders = [
        CategoryEncoder(
            num_categories=len(set(xi)),
            num_bits=num_bits_per_encoder
            ) for xi in x.T]
    total_bits = num_bits_per_encoder*len(category_encoders)
    encoder = MultiEncoder(*category_encoders)

    # Build the config for the SP
    ncolumns = 4096
    nactive = int(ncolumns * 0.20)
    nsynapses = 25
    seg_th = 0
    sp_config = {
        'ninputs': total_bits,
        'ncolumns': ncolumns,
        'nactive': nactive,
        'global_inhibition': True,
        'trim': 1e-4,
        'disable_boost': True,
        'seed': seed,

        'nsynapses': nsynapses,
        'seg_th': seg_th,

        'syn_th': 0.5,
        'pinc': 0.001,
        'pdec': 0.001,
        'pwindow': 0.5,
        'random_permanence': True,

        'nepochs': 1,
        'log_dir': os.path.join(new_dir, '1-1'),
        'clf': LinearSVC(random_state=seed)}

    # Encode all of the data
    new_x = np.zeros((len(x), total_bits), dtype='bool')
    for i in range(len(x)):
        encoder.bind_data([(x[i, j], j) for j in range(x.shape[1])])
        new_x[i] = np.array(list(encoder.encode()), dtype='bool')

    # Dump the data and the details
    with open(os.path.join(new_dir, 'input.pkl'), 'wb') as f:
        pickle.dump((new_x, y), f, pickle.HIGHEST_PROTOCOL)
    with open(os.path.join(new_dir, 'details.csv'), 'w') as f:
        writer = csv.writer(f)
        category_encoder_details = [[
            'Category {0}: Num bits: {1}'.format(i, c.num_bits),
            'Category {0}: Active bits: {1}'.format(i, c.active_bits),
            'Category {0}: Num categories: {1}'.format(i, c.num_categories)]
            for i, c in enumerate(category_encoders)]
        writer.writerows(category_encoder_details)
        writer.writerow(['Num splits', nsplits])
        writer.writerow(['% train', pct_train])
        writer.writerow(['Seed', seed])

    # Run the experiment
    sss = StratifiedShuffleSplit(y, n_iter=nsplits, train_size=pct_train,
                                 random_state=seed)
    results = Parallel(n_jobs=-1)(delayed(train_score_clf)(
        SPRegion(**sp_config), new_x[tr], new_x[te], y[tr], y[te])
        for i, (tr, te) in enumerate(sss))
    pct_accuracy = np.median(results)
    print(['{0:.3f}'.format(r) for r in results])
    print('SP + Linear SVM: {0:.3f} %'.format(pct_accuracy))
    with open(os.path.join(new_dir, 'details.csv'), 'a') as f:
        writer = csv.writer(f)
        writer.writerow(['% Accuracy', pct_accuracy])
Example #10
0
import tensorflow as tf
import numpy as np
from sklearn.cross_validation import StratifiedShuffleSplit
from tensorflow.examples.tutorials.mnist import input_data
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

label_numbers = [np.where(x==1)[0][0] for x in mnist.train.labels]
sss = StratifiedShuffleSplit(y=label_numbers, n_iter=1, test_size=5000, random_state=0)
   
for large_split_indices, small_split_indices in sss:
    small_split_data = mnist.train.images[small_split_indices]
    small_split_labels = mnist.train.labels[small_split_indices]

train_dataset = small_split_data
train_labels = small_split_labels
valid_dataset = mnist.validation.images
valid_labels = mnist.validation.labels
test_dataset = mnist.test.images
test_labels = mnist.test.labels

# We'll be using all of the small training subset at each step, to get smoother gradients.
train_subset = len(train_dataset)

# This is our lambda parameter for regularization.
y = .01
Example #11
0
features_list_without_create_feature = ['poi'] + new_features_list

### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

from time import time
from tester import test_classifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
sss = StratifiedShuffleSplit(labels, 100, test_size=0.3, random_state=42)
parameters = {
    'max_features': [2, 4, 6, 7, 8],
    'max_depth': [1, 3, 5],
    'min_samples_split': [2, 4, 6]
}
#dt = DecisionTreeClassifier()
t0 = time()
grid_obj = GridSearchCV(dt, parameters, scoring='f1', cv=sss)
print "======== Decision Tree (Optimized) ========"
print("DecisionTree tuning: %r" % round(time() - t0, 3))
# TODO: Fit the grid search object to the training data and find the optimal parameters
t0 = time()
grid_obj = grid_obj.fit(features, labels)
print("DecisionTree fitting: %r" % round(time() - t0, 3))
# Get the estimator
Example #12
0
y = data[:, -1]

X = np.delete(X, 7, 1)  # we are not using the hamming-distance-path feature

if USE_MEMMAP:
    Xmm = np.memmap('X.mmap', dtype=X.dtype, mode='w+', shape=X.shape)
    ymm = np.memmap('y.mmap', dtype=y.dtype, mode='w+', shape=y.shape)
    np.copyto(Xmm, X)
    np.copyto(ymm, y)
    del (data)
    del (X)
    del (y)
    X = Xmm
    y = ymm

cv = StratifiedShuffleSplit(y, 1, test_size=0.2, random_state=42)

param = {
    'objective': 'binary:logistic',
    'nthread': 1,
    'eval_metric': 'error',
    'silent': 1
}

for train_index, test_index in cv:

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
Example #13
0
args = parser.parse_args()

inputFile = args.infile
exptID = args.eid
testSize = args.testSize
innerIter = 10
outerIter = args.iter
R = args.rank
gamma = args.gamma
alpha = args.alpha
seed = args.seed

X, axisDict, classDict = tensorIO.loadSingleTensor(inputFile)
Y = np.array(classDict.values(), dtype='int')
ttss = StratifiedShuffleSplit(Y,
                              n_iter=1,
                              test_size=testSize,
                              random_state=seed)
predModel = LogisticRegression(C=1.0, penalty='l1', tol=1e-6)

output = {
    "expt": exptID,
    "iters": outerIter,
    "inner": innerIter,
    "R": R,
    "gamma": gamma,
    "alpha": alpha,
    "seed": seed
}

for train, test in ttss:
    trainShape = list(X.shape)
Example #14
0
	X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
	return X, ids
	
def make_submission(clf, X_test, ids, encoder, name='ert_calibrated3.csv'):
	y_prob = clf.predict_proba(X_test)
	preds = pd.DataFrame(y_prob, index=ids, columns=encoder.classes_)
	preds.to_csv(name, index_label='id', float_format='%.4f')
	print("Wrote submission to file {}.".format(name))

X, y, encoder = load_train_data('train.csv')
X_test, ids = load_test_data('test.csv')
num_classes = len(encoder.classes_)
num_features = X.shape[1]

rng = np.random.RandomState(2)
print("Otto: multiclass classification")
kf = StratifiedShuffleSplit(y,n_iter=3, test_size=0.33, random_state=12)
for train_index, test_index in kf:
	xgb_model = xgb.XGBClassifier(max_depth=12, 
									learning_rate=0.0057, 
									n_estimators=4000,
									objective="multi:softprob",
									nthread=-1,
									min_child_weight=5, 
									subsample=0.865, 
									colsample_bytree=0.55)
	xgb_model.fit(X[train_index],y[train_index])
	pred = xgb_model.predict_proba(X[test_index])
	score = log_loss(y[test_index], pred)
	print(score)
Example #15
0
    def fit(self, data, targets, sample_weight=None):
        self.classes_, indices = np.unique(targets, return_inverse=True)
        self.n_classes_ = self.classes_.shape[0]

        random_state = check_random_state(self.random_state)

        # Shuffle data and eventually split on train and validation sets
        if self.valid_ratio > 0:
            strat_shuffled_split = StratifiedShuffleSplit(
                targets,
                test_size=self.valid_ratio,
                n_iter=1,
                random_state=self.random_state)
            train_index, valid_index = [s for s in strat_shuffled_split][0]
            X_train, y_train = data[train_index], targets[train_index]
            X_valid, y_valid = data[valid_index], targets[valid_index]
        else:
            X_train, y_train = data, targets
            X_valid, y_valid = np.array([]), np.array([])

        if self.verbose > 5:
            print 'X_train: %s, y_train: %s' % (X_train.shape, y_train.shape)
            if self.use_valid:
                print 'X_valid: %s, y_valid: %s' % (X_valid.shape,
                                                    y_valid.shape)

        # Prepare theano variables
        dataset = dict(
            X_train=theano.shared(lasagne.utils.floatX(X_train)),
            y_train=T.cast(theano.shared(y_train), 'int32'),
            X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
            y_valid=T.cast(theano.shared(y_valid), 'int32'),
            num_examples_train=X_train.shape[0],
            num_examples_valid=X_valid.shape[0],
            input_dim=X_train.shape[1],
            output_dim=self.n_classes_,
        )

        if self.verbose > 0:
            print "Building model and compiling functions..."
        output_layer = self.build_model(dataset['input_dim'])
        iter_funcs = self.create_iter_functions(dataset, output_layer)

        if self.verbose > 0:
            print "Starting training..."
        now = time.time()
        results = []
        try:
            for epoch in self.train(iter_funcs, dataset, output_layer):
                if self.verbose > 1:
                    print "Epoch {} of {} took {:.3f}s".format(
                        epoch['number'], self.max_epochs,
                        time.time() - now)
                now = time.time()
                results.append([
                    epoch['number'], epoch['train_loss'], epoch['valid_loss']
                ])
                if self.verbose > 1:
                    print "  training loss:\t\t{:.6f}".format(
                        epoch['train_loss'])
                    print "  validation loss:\t\t{:.6f}".format(
                        epoch['valid_loss'])
                    print "  validation accuracy:\t\t{:.2f} %%".format(
                        epoch['valid_accuracy'] * 100)

                if epoch['number'] >= self.max_epochs:
                    break

            if self.verbose > 0:
                print 'Minimum validation error: %f (epoch %d)' % \
                      (epoch['best_val_error'], epoch['best_val_iter'])

        except KeyboardInterrupt:
            pass

        return self
Example #16
0
                             max_depth=5,
                             min_samples_split=1,
                             min_samples_leaf=1,
                             max_features='auto',
                             bootstrap=False,
                             oob_score=False,
                             n_jobs=1,
                             random_state=seed,
                             verbose=0)

###grid search找到最好的参数
param_grid = dict()
##创建分类pipeline
pipeline = Pipeline([('clf', clf)])
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=3,scoring='accuracy', \
                           cv=StratifiedShuffleSplit(Y_train, n_iter=10, test_size=0.2, train_size=None, indices=None, \
                                                     random_state=seed, n_iterations=None)).fit(X_train, Y_train)
# 对结果打分
print("Best score: %0.3f" % grid_search.best_score_)
print(grid_search.best_estimator_)
report(grid_search.grid_scores_)

print('-----grid search end------------')
print('on all train set')
scores = cross_val_score(grid_search.best_estimator_,
                         x_train,
                         y_train,
                         cv=3,
                         scoring='accuracy')
print scores.mean(), scores
print('on test set')
scores = cross_val_score(grid_search.best_estimator_,
Example #17
0
def main(source, output, n_folds, n_folds_max, type_max_features, type_min_df, type_ngram, pos_max_features, pos_min_df,
         pos_ngram, pos_vec_type, sparse, feature_name_header):
    """
    Generates a good vs bad training dataset from Fuman user posts. (Binary Classification)

    Concatenates simple features from the database, hand crafted features based on various character and word counts,
    and Tf-Idf weighted bag of words based on the text as well as the part-of-speech tags of Fuman user posts.

    :param source: directory or file of the input files. (If dir, file will be all-scored-rants.csv)
    :param output: the output directory
    :param n_folds: the number of splits to generate (using StratifiedKFold)
    :param pos_max_features: parameter for tf-idf vectorizer (default 50000)
    :param pos_min_df: parameter for tf-idf vectorizer (default 100)
    :param pos_vec: [tfidf, count] use corresponding term weighting
    :param pos_ngram: Learn vocabulary with ngrams in range (1,pos_ngram) (default is 3)
    """
    if not os.path.isdir(output):
        raise ValueError("Output must be a directory")

    if os.path.isfile(source):
        raise ValueError("Source must be a directory")
    logging.info("Source dump: {}".format(source))
    timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H_%M_%S')
    logging.info("Timestamp: {}".format(timestamp))
    output_path = os.path.join(output, "gvsb-{}".format(timestamp))

    rant_stats_vectorizer = DictVectorizer()
    stats_pipeline = Pipeline([('stats', RantStats()),  # returns a list of dicts
                               ('vect', rant_stats_vectorizer)])  # list of dicts -> feature matrix
    type_vec = CountVectorizer(tokenizer=tokenize_token_type, ngram_range=(1, type_ngram), strip_accents='unicode',
                               min_df=type_min_df, max_features=type_max_features)
    transformer_list = [
        ('rant_stats', stats_pipeline),
        ("type_vec", type_vec),
    ]
    pos_vec = None
    pos_dict_filename = os.path.join(output_path, "pos-vocabulary-" + timestamp + ".json")
    if pos_max_features:
        logging.info("Adding POS vectorization with max_features: {} ngram: {} max_df: {}".format(pos_max_features,
                                                                                                  pos_ngram,
                                                                                                  pos_min_df))
        pos_vec = VECTORIZERS[pos_vec_type](tokenizer=tokenize_pos, ngram_range=(1, pos_ngram), strip_accents='unicode',
                                            min_df=pos_min_df, max_features=pos_max_features)
        transformer_list.append(("pos_vec", pos_vec))

    pipeline = Pipeline([
        ('union', FeatureUnion(transformer_list))
    ])

    fuman_data = load_fuman_gvb(source, good_filename=GOOD_FILENAME, bad_filename=BAD_FILENAME)
    logging.info("Processing pipeline...")
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", message="deprecated", module="sklearn")
        instances = pipeline.fit_transform(fuman_data.data)
        n_samples = instances.shape[0]
        y = np.asarray(fuman_data.target, dtype=np.int8).reshape((n_samples,))

        pos_features = list()
        if pos_max_features:
            pos_features = pos_vec.get_feature_names()
            save_features_json(pos_dict_filename, pos_features)

        header = make_header(rant_stats_vectorizer.get_feature_names(),
                             token_type_features=type_vec.get_feature_names(),
                             pos_features=pos_features,
                             feature_name_header=feature_name_header)

        logging.info("Saving {} folds to disk...".format(n_folds))
        splits = StratifiedShuffleSplit(y, test_size=1.0 / n_folds)
        for i, (_, test_index) in enumerate(splits, 1):
            dump_csv(output_path, instances[test_index], y[test_index], "gvsb", i, header, timestamp, sparse)
            if i == n_folds_max:
                break
        save_dataset_metadata(sparse, output_path, "goodvsbad", source_filepath=source, timestamp=timestamp,
                              pos_vectorizer=pos_vec, tokenize_pos=tokenize_pos)
    logging.info("Work complete!")
def main():
    # load data
    with open("./data/final_project_dataset.pkl", "r") as data_file:
        data_dict = pickle.load(data_file)

    # preprocess
    df = prepare_data(data_dict)

    # split into training and test set
    cv = StratifiedShuffleSplit(df['poi'], 100)

    '''
    For each classifier two pipelines were created. That was necessary because without knowing the number of features
    after applying SelectKBest the maximum of n_components for pca cannot be set. Thus in a first grid search the n_components
    parameter of the PCA is set to None to include all components, while the other parameters are optimized. In the second
    following grid search the parameters found in the first run were set and the n_components parameter optimized to this set
    of parameters.
    '''

    # create classifiers and parameter grids
    sgd = {'classifier': SGDClassifier(n_jobs=4),
           'parameters': {'kbest__k': np.arange(1, 20, 1).tolist(),
                          'pca__n_components': [None],
                          'classify__class_weight': ['balanced', None],
                          'classify__loss': ['log', 'hinge'],
                          'classify__penalty': ['l2', 'l1', 'elasticnet', 'none'],
                          'classify__alpha': [0.0001, 0.001, 0.01, 0.1]}}

    sgd_pca = {'classifier': SGDClassifier(n_jobs=4),
               'parameters': {'kbest__k': [16],
                              'pca__n_components': [None] + np.arange(1, 16, 1).tolist(),
                              'classify__class_weight': ['balanced'],
                              'classify__loss': ['log'],
                              'classify__penalty': ['l1'],
                              'classify__alpha': [0.001, 0.01, 0.1]}}

    ada = {'classifier': AdaBoostClassifier(),
           'parameters': {'kbest__k': np.arange(1, 20, 1).tolist(),
                          'pca__n_components': [None],
                          'classify__n_estimators': np.arange(20, 200, 20).tolist(),
                          'classify__base_estimator': [DecisionTreeClassifier(),
                                                       RandomForestClassifier()]}}

    ada_pca = {'classifier': AdaBoostClassifier(),
               'parameters': {'kbest__k': [3],
                              'pca__n_components': [None] + np.arange(1, 3, 1).tolist(),
                              'classify__n_estimators': [160, 180, 200],
                              'classify__base_estimator': [DecisionTreeClassifier()]}}

    logit = {'classifier': LogisticRegression(),
             'parameters': {'kbest__k': np.arange(1, 20, 1).tolist(),
                            'pca__n_components': [None],
                            'classify__class_weight': ['balanced', None],
                            'classify__penalty': ['l2', 'l1'],
                            'classify__C': [0.001, 0.01, 0.1, 1, 10]}}

    logit_pca = {'classifier': LogisticRegression(),
                 'parameters': {'kbest__k': [15],
                                'pca__n_components': [None] + np.arange(1, 15, 1).tolist(),
                                'classify__class_weight': ['balanced'],
                                'classify__penalty': ['l1'],
                                'classify__C': [0.1, 1, 10]}}

    # list of classifiers to optimize
    classifiers = {'sgd_finalpca': sgd_pca,
                   'ada_finalpca': ada_pca,
                   'logit_finalpca': logit_pca}

    # split features and labels
    features = df.drop(['poi'], 1)
    labels = df['poi']

    # apply a gridsearch for each classifier
    for c in classifiers:
        print(c)
        steps = [('impute nans', ImputeToValue()),
                 ('log transforming', LogTransform(True)),
                 ('minmax scaling', MinMaxNA()),
                 ('kbest', SelectKBest()),
                 ('pca', PCA()),
                 ('classify', classifiers[c]['classifier'])]

        pipeline = Pipeline(steps)

        clf = GridSearchCV(pipeline,
                           classifiers[c]['parameters'],
                           scoring='precision',
                           cv=cv,
                           n_jobs=4)

        clf.fit(features, labels)
        print('best')
        print(clf.best_estimator_)
        print(clf.best_score_)

        clf_outfile = './data/clf_' + c + '.pkl'
        with open(clf_outfile, 'wb') as f:
            pickle.dump(clf, f)
Example #19
0
def split_test_train_stratified(df, test_ration, random_state=None):
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
Example #20
0
            for line in fp:
                p.append(line)
                p_target.append('1')

        with open(vposfiles, 'r') as fvp:
            for line in fvp:
                vp.append(line)
                vp_target.append('2')

        data = np.array(vn + n + neutral + p + vp)
        data_target = np.array(vn_target + n_target + neutral_target +
                               p_target + vp_target)

        #skf = StratifiedKFold(data_target, n_folds=2, shuffle=True)
        sss = StratifiedShuffleSplit(data_target,
                                     10,
                                     test_size=0.4,
                                     random_state=0)

        data_train = []
        data_test = []

        for train_index, test_index in sss:
            data_train, data_test = data[train_index], data[test_index]
            y_train, y_test = data_target[train_index], data_target[test_index]

        #print("Tamanho do train: {}".format(len(data_train)))
        #print("Tamanho do teste: {}".format(len(data_test)))

        t0 = time()
        vectorizer = TfidfVectorizer(sublinear_tf=True,
                                     max_df=0.5,
Example #21
0
import numpy as np
import pandas as pd

from sklearn.cross_validation import StratifiedShuffleSplit
from itertools import combinations
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR')
training_indeces, testing_indeces = next(
    iter(
        StratifiedShuffleSplit(tpot_data['class'].values,
                               n_iter=1,
                               train_size=0.75,
                               test_size=0.25)))

result1 = tpot_data.copy()

# Perform classification with a logistic regression classifier
lrc1 = LogisticRegression(C=0.1)
lrc1.fit(result1.loc[training_indeces].drop('class', axis=1).values,
         result1.loc[training_indeces, 'class'].values)
result1['lrc1-classification'] = lrc1.predict(
    result1.drop('class', axis=1).values)

# Decision-tree based feature selection
training_features = result1.loc[training_indeces].drop('class', axis=1)
training_class_vals = result1.loc[training_indeces, 'class'].values
Example #22
0
#logit = LogisticRegression(penalty='l1', random_state=42)


def train_and_test(X, y, train, test):
    logit.fit(X[train], y[train])
    return logit.score(X[test], y[test])


pairs_all = []
accuracy_all = []
for i, gr1 in enumerate(sorted(data.keys())[:-1]):
    for j, gr2 in enumerate(sorted(data.keys())[i + 1:]):
        print gr1, gr2
        X = np.concatenate((data[gr1], data[gr2]))
        X = StandardScaler().fit_transform(X)
        y = np.asarray([1] * data[gr1].shape[0] + [0] * data[gr2].shape[0])
        sss = StratifiedShuffleSplit(y,
                                     n_iter=50,
                                     test_size=.25,
                                     random_state=42)
        acc = Parallel(n_jobs=10,
                       verbose=5)(delayed(train_and_test)(X, y, train, test)
                                  for train, test in sss)
        accuracy_all.append(acc)
        pairs_all.append('/'.join([gr1, gr2]))

import matplotlib.pyplot as plt
plt.figure()
plt.boxplot(accuracy_all)
plt.xticks(range(1, len(pairs_all) + 1), pairs_all)
Example #23
0
    def compute_kernel(self):
        if self.gamma == 'cv':
            seed = 0xdeadbeef
            while self.kernel_fun == None:
                try:
                    # search for the correct gamma and C value using cross validation
                    from sklearn import svm
                    from sklearn.grid_search import GridSearchCV
                    from sklearn.cross_validation import StratifiedShuffleSplit
                    from sklearn.cross_validation import train_test_split

                    indices = xrange(0, len(self.max_labels))
                    if self.learning[0] != None:
                        train_indices = [
                            i for i in indices if self.learning[i] == 'train'
                        ]
                    else:
                        train_indices, test_indices = train_test_split(
                            indices,
                            train_size=self.config.learning_split,
                            random_state=seed)
                    train_labels = [self.max_labels[i] for i in train_indices]
                    train_segments = [self.segments[i] for i in train_indices]
                    C_range = numpy.logspace(-10, 10, 20)
                    gamma_range = numpy.logspace(-10, 10, 20)
                    param_grid = dict([("gamma", gamma_range), ("C", C_range)])
                    cv = StratifiedShuffleSplit(train_labels,
                                                n_iter=5,
                                                test_size=0.2,
                                                random_state=seed)
                    grid = GridSearchCV(svm.SVC(),
                                        param_grid=param_grid,
                                        cv=cv)
                    grid.fit(train_segments, train_labels)
                    print(
                        "RBFKernel : The best parameters are %s with a score of %0.2f"
                        % (grid.best_params_, grid.best_score_))
                    self.config.learning_C = grid.best_params_['C']
                    self.config.kernel_gamma = grid.best_params_['gamma']
                    self.gamma = grid.best_params_['gamma']
                    self.kernel_fun = RBFEntryWrapper(self.gamma)
                except ValueError as e:
                    print "ValueError (%s) in cv search, trying again (%x)" % (
                        e, seed)
                    random_state = os.urandom(4)
                    seed = ord(random_state[0]) + 256 * (
                        ord(random_state[1]) + 256 *
                        (ord(random_state[2]) + 256 * ord(random_state[3])))
                    self.kernel_fun = None

        segment_iter = itertools.combinations_with_replacement(
            self.segments, 2)
        if (self.pool == None):
            value_iter = itertools.imap(self.kernel_fun, segment_iter)
        else:
            value_iter = self.pool.imap(
                self.kernel_fun, segment_iter,
                max(
                    1,
                    len(self.segments) * (len(self.segments) + 1) /
                    (5 * 2 * len(multiprocessing.active_children()))))

        self.kernel_matrix = [[0.0 for x in range(len(self.segments))]
                              for y in range(len(self.segments))]
        for ((i, j), val) in itertools.izip(
                itertools.combinations_with_replacement(
                    range(len(self.segments)), 2), value_iter):
            self.kernel_matrix[i][j] = val
            self.kernel_matrix[j][i] = val
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html


### splits data into training and testing set
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)


### implement cross-validation object for gridsearch 
folds = 100
cv = StratifiedShuffleSplit(labels_train, folds, random_state = 17)


"""
### INITIAL TUNING OF PARAMETERS, including selection of features. optimum k found to be 8. 

### set params for gridsearch
params = dict(feat_select__k = range(4, len(features_list)-1), svm__gamma=[0.1, 0.5, 1], 
svm__C=[1,10,100], svm__kernel=['rbf', 'poly', 'sigmoid'])

estimator = grid_search.GridSearchCV(pipe,  param_grid=params, scoring = 'f1', cv=cv)
estimator.fit(features_train, labels_train)
print estimator.best_params_
#pprint.pprint(estimator.grid_scores_)

"""
Example #25
0
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.neighbors import KDTree

class dynamicClassifierSelection:
	def __init__(self, train_X, test_X, train_y, test_y):
		self.train_X = train_X
		self.test_X = test_X
		self.train_y = train_y
		self.test_y = test_y

	def splitData(self, train_X, train_y cvn = 5, test_size_value = 0.2):
		sss = StratifiedShuffleSplit(train_y, n_iter=cvn, test_size=test_size_value)
		fold_tr = [] 	#(train_X1_tr, train_X2_tr, train_X3_tr)
		fold_val = []	#(train_X1_val, train_X2_val, train_X2_val)
		fold_total = []	#(train_Xtot_tr, train_Xtot_val)
		fold_y = [] 	#(train_y_tr, train_y_val)
		# fold : train_X1_tr, train_X1_val, train_X2_tr, train_X2_val, train_X3_tr, train_X3_val
		for train_index, test_index in sss:
			x1_tr, x1_val = train_X[0][train_index], train_X[0][test_index]
			x2_tr, x2_val = train_X[1][train_index], train_X[1][test_index]
			x3_tr, x3_val = train_X[2][train_index], train_X[2][test_index]
			fold_tr.append((x1_tr, x2_tr, x3_tr))
			fold_val.append((x1_val, x2_val, x3_val))

			xTot_tr, xTot_val = train_X[3][train_index], train_X[3][test_index]
			fold_total.append((xTot_tr, xTot_val))
Example #26
0
    [0.01, 0.1, 0.5, 1.0, 10.0, 50.0, 100.0],
    [0.01, 0.1, 0.5, 1.0, 10.0, 50.0, 100.0, 'auto'],
    ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'], [2, 3, 4]):
    if kernel != 'poly' and degree > 2:
        continue

    if kernel not in ['rbf', 'poly', 'sigmoid'] and gamma != 'auto':
        continue

    for dataset_repeat in range(1, 31):
        # Divide the data set into a training and testing sets, each time with a different RNG seed
        training_indices, testing_indices = next(
            iter(
                StratifiedShuffleSplit(input_data['class'].values,
                                       n_iter=1,
                                       train_size=0.75,
                                       test_size=0.25,
                                       random_state=dataset_repeat)))

        training_features = input_data.loc[training_indices].drop(
            'class', axis=1).values
        training_classes = input_data.loc[training_indices, 'class'].values

        testing_features = input_data.loc[testing_indices].drop('class',
                                                                axis=1).values
        testing_classes = input_data.loc[testing_indices, 'class'].values

        ss = StandardScaler()
        training_features = ss.fit_transform(training_features.astype(float))
        testing_features = ss.transform(testing_features.astype(float))
Example #27
0
for key in data_dict.keys():
    data_dict = new_features('shared_receipt_with_poi', 'to_messages', key,
                             data_dict)

### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys=True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
from sklearn.cross_validation import StratifiedShuffleSplit

cv_sss = StratifiedShuffleSplit(labels, n_iter=1000, random_state=42)

for train_index, test_index in cv_sss:
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    for i in train_index:
        X_train.append(features[i])
        y_train.append(labels[i])
    for j in test_index:
        X_test.append(features[j])
        y_test.append(labels[j])

### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
Example #28
0
def CAL_v(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
    """
    Prints the test accuracy of an RBF-kernel SVM predictor
    for a varying amount of "points near the boundary"
    [boundary of the oracle].

    :param name:
    :param label_p:
    :param label_n:
    :param oracle:
    :param n_features:
    :param ftype:
    :param test_x:
    :param test_y:
    :return:
    """
    online = OnlineBase(name,
                        label_p,
                        label_n,
                        oracle,
                        n_features,
                        ftype,
                        error=.5)
    x, y = online.collect_pts(100, -1)
    i = 0
    q = online.get_n_query()

    C_range = np.logspace(-2, 5, 10, base=10)
    gamma_range = np.logspace(-5, 1, 10, base=10)
    param_grid = dict(gamma=gamma_range, C=C_range)

    while q < 3500:
        i += 1
        # h_ = ex.fit(x, y)

        cv = StratifiedShuffleSplit(y,
                                    n_iter=5,
                                    test_size=0.2,
                                    random_state=42)
        grid = GridSearchCV(svm.SVC(),
                            param_grid=param_grid,
                            cv=cv,
                            verbose=0,
                            n_jobs=-1)
        grid.fit(x, y)
        h_ = grid.best_estimator_

        online_ = OnlineBase('',
                             label_p,
                             label_n,
                             h_.predict,
                             n_features,
                             ftype,
                             error=.1)
        x_, _ = online_.collect_pts(10, 200)
        if x_ is not None and len(x_) > 0:
            x.extend(x_)
            y.extend(oracle(x_))
        q += online_.get_n_query()
        pred_y = h_.predict(test_x)

        print "total amount of ", len(x), q, sm.accuracy_score(test_y, pred_y)
Example #29
0
### Set the parameters by cross-validation
## SVM
# recall 1.0, C:1e-5, gamma:0, weight:true:6
# precision:0.66, C:100, gamma:0.2, weight:2
tuned_parameters = {
    'clf__C': [0.5, 0.75, 1.5],
    'clf__gamma': [0.0, 0.1, 0.2],
    'clf__kernel': ['rbf'],
    'clf__tol': [1e-1, 1e-2, 1e-4, 1e-5],
    'clf__class_weight': ['auto']
}

pipe = Pipeline([('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
                 ('clf', SVC())])
# scoring_parameters = 'precision,recall'
cv = StratifiedShuffleSplit(labels, n_iter=20, test_size=0.2, random_state=42)
a_grid_search = GridSearchCV(pipe,
                             param_grid=tuned_parameters,
                             scoring='precision',
                             cv=cv,
                             n_jobs=8)
a_grid_search.fit(features, labels)
clf = a_grid_search.best_estimator_

## Decision Tree
# tuned_parameters = {'clf__max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
#                      'clf__min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
#                      'clf__min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10]
#                      }
# pipe = Pipeline([('clf', tree.DecisionTreeClassifier())])
# cv = StratifiedShuffleSplit(labels, n_iter = 50, test_size=0.2, random_state = 42)
Example #30
0
def CAL(name, label_p, label_n, oracle, n_features, ftype, test_x, test_y):
    """
    Learn with adaptive learning the oracle, using an SVM
     with RBF kernel,
     prints the accuracy as function of amount of queries to
     the LOCAL MODEL (weird function).
    :param name:
    :param label_p:
    :param label_n:
    :param oracle:
    :param n_features:
    :param ftype:
    :param test_x:
    :param test_y:
    :return:
    """
    online = OnlineBase(name,
                        label_p,
                        label_n,
                        oracle,
                        n_features,
                        ftype,
                        error=.5)
    # This is weird - the count should be zero here.
    q = online.get_n_query()
    C_range = np.logspace(-2, 5, 10, base=10)
    gamma_range = np.logspace(-5, 1, 10, base=10)
    param_grid = dict(gamma=gamma_range, C=C_range)

    x, y = online.collect_pts(100, -1)

    i = 0

    cv = StratifiedShuffleSplit(y, n_iter=5, test_size=0.2, random_state=42)
    grid = GridSearchCV(svm.SVC(),
                        param_grid=param_grid,
                        cv=cv,
                        verbose=0,
                        n_jobs=-1)
    grid.fit(x, y)
    h_ = grid.best_estimator_
    while q < 3500:
        i += 1
        # h_ = ex.fit(x, y)
        # This is not really an online model - we set oracle=h_.predict.
        local_model = OnlineBase('',
                                 label_p,
                                 label_n,
                                 h_.predict,
                                 n_features,
                                 ftype,
                                 error=.1)
        x_ = local_model.collect_one_pair()
        if x_ is not None and len(x_) > 0:
            for _x in x_:
                #
                x.append(_x)
                y.append(1)
                cv = StratifiedShuffleSplit(y,
                                            n_iter=5,
                                            test_size=0.2,
                                            random_state=42)
                grid = GridSearchCV(svm.SVC(),
                                    param_grid=param_grid,
                                    cv=cv,
                                    verbose=0,
                                    n_jobs=-1)
                grid.fit(x, y)
                h1 = grid.best_estimator_
                s1 = sm.accuracy_score(y, h1.predict(x))

                y[-1] = -1
                cv = StratifiedShuffleSplit(y,
                                            n_iter=5,
                                            test_size=0.2,
                                            random_state=42)
                grid = GridSearchCV(svm.SVC(),
                                    param_grid=param_grid,
                                    cv=cv,
                                    verbose=0,
                                    n_jobs=-1)
                grid.fit(x, y)
                h2 = grid.best_estimator_
                s2 = sm.accuracy_score(y, h2.predict(x))
                # Assume implicitly that the local model can reach
                # over 99% accuracy over the training set.
                # Check whether there is a reason the query the oracle about x_:
                #   * If for a specific prediction, the performance of
                #   of the model over the so-far found points will
                #   degrade under 99%, it would be useless to query the
                #   oracle because we can already guess this prediction
                #   is wrong.
                #   * Otherwise, we are not certain about oracle(x_) - so we
                #   query the oracle.
                # Very weird - add the point as training point anyway,
                # also when we guess oracle(x_).
                # Notice: I expect that most of the times, only
                # the first "if" will take effect and actually run,
                # Because the points are really close to each other.
                if s1 >= .99 and s2 >= .99:
                    print 'branch 1'
                    y[-1] = oracle(x_)[0]
                elif s1 >= .99 and s2 < .99:
                    print 'branch 2'
                    y[-1] = 1
                elif s1 < .99 and s2 >= .99:
                    print 'branch 3'
                    y[-1] = -1
                else:
                    print 'branch 4: ', s1, s2
                    del x[-1]
                    del y[-1]
                    continue

            if y[-1] == 1:
                h_ = h1
            else:
                h_ = h2
        # This is weird - why do we count the queries of the local_model ?
        # I think we should count the queries to the oracle !
        q += local_model.get_n_query()
        pred_y = h_.predict(test_x)
        print q, sm.accuracy_score(test_y, pred_y)
Example #31
0
pre_d = precision_score(labels_test,
                        pred_d,
                        labels=None,
                        pos_label=1,
                        average='binary',
                        sample_weight=None)
print "pre", pre_d
rec_d = recall_score(labels_test,
                     pred_d,
                     labels=None,
                     pos_label=1,
                     average='binary',
                     sample_weight=None)
print "rec", rec_d

sk_fold = StratifiedShuffleSplit(labels, 100, random_state=42)
gs = GridSearchCV(g_clf, param_grid=pa, cv=sk_fold, scoring='f1')
gs.fit(features, labels)
clf = gs.best_estimator_

print 'best algorithm using strat_s_split'
print clf

test_classifier(clf, my_dataset, features_list)
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)
	try:
		dataset = task.get_dataset()
		# Impute the values - While values would be imputed when calculating some meta-features anyway, this gives more control.
		X, y, categorical = dataset.get_data(target = task.target_feature, return_categorical_indicator = True)

		#X, categorical = remove_zero_columns(impute_values(X, categorical), categorical)

		# Subsample landmarker need folds, the train+test set of subsample landmarkers should be 500 instances,
		# since that is the size of our smallest dataset.
		# We first create a fold for 500 stratified samples, and then again divide that selection to 10 folds.
		max_size = 500
		number_of_classes = len(np.unique(y))
		if y.shape[0] < (max_size + number_of_classes):
			subset_indices = np.arange(max_size)
		else:
			subset_split = StratifiedShuffleSplit(y, n_iter=1, test_size=500, random_state = 0)
			_, subset_indices = next(subset_split.__iter__())
		mapped_folds = StratifiedShuffleSplit(y[subset_indices], n_iter=10, test_size=0.2, random_state = 0)

		subsample_folds = [(subset_indices[train],subset_indices[test]) for train, test in mapped_folds]

		# Because the subsamples are of constant size, always 500, we just calculate them once per dataset,
		# not once for every subsample of every dataset (those are stratified anyway)
		log("subsample-mf")
		subsample_features = subsample_metafeatures(X, y, categorical, subsample_folds)

		# We also take subsets of the original dataset, because it creates a bigger metadataset to learn from
		for i in np.arange(0.1, 1.01, 0.1):

			# We want a minimum size of 500, otherwise predicting runtime is not that useful anyway,
			# and it avoids some issues with train/test splits being too small and timing not being accurately measured
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

X = train_df.values.copy()
test_X = test_df.values.copy()
#np.random.shuffle(X)
X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]

encoder = LabelEncoder()
y = encoder.fit_transform(labels).astype(np.int32)

### we need a test set that we didn't train on to find the best weights for combining the classifiers
sss = StratifiedShuffleSplit(y, 1, test_size=0.1, random_state=1234)

for train_index, test_index in sss:
    print 'split the training data'

scaler = StandardScaler()
X = scaler.fit_transform(X)
train_X, train_y = X[train_index], y[train_index]
check_X, check_y = X[test_index], y[test_index]

test_X, test_ids = test_X[:, 1:].astype(np.float32), test_X[:, 0].astype(str)
test_X = scaler.transform(test_X)

print 'training data: ', train_X.shape
print
Example #34
0
	for k, v in train_distribution.items():
		print('Class label = %d, percentage records = %.2f)'%(k, v))
	
	#print测试集类别分布
	y_test = test[:,-1]
	test_distribution = get_class_distribution(y_test)
	print('\n Test data set class label distribution')
	for k, v in test_distribution.items():
		print('Class label = %d, percentage records = %.2f)'%(k, v))
	
if __name__ == '__main__':
	#train, test = print_data(input_dataset=get_sample_data())
	#print_class_label_split(train, test)
	from sklearn.cross_validation import StratifiedShuffleSplit#导入库

	input_dataset = get_sample_data()
	train, test, test_size = print_data(input_dataset)
	print_class_label_split(train, test)
	stratified_split = StratifiedShuffleSplit(input_dataset[:,-1],test_size=test_size)
	#调用sklearn里的StratifiedShuffleSplit函数。第一个参数是输入的数据集;第二个参数test_size定义了测试集的大小;第三个参数n_iter定义了只进行一次分割。
	
	for train_indx,test_indx in stratified_split:
		train = input_dataset[train_indx]
		test = input_dataset[test_indx]	
	print_class_label_split(train, test)





Example #35
0
 def _valid_split(self):
     self._idcs_train, self._idcs_valid = next(iter(
         StratifiedShuffleSplit(self._train['ts'],
                                n_iter=1,
                                test_size=self._val_size,
                                random_state=self._seed)))
	   os.path.isfile('{}/{}.{}'.format(trg_dir, trainingset_feature_names_file, trainingset_suffix)) or
	   os.path.isfile('{}/{}.{}'.format(trg_dir, trainingset_sample_indices_file, trainingset_suffix)):
		if override:
			print 'WARNING: The target directory {}/ already contains at least on of the files to create. Replacing.'.format(trg_dir)
		else:
			print 'WARNING: The target directory {}/ already contains at least on of the files to create. Skipping.'.format(trg_dir)
			sys.exit(1)

	# initializing collections
	training_set_selections = dict.fromkeys(training_set_cases)

	# iterate over cases, load their respective samples and perform a sampling for each
		

	# draw random stratified sample and extract training set indices
	sss = StratifiedShuffleSplit(classes, n_iter=1, train_size=n_samples)
	sample_indices, _ = sss.next()

	# save

def load_feature_struct(f):
	"Load the feature struct from a feature config file."
	d, m = os.path.split(os.path.splitext(f)[0])
	f, filename, desc = imp.find_module(m, [d])
	return imp.load_module(m, f, filename, desc).features_to_extract

def load_feature_names(f):
	"Load the feature names from a feature config file."
	fs = load_feature_struct(f)
	return [feature_struct_entry_to_name(e) for e in fs]