コード例 #1
0
ファイル: bootstrap_selector.py プロジェクト: milapour/palm
 def select_data(self, target_data, size):
     n = len(target_data)
     assert size <= n-1
     bs = Bootstrap(n, 1, train_size=n-1)
     train_index, test_index = bs.__iter__().next()
     train_index = list(train_index)
     inds = train_index[:size]
     new_target_data = target_data.make_copy_from_selection(inds)
     return new_target_data
コード例 #2
0
ファイル: bootstrap_selector.py プロジェクト: doctaphred/palm
 def select_data(self, target_data, size):
     n = len(target_data)
     assert size <= n - 1
     bs = Bootstrap(n, 1, train_size=n - 1)
     train_index, test_index = bs.__iter__().next()
     train_index = list(train_index)
     inds = train_index[:size]
     new_target_data = target_data.make_copy_from_selection(inds)
     return new_target_data
コード例 #3
0
def run(sc):
    def zero_matrix(n, m):
        return np.zeros(n * m, dtype=int).reshape(n, m)

    def vote_increment(y_est):
        increment = zero_matrix(y_est.size, n_ys)
        increment[np.arange(y_est.size), y_est] = 1
        return increment  # test point x class matrix with 1s marking the estimator prediction

    X, y = make_classification()
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    n_test = X_test.shape[0]
    n_ys = np.unique(y_train).size

    model = DecisionTreeClassifier()
    # Partition the training data into random sub-samples with replacement.
    samples = sc.parallelize(Bootstrap(y.size))
    # Train a model for each sub-sample and apply it to the test data.
    vote_tally = samples.map(lambda (index, _): model.fit(X[index], y[index]).
                             predict(X_test)).map(vote_increment).fold(
                                 zero_matrix(n_test, n_ys),
                                 np.add)  # Take the learner majority vote.
    y_estimate_vote = np.argmax(vote_tally, axis=1)
    return accuracy_score(y_test, y_estimate_vote)
コード例 #4
0
def meta_extractor(targetfile, metafile, yfolds=5, zfolds=10):

    # setup CV sets
    resultdict = {}

    # load data
    print "--loading data"
    target = np.load(targetfile)
    y = target['ymap']
    Z = target['Z']

    # compute cv
    print "--computing cv sets"
    TRAIN, TEST = [], []
    skf = StratifiedKFold(y, yfolds, False)
    for train, test in skf:
        TRAIN.append(np.array(train, dtype=bool))
        TEST.append(np.array(test, dtype=bool))
    train = np.vstack(TRAIN)
    test = np.vstack(TEST)

    resultdict['ycv'] = yfolds
    resultdict["train"] = train
    resultdict["test"] = test

    # compute Z cv
    # subtrain[ix, jx] = indexes correponsing to trainset ix, subtrain set jx
    # subtest[ix, jx] = indexes correponsing to traintest ix, subtest set jx
    print "--computing Z cv sets"
    Nt = int(len(y) * ((yfolds - 1) / float(yfolds)))
    Kt = int(Nt * ((zfolds - 1) / float(zfolds)))
    Ke = Nt - Kt
    SUBTRAIN = np.zeros((yfolds, zfolds, Kt), dtype=bool)
    SUBTEST = np.zeros((yfolds, zfolds, Ke), dtype=bool)
    for idx, trainset in enumerate(train):  # each trainig set
        Zt = Z[trainset]
        Nt = len(Zt)
        skf = Bootstrap(Nt, 10000, Kt, Ke)
        for fold in range(zfolds):
            for ztrain, ztest in skf:
                ntrain = Zt[ztrain].sum(axis=0)
                ntest = Zt[ztest].sum(axis=0)
                if np.alltrue(ntrain > 0) and np.alltrue(
                        ntest > 0):  #accept this
                    SUBTRAIN[idx, fold] = ztrain
                    SUBTEST[idx, fold] = ztest
                    break

    resultdict['zcv'] = zfolds
    resultdict["subtrain"] = SUBTRAIN
    resultdict["subtest"] = SUBTEST

    # save results
    print "--save final result"
    np.savez(metafile, **resultdict)
コード例 #5
0
def bootstrap_seqs(seq, n_iter, subsize, random_state=0):
    """
	seq = the sequence to be selected from
	n_iter = number of sub sequences
	subsize = length of sub sequences
	return iterable of subseqs 
	"""
    bs = Bootstrap(len(seq),
                   n_iter=n_iter,
                   train_size=subsize,
                   random_state=random_state)
    sub_indices = [index for (index, _) in bs]
    seq_array = np.asarray(seq)
    return [seq_array[i] for i in sub_indices]
コード例 #6
0
def run_bootstrap_model(df_cv, model, best_features, X_fut, ref_column):
    bs = Bootstrap(len(df_cv),
                   n_iter=1000,
                   train_size=int(len(df_cv) * 3 / 4),
                   test_size=int(len(df_cv) * 1 / 4))
    count = 1
    first = True
    for train_index, test_index in bs:
        X_tr = df_cv.ix[train_index][best_features]
        y_tr = df_cv.ix[train_index][ref_column]
        mdl = model.fit(X_tr, y_tr)
        pred = mdl.predict(X_fut)
        if first:
            df_pred = pd.DataFrame(pred)
            first = False
        else:
            df_pred[count] = pred
        count += 1
    return df_pred
コード例 #7
0
                             sparse=True)
validation_features = dio.join_features("%s_" + type_v + "_" + short_id +
                                        "_matrix",
                                        tfidf_columns,
                                        extra_valid_features,
                                        sparse=True)

print features.shape
print validation_features.shape

salaries = dio.get_salaries(type_n, log=True)
if not submission:
    valid_salaries = dio.get_salaries(type_v, log=True)

print salaries.shape
bs = Bootstrap(len(salaries), random_state=45, train_size=0.6)
train_index, test_index = next(iter(bs))
param = """Normal count vector with max 200. New submission which is repeatable.
 and nicer

Bag of Words: %s\n

Encoded cols: %s\n

Logged


vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    max_df=0.5,
    stop_words='english'
コード例 #8
0
import numpy as np

from sklearn.cross_validation import train_test_split, Bootstrap
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets, svm, pipeline
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier

if __name__ == '__main__':
    conf = SparkConf()
    conf.setMaster("spark://172.18.109.87:7077")
    # conf.setMaster("local")
    conf.setAppName("spark_svm")
    conf.set("spark.executor.memory", "12g")
    sc = SparkContext(conf=conf)
    X, y = make_classification(n_samples=10000, n_features=30, n_classes=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    samples = sc.parallelize(Bootstrap(y.size))
    feature_map_fourier = RBFSampler(gamma=.2, random_state=1)
    fourier_approx_svm = pipeline.Pipeline([("feature_map",
                                             feature_map_fourier),
                                            ("svm", SGDClassifier())])
    fourier_approx_svm.set_params(feature_map__n_components=700)
    results = samples.map(lambda (index, _):
                          fourier_approx_svm.fit(X[index], y[index]).score(X_test, y_test)) \
                          .reduce(lambda x,y: x+y)
    final_results = results / len(Bootstrap(y.size))
    print(final_results)
コード例 #9
0
ファイル: Scikit+Spark.py プロジェクト: qihangz/spark-rakuten
precisions = []
recalls = []

model_start = time.time()
for targetColumn in topFeatures:
	parsedData = getXY(complete_matrix, int(targetColumn))
	X_train, X_test, y_train, y_test = train_test_split(parsedData[0], parsedData[1], train_size=0.9)
	n_test = X_test.shape[0]
	model = BernoulliNB()
	#model = MultinomialNB()
	#model = KNeighborsClassifier()
	#model = linear_model.SGDClassifier()
	#model = linear_model.LogisticRegressionCV()
	#model = svm.LinearSVC()
	samples = sc.parallelize(Bootstrap(X_train.shape[0], n_iter=19, train_size=0.5), 8)
	vote_result = samples.map(lambda (index, _) : model.fit(X_train[index], y_train[index]).predict(X_test)).map(vote_increment).fold(zero_matrix(n_test, n_ys), numpy.add)
	y_estimate_vote = numpy.argmax(vote_result, axis = 1)
	precisions.append(precision_score(y_test, y_estimate_vote))
	recalls.append(recall_score(y_test, y_estimate_vote))
	samples = sc.parallelize([numpy.arange(X_train.shape[0])])
	vote_result = samples.map(lambda index : model.fit(X_train[index], y_train[index]).predict(X_test)).map(vote_increment).fold(zero_matrix(n_test, n_ys), numpy.add)
	y_estimate_vote = numpy.argmax(vote_result, axis = 1)
	precisions.append(precision_score(y_test, y_estimate_vote))
	recalls.append(recall_score(y_test, y_estimate_vote))

end = time.time()

print (end - model_start) / 60
numpy.mean(precisions)
numpy.mean(recalls)
コード例 #10
0
from sklearn.cross_validation import train_test_split, Bootstrap
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier


def zero_matrix(n, m):
    return np.zeros(n * m, dtype=int).reshape(n, m)


def vote_increment(y_est):
    increment = zero_matrix(y_est.size, n_ys)
    increment[np.arange(y_est.size), y_est] = 1
    return increment  # test point x class matrix with 1s marking the estimator prediction


X, y = make_classification()
X_train, X_test, y_train, y_test = train_test_split(X, y)

n_test = X_test.shape[0]
n_ys = np.unique(y_train).size

model = BernoulliNB()
# Partition the training data into random sub-samples with replacement.
samples = sc.parallelize(Bootstrap(y.size))
# Train a model for each sub-sample and apply it to the test data.
vote_tally = samples.map(lambda (index, _): model.fit(X[index], y[
    index]).predict(X_test)).map(vote_increment).fold(zero_matrix(
        n_test, n_ys), np.add)  # Take the learner majority vote.
y_estimate_vote = np.argmax(vote_tally, axis=1)
print accuracy_score(y_test, y_estimate_vote)
コード例 #11
0
    makePred = False  #set to true if you want to make predictions
    getConM = True  #set to true to get the confusion matrix using our optimal parameters

    with open("./DataSetWithDictionarys/trainingSetX.txt",
              "rb") as trainingFileX:
        trainingX = pickle.load(trainingFileX)

    with open("./DataSetWithDictionarys/trainingSetY.txt",
              "rb") as trainingFileY:
        trainingY = pickle.load(trainingFileY)

    with open("./DataSetWithDictionarys/validationSet.txt",
              "rb") as validationFile:
        validationSet = pickle.load(validationFile)

    bs = Bootstrap(trainingX.shape[0], n_iter=NUMIT, random_state=0)
    kBest = SelectKBest(chi2, k=NUMFT)
    if makePred:
        for nb in NUMNB:
            for s in sigma:
                acc = np.array([])
                acpcl = np.array([0.0, 0.0, 0.0, 0.0])
                pre = np.array([0.0, 0.0, 0.0, 0.0])
                rec = np.array([0.0, 0.0, 0.0, 0.0])
                f1score = np.array([0.0, 0.0, 0.0, 0.0])

                for train_index, test_index in bs:
                    [accuraccy, accpcl, precision, recall,
                     f1] = (bootstrap(train_index, test_index, nb, trainingX,
                                      trainingY, kBest, s))
                    acc = np.append(acc, accuraccy)