Beispiel #1
0
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    # explicitly passing indices value is deprecated
    loo = assert_warns(DeprecationWarning, cval.LeaveOneOut,
                       4, indices=True)
    lpo = assert_warns(DeprecationWarning, cval.LeavePOut,
                       4, 2, indices=True)
    kf = assert_warns(DeprecationWarning, cval.KFold,
                      4, 2, indices=True)
    skf = assert_warns(DeprecationWarning, cval.StratifiedKFold,
                       y, 2, indices=True)
    lolo = assert_warns(DeprecationWarning, cval.LeaveOneLabelOut,
                        labels, indices=True)
    lopo = assert_warns(DeprecationWarning, cval.LeavePLabelOut,
                        labels, 2, indices=True)
    b = cval.Bootstrap(2)  # only in index mode
    ss = assert_warns(DeprecationWarning, cval.ShuffleSplit,
                      2, indices=True)
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X_train, X_test = X[train], X[test]
            y_train, y_test = y[train], y[test]
def bootstrap(X_train, y_train, d, nIter=100, random_state=0):
    """
    svm bootstrap 0 and 632.

    X_train -> nxD
    y_train -> n
    d -> selected feature size
    output -> err_bs0, err_bs632
    """
    bs = cross_validation.Bootstrap(len(y_train),
                                    n_iter=nIter,
                                    random_state=random_state)
    errs0 = []
    errs632 = []
    for train_index, test_index in bs:
        Xbs_train = X_train[train_index]
        Xbs_test = X_train[test_index]
        ybs_train = y_train[train_index]
        ybs_test = y_train[test_index]
        FeaInd_bs = FeaSelTtest(Xbs_train, ybs_train, d)
        Xdbs_train = Xbs_train[:, FeaInd_bs]
        Xdbs_test = Xbs_test[:, FeaInd_bs]
        clf = svm.SVC(kernel='linear')
        clf.fit(Xdbs_train, ybs_train)
        err0 = 1 - clf.score(Xdbs_test, ybs_test)
        err_bs_resub = 1 - clf.score(Xdbs_train, ybs_train)
        err632 = (1 - 0.632) * err_bs_resub + 0.632 * err0
        errs0.append(err0)
        errs632.append(err632)
    err_bs0 = np.array(errs0).mean()
    err_bs632 = np.array(errs632).mean()
    return (err_bs0, err_bs632)
Beispiel #3
0
def cv_select(y, random_state, n_cv, cv, test_size=0.1):
    if isinstance(cv, basestring):
        if cv == 'shuffle':
            return cross_validation.StratifiedShuffleSplit(
                y, n_cv, test_size=test_size, random_state=random_state)
        elif cv == 'loo':
            return cross_validation.LeaveOneOut(n_cv)
        elif cv == 'kfold':
            return cross_validation.StratifiedKFold(y, n_folds=n_cv)
        elif cv == 'boot':
            return cross_validation.Bootstrap(len(y),
                                              n_iter=n_cv,
                                              train_size=(1 - test_size),
                                              random_state=random_state)
        elif cv == 'boot632':
            return bootstrap_632(len(y),
                                 n_iter=n_cv,
                                 random_state=random_state)
        # for regression
        elif cv == '_shuffle':
            return cross_validation.ShuffleSplit(len(y),
                                                 n_iter=n_cv,
                                                 test_size=test_size,
                                                 random_state=random_state)
        elif cv == '_kfold':
            return cross_validation.KFold(len(y), n_folds=n_cv)
        else:
            raise ValueError("bad cv:%s" % cv)
    else:
        return cv
Beispiel #4
0
    def _bootstrapped_fit_transform(self,
                                    data,
                                    n_iter=100,
                                    thresh=0.6,
                                    min_samples=10):
        """Resample each splicing event n_iter times to robustly estimate
        modalities.
        """
        bs = cross_validation.Bootstrap(data.shape[0], n_iter=n_iter)

        assignments = pd.DataFrame(columns=data.columns, index=range(n_iter))

        for i, (train_index, test_index) in enumerate(bs):
            index = train_index + test_index
            psi = data.ix[data.index[index], :]
            psi = psi.dropna(axis=1, thresh=min_samples)
            assignments.ix[i] = self._single_fit_transform(psi,
                                                           do_not_memoize=True)

        counts = assignments.apply(
            lambda x: pd.Series(collections.Counter(x.dropna())))
        fractions = counts / counts.sum().astype(float)
        thresh_assignments = fractions[fractions >= thresh].apply(
            self._max_assignment, axis=0)
        thresh_assignments = thresh_assignments.fillna('unassigned')
        return thresh_assignments
Beispiel #5
0
def sample_random_n(table,
                    n,
                    stratified=False,
                    replace=False,
                    random_state=None):
    assert n > 0
    n = int(n)
    if replace:
        ind = cross_validation.Bootstrap(len(table),
                                         train_size=n,
                                         random_state=random_state)
    elif stratified and is_discrete(table.domain.class_var):
        train_size = max(len(table.domain.class_var.values), n)
        test_size = max(len(table) - train_size, 0)
        ind = cross_validation.StratifiedShuffleSplit(
            table.Y.ravel(),
            n_iter=1,
            test_size=test_size,
            train_size=train_size,
            random_state=random_state)
    else:
        train_size = max(len(table.domain.class_var.values), n)
        test_size = max(len(table) - train_size, 0)
        ind = cross_validation.ShuffleSplit(len(table),
                                            n_iter=1,
                                            test_size=test_size,
                                            train_size=train_size,
                                            random_state=random_state)
    return next(iter(ind))
Beispiel #6
0
    def __call__(self, data, fitters):
        indices = cross_validation.Bootstrap(len(data),
                                             n_iter=self.n_resamples,
                                             train_size=self.p,
                                             random_state=self.random_state)

        results = Results(data, len(fitters), store_data=self.store_data)

        results.folds = []
        if self.store_models:
            results.models = []

        row_indices = []
        actual = []
        predicted = [[] for _ in fitters]
        probabilities = [[] for _ in fitters]
        fold_start = 0
        class_var = data.domain.class_var
        for train, test in indices:
            train_data, test_data = data[train], data[test]
            results.folds.append(slice(fold_start, fold_start + len(test)))
            row_indices.append(test)
            actual.append(test_data.Y.flatten())
            if self.store_models:
                fold_models = []
                results.models.append(fold_models)

            for i, fitter in enumerate(fitters):
                model = fitter(train_data)
                if self.store_models:
                    fold_models.append(model)

                if is_discrete(class_var):
                    values, probs = model(test_data, model.ValueProbs)
                    predicted[i].append(values)
                    probabilities[i].append(probs)
                elif is_continuous(class_var):
                    values = model(test_data, model.Value)
                    predicted[i].append(values)

            fold_start += len(test)

        row_indices = np.hstack(row_indices)
        actual = np.hstack(actual)
        predicted = np.array([np.hstack(pred) for pred in predicted])
        if is_discrete(class_var):
            probabilities = np.array(
                [np.vstack(prob) for prob in probabilities])
        nrows = len(actual)
        nmodels = len(predicted)

        results.nrows = len(actual)
        results.row_indices = row_indices
        results.actual = actual
        results.predicted = predicted.reshape(nmodels, nrows)
        if is_discrete(class_var):
            results.probabilities = probabilities
        return results
Beispiel #7
0
	def fit(self, X, y):
		total_rows, total_features = X.shape
		## randomly select features
		bt = cross_validation.Bootstrap(total_features, n_iter = self.n_estimators, 
										train_size = self.n_features)
		self.feature_sets = [fset for (fset, _) in bt]
		"""
		self.ensemble = Parallel(n_jobs = -1)(delayed(fit_model)(self.ensemble[i], X, y, self.feature_sets[i]) 
					for i in xrange(self.n_estimators))
		"""
		self.ensemble = [fit_model(self.ensemble[i], X, y, self.feature_sets[i]) 
					for i in xrange(self.n_estimators)]
		return self
Beispiel #8
0
def cross_phenotype_jsd(data, groupby, bins, n_iter=100):
    """Jensen-Shannon divergence of features across phenotypes

    Parameters
    ----------
    data : pandas.DataFrame
        A (n_samples, n_features) Dataframe
    groupby : mappable
        A samples to phenotypes mapping
    n_iter : int
        Number of bootstrap resampling iterations to perform for the
        within-group comparisons
    n_bins : int
        Number of bins to binify the singles data on

    Returns
    -------
    jsd_df : pandas.DataFrame
        A (n_features, n_phenotypes^2) dataframe of the JSD between each
        feature between and within phenotypes
    """
    grouped = data.groupby(groupby)
    jsds = []

    seen = set([])

    for phenotype1, df1 in grouped:
        for phenotype2, df2 in grouped:
            pair = tuple(sorted([phenotype1, phenotype2]))
            if pair in seen:
                continue
            seen.add(pair)

            if phenotype1 == phenotype2:
                seriess = []
                bs = cross_validation.Bootstrap(df1.shape[0],
                                                n_iter=n_iter,
                                                train_size=0.5)
                for i, (ind1, ind2) in enumerate(bs):
                    df1_subset = df1.iloc[ind1, :]
                    df2_subset = df2.iloc[ind2, :]
                    seriess.append(
                        binify_and_jsd(df1_subset, df2_subset, None, bins))
                series = pd.concat(seriess, axis=1, names=None).mean(axis=1)
                series.name = pair
                jsds.append(series)
            else:
                series = binify_and_jsd(df1, df2, pair, bins)
                jsds.append(series)
    return pd.concat(jsds, axis=1)
def test_cross_val_generator_with_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4, indices=True)
    lpo = cval.LeavePOut(4, 2, indices=True)
    kf = cval.KFold(4, 2, indices=True)
    skf = cval.StratifiedKFold(y, 2, indices=True)
    lolo = cval.LeaveOneLabelOut(labels, indices=True)
    lopo = cval.LeavePLabelOut(labels, 2, indices=True)
    b = cval.Bootstrap(2)  # only in index mode
    ss = cval.ShuffleSplit(2, indices=True)
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]:
        for train, test in cv:
            X_train, X_test = X[train], X[test]
            y_train, y_test = y[train], y[test]
Beispiel #10
0
def test_cross_val_generator_with_default_indices():
    X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
    y = np.array([1, 1, 2, 2])
    labels = np.array([1, 2, 3, 4])
    loo = cval.LeaveOneOut(4)
    lpo = cval.LeavePOut(4, 2)
    kf = cval.KFold(4, 2)
    skf = cval.StratifiedKFold(y, 2)
    lolo = cval.LeaveOneLabelOut(labels)
    lopo = cval.LeavePLabelOut(labels, 2)
    b = cval.Bootstrap(2)  # only in index mode
    ss = cval.ShuffleSplit(2)
    for cv in [loo, lpo, kf, skf, lolo, lopo, b, ss]:
        for train, test in cv:
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            assert_not_equal(np.asarray(train).dtype.kind, 'b')
            X_train, X_test = X[train], X[test]
            y_train, y_test = y[train], y[test]
Beispiel #11
0
	    train = data[train_index]
	    test = data[test_index]
	    train_target_vals = target_vals[train_index]
	    test_traget_vals = target_vals[test_index]
	    models = all_model(alpha)
	    for model, model_type in models:
	        model.fit(train, train_target_vals)
	        predict = model.predict(test)
	        accuracy = metrics.accuracy_score(test_traget_vals, predict)
	        list_accuracy.append(accuracy)
	        # print model_type, " = ", accuracy
    return max(list_accuracy)


if __name__ == "__main__":
    data, targets = parse()
    data = np.array(data)
    data = data.astype(np.float)
    data = normalize(data, axis=1)
    # print (data[1][0])
    targets = np.array(targets)
    # print (targets[0])
    alpha_vals = np.linspace(750, 1000, 20)
    bs = cv.Bootstrap(targets.size, n_iter=100)
    max_vals = []
    for alpha in alpha_vals:
		print '\n', alpha
		max_vals.append(trails(data, targets, bs, alpha))
	print max(max_vals)

Beispiel #12
0
#svm, X1, Y1, cv=bs)#, score_func=metrics.f1_score)

#print 'score: %f +- %f' % (scores.mean(), scores.std())

#pred_Y = svm.predict(test_X)

#print metrics.precision_score(test_Y, pred_Y)
#print metrics.recall_score(test_Y, pred_Y)
#print metrics.f1_score(test_Y, pred_Y)

#pred_Y = svm.predict(X1)

#print metrics.precision_score(Y1, pred_Y)
#print metrics.recall_score(Y1, pred_Y)
#print metrics.f1_score(Y1, pred_Y)

alpha_arr = []
for label in np.unique(labels):
    n = np.sum(labels == label)
    alpha_arr.append(n / float(labels.shape[0]))
alpha_arr = np.array(alpha_arr)
alpha = np.max(alpha_arr)
print alpha

bs = cross_validation.Bootstrap(data.shape[0], 3)
for train_indices, test_indices in bs:
    svm.fit(data[train_indices], labels[train_indices])
    score = svm.score(data[test_indices], labels[test_indices])
    print score, (score - alpha) / (1 - alpha)
    #pred = svm.predict(data[test_indices])
Beispiel #13
0
def main():

    #random.seed(5)
    #random.random()
	
    startCol = 0
    endCol = 1775  # max = 1775

    trainBase = csv_io.read_data("../Data/train.csv")
	
    result = 100
    avg = 0
    bootstraps = 9 # should be odd for median
	
    rnd_start = 456
	

    predicted_list = []
	
    spanDistance = 12
    bootstrapLists = []
	

    if ( True):
        
        predicted_list = []
        bs = cross_validation.Bootstrap(len(trainBase) - 1, n_bootstraps=bootstraps, train_size=0.7, random_state=0)
        for train_index, test_index in bs:

            trainBaseTemp = [trainBase[i+1] for i in train_index]
            #trainBaseTemp = trainBase
            target = [x[0] for x in trainBaseTemp]#[1001:3700]
            train = [x[1:] for x in trainBaseTemp]#[1001:3700]
	
            testBaseTemp = [trainBase[i+1] for i in test_index]
            #testBaseTemp = trainBase
            targetTest = [x[0] for x in testBaseTemp]#[1:1000]
            trainTest = [x[1:] for x in testBaseTemp]#[1:1000]
	
	
            test = csv_io.read_data("../Data/test.csv")
            test = [x[0:] for x in test]
	
	
            fo = open("rf_stats.txt", "a+")
    
	
            rf = ExtraTreesClassifier(n_estimators=200, criterion='gini', max_depth=None, min_samples_split=1, min_samples_leaf=1, min_density=0.10000000000000001, max_features='auto', bootstrap=False, compute_importances=False, oob_score=False, n_jobs=1, random_state=None, verbose=0)

            rf.fit(train, target)
            prob = rf.predict_proba(trainTest)  # was test
	
            probSum = 0
            totalOffByHalf = 0
	
            for i in range(0, len(prob)):
                probX = prob[i][1] # [1]
                if ( probX > 0.999999999999):
                    probX = 0.999999999999;		
                if ( probX < 0.000000000001):
                    probX = 0.000000000001;
                #print i, probSum, probX, target[i]
                #print target[i]*log(probX), (1-target[i])*log(1-probX)
                probSum += targetTest[i]*log(probX)+(1-targetTest[i])*log(1-probX)
                if ( math.fabs(probX - targetTest[i]) > 0.5 ):
                    totalOffByHalf = totalOffByHalf + 1		
			
            print "Total Off By > 0.5 ", totalOffByHalf
            print -probSum/len(prob)
	
            #fo.write(str(C) + "," + str(g) + "," + str(-probSum/len(prob)));
	
            avg += 	(-probSum/len(prob))/bootstraps

            predicted_probs = rf.predict_proba(test)  # was test
            predicted_list.append([x[1] for x in predicted_probs])
	
	
            fo.close()


        avg_list = []
        med_list = []
	
	
        for p in range(0, len(test)):
            temp_list =[]	
            for q in range(0, len(predicted_list)):		
		        temp_list.append(  predicted_list[q][p]) 
			
            avg_list.append( mean(temp_list) )
            med_list.append( getMedian(temp_list) )
		
            print p, q, temp_list, mean(temp_list), getMedian(temp_list)
		
        bootstrapLists.append(avg_list)
		
    if ( len(bootstrapLists) > 1 ):
        finalList = []
        for p in range(0, len(test)):
            temp_list =[]	
            for q in range(0, len(bootstrapLists)):		
		        temp_list.append(  bootstrapLists[q][p]) 
			
            finalList.append( meanSpan(temp_list, spanDistance) )
		
            print p, q, temp_list, meanSpan(temp_list, spanDistance)
    else:
        finalList = bootstrapLists[0]    	
		
		
    avg_values = ["%f" % x for x in finalList]
    csv_io.write_delimited_file("../Submissions/et_stack_avg_benchmark.csv", avg_values)	
	
	
    print "Average: ", avg
		
    var = raw_input("Enter to terminate.")								
Beispiel #14
0
def main():

    #random.seed(5)
    #random.random()

    # this method does not seem to benefit from using less than all columns of data.
    startCol = 0
    endCol = 1775  # max = 1775

    trainBase = csv_io.read_data("../Data/train.csv")

    result = 100
    avg = 0
    avg_sum = 0
    avg_counter = 0
    bootstraps = 9

    predicted_list = []

    bs = cross_validation.Bootstrap(len(trainBase) - 1,
                                    n_bootstraps=bootstraps,
                                    train_size=0.7,
                                    random_state=0)
    for train_index, test_index in bs:

        trainBaseTemp = [trainBase[i + 1] for i in train_index]
        trainBaseTemp = trainBase
        target = [x[0] for x in trainBaseTemp][1001:3700]
        train = [x[startCol + 1:endCol + 1] for x in trainBaseTemp][1001:3700]

        testBaseTemp = [trainBase[i + 1] for i in test_index]
        testBaseTemp = trainBase
        targetTest = [x[0] for x in testBaseTemp][1:1000]
        trainTest = [x[startCol + 1:endCol + 1] for x in testBaseTemp][1:1000]

        test = csv_io.read_data("../Data/test.csv")
        test = [x[startCol:endCol] for x in test]

        fo = open("svm_stats.txt", "a+")

        # good for rbf method
        CC = [0.0]
        gg = [-5.5]

        # sigmoid use C=-8
        #C=-8

        #poly use
        #CC = [12, 10, 8, 6,4 ,2]
        #gg = [-19, -17, -15, -13, -11, -9]

        for g in gg:
            for C in CC:
                #for g in range(-19, -10, 2):
                #for C in range(14, -10, -2):
                #if (True):
                #for y in range(0, 6):
                svc = svm.SVC(probability=True,
                              C=10**-8,
                              cache_size=800,
                              coef0=0.0,
                              degree=3,
                              kernel='rbf',
                              shrinking=True,
                              tol=0.001)
                #svc = svm.SVC(probability=True, C=10**C[y], gamma=2**g[y],cache_size=800, coef0=0.0, degree=3, kernel='poly', shrinking=True, tol=0.01)
                svc.fit(train, target)
                prob = svc.predict_proba(trainTest)  # was test

                probSum = 0
                totalOffByHalf = 0

                for i in range(0, len(prob)):
                    #print i, probSum, prob[i][1], target[i]
                    #print target[i]*log(prob[i][1]), (1-target[i])*log(1-prob[i][1])
                    probSum += targetTest[i] * log(
                        prob[i][1]) + (1 - targetTest[i]) * log(1 - prob[i][1])
                    if (math.fabs(prob[i][1] - targetTest[i]) > 0.5):
                        totalOffByHalf = totalOffByHalf + 1

                #print probSum
                #print len(prob)
                print "Total Off By > 0.5 ", totalOffByHalf
                print "C: ", 10**C, " gamma: ", 2**g
                #print "C: ", 10**C[y], " gamma: " ,2**g[y]
                print -probSum / len(prob)

                #fo.write(str(C[y]) + "," + str(g[y]) + "," + str(-probSum/len(prob)));
                fo.write(
                    str(C) + "," + str(g) + "," + str(-probSum / len(prob)))

                avg_sum += (-probSum / len(prob))
                avg_counter = avg_counter + 1

                #if ( -probSum/len(prob) < result ):
                #    result = -probSum/len(prob)
                #    predicted_probs = svc.predict_proba(test)  # was test
                #    predicted_probs = ["%f" % x[1] for x in predicted_probs]
                #    csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs)
                #    print "Generated Data!!"

                predicted_probs = svc.predict_proba(test)  # was test
                predicted_list.append([x[1] for x in predicted_probs])

        fo.close()

    avg_list = []
    med_list = []

    for p in range(0, len(test)):
        temp_list = []
        for q in range(0, len(predicted_list)):
            temp_list.append(predicted_list[q][p])

        avg_list.append(mean(temp_list))
        med_list.append(getMedian(temp_list))

        print p, q, temp_list, mean(temp_list), getMedian(temp_list)

    med_values = ["%f" % x for x in med_list]
    csv_io.write_delimited_file("../Submissions/svm_med_benchmark.csv",
                                med_values)

    avg_values = ["%f" % x for x in avg_list]
    csv_io.write_delimited_file("../Submissions/svm_avg_benchmark.csv",
                                avg_values)

    print "Average: ", (avg_sum / avg_counter)

    var = raw_input("Enter to terminate.")
Beispiel #15
0
    def __init__(self, data, learners, n_resamples=10, p=0.75, random_state=0,
                 store_data=False, store_models=False):
        super().__init__(data, len(learners), store_data=store_data,
                         store_models=store_models)
        self.store_models = store_models
        self.n_resamples = n_resamples
        self.p = p
        self.random_state = random_state

        indices = skl_cross_validation.Bootstrap(
            len(data), n_iter=self.n_resamples, train_size=self.p,
            random_state=self.random_state
        )

        self.folds = []
        if self.store_models:
            self.models = []

        row_indices = []
        actual = []
        predicted = [[] for _ in learners]
        probabilities = [[] for _ in learners]
        fold_start = 0
        class_var = data.domain.class_var
        for train, test in indices:
            train_data, test_data = data[train], data[test]
            self.folds.append(slice(fold_start, fold_start + len(test)))
            row_indices.append(test)
            actual.append(test_data.Y.flatten())
            if self.store_models:
                fold_models = []
                self.models.append(fold_models)

            for i, learner in enumerate(learners):
                model = learner(train_data)
                if self.store_models:
                    fold_models.append(model)

                if is_discrete(class_var):
                    values, probs = model(test_data, model.ValueProbs)
                    predicted[i].append(values)
                    probabilities[i].append(probs)
                elif is_continuous(class_var):
                    values = model(test_data, model.Value)
                    predicted[i].append(values)

            fold_start += len(test)

        row_indices = np.hstack(row_indices)
        actual = np.hstack(actual)
        predicted = np.array([np.hstack(pred) for pred in predicted])
        if is_discrete(class_var):
            probabilities = np.array([np.vstack(prob) for prob in probabilities])
        nrows = len(actual)
        nmodels = len(predicted)

        self.nrows = len(actual)
        self.row_indices = row_indices
        self.actual = actual
        self.predicted = predicted.reshape(nmodels, nrows)
        if is_discrete(class_var):
            self.probabilities = probabilities
Beispiel #16
0
def main(kernel, CC, gg, bootstraps, spanDistance):

    #meanSpan([0.197, 0.384,0.382,0.268,0.248,0.280,0.248,0.417], 4)
    #exit()

    #random.seed(5)
    #random.random()

    # this method does not seem to benefit from using less than all columns of data.
    startCol = 0
    endCol = 1775  # max = 1775

    trainBase = csv_io.read_data("../Data/train.csv")

    result = 100
    avg = 0
    avg_sum = 0
    avg_counter = 0
    #bootstraps = 5

    predicted_list = []

    #spanDistance = 15

    #poly use
    #Cc = [12, 10, 8, 6,4 ,2]
    #gg = [-19, -17, -15, -13, -11, -9]

    #CC = [10, 9.5, 9, 8.5, 8, 7.5,  7, 6.5, 6]
    #gg = [-17, -16.5, -16, -15.5, -15, -14.5, -14, -13.5, 13.0]

    # poly optimized set.
    #CC = [10, 8.5, 8.5, 8, 8, 7, 6.5, 6]
    #gg = [-17, -16.5, -16, -15.5, -15, -14.5, -14, -13.5]
    #CC = [10]
    #gg = [-17]

    bootstrapLists = []

    #for g in gg:
    #for C in CC:
    if (True):
        for a in range(0, len(CC)):

            C = CC[a]
            g = gg[a]

            predicted_list = []
            bs = cross_validation.Bootstrap(len(trainBase) - 1,
                                            n_bootstraps=bootstraps,
                                            train_size=0.7,
                                            random_state=0)
            for train_index, test_index in bs:

                trainBaseTemp = [trainBase[i + 1] for i in train_index]
                trainBaseTemp = trainBase
                target = [x[0] for x in trainBaseTemp][1001:3700]
                train = [x[startCol + 1:endCol + 1]
                         for x in trainBaseTemp][1001:3700]

                testBaseTemp = [trainBase[i + 1] for i in test_index]
                testBaseTemp = trainBase
                targetTest = [x[0] for x in testBaseTemp][1:1000]
                trainTest = [x[startCol + 1:endCol + 1]
                             for x in testBaseTemp][1:1000]

                test = csv_io.read_data("../Data/test.csv")
                test = [x[startCol:endCol] for x in test]

                fo = open("svm_stats.txt", "a+")

                svc = None

                if (kernel == 'poly'):
                    svc = svm.SVC(probability=True,
                                  C=10**C,
                                  gamma=2**g,
                                  cache_size=800,
                                  coef0=0.0,
                                  degree=3,
                                  kernel='poly',
                                  shrinking=True,
                                  tol=0.01)
                if (kernel == 'sigmoid'):
                    svc = svm.SVC(probability=True,
                                  C=1 * C,
                                  gamma=2**g,
                                  cache_size=800,
                                  coef0=0.0,
                                  degree=3,
                                  kernel='sigmoid',
                                  shrinking=True,
                                  tol=0.01)
                if (kernel == 'rbf'):
                    svc = svm.SVC(probability=True,
                                  C=10**C,
                                  gamma=2**g,
                                  cache_size=800,
                                  coef0=0.0,
                                  degree=3,
                                  kernel='rbf',
                                  shrinking=True,
                                  tol=0.01)
                if (kernel == 'linear'):
                    svc = svm.SVC(probability=True,
                                  C=10**C,
                                  gamma=2**g,
                                  cache_size=800,
                                  coef0=0.0,
                                  degree=3,
                                  kernel='linear',
                                  shrinking=True,
                                  tol=0.01)

                svc.fit(train, target)
                prob = svc.predict_proba(trainTest)  # was test

                probSum = 0
                totalOffByHalf = 0

                for i in range(0, len(prob)):
                    #print i, probSum, prob[i][1], target[i]
                    #print target[i]*log(prob[i][1]), (1-target[i])*log(1-prob[i][1])
                    probSum += targetTest[i] * log(
                        prob[i][1]) + (1 - targetTest[i]) * log(1 - prob[i][1])
                    if (math.fabs(prob[i][1] - targetTest[i]) > 0.5):
                        totalOffByHalf = totalOffByHalf + 1

                #print probSum
                #print len(prob)
                print "Total Off By > 0.5 ", totalOffByHalf
                print "C: ", 10**C, " gamma: ", 2**g
                #print "C: ", 10**C[y], " gamma: " ,2**g[y]
                print -probSum / len(prob)

                #fo.write(str(C[y]) + "," + str(g[y]) + "," + str(-probSum/len(prob)));
                fo.write(
                    str(C) + "," + str(g) + "," + str(-probSum / len(prob)))

                avg_sum += (-probSum / len(prob))
                avg_counter = avg_counter + 1

                #if ( -probSum/len(prob) < result ):
                #    result = -probSum/len(prob)
                #    predicted_probs = svc.predict_proba(test)  # was test
                #    predicted_probs = ["%f" % x[1] for x in predicted_probs]
                #    csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs)
                #    print "Generated Data!!"

                predicted_probs = svc.predict_proba(test)  # was test
                predicted_list.append([x[1] for x in predicted_probs])

                fo.close()

            avg_list = []
            med_list = []

            for p in range(0, len(test)):
                temp_list = []
                for q in range(0, len(predicted_list)):
                    temp_list.append(predicted_list[q][p])

                avg_list.append(mean(temp_list))
                med_list.append(getMedian(temp_list))

                #print p, q, temp_list, mean(temp_list), getMedian(temp_list)

            bootstrapLists.append(med_list)

    if (len(bootstrapLists) > 1):
        finalList = []
        for p in range(0, len(test)):
            temp_list = []
            for q in range(0, len(bootstrapLists)):
                temp_list.append(bootstrapLists[q][p])

            finalList.append(meanSpan(temp_list, spanDistance))

            print p, q, temp_list, meanSpan(temp_list, spanDistance)
    else:
        finalList = bootstrapLists[0]

    final_values = ["%f" % x for x in finalList]
    csv_io.write_delimited_file(
        "../Submissions/svm-" + kernel +
        "-bootstrap-stack_meanSpan_benchmark.csv", final_values)

    print "Average: ", (avg_sum / avg_counter)

    var = raw_input("Enter to terminate.")
Beispiel #17
0
def test_bootstrap_test_sizes():
    assert_equal(cval.Bootstrap(10, test_size=0.2).test_size, 2)
    assert_equal(cval.Bootstrap(10, test_size=2).test_size, 2)
    assert_equal(cval.Bootstrap(10, test_size=None).test_size, 5)
Beispiel #18
0
def main():

    #random.seed(5)
    #random.random()

    startCol = 0
    endCol = 1775  # max = 1775

    trainBase = csv_io.read_data("../Data/train.csv")

    result = 100
    avg = 0
    bootstraps = 1  # should be ood for median

    rnd_start = 456

    predicted_list = []

    for n_est in [160, 320, 640, 1280, 4000, 8000, 16000]:
        for learn_r in [0.5, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001]:
            bs = cross_validation.Bootstrap(len(trainBase) - 1,
                                            n_bootstraps=bootstraps,
                                            train_size=0.6,
                                            random_state=0)
            for train_index, test_index in bs:

                print n_est, learn_r
                #trainBaseTemp = [trainBase[i+1] for i in train_index]
                trainBaseTemp = trainBase
                target = [x[0] for x in trainBaseTemp][1001:3700]
                train = [x[1:] for x in trainBaseTemp][1001:3700]

                #testBaseTemp = [trainBase[i+1] for i in test_index]
                testBaseTemp = trainBase
                targetTest = [x[0] for x in testBaseTemp][1:1000]
                trainTest = [x[1:] for x in testBaseTemp][1:1000]

                test = csv_io.read_data("../Data/test.csv")
                test = [x[0:] for x in test]

                fo = open("gb_stats.txt", "a+")

                #learn_rate=0.1, n_estimators=200
                rf = GradientBoostingClassifier(
                    loss='deviance',
                    learn_rate=learn_r,
                    n_estimators=n_est,
                    subsample=1.0,
                    min_samples_split=1,
                    min_samples_leaf=1,
                    max_depth=3,
                    init=None,
                    random_state=rnd_start)  # , max_features=None

                rf.fit(train, target)
                prob = rf.predict_proba(trainTest)  # was test

                probSum = 0
                totalOffByHalf = 0

                for i in range(0, len(prob)):
                    probX = prob[i][1]  # [1]
                    if (probX > 0.999999999999):
                        probX = 0.999999999999
                    if (probX < 0.000000000001):
                        probX = 0.000000000001
                    #print i, probSum, probX, target[i]
                    #print target[i]*log(probX), (1-target[i])*log(1-probX)
                    probSum += targetTest[i] * log(probX) + (
                        1 - targetTest[i]) * log(1 - probX)
                    if (math.fabs(probX - targetTest[i]) > 0.5):
                        totalOffByHalf = totalOffByHalf + 1

                print "Total Off By > 0.5 ", totalOffByHalf
                print -probSum / len(prob)

                #fo.write(str(C) + "," + str(g) + "," + str(-probSum/len(prob)));

                avg += (-probSum / len(prob)) / bootstraps

                #if ( -probSum/len(prob) < result ):
                #    result = -probSum/len(prob)
                #    predicted_probs = rf.predict_proba(test)  # was test
                #    predicted_probs = ["%f" % x[1] for x in predicted_probs]
                #    csv_io.write_delimited_file("../Submissions/svm_benchmark.csv", predicted_probs)
                #    print "Generated Data!!"

                predicted_probs = rf.predict_proba(test)  # was test
                predicted_list.append([x[1] for x in predicted_probs])

                fo.close()

    avg_list = []
    med_list = []

    for p in range(0, len(predicted_list[0])):
        temp_list = []
        for q in range(0, len(predicted_list)):
            temp_list.append(predicted_list[q][p])

        avg_list.append(mean(temp_list))
        med_list.append(getMedian(temp_list))

        #print p, q, temp_list, mean(temp_list), getMedian(temp_list)

    med_values = ["%f" % x for x in med_list]
    csv_io.write_delimited_file("../Submissions/gb_med_benchmark.csv",
                                med_values)

    avg_values = ["%f" % x for x in avg_list]
    csv_io.write_delimited_file("../Submissions/gb_avg_benchmark.csv",
                                avg_values)

    print "Average: ", avg

    var = raw_input("Enter to terminate.")