Ejemplo n.º 1
0
def evaluate_learner(X_train, X_test, y_train, y_test):
    '''
    Run multiple times with different algorithms to get an idea of the
    relative performance of each configuration.
    Returns a sequence of tuples containing:
        (title, expected values, actual values)
    for each learner.
    '''

    # Use a support vector machine for regression
    from sklearn.svm import SVR

    # Train using a radial basis function
    svr = SVR(kernel='rbf', gamma=0.1)
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_test)
    r_2 = svr.score(X_test, y_test)
    yield 'RBF Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred

    # Train using a linear kernel
    svr = SVR(kernel='linear')
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_test)
    r_2 = svr.score(X_test, y_test)
    yield 'Linear Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred

    # Train using a polynomial kernel
    svr = SVR(kernel='poly', degree=2)
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_test)
    r_2 = svr.score(X_test, y_test)
    yield 'Polynomial Model ($R^2={:.3f}$)'.format(r_2), y_test, y_pred
Ejemplo n.º 2
0
def train_SVM(X, Y, kernel='rbf', shrinking=True,  tol=0.001, cache_size=1500, verbose=True, max_iter=-1):
	"""Assumes all irrelevant features have been removed from X and Y"""
	"""Learns several hundred SVMs"""

	clf = SVR(kernel=kernel, tol=tol, cache_size=cache_size, verbose=verbose, max_iter=max_iter)
	pipeline = Pipeline(zip([ "imputate", "vart", "scale", "svm" ], [ Imputer(), VarianceThreshold(), StandardScaler(), clf ]))
	
	param_grid = dict(svm__C=[0.1, 1, 10, 100, 1000],
										svm__gamma=[0.001, 0.01, 1, 10])

	
	grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=3)
	
	results = []

	for i in range(Y[0].shape[1]):
		Y_new = np.fromiter((x[:, i][0, 0] for x in Y), np.double)
		X_new = np.array([np.matrix(x.data).flatten().tolist() for x in X], np.double)
		#X_new = np.fromiter((np.matrix(x.data) for x in X), np.double)

		X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_new, Y_new, test_size = 0.2)
		X_train = flatten(X_train)
		X_test = flatten(X_test)

		grid_search.fit(X_train, Y_train)
		results.append( (grid_search.best_estimator_, clf.score(X_test, Y_test)))	
		print("Best estimators (C): {0}, Score: {1}".format(grid_search.best_estimator_, clf.score(X_test, Y_test)))
	return results
Ejemplo n.º 3
0
def test():
    n_samples, n_features = 10,5
    np.random.seed(0)
    y = np.random.randn(n_samples)
    X = np.random.randn(n_samples, n_features)
    print y
    print X
    clf = SVR(C=1.0,epsilon=0.2)
    clf.fit(X,y)
    print clf.score(X,y)
Ejemplo n.º 4
0
def svmRegressorStudy(X, Y, setSize, comment):
    #runs svm regressor on the data
    X_train=X[:setSize]
    X_test=X[setSize:]
    Y_train=Y[:setSize]
    Y_test=Y[setSize:]

    svm=SVR()
    svm.fit(X_train, Y_train)
    print 'svm regressor '+comment
    s1 = svm.score(X_train, Y_train)
    s2 = svm.score(X_test, Y_test)
    print 'svm score for ', s1, s2
class SVRegression:
    def __init__(self, kernel_value, c_value, iter_value):
        self.kernel = kernel_value
        self.c = c_value
        self.iter = iter_value
        self.svr_lin = None
    
    def fit_predict(self, x_train, y_train, x_test):
        self.svr_lin = SVR(kernel=self.kernel, C=self.c, max_iter=self.iter)
        y_lin = self.svr_lin.fit(x_train, y_train).predict(x_test)
        return y_lin
    
    def computeC(self, x_train):
        print "ARRAY ", type(x_train)
        print x_train
        array = x_train.todense()
        print "ARRAY ", type(array)
        print array
        result = array.sum(axis=1, dtype='float')
        result = pow(result, 2)
        total = result.sum(axis=0, dtype='float')
        rows, columns = x_train.shape
        total = float(total)/float(rows)
        total = pow(total,-1)
        print "C", total
        self.c = total

    def computeAccuracy(self, x, y):
        return self.svr_lin.score(x, y)
Ejemplo n.º 6
0
def trainmodel(train_data, dev_data, train_label=train_labels, dev_label=test_labels, params={ "kernel":"rbf"}):
    rf=SVR().set_params(**params)
    rf.fit(train_data, train_label)
    rf_pred=rf.predict(dev_data)
    #r2=mean_squared_error(dev_label, rf_pred)
    r2 = rf.score(dev_data, dev_label)  
    
    return r2, rf
Ejemplo n.º 7
0
def correlation_search_map(line,variable,lag_variable,kernel_variable):
    
    predictor_data = line.split(';')
    predictor_key = predictor_data[0]
    predictor_name = predictor_data[1]
    
    predictor = cjson.decode(predictor_data[2])
    predictor = transform_serie(predictor)
    
        
    key = str(predictor_key)
            
    original_predictor = predictor.values()
    original_X = np.array(original_predictor,ndmin=2)
    original_X = original_X.reshape((-1,1))
    #original_X = scale(original_X)
    
    results = {}        
    for i in range(lag_variable.value+1):
        
        lagged_predictor = lag_serie(predictor, i)
        
        (list_key,list_variable, list_predictor) = serie_join(variable.value, lagged_predictor)
        
        
        if len(list_predictor) < 6:    
            results[str(i)] = {'r2' : 0}
            continue
        
        
        y = np.array(serie_std(list_variable),ndmin=1)
        
        X = np.array(serie_std(list_predictor),ndmin=2)
        X = X.reshape((-1,1))
        
        
        clf = SVR(kernel=kernel_variable.value)
        clf.fit(X, y)
        r_squared = clf.score(X, y)
        
        
        
        if r_squared < 0.5:
            results[str(i)] = {'r2' : 0}
            continue
        
        
        result = {}
        
        result["r2"] = r_squared
        
        
        results[str(i)] = result

    return { 'id':key, 'results': results, 'name':predictor_name}
Ejemplo n.º 8
0
class SvrBlockade(BlockadeModel):
    def __init__(self):
        super(SvrBlockade, self).__init__()
        self.name = "SVR"
        self.svr_cache = {}

    def _svr_predict(self, feature_vec):
        """
        Predicts signal for a feature vector
        """
        if feature_vec not in self.svr_cache:
            np_feature = np.array(feature_vec).reshape(1, -1)
            self.svr_cache[feature_vec] = self.predictor.predict(np_feature)[0]
        return self.svr_cache[feature_vec]

    def train(self, peptides, signals, C=1000, gamma=0.001, epsilon=0.01):
        """
        Trains SVR model
        """
        self.predictor = SVR(kernel="rbf", C=C, gamma=gamma, epsilon=epsilon)
        features = map(lambda p: self._peptide_to_features(p), peptides)
        train_features = np.array(sum(features, []))
        train_signals = np.array(sum(signals, []))
        assert len(train_features) == len(train_signals)

        self.predictor.fit(train_features, train_signals)
        print(self.predictor.score(train_features, train_signals))

    def peptide_signal(self, peptide):
        """
        Generates theoretical signal for a given peptide
        """
        assert self.predictor is not None

        features = self._peptide_to_features(peptide)
        signal = np.array(map(lambda x: self._svr_predict(x), features))
        #normalize the signal's amplitude
        signal = signal / np.std(signal)
        return signal

    def _peptide_to_features(self, peptide):
        """
        Converts peptide into a list of feature vectors
        """
        aa_weights = _aa_to_weights(peptide)
        num_peaks = len(aa_weights) + self.window - 1
        flanked_peptide = ("-" * (self.window - 1) + aa_weights +
                           "-" * (self.window - 1))
        features = []
        for i in xrange(0, num_peaks):
            kmer = flanked_peptide[i : i + self.window]
            feature = _kmer_to_features(kmer)
            features.append(feature)

        return features
 def obj(self, cfg):
     # create the regressor with given params
     clsf = SVR(C = 10.0 ** cfg['C'], 
                epsilon= 10.0 ** cfg['epsilon'], 
                gamma= 10.0 ** cfg['gamma'], 
                kernel=cfg['kernel'])
     
     # fit the regressor
     clsf.fit(self.X, self.Y)
     
     # get the validation score
     score = clsf.score(self.Xv, self.Yv)
     
     return score
Ejemplo n.º 10
0
def main():
    log.info('main start')

    dfs = []
    for year in range(YEAR_START+1, YEAR_END+1):
        dfs.append(loadHistory(year))
    # print dfs
    df = pd.concat(dfs, ignore_index=True)
    # print df.head(1)
    # print df['score']
    log.info('{} total rows'.format(len(df)))

    # clean player data
    df_cleaned = cleanPlayerData(df)
    df_cleaned.to_csv('{}/data.csv'.format(FOLDER))

    # get labels and features
    log.info('getting labels and features...')
    labels = df_cleaned['score']
    print 'labels\n', labels
    # print df_merged.columns[-10:]
    df_cleaned = df_cleaned.drop('score', axis=1).astype(float)
    features = scale(df_cleaned)
    # print df_merged.columns[-10:]
    print 'features\n', features[0]

    X_train, X_test, y_train, y_test = train_test_split(features, labels)

    # CV
    # clf = GradientBoostingRegressor()
    clf = SVR(kernel='linear')
    # clf = ExtraTreesRegressor(n_estimators=1000)
    cv = cross_val_score(clf, X_train, y_train, cv=5, scoring='r2')
    cv = [abs(n) for n in cv]
    log.info('CV mean {} std {}'.format(np.mean(cv), np.std(cv)))
    # print 'cv', cv

    # train
    clf.fit(X_train, y_train)
    print 'score', clf.score(X_test, y_test)

    # predict
    prediction = predictCurrent(clf, YEAR_END+1, df_cleaned.columns)

    # calculate winnings
    # calculateWinnings(prediction)

    log.info('main end')
Ejemplo n.º 11
0
def CalculateSVR(data=None):
    """
    Function is used to classify review text based on Support Vector Regression Classifier
    :param data: Review text with the rating from the data set
    :return: print the accuracy Score
    """
    vectorizer = TfidfVectorizer(tokenizer=pre_process)
    classifier = SVR(kernel='linear')
    train, test = train_test_split([(i['text'], i['stars']) for i in data],
                                   test_size=.2,
                                   random_state=10)
    x_train = vectorizer.fit_transform(i[0] for i in train)
    x_test = vectorizer.transform(i[0] for i in test)
    classifier.fit(x_train, [i[1] for i in train])
    score = classifier.score(x_test, [i[1] for i in test])
    print score
Ejemplo n.º 12
0
def train_SVM(trainingdataX,trainingdataY,testdataX,testdataY):
	clf = SVR(C=C, epsilon=epsilon,kernel='rbf')
	clf.fit(trainingdataX,(trainingdataY))

	samples = []
	labels = []
	pred = []
	for sample in range(len(testdataX)):
		samples.append(testdataX[sample])
		labels.append((float)(testdataY[sample][0]))
	pred = clf.predict(samples)
	errs = pred - labels

	r2 = clf.score(samples,labels)
	print("SVM: R^2: {} C: {} eps: {}".format(r2,C,epsilon))
	return clf, r2
def make_encoding_model (X,y1,y2,movie_idx):
    ####
    #Encoding model pipeline
    ####
    #1. Fit linear regression at each voxel of training data
    #2. Get model fit at each voxel of testing data
    #3. Find max loading PE for each voxel (assuming these are sorted) and project these to a new brain volume

    #Experimenter knobs:
    #1. Crossval is being done on run_1/run_2. Could change this to 90%/10% having averaged together 1st/2nd half.
    #2. Should be trained on 90% of the movie and tested on the other 10%

    #Fit the voxel timecourse Y with X
    #clf = linear_model.RidgeCV(alphas=[0.001, 0.1, 1, 10],fit_intercept=True)
    #clf = linear_model.Ridge(alpha=0.1,fit_intercept=True)
    clf = SVR(kernel='linear', C=1e3, gamma=0.1)
    #clf = linear_model.Ridge(alpha=1,fit_intercept=True).fit(X,rh_1_array)
    #clf.fit(X,y1) #Fit to first half
    #coeffs = clf.coef_
    #y2_hat = clf.predict(X)
    #MSE = metrics.mean_squared_error(y2, y2_hat) #compare [predicted y2] to y2
    #r2 = clf.score(object_model, y2) #compare [predicted y2] to y2
    #return coeffs, r2, MSE, y2_hat
    ntimes = movie_idx.shape[0]
    coeffs = np.zeros((ntimes,X.shape[1]))
    r2_array = np.zeros(ntimes)
    r_array = np.zeros(ntimes)
    mse_array = np.zeros(ntimes)
    y2_hat_array = []
    for idx in range(0,ntimes - 1):
        #seperate out this run from the rest
        start_idx = movie_idx[idx]
        end_idx = movie_idx[idx + 1] - 1
        tx = X[start_idx:end_idx,:]
        ty1 = y1[start_idx:end_idx]
        ty2 = y2[start_idx:end_idx]
        clf = SVR(kernel='linear', C=10, gamma=0.1, verbose = False, max_iter = 1000)
        clf.fit(tx,ty1) #Fit to first half
        coeffs[idx,:] = clf.coef_
        y2_hat = clf.predict(tx)
        y2_hat_array.append(y2_hat)
        r2_array[idx] = clf.score(tx, ty2)
        r_array[idx] = np.corrcoef(ty2, y2_hat)[0,1]
        mse_array[idx] = metrics.mean_squared_error(ty2,y2_hat)

    #y2_hat_array = np.vstack(y2_hat_array)
    return coeffs, r2_array, mse_array, y2_hat_array, r_array
Ejemplo n.º 14
0
def main():
    """Load images, train classifier, score classifier."""
    parser = argparse.ArgumentParser(description="Train an SVM model to locate cat faces in images.")
    parser.add_argument("--dataset", required=True, help="Path to your 10k cats dataset directory")
    args = parser.parse_args()

    # initialize dataset
    subdir_names = ["CAT_00", "CAT_01", "CAT_02", "CAT_03", "CAT_04", "CAT_05", "CAT_06"]
    subdirs = [os.path.join(args.dataset, subdir) for subdir in subdir_names]
    dataset = Dataset(subdirs)

    # load images and labels
    print("Loading images...")
    X, y = load_xy(dataset, NB_CROPS, NB_AUGMENTATIONS)
    assert X.dtype == np.float32
    assert np.max(X) <= 1.0
    assert np.min(X) >= 0.0

    # split train and val
    """
    nb_images = X.shape[0]
    nb_train = int(nb_images * (1 - SPLIT))
    X_train = X[0:nb_train, ...]
    y_train = y[0:nb_train, ...]
    X_val = X[nb_train:, ...]
    y_val = y[nb_train:, ...]
    """
    X_val, X_train = X[0:NB_VALIDATION, ...], X[NB_VALIDATION:, ...]
    y_val, y_train = y[0:NB_VALIDATION, ...], y[NB_VALIDATION:, ...]
    print("%d of %d values in y_train are 1, %d of %d values in y_val" % (np.count_nonzero(y_train), y_train.shape[0], np.count_nonzero(y_val), y_val.shape[0]))

    print("Training...")
    #svc = SVC(C=0.1, class_weight="auto", kernel="poly")
    svc = SVR(C=0.1, verbose=True)
    svc.fit(X_train, y_train)

    print("Predictions...")
    preds = svc.predict(X_val)
    for i in range(preds.shape[0]):
        print("%d: pred=%.2f, label=%.2f" % (i, preds[i], y_val[i]))

    print("Scoring...")
    acc = svc.score(X_val, y_val)
    print("accuracy = %.4f" % (acc))
Ejemplo n.º 15
0
def main():

    if debug:
        print "\n\n\tdrugBind.py"
    # obtain training data
    try:
        train_x, train_y, newData = getFeatures(featuresFilename)
    except IOError:
        makeFeatures(featuresFilename)
        train_x, train_y, newData = getFeatures(featuresFilename)

    # machine learning steps
    # fit a SVM model to the data
    model = SVR()
    model.fit(train_x, train_y)
    if debug:
        print model
        print "\nUsing training data to test model accuracy:"

    # make predictions
    expected = train_y
    predicted = model.predict(train_x)

    # summarize the fit of the model
    mse = numpy.mean((predicted-expected)**2)
    # mean of squared errors
    if debug:
        print("\n\tMean of squared errors: {}".format(mse))


    '''
    Returns the coefficient of determination R^2 of the prediction.
    The coefficient R^2 is defined as (1 - u/v), where u is the regression sum of squares
    ((y_true - y_pred) ** 2).sum() and v is the residual sum of squares ((y_true - y_true.mean()) ** 2).sum().
    Best possible score is 1.0, lower values are worse.
    '''
    if debug:
        print("\tModel score: {}".format(model.score(train_x, train_y)))
    def _random_search(self, random_iter, x, y, kernel_cache_size):
        # Default Values
        c = 1.0
        gamma = 0.0
        best_score = -sys.maxint

        if random_iter > 0:
            sys.stdout.write("Do a random search %d times" % random_iter)
            param_dist = {"C": numpy.power(2.0, range(-5, 16)),
                          "gamma": numpy.power(2.0, range(-15, 4))}
            param_list = [{"C": c, "gamma": gamma}, ]
            param_list.extend(list(ParameterSampler(param_dist,
                                                    n_iter=random_iter-1,
                                                    random_state=self._rng)))
            train_x, test_x, train_y, test_y = \
                train_test_split(x, y, test_size=0.5, random_state=self._rng)

            for idx, d in enumerate(param_list):
                svr = SVR(kernel='rbf',
                          gamma=d['gamma'],
                          C=d['C'],
                          random_state=self._rng,
                          cache_size=kernel_cache_size)
                svr.fit(train_x, train_y)
                sc = svr.score(test_x, test_y)
                # Tiny output
                m = "."
                if idx % 10 == 0:
                    m = "#"
                if sc > best_score:
                    m = "<"
                    best_score = sc
                    c = d['C']
                    gamma = d['gamma']
                sys.stdout.write(m)
                sys.stdout.flush()
            sys.stdout.write("Using C: %f and Gamma: %f\n" % (c, gamma))
        return c, gamma
def linear_regression_multivariant(X_train, X_test, Y_train, Y_test, cost_fun='ordinary_least_squares'):
    if cost_fun == 'ordinary_least_squares':
        regr = linear_model.LinearRegression()
    elif cost_fun == 'Ridge_Regression':
        regr = linear_model.Ridge(alpha=1)
    elif cost_fun == 'Bayesian_Regression':
        regr = linear_model.BayesianRidge()
    elif cost_fun == 'SVR':
        regr = SVR(C=1.0, epsilon=0.2, kernel='linear')
    elif cost_fun == 'KNN_Reg':
        regr = neighbors.KNeighborsRegressor(5, weights='distance')
    else:
        raise Exception('The type of cost function is not specified.')

    # Train the model using the training sets
    regr.fit(X_train, Y_train)
    predict = regr.predict(X_test)
    # record the experiment performance, Explained variance score: 1 is perfect prediction
    np.seterr(invalid='ignore')
    print(list(predict)[:100])
    print(Y_test[:100])
    evaluate(list(predict), np.array(Y_test),
             'linear regression ' + 'Explained variance score: %.2f' % regr.score(X_test, Y_test))
Ejemplo n.º 18
0
print(x_test.shape)
print(y_test.shape)

# Define the algorithm to be used
alg = SVR(kernel='linear', C=1.0, epsilon=0.2)

# Fit the model
alg.fit(x_train, y_train)

# Verify the prediction model using the test data 
y_pred = alg.predict(x_test)
print(y_pred.shape)

# Graph the test data with the regretion line
plt.scatter(x_test, y_test)
plt.plot(x_test, y_pred, color='red', linewidth=3)
plt.title('Support Vector Regression')
plt.xlabel('Rooms Number')
plt.ylabel('Mean Value')
plt.show()

# Obtain the parameters ai for this model
a0 = alg.intercept_
print('a0:', a0)
a = alg.coef_
print('ai:', a)
print('y =', a0, '+', a[0], '* x')

# Verify the model error based on R²
print('certainty:', alg.score(x_train, y_train) * 100, '%')
Ejemplo n.º 19
0
def support_vector_regression(data):
    """
    Main method for support vector regression. Trains a RBF-Model, Polynomial-Model and Linear-Model.

    Args:
        data (DataML): The dataset to perform the regression on.
    """
    global dataset
    dataset = data
    global feature_mapping
    feature_mapping = create_dict()

    # Configure regression model
    svr_rbf = SVR(kernel='rbf', C=1, gamma=0.5, epsilon=0.165, cache_size=1000)
    svr_lin = SVR(kernel='linear', C=1, epsilon=0.165, cache_size=1000)
    svr_poly = SVR(kernel='poly', C=100000, degree=2, epsilon=0.165, cache_size=1000)

    output_dir = create_output_dir()

    results = ''
    results += calculate_crossvalidation(svr_lin, svr_poly, svr_rbf, Config.CV_FOLDS_REGRESSION)

    rbf_count_exact = []
    rbf_count_close = []
    poly_count_exact = []
    poly_count_close = []
    lin_count_exact = []
    lin_count_close = []
    test_set_start = Config.TRAIN_SET_START
    while test_set_start <= Config.TEST_SET_STOP:
        if test_set_start >= Config.TEST_SET_STOP:
            test_set_end = Config.TRAIN_SET_END
        else:
            test_set_end = test_set_start + Config.TEST_SET_SIZE

        X = np.concatenate(
            [data.data[Config.TRAIN_SET_START:test_set_start], data.data[test_set_end:Config.TRAIN_SET_END]])
        y = np.concatenate(
            [data.target[Config.TRAIN_SET_START:test_set_start], data.target[test_set_end:Config.TRAIN_SET_END]])
        X_test = data.data[test_set_start:test_set_end]
        y_test = data.target[test_set_start:test_set_end]

        # Do the mapping for target values
        do_mapping(y_test)
        do_mapping(y)

        rbf = svr_rbf.fit(X, y)
        lin = svr_lin.fit(X, y)
        poly = svr_poly.fit(X, y)

        score_rbf = svr_rbf.score(X_test, y_test)
        score_poly = svr_poly.score(X_test, y_test)
        score_lin = svr_lin.score(X_test, y_test)

        y_rbf_predicted = rbf.predict(X_test)
        y_lin_predicted = lin.predict(X_test)
        y_poly_predicted = poly.predict(X_test)

        results += add_to_results(test_set_end, test_set_start)
        count_rbf_exact, count_rbf_close, results = calculate_metrics(y_test, y_rbf_predicted, "RBF-Kernel", score_rbf,
                                                                      results)
        count_poly_exact, count_poly_close, results = calculate_metrics(y_test, y_poly_predicted, "Poly-Kernel",
                                                                        score_poly, results)
        count_lin_exact, count_lin_close, results = calculate_metrics(y_test, y_lin_predicted, "Linear-Kernel",
                                                                      score_lin, results)
        rbf_count_exact.append(count_rbf_exact)
        rbf_count_close.append(count_rbf_close)
        poly_count_exact.append(count_poly_exact)
        poly_count_close.append(count_poly_close)
        lin_count_exact.append(count_lin_exact)
        lin_count_close.append(count_lin_close)

        graph_dir = '%s/%s_%s_predicted_graph.png' % (output_dir, test_set_start, test_set_end)
        x_axis = np.arange(test_set_start, test_set_end)
        draw_results(y_test, y_lin_predicted, y_poly_predicted, y_rbf_predicted, x_axis, graph_dir)

        test_set_start = test_set_start + Config.TEST_SET_SIZE

    results = add_counts_to_results(lin_count_close, lin_count_exact, poly_count_close, poly_count_exact,
                                    rbf_count_close, rbf_count_exact, results)

    # save file
    with open(output_dir + "/scoring_results.txt", 'w') as file:
        file.write(results)
Ejemplo n.º 20
0
######################## Escalonamento Dados ########################
scaler_x = StandardScaler()
x_scaled = scaler_x.fit_transform(x)

scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y)

# Split
x_sc_treinamento, x_sc_teste, y_sc_treinamento, y_sc_teste = train_test_split(
    x_scaled, y_scaled, test_size=0.5, random_state=101)

######################## SVR ########################
regressor_SVM = SVR(kernel='rbf', C=100)
regressor_SVM.fit(x_treinamento, y_treinamento)

regressor_SVM.score(x_treinamento, y_treinamento)
regressor_SVM.score(x_teste, y_teste)

previsao = regressor_SVM.predict(x_teste)

n_toPlot = -365

plt.scatter(Pivot_Data.index[n_toPlot:],
            Pivot_Data['USDBRL Curncy'].values[n_toPlot:],
            s=5,
            c="b",
            label="USDBRL Curncy")
plt.plot(Pivot_Data.index[n_toPlot:],
         regressor_SVM.predict(x)[n_toPlot:],
         c="r",
         label="Estimado")
Ejemplo n.º 21
0
 y_train = ss_y.fit_transform(y_train)
 y_test = ss_y.transform(y_test)
 #线性核函数
 linear_svr = SVR(kernel='linear')
 linear_svr.fit(X_train, y_train)
 linear_svr_y_predict = linear_svr.predict(X_test)
 #多项式核函数
 poly_svr = SVR(kernel='poly')
 poly_svr.fit(X_train, y_train)
 poly_svr_y_predict = poly_svr.predict(X_test)
 #径向基核函数
 rbf_svr = SVR(kernel='rbf')
 rbf_svr.fit(X_train, y_train)
 rbf_svr_y_predict = rbf_svr.predict(X_test)
 print('-------------The result of linear SVR-------------')
 print('R-squared', linear_svr.score(X_test, y_test))
 print(
     'MSE:',
     mean_squared_error(ss_y.inverse_transform(y_test),
                        ss_y.inverse_transform(linear_svr_y_predict)))
 print(
     'MAE:',
     mean_absolute_error(ss_y.inverse_transform(y_test),
                         ss_y.inverse_transform(linear_svr_y_predict)))
 print('-------------The result of poly SVR-------------')
 print('R-squared', poly_svr.score(X_test, y_test))
 print(
     'MSE:',
     mean_squared_error(ss_y.inverse_transform(y_test),
                        ss_y.inverse_transform(poly_svr_y_predict)))
 print(
Ejemplo n.º 22
0
trainData.drop(2744604, inplace = True)
print("Total Train Data: ", len(trainData))
#print(trainData.isnull().any())

trainData['matchType'] = trainData['matchType'].astype('category')
trainData['groupId'] = trainData['groupId'].astype('category')
trainData['matchId'] = trainData['matchId'].astype('category')

trainData['groupId_cat'] = trainData['groupId'].cat.codes
trainData['matchId_cat'] = trainData['matchId'].cat.codes
trainData['matchType_cat'] = trainData['matchType'].cat.codes

trainData.drop(columns = ['Id','groupId', 'matchId', 'matchType'], inplace = True)
#print(trainData.head())


x = trainData.drop(['winPlacePerc'],axis=1)
y = trainData['winPlacePerc']


xtrain,xtest,ytrain,ytest = train_test_split(x,y, test_size = 0.2, random_state = 4)
#print(xtrain.values)



print("Train: ", len(ytrain)," Test: ", len(ytest))
print("Model Training...")
svmModel = SVR(gamma=0.001, C=1.0, epsilon=0.2)
svmModel.fit(xtrain.values[:50000],ytrain.values[:50000])
print("Score: " ,svmModel.score(xtest.values[:200],ytest.values[:200]))
import pandas as pd

base = pd.read_csv('plano-saude2.csv')

X = base.iloc[:, 0:1].values
y = base.iloc[:, 1:2].values

# kernel linear
from sklearn.svm import SVR
regressor_linear = SVR(kernel='linear')
regressor_linear.fit(X, y)

import matplotlib.pyplot as plt
plt.scatter(X, y)
plt.plot(X, regressor_linear.predict(X), color='red')
regressor_linear.score(X, y)

# kernel poly
regressor_poly = SVR(kernel='poly', degree=3)
regressor_poly.fit(X, y)

plt.scatter(X, y)
plt.plot(X, regressor_poly.predict(X), color='red')
regressor_poly.score(X, y)

# kernel rbf
from sklearn.preprocessing import StandardScaler
scaler_x = StandardScaler()
X = scaler_x.fit_transform(X)
scaler_y = StandardScaler()
y = scaler_y.fit_transform(y)
Ejemplo n.º 24
0
X = np.array(dataset.connectivity)
y = np.array(dataset.scores['age'])
yr = np.ceil(y / 10).astype(int)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
r2s, maes = [], []
ytrue, ypred = [], []
svr = SVR(kernel='linear')

for iteration, (train, test) in enumerate(skf.split(X, yr)):
    svr.fit(X[train], y[train])
    yp = svr.predict(X[test])
    ytrue.extend(y[test])
    ypred.extend(yp)
    r2 = svr.score(X[test], y[test])
    mae = mean_absolute_error(y[test], yp)
    print('%u: R^2 %.2f - MAE %.2f' % (iteration, r2, mae))
    maes.append(mae)
    r2s.append(r2)

f = plt.figure(figsize=(6, 6))
ax = f.gca()
ax.scatter(ytrue, ypred)
ax.plot([0, 100], [0, 100], 'r', linewidth=2)
ax.grid(linestyle='--')
xlabels = ['%u' % x for x in ax.get_xticks()]
ax.set_xticklabels(xlabels, fontsize=26)
labels = ['%u' % x for x in ax.get_yticks()]
ax.set_yticklabels(labels, fontsize=26)
ax.set_ylabel('Predicted Age', fontsize=24)
Ejemplo n.º 25
0
### Create the dependent data set ###
# Convert the dataframe to a numpy array
y = np.array(df['Prediction'])
# Get all of the y value except the last 'n' rows
y = y[:-forecast_out]


# Split data into 80% training and 20% testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

# Create and train SVM model
svr_rbf = SVR(kernel = 'rbf', C=1e3, gamma = 0.1)
svr_rbf.fit(x_train, y_train)

# Testing Model: Score returns the coefficient of determination R^2 of the prediction
svm_confidence = svr_rbf.score(x_test, y_test)
print("svm confidence: ", svm_confidence)

# Create and train the Linear Regression Model
lr = LinearRegression()
# Train the model
lr.fit(x_train, y_train)

# Test LR model
lr_confidence = lr.score(x_test, y_test)
print("lr confidence: ", lr_confidence)

#Set x_forecast equal to the last 30 rows of the original data set from Adj. Close column
x_forecast = np.array(df.drop(['Prediction'], 1))[-forecast_out:]

# Print the LR model predictions for the next 'n' days
Ejemplo n.º 26
0
y_pred_gb = clf_gb.predict(x_test)

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(30, 10))

# Linear Regression
ax1.scatter(range(len(y_test)), y_test, label='data')
ax1.plot(range(len(y_test)), y_pred_lr, color='green', label='LR model')
ax1.legend()

# Support Vector Machine
ax2.scatter(range(len(y_test)), y_test, label='data')
ax2.plot(range(len(y_test)), y_pred_svr, color='orange', label='SVM-RBF model')
ax2.legend()

f1, (ax3, ax4) = plt.subplots(1, 2, figsize=(30, 10))

# Random Forest Regressor
ax3.scatter(range(len(y_test)), y_test, label='data')
ax3.plot(range(len(y_test)), y_pred_rf, color='red', label='RF model')
ax3.legend()

# Gradient Boosting Regressor
ax4.scatter(range(len(y_test)), y_test, label='data')
ax4.plot(range(len(y_test)), y_pred_gb, color='black', label='GB model')
ax4.legend()

print("Accuracy of Linear Regerssion Model:", clf_lr.score(x_test, y_test))
print("Accuracy of SVM-RBF Model:", clf_svr.score(x_test, y_test))
print("Accuracy of Random Forest Model:", clf_rf.score(x_test, y_test))
print("Accuracy of Gradient Boosting Model:", clf_gb.score(x_test, y_test))
Ejemplo n.º 27
0
linear_svr = SVR(kernel='linear')
linear_svr.fit(x_train, y_train)
linear_svr_y_predict = linear_svr.predict(x_test)

#使用多项式核函数配置
poly_svr = SVR(kernel='poly')
poly_svr.fit(x_train, y_train)
poly_svr_y_predict = poly_svr.predict(x_test)

#使用径向基核函数配置
rbf_svr = SVR(kernel='rbf')
rbf_svr.fit(x_train, y_train)
rbf_svr_y_predict = rbf_svr.predict(x_test)

#模型评价
print 'the value of default measurement of Linear SVR is ', linear_svr.score(
    x_test, y_test)
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
print 'the value of R-squared of Linear SVR is ', r2_score(
    y_test, linear_svr_y_predict)
print 'the value of mean squared error of Linear SVR is ', mean_squared_error(
    ss_y.inverse_transform(y_test),
    ss_y.inverse_transform(linear_svr_y_predict))
print 'the value of mean absoluate error of Linear SVR is ', mean_absolute_error(
    ss_y.inverse_transform(y_test),
    ss_y.inverse_transform(linear_svr_y_predict))

print ''
print 'the value of default measurement of Poly SVR is ', poly_svr.score(
    x_test, y_test)
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
print 'the value of R-squared of Poly SVR is ', r2_score(
Ejemplo n.º 28
0
X_test = test[features].dropna()
y_test = test[target].dropna()

svr = SVR(kernel='rbf', C=1e3, gamma=0.1)
# svr = SVR(kernel='linear', C=1e3)
# svr = SVR(kernel='poly', C=1e3, degree=2)

# train the model on the training set
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
plt.scatter(y_test, y_pred, color='blue')
plt.xlabel("Real revenue")
plt.ylabel("Predicted revenue")
plt.show()

svr_score_train = svr.score(X_test, y_test)
svr_score_test = svr.score(X_train, y_train)
print("Training score: ", svr_score_train)
print("Testing score: ", svr_score_test)

# y = movies.revenue.values

# length = 4083
# y = y.reshape(-1, 1)

# x = preprocessing.scale(x)
# y = preprocessing.scale(y)

# regr = scr_rbfear_model.scr_rbfearRegression()
# regr.fit(x,y)
Ejemplo n.º 29
0
# step 1. model
mod = SVR()

# step 2. learning
mod.fit(X_train, y_train)
# y_pred = mod.predict(X_train)
# print_prediction_score(y_train, y_pred) # = training score

# step 3. predict
y_pred = mod.predict(X_test)

# step 4. score
print_prediction_score(y_test, y_pred)
print('R^2 train : %.3f, test : %.3f' %
      (mod.score(X_train, y_train), mod.score(X_test, y_test)))
# }}}
#
# 2. parameter optimization (Grid Search)
#{{{
print('')
print('')
print('# 2. parameter optimization (Grid Search)')

# step 1. model
mod = SVR()

# step 2. learning with optimized parameters
# search range
range_c = [i * 10**j for j in range(-2, 2) for i in range(1, 10)]
range_g = [i * 10**j for j in range(-2, 2) for i in range(1, 10)]
Ejemplo n.º 30
0
pred1 = clss.predict(X_tst)
pred1 = pd.DataFrame(pred1)
###################################################################################
#################### Decision Tree Regressor ######################################
clss = DecisionTreeRegressor()
clss.fit(X_trn, y_trn)
scoreOfModel2 = clss.score(X_trn, y_trn)
print("Model Score DTR: ", scoreOfModel2)

pred2 = clss.predict(X_tst)
pred2 = pd.DataFrame(pred2)
####################################################################################
###################### Support vector regressor ####################################
sv = SVR(kernel='rbf', C=1.0)  # radial basis function(rbf)
sv.fit(X_trn, y_trn)
scoreOfModel3 = sv.score(X_trn, y_trn)
print("Model Score SVR: ", scoreOfModel3)

pred3 = sv.predict(X_tst)
pred3 = pd.DataFrame(pred3)
####################################################################################
################### Multiple Linear Regression #####################################
reg = LinearRegression()
reg.fit(X_trn, y_trn)
scoreOfModel4 = reg.score(X_trn, y_trn)

pred4 = reg.predict(X_tst)
pred4 = pd.DataFrame(pred4)
print('r2 score MLR:', {r2_score(y_tst, pred4)})  #model Evaluation
###################################################################################
######################### Ridge Regression (L2) ###################################
# Best parameters for cuxhaven.de (~5yr. data)
# score: 0.831, Best parameters:  {'C': 10, 'gamma': 0.001
#svr_rbf = SVR(kernel='rbf', C=10, gamma=0.001)

#Best parameters for 0.2 split
# score: 0.834, Best parammeters: {'C': 20, 'gamma': 0.001, 'kernel': 'rbf'}
#svr_rbf = SVR(kernel='rbf', C=20, gamma=0.001)

# Unshuffled Best parameters for 0.2 split
# score: 0.812, Best parameters: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
svr_rbf = SVR(kernel='rbf', C=10, gamma=0.001)
eps = 0.1
# RBF
svr_rbf.fit(lx_norm_train,ly_train['surge'])
pred_svr_rbf = svr_rbf.predict(lx_norm_test) # surge predictions by the svr_rbf model
print(svr_rbf.score(lx_norm_train, ly_train['surge'])) # Model Score R^2 of 0.777
# Compare the surge values from the test dataset to the predicted surge values
SR_rmse = np.sqrt(metrics.mean_squared_error(ly_test['surge'], pred_svr_rbf))

# Plot results
plt.figure(figsize=(14, 7))
plt.plot(surge_w1['date'],surge_w1['surge'], 'black') # un-split surge dataset
plt.plot(ly_test['date'], ly_test['surge'], 'blue') # test data (target: surge)

plt.plot(ly_test['date'], pred_svr_rbf, 'red')
#plt.scatter(horizontal[svr_rbf.support_], retry[svr_rbf.support_], \
          # facecolor='none', edgecolor='red', )  # support vectors
plt.plot(ly_test['date'], pred_svr_rbf+eps, color='g', linestyle='--')
plt.plot(ly_test['date'], pred_svr_rbf-eps, color='g', linestyle='--')

plt.xlabel('Time')
Ejemplo n.º 32
0
clf_A.fit(features, arousal)

clf_V = SVR(C=1.0,
            cache_size=200,
            coef0=0.0,
            degree=2,
            epsilon=0.5,
            gamma='auto',
            kernel='linear',
            max_iter=-1,
            shrinking=True,
            tol=0.001,
            verbose=False)
clf_V.fit(features, valence)

print('Arousal Training R^2: %0.3f' % clf_A.score(features, arousal))
print('Valence Training R^2: %0.3f' % clf_V.score(features, valence))

# Cross Validation
scores_A = cross_val_score(clf_A,
                           features,
                           arousal,
                           cv=5,
                           scoring='neg_mean_squared_error')
print("Arousal CV MSE: %0.2f (+/- %0.2f)" %
      (-scores_A.mean(), scores_A.std() * 2))

scores_V = cross_val_score(clf_V,
                           features,
                           valence,
                           cv=5,
Ejemplo n.º 33
0
linear_svr.fit(X_train, y_train.ravel())
linear_svr_y_predict = linear_svr.predict(X_test)
# 使用多项式核函数配置的支持向量机进行回归训练,并且对测试样本进行预测。
poly_svr = SVR(kernel='poly')
poly_svr.fit(X_train, y_train.ravel())
poly_svr_y_predict = poly_svr.predict(X_test)
# 使用径向基核函数配置的支持向量机进行回归训练,并且对测试样本进行预测。
rbf_svr = SVR(kernel='rbf')
rbf_svr.fit(X_train, y_train.ravel())
rbf_svr_y_predict = rbf_svr.predict(X_test)

print ''
print '************************************************************************************************************'
# 使用R-squared、MSE和MAE指标对三种配置的支持向量机(回归)模型在相同测试集上进行性能评估。
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
print 'R-squared value of linear SVR is', linear_svr.score(X_test, y_test)
print 'The mean squared error of linear SVR is', mean_squared_error(
    ss_y.inverse_transform(y_test),
    ss_y.inverse_transform(linear_svr_y_predict))
print 'The mean absoluate error of linear SVR is', mean_absolute_error(
    ss_y.inverse_transform(y_test),
    ss_y.inverse_transform(linear_svr_y_predict))

print ''
print '************************************************************************************************************'
print 'R-squared value of Poly SVR is', poly_svr.score(X_test, y_test)
print 'The mean squared error of Poly SVR is', mean_squared_error(
    ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict))
print 'The mean absoluate error of Poly SVR is', mean_absolute_error(
    ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict))
def main():
    horses98 = HorseParserNoHandicaps('./../Data/born98.csv').horses
    horses05 = HorseParserNoHandicaps('./../Data/born05.csv').horses

    races98 = RaceParserNoHandicaps('./../Data/born98.csv').races
    races05 = RaceParserNoHandicaps('./../Data/born05.csv').races

    print 'HorsesBorn98 Dataset'
    horses_train_98, horses_test_98 = split_dataset(horses98)

    horses_98_X_train = []
    horses_98_y_train = []
    for h in horses_train_98:
        v,s = compute_vector(h)
        horses_98_X_train.append(v)
        horses_98_y_train .append(s)

    print 'No. of instances in training set:'
    print len(horses_98_X_train)
    print len(horses_98_y_train)
    print ''

    horses_98_X_test = []
    horses_98_y_test = []
    for h in horses_test_98:
        v,s = compute_vector(h)
        horses_98_X_test.append(v)
        horses_98_y_test.append(s)

    print 'No. of instances in testing set:'
    print len(horses_98_X_test)
    print len(horses_98_y_test)
    print ''
    
    print 'Create SVR object'
    # Create svr object
    svr98 = SVR(kernel='linear', C=1e3)#, gamma=0.1)

    print 'Training SVR'
    # Train the model using the training sets
    svr98.fit(horses_98_X_train, horses_98_y_train)

    print 'Predicting'
    horses_98_y_pred = svr98.predict(horses_98_X_test)

    # Explained variance score: 1 is perfect prediction
    print 'Variance score:'
    print svr98.score(horses_98_X_test, horses_98_y_test)
    print ''

    print 'Mean absolute error:'
    print mean_absolute_error(horses_98_y_test, horses_98_y_pred)
    print ''

    print 'Explained variance:'
    print explained_variance_score(horses_98_y_test, horses_98_y_pred)
    print ''

    print 'Mean squared error:'
    print mean_squared_error(horses_98_y_test, horses_98_y_pred)
    print ''

    print 'R2 score:'
    print r2_score(horses_98_y_test, horses_98_y_pred)
    print ''
Ejemplo n.º 35
0
from sklearn.svm import SVR

erb = 4

# SVR
svr_rbf = SVR(kernel='rbf',
              C=44.424063740,
              epsilon=0.0056846371,
              gamma=1 / 2000)

svr_rbf.fit(bts_base[erb][:1500], pathloss[erb]['plreal'][:1500])

y = svr_rbf.predict(bts_base[erb][1500:])
plreal = pathloss[erb]['plreal'][1500:]

# The mean squared error
print("Mean squared error: %.2f" % np.mean((y - plreal)**2))
# Explained variance score: 1 is perfect prediction
print('SVR score: %.2f' % svr_rbf.score(bts_base[erb][1500:], y))

# Plot
lw = 2
plt.plot([plreal.min(), plreal.max()],
         [plreal.min(), plreal.max()],
         lw=2,
         color="black")
plt.scatter(plreal, y, color='darkorange', label='data', edgecolors="black")
plt.xlabel("Predict Path Loss")
plt.ylabel("Real Path Loss")
plt.show()
Ejemplo n.º 36
0
    def svmRegressor(self):

    	trainingData,desiredLabel = self.normalizeColumnwiseData()
    	#trainingData,desiredLabel = self.loadExperimentData()
    	test_size = 0.3
        coordinates_train, coordinates_test, windspeed_train, windspeed_test = cross_validation.train_test_split(trainingData,desiredLabel,test_size=test_size)
        _, coordinates_predict, _, windspeed_predict = cross_validation.train_test_split(coordinates_test, windspeed_test,test_size=0.04)
        kernel='rbf'
        c= 21.0
        epsilon= 0.2
        gamma=1.6
        curveFit = SVR(kernel=kernel,C=c, epsilon= epsilon, gamma=gamma)
        print curveFit
        print "kernel : ",kernel,"C : ",c,"epsilon : ",epsilon,"gamma : ",gamma, "test % : ", test_size, "no of train data : ", len(coordinates_train	)
        curveFit = curveFit.fit(coordinates_train, windspeed_train)
        print "Number of support vectors used:",len(curveFit.support_vectors_)
        print "Prediction Score :", curveFit.score(coordinates_test, windspeed_test)
        predicted_speed = curveFit.predict(coordinates_predict)
        predicted_speed_random_number_generator = []
        for i in coordinates_predict:
        	predicted_speed_random_number_generator.append(random.uniform(10,43))
        
        predicted_speed_random_number_generator2 = []
        for i in coordinates_predict:
            predicted_speed_random_number_generator2.append(random.uniform(10,43))
        
        mse = mean_squared_error(windspeed_test, curveFit.predict(coordinates_test))
        rms = sqrt(mse)
        print "mse : ",mse

        errorbarValues = []
        #errorbins = [-4,-3,-2,-1,0,1,2,3,4,5]
        errorbins = np.arange(-30,30,1)
        for threshold in errorbins:
	        correct_estimation = 0
	        for i in range(len(predicted_speed)):
	        	if (windspeed_predict[i] - predicted_speed[i] < threshold) and (windspeed_predict[i] - predicted_speed[i] > threshold-1):
	        		correct_estimation += 1
	    	print "for threshold between: ", threshold ," and ",threshold-1," estimation: ", correct_estimation, " out of : ", len(windspeed_predict)	    
	        errorbarValues.append(correct_estimation)

        """for threshold in [1,2,3,4,5]:
            correct_estimation = 0
            for i in range(len(predicted_speed_random_number_generator)):
                if np.abs(windspeed_predict[i] - predicted_speed_random_number_generator[i]) < threshold:
                    correct_estimation += 1
            print "for threshold : ", threshold,"Fake Correct estimation: ", correct_estimation, " out of : ", len(windspeed_predict)       
        
        for threshold in [1,2,3,4,5]:
            correct_estimation = 0
            for i in range(len(predicted_speed_random_number_generator)):
                if np.abs(predicted_speed_random_number_generator[i] - predicted_speed_random_number_generator2[i]) < threshold:
                    correct_estimation += 1
            print "for threshold : ", threshold,"Total Fake Correct estimation: ", correct_estimation, " out of : ", len(predicted_speed_random_number_generator)"""       
        
	    ###############################################################################
        #Plot the error bar
        fig = plt.figure()
        ax = fig.add_subplot(111)
        width = 0.4
        ax.bar([i - width for i in errorbins],errorbarValues,width,color="y",alpha=0.7)
        #ax.bar(errorbins,errorbarValues,width,color="y",alpha=0.7)
        plt.xlabel("Estimation error(kmph)")
        plt.ylabel("Number of observation")
        plt.title("Error histogram SVR")
        ax.set_xlim(-25,25)
        plt.grid()
        # look at the results
        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.scatter(np.arange(0,len(predicted_speed),1),predicted_speed, c='g',marker='+', label='predicted speed')
        ax.scatter(np.arange(0,len(windspeed_predict),1),windspeed_predict, c='r',marker='x', label='Actual data')
        ax.set_xlim(-2,len(windspeed_predict))
        ax.set_ylim(8,45)   

        plt.xlabel('Number of test cases')
        plt.ylabel('wind speed')
        plt.title('Support Vector Regression')
        ax.legend()
        
        for i in range(len(predicted_speed)):
            ax.annotate('', xy=(i, windspeed_predict[i]), xytext=(i, predicted_speed[i]),
                    arrowprops=dict(facecolor='b',alpha=0.5, shrink=0.03,headwidth=4.5,width=1.5,frac=0.4),
                    )
        plt.show()
Ejemplo n.º 37
0
w = w[:-forecast]
print(w)

q=y['Prediction']
q=q[:-forecast]
print(q)

from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(w,q,test_size =0.20)

svr_rbf = SVR(kernel='rbf',C=1e3)
svr_rbf.fit(x_train,y_train)

svr_out = svr_rbf.score(x_test,y_test)
print (svr_out)

x_forecast = y.drop(['Prediction'],1)[-forecast:]
print(x_forecast)

svr_prediction = svr_rbf.predict(x_forecast)
print(svr_prediction)

import matplotlib.pyplot as plt

z = y['2018-11-30':]

plt.figure(figsize=(16,8))
plt.title('Bajaj Auto')
plt.xlabel('Days')
Ejemplo n.º 38
0
dim = 128

f1 = f1(dim)

nbTrain, nbTest = 10000, 100

xTrain = np.random.rand(nbTrain, dim)
xTest = np.random.rand(nbTest, dim)

#%%
yTrain = f1.compute(xTrain)
yTest = f1.compute(xTest)

svr = SVR()
print 'SVR'
print 'Learning...'
svr.fit(xTrain, yTrain)
print 'Scoring...'
print svr.score(xTest, yTest)

#%%
yTrain = f1.computeC(xTrain)
yTest = f1.computeC(xTest)

svc = SVC()
print 'SVC'
print 'Learning...'
svc.fit(xTrain, yTrain)
print 'Scoring...'
print svc.score(xTest, yTest)
Ejemplo n.º 39
0
X = data.iloc[:, 1:2].values
y = data.iloc[:, 2].values
length_old = len(data.columns)

sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X.reshape(-1, 1))
y = sc_y.fit_transform(y.reshape(-1, 1))

reg = SVR(kernel='rbf')
reg.fit(X, y)

y_pred = reg.predict(np.array([5]).reshape(-1, 1))
print(y_pred)

r2 = reg.score(X, y)
print(r2)

plt.scatter(X, y, color='r')
plt.plot(X, reg.predict(X), color='b')
plt.show()

# Importing dataset
dataSet = pd.read_csv(
    '/home/admin1/Desktop/Gayatri/Week2/ML_Problems/DecisionTreeRegression/Position_Salaries.csv'
)
length_old = len(dataSet.columns)

# Handling categorical data
positions = pd.get_dummies(dataSet['Position'])
dataSet = dataSet.drop('Position', axis=1)
Ejemplo n.º 40
0
import pandas as pd

base = pd.read_csv('house-prices.csv')

X = base.iloc[:, 3:19].values
y = base.iloc[:, 2:3].values

from sklearn.preprocessing import StandardScaler
scaler_x = StandardScaler()
X = scaler_x.fit_transform(X)
scaler_y = StandardScaler()
y = scaler_y.fit_transform(y)

from sklearn.model_selection import train_test_split
X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(
    X, y, test_size=0.3, random_state=0)
from sklearn.svm import SVR
regressor = SVR(kernel='rbf')
regressor.fit(X_treinamento, y_treinamento)
score = regressor.score(X_treinamento, y_treinamento)

regressor.score(X_teste, y_teste)

previsoes = regressor.predict(X_teste)
y_teste = scaler_y.inverse_transform(y_teste)
previsoes = scaler_y.inverse_transform(previsoes)

from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_teste, previsoes)
Ejemplo n.º 41
0
          .format(tree.score(training_encoded_data, training_targets)))
    print('    VGG16 Tree MaxLevel=3 R2 testing  score = {}'
          .format(tree.score(testing_encoded_data, testing_targets)))

    tree = DecisionTreeRegressor(max_depth=2)
    tree.fit(training_encoded_data, training_targets)
    print('    VGG16 Tree MaxLevel=2 R2 training score = {}'
          .format(tree.score(training_encoded_data, training_targets)))
    print('    VGG16 Tree MaxLevel=2 R2 testing  score = {}'
          .format(tree.score(testing_encoded_data, testing_targets)))

    svm = SVR()
    scaler = preprocessing.MinMaxScaler().fit(training_encoded_data)
    svm.fit(scaler.transform(training_encoded_data), training_targets)
    print('    VGG16 MinMaxScale Svm R2 training score = {}'
          .format(svm.score(scaler.transform(training_encoded_data), training_targets)))
    print('    VGG16 MinMaxScale Svm R2 testing  score = {}'
          .format(svm.score(scaler.transform(testing_encoded_data), testing_targets)))

    svm = SVR()
    svm.fit(training_encoded_data, training_targets)
    print('    VGG16 Svm R2 training score = {}'
          .format(svm.score(training_encoded_data, training_targets)))
    print('    VGG16 Svm R2 testing  score = {}'
          .format(svm.score(testing_encoded_data, testing_targets)))

print()
for index in range(11):
    print('ResNet50 index={}'
          .format(index))
    extractor = ResNet50ImageFeature(index)
features_train, features_test, labels_train, labels_test = train_test_split(
    features, labels, test_size=0.20, random_state=42)

# applying different classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
cfl1 = RandomForestClassifier()
cfl2 = SVR()

cfl1.fit(features_train, labels_train)

acc_test = cfl1.score(features_train, labels_train)

acc_test

cfl2.fit(features_train, labels_train)

cfl2.score(features_train, labels_train)

cfl1.score(features_test, labels_test)

cfl2.score(features_test, labels_test)

#predicting a type
flower = [[5.7, 0.5]]
class_code = cfl1.predict(flower)
decoded_class = le.inverse_transform(class_code)
print(decoded_class)

class_code
    # dimension should be n_bins
    target = geo_data[:, column_feature_mapping[pair[0]]]
    print target

    # create data (a 2D numpy array based on the classifiers the user provides)
    # dimensions should be n_bins x n_classifiers
    data = geo_data[:, [column_feature_mapping[pair[1]]]]
    print data

    clf = SVR()
    training_data, testing_data, training_target, testing_target = cross_validation.train_test_split(
        data, target, test_size=0.4, random_state=0
    )

    clf.fit(training_data, training_target)
    print clf.score(testing_data, testing_target)
    data = geo_data[:, column_feature_mapping[pair[1]]]
    if pair[1] == "farMarket":
        print "farmarket data:"
        print data

    corr_coefficent = stat.pearsonr(data, target)
    print corr_coefficent[0]
    plt.figure(i)
    plt.scatter(data, target)
    plt.xlabel(pair[1])
    plt.ylabel(pair[0])
    plt.title(
        "Pearson Correlation: "
        + str(corr_coefficent[0])
        + "  SVR Score: "
Ejemplo n.º 44
0
#训练模型
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_y_predict = lr.predict(X_test)
print("解析方法的评估:", lr.score(X_test, y_test))

sgdr = SGDRegressor()
sgdr.fit(X_train, y_train)
sgdr_y_predict = sgdr.predict(X_test)
print("随机梯度法的评估:", sgdr.score(X_test, y_test))

linear_svr = SVR(kernel='linear')  #使用线性核函数的支持向量机
linear_svr.fit(X_train, y_train)
linear_y_predict = linear_svr.predict(X_test)
print("线性核函数性能评估:", linear_svr.score(X_test, y_test))

poly_svr = SVR(kernel='poly')
poly_svr.fit(X_train, y_train)
poly_y_predict = poly_svr.predict(X_test)
print("多项式核函数性能评估:", poly_svr.score(X_test, y_test))

rbf_svr = SVR(kernel='rbf')
rbf_svr.fit(X_train, y_train)
rbf_y_predict = rbf_svr.predict(X_test)
print("径向基核函数性能评估:", rbf_svr.score(X_test, y_test))

knr = KNeighborsRegressor()
knr.fit(X_train, y_train)
knr_y_predict = knr.predict(X_train)
print("K近邻性能评估:", knr.score(X_test, y_test))
Ejemplo n.º 45
0
######################
# cross validating
#####################
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train, train_count, test_size=0.4, random_state=0)

#############################################################################################
# 										Training Section									#
#############################################################################################

print "Training"

# Support Vector Regression
rbf = SVR(kernel="rbf", C=1e3, gamma=0.1)
# rbf.fit(train, train_count)
rbf.fit(X_train, y_train)
print "svr with rbf ", rbf.score(X_test, y_test)

# Bayesian Ridge Regression
clf = linear_model.BayesianRidge(compute_score=True)
# clf.fit(train,train_count)
clf.fit(X_train, y_train)
print "Bayesian Ridge Regression ", clf.score(X_test, y_test)

# Linear Regression
ols = linear_model.LinearRegression()
# ols.fit(train,train_count)
ols.fit(X_train, y_train)
print "Linear Regressor  ", ols.score(X_test, y_test)


# Gradient Boosting Regression
Ejemplo n.º 46
0
sc_X = StandardScaler()
sc_y = StandardScaler()
x_tr = sc_X.fit_transform(x_tr)
x_ts = sc_X.transform(x_ts)
y_tr = sc_y.fit_transform(y_tr)
y_ts = sc_y.fit_transform(y_ts)

#SVR algorithm for training purpose
from sklearn.svm import SVR

regressor = SVR(kernel='rbf')
regressor.fit(x_tr, y_tr)

#SVR algorithm for testing purpose
y_pred = regressor.predict(x_ts)
regressor.score(x_tr, y_tr)
regressor.score(x_ts, y_ts)

#Applying K-Folf cross validation
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=regressor,
                             X=x_tr,
                             y=y_tr,
                             cv=5,
                             scoring='neg_mean_squared_error')
accuracies = accuracies * (-1)

accuracies.mean()
accuracies.std()
Ejemplo n.º 47
0
def svr_trading(context, data):

    # Historical data, lets get the past days close prices for
    pastPrice = history(bar_count=context.history_len, frequency='1d', field='price')

    # Make predictions on universe
    for stock in data:
        # Make sure this stock has no existing orders or positions to simplify our portfolio handling.
        if check_if_no_conflicting_orders(stock) and context.portfolio.positions[stock].amount == 0:
            
            #This is a scoring system for our model, we only trade when confident our model is wicked awesome 
            full_series = np.array(pastPrice[stock].values)
            l           = context.out_of_sameple_bin_size
            power = 1 #N where X^n for weight function
            
            # Create bins of X len to hold as out of sample data, average score(error) of these is a decent measure of fit.
            prediction_history = []
            for i in np.arange(context.history_len/context.out_of_sameple_bin_size):
                #Index of current in same, and out of sample data.
                # 3 cases of this slicing
                if   i == 0:
                    #First run, only two bins to work with(First OOSD bin, and the rest of the data)
                    ISD = full_series[l:]
                    OOSD = full_series[:l]
                    X = np.arange(l,len(full_series))

                    # use a variable weight (~0 - 1.0)
                    weight_training = np.power(np.arange(l,len(full_series),dtype=float), power)[::-1]/np.power(np.arange(l,len(full_series),dtype=float), power)[::-1].max()
                    # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0)
                    weight_score = np.concatenate((np.power(np.arange(1,l+1,dtype=float), power)/np.power(np.arange(1,l+1,dtype=float), power).max(),
                                                   np.power(np.arange(l+1,len(full_series)+1,dtype=float), power)[::-1]/np.power(np.arange(l+1,len(full_series)+2,dtype=float), power)[::-1].max()))
                    """print len (weight_training)
                    print weight_training
                    print len (weight_score)
                    print weight_score
                    print exit()"""
                elif i == context.history_len/context.out_of_sameple_bin_size - 1:
                    #Last run, only two bins to work with(Last OOSD bin, and the rest of the data)
                    ISD = full_series[:-l]
                    OOSD = full_series[-l:]
                    X = np.arange(0,len(full_series)-l)

                    # use a variable weight (~0 - 1.0)
                    weight_training = np.power(np.arange(l,len(full_series),dtype=float)+1, power)/np.power(np.arange(l,len(full_series),dtype=float)+1, power).max()
                    # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0)
                    weight_score = np.concatenate((np.power(np.arange(1,len(full_series)-l+1,dtype=float), power)/np.power(np.arange(1,len(full_series)-l+2,dtype=float), power).max(),
                                                   np.power(np.arange(1,l+1,dtype=float), power)[::-1]/np.power(np.arange(1,l+1,dtype=float), power)[::-1].max()))
                    """print len (weight_training)
                    print weight_training
                    print len (weight_score)
                    print weight_score
                    print exit()"""
                else:
                    #Any other run, we have a sandwhich of OOSD in the middle of two ISD sets so we need to aggregate.
                    ISD = np.concatenate((full_series[:(l*i)], full_series[l*(i+1):]))
                    OOSD = full_series[l*i:l*(i+1)]
                    X = np.concatenate(( np.arange(0,(l*i)), np.arange(l*(i+1),len(full_series)) ))

                    # use a variable weight (~0 - 1.0)
                    weight_training = np.concatenate(( np.power(np.arange(1, l*i+1, dtype=float), power)/np.power(np.arange(1, l*i+1, dtype=float), power).max(),
                                                       np.power(np.arange(l*(i+1), len(full_series), dtype=float), power)[::-1]/np.power(np.arange(l*(i+1), len(full_series),dtype=float), power)[::-1].max() ))
                    # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0)
                    weight_score = np.concatenate(( np.power(np.arange(1, l*(i+1)+1, dtype=float), power)/np.power(np.arange(1, l*(i+1)+1, dtype=float), power).max(),
                                                    np.power(np.arange(l*(i+1), len(full_series), dtype=float), power)[::-1]/np.power(np.arange(l*(i+1), len(full_series)+1, dtype=float), power)[::-1].max() ))
                    """print len (weight_training)
                    print weight_training
                    print len (weight_score)
                    print weight_score
                    exit()"""
                
                # Domain and range of training data
                #X = np.arange(len(ISD))
                X = np.atleast_2d(X).T
                y = ISD

                # Domain of prediction set
                #x = np.atleast_2d(np.linspace(0, len(ISD)+len(OOSD)-1, len(ISD)+len(OOSD))).T
                #x = np.atleast_2d(np.linspace(len(ISD) ,len(ISD)+len(OOSD)-1, len(OOSD))).T
                x = np.atleast_2d(np.linspace(0, len(full_series)-1, len(full_series))).T
                
                # epsilon-Support Vector Regression using scikit-learn
                # Read more here: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html
                SVR_model = SVR(kernel='rbf', C=100, gamma=.01)
                SVR_model.fit(X,y, weight_training)
                y_predSVR = SVR_model.predict(x)
                
                if np.isnan(full_series).any() or np.isinf(full_series).any():
                    print(stock + " Failed due to data INF or NAN")
                    y_score = 0
                    break
                else:
                    y_score = SVR_model.score(x, full_series)#, sample_weight=weight_score) #y_predSVR[-len(OOSD):] np.atleast_2d(y_predSVR).T

                    #log.debug(y_score)
                
                prediction_history.append(y_score)
                
            score = np.mean(y_score)

            # If we are studying one stock, lets plot its correlation regression results
            if len(data) == 1:
                record(Ideal=1.0, Score=score) #Slope=slope, R_value=r

            
            # Store the prediction for comparison with the rest of the universe
            #   Measure accuracy as the mean of the distance to the ideal value of 
            #   the r2 and slope from past vs predicted price correlation regression
            if score >= context.score_filter:
                
                #The model was accepted, make a forecast
                
                #form domain and range of test data(we leave no out of sameple data out since we already scored the model)
                X = np.arange(context.history_len)
                X = np.atleast_2d(X).T
                y = np.array(pastPrice[stock].values)

                # Domain of predection set. We only need to predict the next close price.
                x = np.atleast_2d(np.linspace(len(y), len(y), 1)).T
                """log.debug(X)
                log.debug(len(X))
                log.debug(x)
                log.debug(len(x))
                exit()"""
                
                # use a linearly peaking weight, focus on next day prediction (~0 - 1.0 - ~0)
                #weight_training = np.power(np.arange(1,context.history_len+1, dtype=float), power)/np.power(np.arange(1,context.history_len+1, dtype=float), power).max()
                #weight_training = np.exp(np.arange(1,context.history_len+1, dtype=float))/np.exp(np.arange(1,context.history_len+1, dtype=float)).max()
                
                # epsilon-Support Vector Regression using scikit-learn
                # Read more here: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html
                SVR_model = SVR(kernel='rbf', C=100, gamma=.01)
                SVR_model.fit(X, y)#, weight_training)
                y_predSVR = SVR_model.predict(x)
                
                context.next_pred_price[stock] = y_predSVR[-1]
            else:
                #Case where stock is left in dict and we dont want to use it, so remove it.
                if stock in context.next_pred_price:
                    del context.next_pred_price[stock]
            

    # Count number of trades so we can split the availible cash properly
    number_of_trades_today = 0
    for stock in data:
        # Make sure this stock has no existing orders or positions to simplify our portfolio handling
        # Also check that we have a prediction stored in the dict
        if check_if_no_conflicting_orders(stock) and \
           context.portfolio.positions[stock].amount == 0 and \
           stock in context.next_pred_price:
            # If we plan to move on this stock, take count of it(explained more in actual buy statement below)(Make sure these match both buy statements.
            if (percent_change(context.next_pred_price[stock], pastPrice[stock][-1])  >= context.action_to_move_percent and \
               percent_change(context.next_pred_price[stock], data[stock]['price'])   >= context.action_to_move_percent) or \
               (percent_change(context.next_pred_price[stock], pastPrice[stock][-1])  <= -context.action_to_move_percent and \
                 percent_change(context.next_pred_price[stock], data[stock]['price']) <= -context.action_to_move_percent):
                number_of_trades_today += 1
    #

    #Lets use record to plot how  many securities are traded on each day.       
    if len(data) >= 2:
        record(number_of_stocks_traded=number_of_trades_today)

    #Make buys and shorts if the predicted close change is bigger than our tollerance, same with current price to avoid opening gaps.
    for stock in data:
        # Make sure this stock has no existing orders or positions to simplify our portfolio handling
        # Also check that we have a prediction stored in the dict
        if check_if_no_conflicting_orders(stock) and context.portfolio.positions[stock].amount == 0 and stock in context.next_pred_price:

            #Go long if we predict the close price will change more(upward) than our tollerance, 
            # apply same filter against current price vs predicted close in case of gap up/down.
            if percent_change(context.next_pred_price[stock], pastPrice[stock][-1]) >= context.action_to_move_percent and \
               percent_change(context.next_pred_price[stock], data[stock]['price']) >= context.action_to_move_percent:

                # Place an order, and store the ID to fetch order info
                orderId    = order_target_percent(stock, 1.0/number_of_trades_today)
                # How many shares did we just order, since we used target percent of availible cash to place order not share count.
                shareCount = get_order(orderId).amount

                # We can add a timeout time on the order.
                #context.duration[orderId] = exchange_time + timedelta(minutes=5)

                # We need to calculate our own inter cycle portfolio snapshot as its not updated till next cycle.
                value_of_open_orders(context, data)
                availibleCash = context.portfolio.cash-context.cashCommitedToBuy-context.cashCommitedToSell

                print("+ BUY {0:,d} of {1:s} at ${2:,.2f} for ${3:,.2f} / ${4:,.2f} @ {5:s}"\
                         .format(shareCount,
                                 stock,data[stock]['price'],
                                 data[stock]['price']*shareCount, 
                                 availibleCash,
                                 context.exchange_time))

            #Go short if we predict the close price will change more(downward) than our tollerance, 
            # apply same filter against current price vs predicted close incase of gap up/down.
            elif percent_change(context.next_pred_price[stock], pastPrice[stock][-1]) <= -context.action_to_move_percent and \
                 percent_change(context.next_pred_price[stock], data[stock]['price']) <= -context.action_to_move_percent:

                #orderId    = order_target_percent(stock, -1.0/len(data))
                orderId    = order_target_percent(stock, -1.0/number_of_trades_today)
                # How many shares did we just order, since we used target percent of availible cash to place order not share count.
                shareCount = get_order(orderId).amount

                # We can add a timeout time on the order.
                #context.duration[orderId] = exchange_time + timedelta(minutes=5)

                # We need to calculate our own inter cycle portfolio snapshot as its not updated till next cycle.
                value_of_open_orders(context, data)
                availibleCash = context.portfolio.cash-context.cashCommitedToBuy+context.cashCommitedToSell

                print("- SHORT {0:,d} of {1:s} at ${2:,.2f} for ${3:,.2f} / ${4:,.2f} @ {5:s}"\
                         .format(shareCount,
                                 stock,data[stock]['price'],
                                 data[stock]['price']*shareCount, 
                                 availibleCash,
                                 context.exchange_time))
Ejemplo n.º 48
0
test_y_list = []
for x,y in testExamples:
    test_feature_list.append(tweetFeatureExtractor(x))
    test_y_list.append(y)

test = False
if test:
    n_samples, n_features = 10,5
    np.random.seed(0)
    y = np.random.randn(n_samples)
    X = np.random.randn(n_samples, n_features)
    print y
    print X
    clf = SVR(C=1.0,epsilon=0.2)
    clf.fit(X,y)
    print clf.score(X,y)

## Straight SVR
#y = np.array(y_list)
#X = np.matrix(feature_list)
#test_y = np.array(test_y_list)
#test_X = np.matrix(test_feature_list)
#print y
#print X
#clf = SVR(C=1.0,epsilon=0.4)
#clf.fit(X,y)
#print clf.score(test_X,test_y)
#print test_y_list

#pred_output = [int(round(y)) for y in clf.predict(test_X).tolist()]
#print pred_output
Ejemplo n.º 49
0
class predictRealData:
    def __init__(self,learningProblem,seriesLengthInSeconds,readPath,writePath,test_size,slidingWindow=False,flag=True):
        self.frameRatePerSecond = 120
        self.seriesLengthInSeconds = seriesLengthInSeconds #0.5 #0.0625
        self.featuresPerSeries = np.round(self.seriesLengthInSeconds * self.frameRatePerSecond)
        self.framesInTheSlidingWindow = int(self.featuresPerSeries/3)
        self.learningProblem = learningProblem
        #This is the path from where the experiment results will be read
        self.readPath = readPath
        self.writePath = writePath
        self.slidingWindow = slidingWindow
        #print "features per series",2*self.featuresPerSeries
        self.numberOfFeatures = 6
        self.test_size = test_size
        if learningProblem != "regression":
            self.clf = SVC(C=1.6,gamma=0.002)
        else:
            self.clf = SVR(kernel='rbf',C=1.2, epsilon=1.38)
        self.flagPredict = flag
        if flag:
            self.writePath = './tuft_real_data/17June/extractedFeatures/'

    def outlierDetection(self,listItem):
        #Reference http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm
        listItem = copy.deepcopy(listItem)
        #Step1: Sort the list
        sortedListItem = sorted(listItem)
        #Step2: Find the median value
        medianValue = self.calculateMedian(sortedListItem)
        #Step3: Find the lower quartile
        lowerQuartile = sortedListItem[:len(listItem)//2]
        lowerQuartileValue = self.calculateMedian(lowerQuartile)
        #Step4: Find the upper quartile
        upperQuartile = sortedListItem[len(listItem)//2:]
        upperQuartileValue = self.calculateMedian(upperQuartile)
        #Step5 find interquartile range
        interquartileRange = (upperQuartileValue - lowerQuartileValue)*1.5
        innerFences = ((lowerQuartileValue - interquartileRange),(upperQuartileValue + interquartileRange)) 
        #Step6 find outerquartile range
        interquartileRange2 = (upperQuartileValue - lowerQuartileValue)*3.0
        outerFences = ((lowerQuartileValue - interquartileRange2),(upperQuartileValue + interquartileRange2)) 
        
        #Step7 Calculate the major and minor outliers
        majorOutliers = []
        minorOutliers = []
        #print "outerFences",outerFences
        for i in listItem:
            if i < outerFences[0] or i > outerFences[1]:
                majorOutliers.append(i)
            if (i < innerFences[0] or i > innerFences[1]) and i not in majorOutliers:
                minorOutliers.append(i) 
            return majorOutliers,minorOutliers
    
    def calculateMedian(self,listItem2):
        listItem2 = copy.deepcopy(listItem2)
        if len(listItem2) == 0:
            print "Empty array"
            return None
        if len(listItem2)%2 == 0:
            middleValue = len(listItem2)//2
            medianValue = (listItem2[middleValue] + listItem2[middleValue+1])/2.0
        else:
            medianValue = (listItem2[len(listItem2)//2])
        #print medianValue
        return medianValue

    def rangeFinder(self,listItem3,outliers=None):
        listItem3 = copy.deepcopy(listItem3)
        outliers = copy.deepcopy(outliers)
        withoutOutliersList = []
        if outliers != None and len(outliers) > 0:
            for v in listItem3:
                if v not in outliers:
                    withoutOutliersList.append(v)
            valueRange = (min(withoutOutliersList),max(withoutOutliersList))
        else:
            valueRange = (min(listItem3),max(listItem3))
            withoutOutliersList = listItem3

        #print "range",valueRange[0],valueRange[1]
        medianValue = self.calculateMedian(withoutOutliersList)
        std = np.std(withoutOutliersList)
        return valueRange,std
    
    '''This function reads files and creates a feature list which are sent to 
    writeFeatures() function which creates .csv files contaning features and
    separate files contaning related labels'''

    def processFiles(self):
        list_of_files = glob.glob(self.readPath+'*.txt')
        #list_of_files = glob.glob('./tuft_real_data/22April/*.txt')
        colorCode = -1
        colors = ["ro","bo","mo","go","yo"]
        numberOfFiles = 1
        #For median Plot
        mediansX = []
        mediansY = []
        speeds = []
        averagesX = []
        averagesY = []
        '''For the speed vs outlier analysis'''
        speedsStack = []
        outlierStackX = []
        outlierStackY = []
        for fileName in list_of_files:
            print os.path.splitext(fileName)[0].split("/")[-1]
            #data_list = open( fileName, "r" ).readlines()
        
            featureSet = []
            normalisedX = []
            normalisedY = []
            typeOfFlow = None
            speed = 0.0
            #Saving x and y coordinates separately for range analysis
            xValues = []
            yValues = []
            colorCode += 1 

            #For one file should be one origin x and y
            originX = None
            originY = None
            flagOrigin = True
            with open(fileName,'r') as f:
                for line in f:
                    if "TYPE" in line and line != "":
                        typeOfFlow = line.split()
                        typeOfFlow = typeOfFlow[1]
                        #print typeOfFlow 
                    elif "SPEED" in line and line != "":
                        speed = line.split()
                        speed = speed[1]   
                        #print speed
                    elif "VECTOR" in line and line != "":
                        coordinates = line.split()
                        #print coordinates[0]
                        #TO DO: Only fixed start coordinates can be used.
                        if flagOrigin:
                            originX = float(coordinates[1])
                            originY = float(coordinates[2])
                            flagOrigin = False
                        #normalisedX = float(coordinates[3]) - float(coordinates[1])
                        #normalisedY = float(coordinates[4]) - float(coordinates[2])
                        #xValues.append(coordinates[1])
                        xValues.append(float(coordinates[3]) - float(coordinates[1]))
                        #yValues.append(coordinates[2])
                        yValues.append(float(coordinates[4]) - float(coordinates[2]))
                        #featureSet.append(normalisedX)
                        #featureSet.append(normalisedY)
                    elif "FRAME" in line and line != "":
                        frames = line.split()
                        vectors = len(xValues) + 1
                        '''if vectors in [3660,3690,3900,4230,4470,4500,5730,9900]:
                            print frames[1]'''
                featureFileName = os.path.splitext(fileName)[0].split("/")[-1]
            
                        
            numberOfFiles += 1
            
            #Calculating the outliers
            majorOutliersX,minorOutliersX = self.outlierDetection(xValues)
            majorOutliersY,minorOutliersY = self.outlierDetection(yValues)
            outlierStackX.append(len(minorOutliersX))
            outlierStackY.append(len(minorOutliersY))
            speedsStack.append(speed)

            #Calculating Range of the vectors
            valueRangeX,medianValueX = self.rangeFinder(xValues,majorOutliersX)
            valueRangeY,medianValueY = self.rangeFinder(yValues,majorOutliersY)

            #Plotting the median and range values
            
            mediansX.append(medianValueX)
            mediansY.append(medianValueY)
            speeds.append(speed)
                        
            '''Note: The way this logic is implemented the outliers in y 
            does not come into play at all'''

            withoutOutliersListX = []
            withoutOutliersIndex = []
            for i in range(len(xValues)):
                if xValues[i] not in majorOutliersX:
                    withoutOutliersListX.append(xValues[i])
                    withoutOutliersIndex.append(i)
            withoutOutliersListY = []
            for i in range(len(yValues)):
                if i in withoutOutliersIndex:
                    withoutOutliersListY.append(yValues[i])
                    #featureSet.append(xValues[i])
                    #featureSet.append(yValues[i])   
                    normalisedX.append(xValues[i])
                    normalisedY.append(yValues[i])         
            
            #Function call for the supplying the polar coordinates
            featureSet2 = []
            featureSet2 = self.supplyStats(normalisedX,normalisedY)
            if self.flagPredict:
                self.predictResult(featureSet2)
                typeOfFlow = ['NA']
                speed = 'NA'
            self.writeFeatures(featureSet2,featureFileName,typeOfFlow,speed)

        return featureSet
    
    '''This function takes input of the xdiff, ydiff values and returns the blocks of
    Stats in place of the raw data'''
    def supplyStats(self,xDiff,yDiff):
        xDiff = copy.deepcopy(xDiff)
        yDiff = copy.deepcopy(yDiff)
        xDiffBlocks = self.cutThelengthOfdata(xDiff,self.featuresPerSeries)
        yDiffBlocks = self.cutThelengthOfdata(yDiff,self.featuresPerSeries)
        #print xDiffBlocks
        statsAsFeatures = []
        for i in range(len(xDiffBlocks)):
            #At this index the block of x values are stored, this will be processed for stats collection
            distances,angles,featureSet2 = self.supplyPolarCoordinates(xDiffBlocks[i],yDiffBlocks[i])
            statsAsFeatures.append(np.mean(distances))
            statsAsFeatures.append(np.mean(angles))
            valueRange,std = self.rangeFinder(xDiffBlocks[i])
            statsAsFeatures.append(std)
            statsAsFeatures.append(valueRange[1]-valueRange[0])
            valueRange,std = self.rangeFinder(yDiffBlocks[i])
            statsAsFeatures.append(std)
            statsAsFeatures.append(valueRange[1]-valueRange[0])

        #print "Mean Length of vector,Mean angle,median distance,range diff distance,median angles,range diff angles"
        return statsAsFeatures

    '''This function returns the blocks of the feature to be exchanged by the stats data'''
    def cutThelengthOfdata(self,dataInput,lengthOfVector):
        counter = 0
        eachLine = []
        dataOutput = []
        for i in range(len(dataInput)): #1,2,3,4,5,6,4,3,2,1
            if counter < lengthOfVector:
                eachLine.append(dataInput[i])
                counter += 1
            else:
                counter = 1                
                dataOutput.append(eachLine)
                eachLine = []
                eachLine.append(dataInput[i])
        return dataOutput

    '''Distance calculator'''
    def distanceCalculator(self,diff1,diff2):
        distance = np.sqrt(pow(diff1,2)+pow(diff2,2))
        return distance


    '''This function calculates the polar coordinates for supplied list of cartesian coordinates'''
    def supplyPolarCoordinates(self,xDiff,yDiff):
        xDiff = copy.deepcopy(xDiff)
        yDiff = copy.deepcopy(yDiff)
        polarDistances = [] 
        angles = []
        features = []
        for i in range(len(xDiff)):
            dist = self.distanceCalculator(xDiff[i],yDiff[i])
            polarDistances.append(dist)
            #theta = math.degrees(math.atan2(yDiff[i],xDiff[i]))
            theta = math.atan2(yDiff[i],xDiff[i])
            angles.append(theta)
            features.append(dist)
            features.append(theta)
        return polarDistances,angles,features

    '''This function writes the vector series data and labels in .csv format.
    The length of the series depends on the parameters in init'''
    def writeFeatures(self,fileContent,fileName,typeOfFlow,speed):
        features = copy.deepcopy(fileContent)
        #print features
        #speed = float(speed)
        if speed == 'NA' and self.learningProblem != "classification":
            print "speed not available"
            return None
        else:
            print typeOfFlow,speed


        #path = './tuft_real_data/3May/extractedFeatures/'
        path = self.writePath

        with open(path+'data/'+fileName+'.csv', 'w') as csvfile:
            writer = csv.writer(csvfile, delimiter=',')
            eachLine = []
            counter = 0
            counter2 = 0
            for i in features: 
                eachLine.append(i)
                counter += 1
                if counter%(self.numberOfFeatures) == 0:
                    writer.writerow(eachLine)
                    eachLine = []
                    counter2 += 1
            

        with open(path+'labels/'+fileName+'label.csv', 'w') as csvfile2:
            writer2 = csv.writer(csvfile2,delimiter=',')
            counter = 0
            for i in features: 
                counter += 1
                if counter%(self.numberOfFeatures) == 0:
                    if self.learningProblem == "classification":
                        writer2.writerow(typeOfFlow)
                    else:
                        #print "regression"
                        writer2.writerow([speed])

    def normalizeColumnwiseData(self):
        trainingData,desiredLabel = self.loadExperimentData()
        #Step 1: Check how many columns are there
        noOfColumns = len(trainingData[0])
        trainDataArray = np.asarray(trainingData)
        #print trainingData.shape , noOfColumns
        normalizedData = np.zeros(trainingData.shape)
        for col in range(noOfColumns):
            columnVal = np.asarray(trainingData[:,col])
            #print len(columnVal) , len(trainingData)
            #Step 2: For all the rows and specific column do the normalization 
            meanSubstracted = columnVal - np.mean(columnVal)
            normalizedColumn = meanSubstracted/np.std(columnVal)
            #print "alles gut"
            #Step 3: Stack them vertically one by one
            normalizedData[:,col] =normalizedColumn
            #print normalizedData
        #print normalizedData.shape
        return normalizedData,desiredLabel

    def loadExperimentData(self):
        path = "./tuft_real_data/13June/extractedFeatures/"
        list_of_data_files = glob.glob(path+'data/*.csv')
        list_of_data_files = sorted(list_of_data_files)
        flagInitial = True
        
        for file_name in list_of_data_files:
            featureFileName = os.path.splitext(file_name)[0].split("/")[-1]
            #print featureFileName
            data = np.loadtxt(fname=file_name,delimiter=',')
            if flagInitial:
                flagInitial = False
                trainData = data
            else:
                trainData = np.vstack((trainData,data))

        #For reading the labels
        list_of_label_files = glob.glob(path+'labels/*.csv')
        list_of_label_files = sorted(list_of_label_files)
        flagInitial = True        
        for file_name in list_of_label_files:
            featureFileName = os.path.splitext(file_name)[0].split("/")[-1]
            #print featureFileName
            labels = np.loadtxt(fname=file_name,delimiter=',')
            if flagInitial:
                flagInitial = False
                trainLabel = labels
            else:
                trainLabel = np.concatenate((trainLabel,labels),axis=0)

        return trainData,trainLabel
        
    def svmClassifier(self):
        trainData,trainLabel = self.normalizeColumnwiseData()

        print "total available data",len(trainData)
        data_train,data_test,label_train,label_test = cross_validation.train_test_split(trainData,trainLabel,test_size=self.test_size)
        
        #self.clf = SVC(C=1.6,gamma=0.002)
        self.clf = self.clf.fit(data_train,label_train)
        print "prediction Accuracy",self.clf.score(data_test,label_test)
        print "Number of support vectors used:",len(self.clf.support_vectors_)

        '''#Use the cross_validation score
        clf2 = SVC(C=1.6,gamma=0.002)
        cv = cross_validation.ShuffleSplit(len(trainData), n_iterations=3,test_size=self.test_size, random_state=0)
        scores = cross_validation.cross_val_score(clf2, trainData, trainLabel, cv=cv)
        print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100 / 2)'''

    def svmRegressor(self):
        trainData,trainLabel = self.loadExperimentData()

        print "total available data",len(trainData)
        data_train,data_test,label_train,label_test = cross_validation.train_test_split(trainData,trainLabel,test_size=self.test_size)
        
        #self.clf = SVC(C=1.6,gamma=0.002)
        self.clf = self.clf.fit(data_train,label_train)
        print "prediction Accuracy",self.clf.score(data_test,label_test)
        print "Number of support vectors used:",len(self.clf.support_vectors_)

        '''#Use the cross_validation score
        clf2 = SVC(C=1.6,gamma=0.002)
        cv = cross_validation.ShuffleSplit(len(trainData), n_iterations=3,test_size=self.test_size, random_state=0)
        scores = cross_validation.cross_val_score(clf2, trainData, trainLabel, cv=cv)
        print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean()*100, scores.std()*100 / 2)'''

    def predictResult(self,features):
        testData = copy.deepcopy(features)
        results = []
        eachLine = []
        counter = 0
        confidence = []
        for i in testData: 
            eachLine.append(i)
            counter += 1
            if counter%(self.numberOfFeatures) == 0:
                results.append(self.clf.predict(eachLine)[0])
                #confidence.append(self.clf.predict_proba(eachLine))
                eachLine = []
        #print results
        #print confidence
        self.returnFrames(results)

    def returnFrames(self,results):
        results = copy.deepcopy(results)
        lastResult = 1
        frameCounter = 0
        for r in results:
            if r != lastResult:
                print frameCounter*self.featuresPerSeries,lastResult
                lastResult = r
            frameCounter += 1

        print frameCounter*self.featuresPerSeries,lastResult
Ejemplo n.º 50
0
class Trainer():

	def __init__(self):
		
		with open('credentials.json') as credentials_file:
		    credentials = json.load(credentials_file)

		passwd = credentials['mysql']['password']
		self.con = mdb.connect(host='127.0.0.1', port=3306, user='******', passwd=passwd, db='insight', autocommit=True)
		self.cur = self.con.cursor()
		print "Connected to database"
		
		self.load_data()

	def load_data(self):
		f = open('./pickles/mysql_dump.pickle', 'rb')
		self.loanData = pickle.load(f)
		self.loanData = pd.DataFrame(self.loanData)
		f.close()

	def drop_na(self):
		self.loanData = loanData.dropna()
		self.loanData.index = range(len(self.loanData))

	def drop_columns(self):
		#drop the columns with malformed data in mysql db
		self.loanData = self.loanData.drop(['none',
											'educational',
											'IA',
											'IDAHO',
											'ME',
											'NE',
											'other_housing',
											'issue_year'], 1)

	def drop_prepaid_loans(self):
		indices_to_drop = []
		for i in range(len(self.loanData)):
			if self.loanData['loan_status'][i]==1 and self.loanData['days_to_zero_dollars'][i] < 1000:
				indices_to_drop.append(i)
		self.loanData = self.loanData.drop(indices_to_drop, 0)
		print "Number of prepaid loans: ", len(indices_to_drop)
		print "Number of loans after dropping prepaids: ", len(self.loanData)


	def define_features_targets(self, kind="regression"):
		
		#take out 1000 random loans with 36 month terms for testing
		#ids are already populated in test_loans for consistency
		test_ids = []
		sql_query = "select id from test_loans;"
		self.cur.execute(sql_query)
		sql_resp = self.cur.fetchall()
		print "length of sql response: ", len(sql_resp)
		for val in sql_resp:
			test_ids.append(val[0])
		print "length of test_ids: ", len(test_ids)
		#make the test and train data frames
		self.testLoanData = self.loanData[self.loanData['id'].isin(test_ids)]
		self.trainLoanData = self.loanData[~self.loanData['id'].isin(test_ids)]
		self.testLoanData.index = range(len(self.testLoanData))
		self.trainLoanData.index = range(len(self.trainLoanData))
		print "Train Loan Data: ", len(self.trainLoanData)
		print "Test Loan Data: ", len(self.testLoanData)
		
		self.features = self.trainLoanData.drop(['loan_status', 
											'days_to_zero_dollars',
											'id'], 1)
		self.features = self.features.values
		#choose different target variables for regression vs classification
		if kind == "regression":
			self.targets = self.trainLoanData['days_to_zero_dollars'].values
			self.y_test = self.testLoanData['days_to_zero_dollars'].values
		elif kind == "classification":
			self.targets = self.trainLoanData['loan_status'].values
			self.y_test = self.testLoanData['loan_status'].values

	def preprocess(self):
		(self.X_train, 
		 self.X_cv, 
		 self.y_train, 
		 self.y_cv) = dm.split_train_test(features=self.features, 
		 									targets=self.targets, 
		 									test_size=0.1)
		self.X_test = self.testLoanData.drop(['loan_status', 
											  'days_to_zero_dollars',
											  'id'], 1).values
		(self.X_train, self.X_cv) = dm.standardize_samples(self.X_train, 
														  self.X_cv)
		(self.X_train, self.X_cv) = dm.scale_samples_to_range(self.X_train, 
																self.X_cv)
		(self.X_test, _) = dm.standardize_samples(self.X_test, 
														  self.X_test)
		(self.X_test, _) = dm.scale_samples_to_range(self.X_test, 
																self.X_test)

	def define_dummy_classifier(self):
		self.clf = DummyClassifier()

	def define_rfr(self, n_estimators=10):
		self.regr = RandomForestRegressor(n_estimators=n_estimators, oob_score=True)
		print self.regr.get_params()

	def define_linear_regressor(self):
		self.regr = LinearRegression()
		print self.regr.get_params()

	def define_SVR(self, C=1, gamma=0.1):
		self.regr = SVR(C=C, gamma=gamma, verbose=3)
		print self.regr.get_params()

	def define_logistic_regressor(self, penalty="l2", C=1.0, class_weight=None):
		self.clf = LogisticRegression(penalty=penalty, 
									  C=C, 
									  class_weight=class_weight)
		print self.clf.get_params()

	def define_rfc(self, n_estimators=10):
		self.clf = RandomForestClassifier(n_estimators=n_estimators, oob_score=True)
		print self.clf.get_params()

	def train(self, kind="regression"):
		print "Fitting training data"
		if kind == "regression":
			self.regr.fit(self.X_train, self.y_train)
		elif kind == "classification":
			self.clf.fit(self.X_train, self.y_train)

	def predict(self, X, kind="regression"):
		if kind == "regression":
			self.prediction = self.regr.predict(X)
		elif kind == "classification":
			self.prediction = self.clf.predict(X)

	def score(self, X, y, kind="regression"):
		if kind == "regression":
			score_val = self.regr.score(X, y)
			print "R2 Score: ", score_val
		elif kind == "classification":
			score_val = self.clf.score(X, y)
			print "Accuracy: ", score_val
			print classification_report(y, self.prediction)
			self.precision = precision_score(y, self.prediction, labels=[0,1,2], average=None)
			print "\n\nPrecision Score: ", self.precision, "\n\n"
			self.accuracy = accuracy_score(y, self.prediction)

	def test(self, kind="regression"):
		#run clf and regr on the test data to determine to top 100 loans
		#the top loans are the ones least likely to default
		if kind == "regression":
			pred = self.regr.predict(self.X_test)
			print "length of regression pred: ", len(pred)
			for i, loan in enumerate(self.testLoanData['id']):
				sql_query = "UPDATE test_loans SET pred_days_to_zero_dollars=%s where id='%s';" %(
						pred[i], self.testLoanData['id'][i])
				self.cur.execute(sql_query)
			print i
		elif kind == "classification":
			pred_proba = self.clf.predict_proba(self.X_test)
			for i, loan in enumerate(self.testLoanData['id']):
				sql_query = "UPDATE test_loans SET pred_default=%s, pred_paid=%s, pred_prepaid=%s where id='%s';" %(
						pred_proba[i][0], pred_proba[i][1],pred_proba[i][2], self.testLoanData['id'][i])
				self.cur.execute(sql_query)
		self.con.close()

	def run_pca(self, n_components=20):
		self.pca = PCA(n_components=n_components)
		self.X_train = self.pca.fit_transform(self.X_train)
		print "Reduced data down to ", self.pca.n_components_, " dimensions: "
		print "Transforming cv data ..."
		self.X_cv = self.pca.transform(self.X_cv)
		print "Transforming test data ..."
		self.X_test = self.pca.transform(self.X_test)

	def plot_prediction(self):
		plt.scatter(self.prediction, self.y_cv)
		plt.xlabel('prediction')
		plt.ylabel('y_test')
		plt.show()

	def runSVRGridSearch(self):
		C_vals = [0.01, 0.1, 1, 10, 100]
		gamma_vals = [1E-2, 1E-1, 1, 1E1, 1E2, 1E3, 1E4]

		for C in C_vals:
			for gamma in gamma_vals:
				print "\n\n C: ", C, "  gamma: ", gamma
				self.define_SVR(C=C, gamma=gamma)
				self.train()
				print "Training Scores:"
				self.predict(self.X_train)
				self.score(self.X_train, self.y_train)
				print "Testing Scores:"
				self.predict(self.X_cv)
				self.score(self.X_cv, self.y_cv)

	def roc(self):
		'''Compute ROC curve using one-vs-all technique'''
		pred_proba = self.clf.predict_proba(self.X_cv)
		fpr = []
		tpr = []
		thresholds = []
		for i in [0, 1, 2]:
			fpr_i, tpr_i, thresholds_i = roc_curve(self.y_cv, pred_proba[:,i], pos_label=i)
			fpr.append(fpr_i)
			tpr.append(tpr_i)
			thresholds.append(thresholds_i)
			print "AUC: ", auc(fpr_i, tpr_i)
		plt.plot([0,1], [0,1], '--', color=(0.6, 0.6, 0.6))
		plt.plot(fpr[0], tpr[0], label="Default", linewidth=3)
		plt.xlim([-0.05, 1.05])
		plt.ylim([-0.05, 1.05])
		plt.show()


	def pickle_algo(self, X, fileName):
		print "pickling algorithm"
		f = open(fileName, 'wb')
		pickle.dump(X, f)
		f.close()
Ejemplo n.º 51
0
regr_rbf_3 = SVR(kernel="rbf", C=1.0, gamma=0.0002, epsilon=0.1)

# Train the model using the training sets
regr_linear.fit(annual_index_feature, annual_temp)
regr_rbf_1.fit(annual_index_feature, annual_temp)
regr_rbf_2.fit(annual_index_feature, annual_temp)
regr_rbf_3.fit(annual_index_feature, annual_temp)


# The coefficients
#print 'Coefficients:', regr.coef_
# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((regr_rbf_1.predict(annual_index_feature) - annual_temp) ** 2))
# Explained variance score: 1 is perfect prediction
print('score1: %.2f' % regr_rbf_1.score(annual_index_feature, annual_temp))
print('score2: %.2f' % regr_rbf_2.score(annual_index_feature, annual_temp))
print('score3: %.2f' % regr_rbf_3.score(annual_index_feature, annual_temp))

# Plot outputs
plt.figure(figsize=(20,5))
plt.bar(annual_index, annual_temp,  width=0.7, edgecolor="none", color=(annual_temp>0).map({True: 'r', False: 'b'}),
        label="Annual Average Global Anomaly", alpha=0.3)

plt.plot(prediction_annual_index[:], regr_linear.predict(prediction_annual_index[:]), color='green',
        linewidth=3, alpha=0.5, label="Linear Prediction")
plt.plot(prediction_annual_index[:], regr_rbf_1.predict(prediction_annual_index[:]), color='blue',
        linewidth=3, alpha=0.5, label="RBF1 Prediction")
plt.plot(prediction_annual_index[:], regr_rbf_2.predict(prediction_annual_index[:]), color='orange',
        linewidth=3, alpha=0.5, label="RBF2 Prediction")
plt.plot(prediction_annual_index[:], regr_rbf_3.predict(prediction_annual_index[:]), color='red',
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=44,
                                                    shuffle=True)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)
print("=" * 25)
# ----------------------------------------------------
# Applying SVR Model
SVRModel = SVR(kernel='rbf', C=20.0)
SVRModel.fit(X_train, y_train)
# ----------------------------------------------------
# Calculating Details
print('SVRModel Train Score is : ', SVRModel.score(X_train, y_train))
print('SVRModel Test Score is : ', SVRModel.score(X_test, y_test))
print("=" * 25)
# ----------------------------------------------------
# Calculating Prediction
y_pred = SVRModel.predict(X_test)
print('Predicted Value for SVRModel is : ', y_pred[:5])
print('True Value for SVRModel is : ', y_test[:5])
print("=" * 25)
# ----------------------------------------------------
# Calculating Mean Absolute Error
MAEValue = mean_absolute_error(y_test, y_pred, multioutput='uniform_average')
print('Mean Absolute Error Value is : ', MAEValue)
# ----------------------------------------------------
# Calculating Mean Squared Error
MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average')
Ejemplo n.º 53
0
class LinearModels(object):
    def __init__(self, train_df, test_df):
        self.train_df = train_df
        self.test_df = test_df

    def fittingModels(self, predictors, out_fv, kernel="linear"):
        # logistic regression model
        self.predictors = predictors
        self.out_fv = out_fv
        self.kernel = kernel

        # linear regression
        self.lr_model = lm.LinearRegression().fit(
            self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values
        )

        # simple lasso model
        self.lasso_model = lm.Lasso(alpha=0.1).fit(
            self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values
        )

        # Naive Bayes algorithm
        self.nb_model = lm.GaussianNB().fit(
            self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values
        )

        # Bayesian Ridge Model - adapts to data at hand and regularized parameter is used
        self.br_model = lm.BayesianRidge().fit(
            self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values
        )

        # ARD Regression Model
        self.ard_model = lm.ARDRegression().fit(
            self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values
        )

        # SVR with linear kernel
        self.svm = SVR(C=1.0, epsilon=0.2, kernel=self.kernel).fit(
            self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values
        )

        # if number of dimensions are significantly larger than number of points then
        # LARS Lasso can be used

        self.lars_model = lm.LassoLars(alpha=0.1).fit(
            self.train_df.loc[:, self.predictors].values, self.train_df.loc[:, self.out_fv].values
        )

    def predictions(self):

        print "Simple Linear Regression Prediction"
        print self.lr_model.score(self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values)
        self.lr_model_predict = self.lr_model.predict(self.test_df.loc[:, self.predictors].values)
        print "####################################"
        print "Simple LASSO Regression Prediction"
        print self.lasso_model.score(
            self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values
        )
        self.lasso_model_predict = self.lasso_model.predict(self.test_df.loc[:, self.predictors].values)
        print "####################################"
        print "Bayesian Ridge Regression Prediction"
        print self.br_model.score(self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values)
        self.br_model_predict = self.br_model.predict(self.test_df.loc[:, self.predictors].values)
        print "####################################"
        print "ARD Regression Prediction"
        print self.ard_model.score(self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values)
        self.ard_model_predict = self.ard_model.predict(self.test_df.loc[:, self.predictors].values)
        print "####################################"
        print "Support Vector Regression Prediction"
        print self.svm.score(self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values)
        self.svm_predict = self.svm.predict(self.test_df.loc[:, self.predictors].values)
        print "####################################"
        print "LARS LASSO Regression Prediction"
        print self.lars_model.score(
            self.test_df.loc[:, self.predictors].values, self.test_df.loc[:, self.out_fv].values
        )
        self.lars_model_predict = self.lars_model.predict(self.test_df.loc[:, self.predictors].values)
        print "####################################"
Ejemplo n.º 54
0
Y = Y[:-predict_Price]
#print(Y)
# Now split the data into x% training and y% testing in test_size , i.e = 0.8 ( 80% training 20% testing)
# more training data -> better model , more testing data -> high accuracy on testing results
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.8)
# Now using SVM (Regressor)
s_Vector_Reg = SVR(
    kernel='rbf', C=1e3, gamma=0.1
)  #C is regularization parameter and gamma is a parameter that defines influence of single training
s_Vector_Reg.fit(
    x_train, y_train
)  # gamma is influence , smaller gamma ~ high influence ~ highly constrained model

# Testing Model: 'Score' would return the coefficient of determination R^2 of the prediction.
#best score = 1 , coefficient of determination = square of corelation between 'x' and 'y' scores
svm_Confidence_Value = s_Vector_Reg.score(x_test, y_test)
print("svm confidence value : ", svm_Confidence_Value)
# we create the L.regression model
linear_Regression = LinearRegression()
# now we train the model
linear_Regression.fit(x_train, y_train)
#confidence score is how much a predicted base can be trusted
linear_Regression_Confidence_Value = linear_Regression.score(x_test, y_test)
print("linear regression confidence value : ",
      linear_Regression_Confidence_Value)
# HENCE LINEAR REFRESSION MODEL IS BETTER THAN SVM MODEL

# Set x_Predict_ equal to the last 30 rows of the original data set from 'Open Price' column
x_Predict_ = np.array(data_Stocks.drop(['Prediction_Price'],
                                       1))[-predict_Price:]
#print(x_Predict_) # this is the data that we are going to do prediction on
Ejemplo n.º 55
0
    print('X : ', len(X), '*', len(X[0]), ' Y : ', len(y), '*', '1')

    # Train on %s of the data
    train_idx = int(len(df) * train_factor)

    # create train and test data
    X_train, y_train, X_test, y_test = X[:train_idx], y[:train_idx], X[
        train_idx:], y[train_idx:]
    ts_train, ts_test = ts[:train_idx], ts[train_idx:][seq_lag:]

    # fit and predict
    clf = SVR(kernel='rbf', C=1, epsilon=0.1)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    score = clf.score(X_test, y_pred)
    print(score)
    print('length of output data are : ')
    print(len(X_test), len(y_pred), len(y_test), len(ts_test), len(ts_train),
          len(X_train), len(y_train))
    print('-----')

    from sklearn.metrics import mean_squared_error
    from math import sqrt
    ms = mean_squared_error(y_test, y_pred)
    rms = sqrt(ms)
    print('MSE is ', ms)
    print(' ------------------------------------')

    def mean_absolute_percentage_error(y_true, y_pred):
        ape = []
Ejemplo n.º 56
0
    #parse_data=parse_csv(Train_File_name)
    #X,Y=create_dataset(parse_data)
    
    #input Feature & output label
    failure=[12,15,28,39,53,53,60,60,60,63,68,68,82,91,97,97,102,103,103,104,105,109,109,113,125,126,131,158,165,166,166,173,183,189,193,194,202,204,214,229,230,235,235,237,238,238,239,243,251,253,257,260,263,266,268,271,271,272,274,279,284,288,288,291,293,299,305,308,323,323,327,328,333,336,347,349,369,389,392,393,405,410,411,411,417,435,435,435,435,441,441,453,467,468,488,509,512,517,558,559,573,587,644,644,655,728,734,769,783,994,1064]
    
    Y=[]
    X =[]
    for i in range(1,len(failure)+1):
        X.append([i])
    for fail in failure:
        Y.append(fail)

    #**************    
    #Now Create & Train Our Classifier
    clsf=SVR(kernel='rbf', C=1e2, gamma=0.1)
    print 'Training Started ..'
    clsf.fit(X,Y)
   
    
    #Now Load Testing dataset

    
    # Print Accuracy Test
    print 'TIme           Real failure           predicted failure'
    prediction=clsf.predict(X)
    for i in range(len(X)):
    	print X[i],'           ',Y[i],'          ',prediction[i]

    print 'r-squire Score',clsf.score(X,Y)
Ejemplo n.º 57
0
lm.fit(x_train, y_train)
print lm.score(x_test, y_test)
print zip(features, lm.coef_)
##### score = 0.691

# random forest
rf = RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_features='sqrt')
rf.fit(x_train, y_train)
print rf.score(x_test, y_test)
print zip(features, rf.coef_)
##### score = 0.667

# svm - linear kernel
svr_linear = SVR(kernel='linear', C=.5)
svr_linear.fit(x_train, y_train)
print svr_linear.score(x_train, y_train)
print zip(features, svr_linear.coef_)
##### score = 0.686

# svm - rbf kernel
svr_rbf = SVR(kernel='rbf', C=.5)
svr_rbf.fit(x_train, y_train)
print svr_rbf.score(x_train, y_train)
print zip(features, svr_rbf.coef_)
##### score = 0.700

# let's transform our dependent variable
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)
# fill in nan and inf
y_test_log = np.nan_to_num(y_test_log)
Ejemplo n.º 58
0
    X2.append(list(get_composition_descriptors(c).values()))

X = np.array(X)
X2 = np.array(X)
y = np.array(y)

#pca = PCA(n_components=10, whiten=True)
#X = pca.fit_transform(X)

train_x, test_x, train_y, test_y = cross_validation.train_test_split(
    X, y, train_size=0.5)

clf = linear_model.LinearRegression()

clf.fit(train_x, train_y)
train_y -= clf.predict(train_x)

parameters = {
    'n_estimators': [10, 100, 500],
    'max_depth': [2, 3, 4],
    'min_samples_split': [1, 2, 3, 4],
    'learning_rate': [0.001, 0.01, 0.1]
}
#gbr = GradientBoostingRegressor()
#clf = grid_search.GridSearchCV(gbr, parameters)
clf = SVR()

clf.fit(train_x, train_y)

print(clf.score(test_x, test_y))
def SupportVectorRegression(
    symbol, seriesSet, c=100, Gamma=0.01, Epsilon=0.1, oosd_bin_size=10, oosd_lookback=100
):  # returnList, indexId,
    """
    regression on the data and optionally display
  
    Parameters
    ----------
    data : array like
        2D array of float data.

    Returns
    -------
    maskArray : numpy array
        2D numpy array containing GaussianProcess fit 
    """

    fileName = "Images/SVR/%s.png" % (symbol)

    labels = ["High", "Low", "Open", "Close"]
    colors = ["r", "g", "b", "c"]

    # import matplotlib.pyplot as plt

    predictionSets = []
    scoreSets = []
    count = 0

    for series in seriesSet:

        # Perform an analysis of the model w/ ISD and OOSD
        full_series = np.array(series)
        # H = len(series) - oosd_lookback #Len of history data
        # L = oosd_lookback-oosd_bin_size               #Len of analysis lookback
        l = oosd_bin_size  # Len of out of sample AND prediction domain(how many days forecasted)
        power = 1  # N where X^n for weight function
        prediction_history = []
        for i in np.arange(oosd_lookback / oosd_bin_size):
            # Index of current in same, and out of sample data.
            # 3 cases of this slicing
            if i == 0:
                # First run, only two bins to work with(First OOSD bin, and the rest of the data)
                ISD = full_series[l:]
                OOSD = full_series[:l]
                X = np.arange(l, len(full_series))

                # use a variable weight (~0 - 1.0)
                weight_training = (
                    np.power(np.arange(l, len(full_series), dtype=float), power)[::-1]
                    / np.power(np.arange(l, len(full_series), dtype=float), power)[::-1].max()
                )
                # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0)
                weight_score = np.concatenate(
                    (
                        np.power(np.arange(1, l + 1, dtype=float), power)
                        / np.power(np.arange(1, l + 1, dtype=float), power).max(),
                        np.power(np.arange(l + 1, len(full_series) + 1, dtype=float), power)[::-1]
                        / np.power(np.arange(l + 1, len(full_series) + 2, dtype=float), power)[::-1].max(),
                    )
                )
                """print len (weight_training)
                print weight_training
                print len (weight_score)
                print weight_score
                print exit()"""
            elif i == oosd_lookback / oosd_bin_size - 1:
                # Last run, only two bins to work with(Last OOSD bin, and the rest of the data)
                ISD = full_series[:-l]
                OOSD = full_series[-l:]
                X = np.arange(0, len(full_series) - l)

                # use a variable weight (~0 - 1.0)
                weight_training = (
                    np.power(np.arange(l, len(full_series), dtype=float) + 1, power)
                    / np.power(np.arange(l, len(full_series), dtype=float) + 1, power).max()
                )
                # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0)
                weight_score = np.concatenate(
                    (
                        np.power(np.arange(1, len(full_series) - l + 1, dtype=float), power)
                        / np.power(np.arange(1, len(full_series) - l + 2, dtype=float), power).max(),
                        np.power(np.arange(1, l + 1, dtype=float), power)[::-1]
                        / np.power(np.arange(1, l + 1, dtype=float), power)[::-1].max(),
                    )
                )
                """print len (weight_training)
                print weight_training
                print len (weight_score)
                print weight_score
                print exit()"""
            else:
                # Any other run, we have a sandwhich of OOSD in the middle of two ISD sets so we need to aggregate.
                ISD = np.concatenate((full_series[: (l * i)], full_series[l * (i + 1) :]))
                OOSD = full_series[l * i : l * (i + 1)]
                X = np.concatenate((np.arange(0, (l * i)), np.arange(l * (i + 1), len(full_series))))

                # use a variable weight (~0 - 1.0)
                weight_training = np.concatenate(
                    (
                        np.power(np.arange(1, l * i + 1, dtype=float), power)
                        / np.power(np.arange(1, l * i + 1, dtype=float), power).max(),
                        np.power(np.arange(l * (i + 1), len(full_series), dtype=float), power)[::-1]
                        / np.power(np.arange(l * (i + 1), len(full_series), dtype=float), power)[::-1].max(),
                    )
                )
                # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0)
                weight_score = np.concatenate(
                    (
                        np.power(np.arange(1, l * (i + 1) + 1, dtype=float), power)
                        / np.power(np.arange(1, l * (i + 1) + 1, dtype=float), power).max(),
                        np.power(np.arange(l * (i + 1), len(full_series), dtype=float), power)[::-1]
                        / np.power(np.arange(l * (i + 1), len(full_series) + 1, dtype=float), power)[::-1].max(),
                    )
                )
                """print len (weight_training)
                print weight_training
                print len (weight_score)
                print weight_score
                exit()"""

            # Domain and range of training data
            # X = np.arange(len(ISD))
            X = np.atleast_2d(X).T
            y = ISD

            # Domain of prediction set
            # x = np.atleast_2d(np.linspace(0, len(ISD)+len(OOSD)-1, len(ISD)+len(OOSD))).T
            # x = np.atleast_2d(np.linspace(len(ISD) ,len(ISD)+len(OOSD)-1, len(OOSD))).T
            x = np.atleast_2d(np.linspace(0, len(full_series) - 1, len(full_series))).T

            # epsilon-Support Vector Regression using scikit-learn
            # Read more here: http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html
            SVR_model = SVR(kernel="rbf", C=c, gamma=Gamma, epsilon=Epsilon)
            SVR_model.fit(X, y, weight_training)
            y_predSVR = SVR_model.predict(x)

            if np.isnan(full_series).any() or np.isinf(full_series).any():
                log.debug(stock.symbol + " Failed due to data INF or NAN")
                y_score = 0
                break
            else:
                y_score = SVR_model.score(
                    x, full_series, weight_score
                )  # y_predSVR[-len(OOSD):]   np.atleast_2d(y_predSVR[::-1]).T

            # log.debug(y_score)
            # print y_score
            prediction_history.append(y_score)

        score = np.mean(y_score)
        # print ""
        # print score
        # exit()

        # Make the next day's prediction
        X = np.arange(series.shape[0])
        X = np.atleast_2d(X).T
        y = series

        startTime = time.time()
        # Mesh the input space for evaluations of the real function, the prediction and
        # its MSE
        x = np.atleast_2d(np.linspace(len(series), len(series), 1.0)).T

        # kernel='rbf', ‘linear’, ‘poly’, ‘sigmoid’, ‘precomputed’
        SVR_model = SVR(kernel="rbf", C=c, gamma=Gamma, epsilon=Epsilon)

        # use a variable weight, focus on next day prediction (~0 - 1.0 - ~0)
        weight_training = (
            np.power(np.arange(1, len(X) + 1, dtype=float), power) / np.power(np.arange(1, len(X) + 1), power).max()
        )

        # Fit to data using Maximum Likelihood Estimation of the parameters
        SVR_model.fit(X, y, weight_training)

        # Make the prediction on the meshed x-axis (ask for MSE as well)
        y_pred = SVR_model.predict(x)

        """print len(X)
        print X
        print len(x)
        print x
        exit()"""

        # print SVR_model.score(x,y_pred)

        # score = gp.score(y, y_pred)
        # print score
        predictionSets.append(y_pred)
        scoreSets.append(score)
        # print "{0:0.1f} minutes to compute Gaussian Process & Fit.".format((time.time() - startTime)/60.0)

        count += 1

    lookBack = -1

    return (
        predictionSets[0],
        predictionSets[1],
        predictionSets[2],
        predictionSets[3],
        predictionSets[4],
        scoreSets[0],
        scoreSets[1],
        scoreSets[2],
        scoreSets[3],
        scoreSets[4],
    )
Ejemplo n.º 60
0
from sklearn.metrics import mean_squared_error

# In[54]:


def rmse(Y_Test, Y_Pred):
    return np.sqrt(mean_squared_error(Y_Test, Y_Pred))


# In[55]:

from sklearn.svm import SVR

svr = SVR()
svr.fit(X_Train, Y_Train)
svr_score = svr.score(X_Test, Y_Test)
svr_rmse = rmse(Y_Test, svr.predict(X_Test))
svr_score, svr_rmse

# In[62]:

from sklearn.linear_model import LinearRegression

# In[63]:

lr = LinearRegression()

# In[65]:

lr.fit(X_Train, Y_Train)
lr_score = lr.score(X_Test, Y_Test)