コード例 #1
0
ファイル: cgen_include.py プロジェクト: bwelsh/projectW4761
def addPolyFeatures(data, deg):
    '''
    Given a dictionary of dataframes and a degree, this function adds polynomial features up to the degree provided to each feature dataframe and returns the dictionary with the enhanced feature set.
    '''
    train_features = data['train']['features']
    valid_features = data['valid']['features']
    test_features = data['test']['features']

    train_fit = preprocessing.PolynomialFeatures(
        degree=deg, include_bias=False).fit(train_features)
    new_columns = []
    poly_arr = train_fit.powers_
    for i in range(0, len(poly_arr)):
        out_name = 'F'
        for j in range(0, len(poly_arr[i])):
            if poly_arr[i][j] > 0:
                out_name = out_name + '_' + str(j) + '^' + str(poly_arr[i][j])
        new_columns.append(out_name)
    train_features = train_fit.transform(train_features)
    train_features = pd.DataFrame(train_features)
    train_features.columns = new_columns
    data['train']['features'] = train_features
    valid_features = preprocessing.PolynomialFeatures(
        degree=deg, include_bias=False).fit_transform(valid_features)
    valid_features = pd.DataFrame(valid_features)
    valid_features.columns = new_columns
    data['valid']['features'] = valid_features
    test_features = preprocessing.PolynomialFeatures(
        degree=deg, include_bias=False).fit_transform(test_features)
    test_features = pd.DataFrame(test_features)
    test_features.columns = new_columns
    data['test']['features'] = test_features
    return data
コード例 #2
0
def findModelName(finalSelectMethod):
    X = df.iloc[:, 0:2]
    Y = df.iloc[:, 2:3].values.ravel()
    if list(finalSelectMethod.keys()) == ['LR']:
        return LogisticRegression(), X, Y
    elif list(finalSelectMethod.keys()) == ['KNN']:
        return KNeighborsClassifier(), X, Y
    elif list(finalSelectMethod.keys()) == ['Dec. Tree Classifier']:
        return DecisionTreeClassifier(), X, Y
    elif list(finalSelectMethod.keys()) == ['NB']:
        return GaussianNB(), X, Y
    elif list(finalSelectMethod.keys()) == ['SVM']:
        return SVC(kernel='linear', C=2, gamma='auto'), X, Y
    elif list(finalSelectMethod.keys()) == ['Lin. Reg.']:
        return LinearRegression(), X, Y
    elif list(finalSelectMethod.keys()) == ['Poly2']:
        regression2 = pp.PolynomialFeatures(degree=2)
        X_pol2 = regression2.fit_transform(X)
        return LinearRegression(), X_pol2, Y
    elif list(finalSelectMethod.keys()) == ['Poly3']:
        regression3 = pp.PolynomialFeatures(degree=3)
        X_pol3 = regression3.fit_transform(X)
        return LinearRegression(), X_pol3, Y
    elif list(finalSelectMethod.keys()) == ['Poly4']:
        regression4 = pp.PolynomialFeatures(degree=4)
        X_pol4 = regression4.fit_transform(X)
        return LinearRegression(), X_pol4, Y
    elif list(finalSelectMethod.keys()) == ['Dec. Tree regressor']:
        return DecisionTreeRegressor(), X, Y
コード例 #3
0
    def train_classifiers(self, polynomial=False):
        if polynomial:
            self.X1 = preprocessing.PolynomialFeatures().fit_transform(self.X1)
            self.X2 = preprocessing.PolynomialFeatures().fit_transform(self.X2)

        # iterate over classifiers
        for i, (name, clf) in enumerate(zip(self.names, self.classifiers)):
            scoreL = []
            importances = None
            print("Training {}...".format(name))

            clf.fit(np.array(self.X1), np.array(self.Y1))
            start_time = time.time()
            predictedL = list(clf.predict(np.array(self.X2)))
            self.timers[i] += (time.time() - start_time)
            if hasattr(clf, "feature_importances_"):
                importances = clf.feature_importances_
            elif hasattr(clf, "coef_"):
                importances = clf.coef_.ravel()
            scoreL.append(clf.score(np.array(self.X2), np.array(self.Y2)))

            clf.fit(np.array(self.X2), np.array(self.Y2))
            start_time = time.time()
            predictedL += list(clf.predict(np.array(self.X1)))
            self.timers[i] += (time.time() - start_time)
            if hasattr(clf, "feature_importances_"):
                importances += clf.feature_importances_
            elif hasattr(clf, "coef_"):
                # TODO when coef_ is added to importances that already contains another one, it throws a
                # ValueError: output array is read-only
                importances = clf.coef_.ravel()
            scoreL.append(clf.score(np.array(self.X1), np.array(self.Y1)))

            self.timers[i] += self.timer
            self.importances.append(importances)
            self.scores.append(scoreL)

            print("Matching records...")
            real = self.Y2 + self.Y1
            for pos in range(len(real)):
                if real[pos] == 1.0:
                    if predictedL[pos] == 1.0:
                        self.num_true_predicted_true[i] += 1.0
                        if self.accuracyresults:
                            self.file.write("TRUE\tTRUE\n")
                    else:
                        self.num_true_predicted_false[i] += 1.0
                        if self.accuracyresults:
                            self.file.write("TRUE\tFALSE\n")
                else:
                    if predictedL[pos] == 1.0:
                        self.num_false_predicted_true[i] += 1.0
                        if self.accuracyresults:
                            self.file.write("FALSE\tTRUE\n")
                    else:
                        self.num_false_predicted_false[i] += 1.0
                        if self.accuracyresults:
                            self.file.write("FALSE\tFALSE\n")
コード例 #4
0
def fitting(train, model_sel, steps_ahead):
    '''
    Returns prediction
    '''

    # Select models
    if model_sel[0] == 'ARIMA':
        model = ARIMA(train['Close'], order=model_sel[1])
        model_fit = model.fit()
        res = model_fit.forecast(steps=steps_ahead)  # Forecasting steps_ahead
        output = res[0][len(res[0]) - 1]  # Only consider steps_ahead
    else:
        # Prepare data
        X = np.c_[train.index]
        y = np.c_[train['Close']]
        poly = preprocessing.PolynomialFeatures(degree=3, include_bias=False)
        scaler = preprocessing.StandardScaler()

        if model_sel[0] == 'poly':
            model = linear_model.Ridge(alpha=model_sel[1])
        elif model_sel[0] == 'SVM':
            model = LinearSVR(C=model_sel[1][0], epsilon=model_sel[1][1])

        modelPip = pipeline.Pipeline([('poly', poly), ('scal', scaler),
                                      ('modl', model)])
        model_fit = modelPip.fit(X, y)
        output = model_fit.predict([[X[len(X) - 1][0] + steps_ahead]])
    return output.flatten()[0]
コード例 #5
0
def mapFeature(X1, X2):
    # MAPFEATURE Feature mapping function to polynomial features
    #
    #   MAPFEATURE(X1, X2) maps the two input features
    #   to quadratic features used in the regularization exercise.
    #
    #   Returns a new feature array with more features, comprising of
    #   X1, X2, X1.^2, X2.^2, X1*X2, X1*X2.^2, etc..
    #
    #   Inputs X1, X2 must be the same size
    #

    # n = X1.shape[0]
    # degree = 6
    # out = np.ones((n, 1)).reshape((n, 1))
    # for i in range(1, degree + 1):
    #     for j in range(i + 1):
    #         term1 = X1 ** (i - j)
    #         term2 = X2 ** j
    #         out = np.hstack((out, (term1 * term2).reshape((n, 1))))

    data = np.c_[X1, X2]
    poly = preprocessing.PolynomialFeatures(6)
    out = poly.fit_transform(data)

    return out
コード例 #6
0
def run_random_forest_probabilistic_classification(train, train_labels,
                                                   validate, validate_labels):
    # transform counts to TFIDF features
    tfidf = feature_extraction.text.TfidfTransformer(smooth_idf=False)
    train = tfidf.fit_transform(train).toarray()
    validate = tfidf.transform(validate).toarray()

    # encode labels
    label_encode = preprocessing.LabelEncoder()
    train_labels = label_encode.fit_transform(train_labels)

    poly_feat = preprocessing.PolynomialFeatures(degree=2,
                                                 interaction_only=False,
                                                 include_bias=True)
    train = poly_feat.fit_transform(train, train_labels)
    validate = poly_feat.transform(validate)

    randomForest = RandomForestClassifier(n_jobs=4,
                                          n_estimators=1000,
                                          max_features=20,
                                          min_samples_split=3,
                                          bootstrap=False,
                                          verbose=3,
                                          random_state=23,
                                          max_depth=100)
    randomForest.fit(train, train_labels)
    predicted_labels = randomForest.predict_proba(validate)
    return metrics.log_loss(validate_labels, predicted_labels)
コード例 #7
0
ファイル: 3LinearRegression.py プロジェクト: eatamath/-
def poly_predict_boston():
    print "多项式回归预测房价"
    # 加载特征值预处理对象 转换成 n-degree 多项式
    pf = pp.PolynomialFeatures(2)
    # 特征值处理
    X = pf.fit_transform(X)
    Xtest = pf.fit_transform(Xtest)
    # 加载岭回归
    lr = limd.Ridge()
    # 模型训练
    model = lr.fit(X, Y)
    print "模型\n", model
    print("训练拟合评分\n %.3f" % lr.score(X, Y))
    Ypred = model.predict(Xtest)
    print("预测均方误差\n %.3f" % metrics.mean_squared_error(Ytest, Ypred))
    print("系数\n %s " % lr.coef_)
    print "截距\n", lr.intercept_
    # 作图
    fig = plt.figure()
    # 画出直线 y=x 并且设置颜色和粗细
    plt.plot([min(Ytest), max(Ytest)], [min(Ytest), max(Ytest)], 'r', lw=5)
    # 设置颜色
    color = abs(Ypred - Ytest) / Ytest
    # 画出散点图
    p = plt.scatter(Ypred, Ytest, c=color, marker='.')
    # 颜色刻度
    plt.colorbar()
    plt.ylabel("Predieted Price")
    plt.xlabel("Real Price")
    # 图片显示
    plt.show()
    return
def preprocess(X_tr, X_ts, poly_degree=1):
    """
    If current directory contains RBFSampler.txt then 
    use RBFSampler, otherwise,
    Do polynomial transform
    also return the combined transform, incase needed

    features are normalized already in the source
    so, only polynomial transformation is done
    default is 1, since 561 fetures is already too many
    """
    rbf_path = os.path.join(os.getcwd(), 'RBFSampler.txt')
    if False and os.path.exists(
            rbf_path):  # disable RBFSample features again. Didn't help
        with open(rbf_path, 'rt') as f:
            kwargs = ast.literal_eval(f.read())
            transformer = RBFSampler(**kwargs)
    else:
        transformer = preprocessing.PolynomialFeatures(degree=poly_degree,
                                                       interaction_only=False)

    X_comb_tr = transformer.fit_transform(np.concatenate(X_tr, axis=0))
    X_comb_ts = transformer.transform(np.concatenate(X_ts, axis=0))

    X_tr = [transformer.transform(x) for x in X_tr]
    X_ts = [transformer.transform(x) for x in X_ts]

    return X_tr, X_ts, X_comb_tr, X_comb_ts, transformer
コード例 #9
0
def polynomial_feature_ord(X, n):
    poly = preprocessing.PolynomialFeatures(n)
    out = poly.fit_transform(X)
    feature_names = poly.get_feature_names(X.columns)

    X_new = pd.DataFrame(out, columns=feature_names)
    return X_new
コード例 #10
0
    def train(self):
        self.output().makedirs()
        preproc = pipeline.Pipeline([
            ('norm', preprocessing.Normalizer()),
            ('poly', preprocessing.PolynomialFeatures(self.npoly.get()))
        ])

        X = abhishek_feats.AbhishekFeatures().load('train',
                                                   self.fold,
                                                   as_df=True)
        X = preproc.fit_transform(X)
        y = xval_dataset.BaseDataset().load('train', self.fold).squeeze()
        cls = linear_model.LogisticRegression(C=self.C.get(),
                                              solver='sag',
                                              class_weight=core.dictweights)
        cls.fit(X, y)

        print('Validating')
        validX = abhishek_feats.AbhishekFeatures().load('valid', self.fold)
        validX = preproc.transform(validX)
        y = xval_dataset.BaseDataset().load('valid', self.fold).squeeze()
        y_pred = cls.predict_proba(validX)[:, 1]

        score = core.score_data(y, y_pred)
        np.save(
            'cache/abhishek/logit/{:f}/{:d}/valid.npy'.format(
                self.C.get(), self.fold), y_pred)

        return score, cls, preproc
コード例 #11
0
        def polynomial_regression(X, Y):
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                Y,
                                                                test_size=0.20,
                                                                shuffle=True)
            poly_features = preprocessing.PolynomialFeatures(degree=2)

            # transforms the existing features to higher degree features.
            X_train_poly = poly_features.fit_transform(X_train)
            X_test_poly = poly_features.fit_transform(X_test)
            # fit the transformed features to Linear Regression
            poly_model = linear_model.LinearRegression()
            poly_model.fit(X_train_poly, y_train)

            # predicting on training data-set
            y_train_predicted = poly_model.predict(X_train_poly)

            # predicting on test data-set
            prediction = poly_model.predict(X_test_poly)
            # print('Co-efficient of linear regression', poly_model.coef_)
            # print('Intercept of linear regression model', poly_model.intercept_)
            print('///////////////training/////////////////////////')
            print(
                'Mean Square Error',
                metrics.mean_squared_error(np.asarray(y_train),
                                           y_train_predicted))
            print('R2 Score :', poly_model.score(X_train_poly, y_train))
            print('///////////////testing/////////////////////////')
            print('Mean Square Error',
                  metrics.mean_squared_error(np.asarray(y_test), prediction))
            print('R2 Score :', poly_model.score(X_test_poly, y_test))
            filename = 'model/poly.pkl'
            pickle.dump(poly_model, open(filename, 'wb'))
コード例 #12
0
def determine_optimal_q(c):
    kf = KFold(n_splits=optimal_k)
    mean_error = []
    std_error = []
    q_range = [1, 2, 3, 4, 5, 6]
    for i, q in enumerate(q_range):
        print("--- Trying Q Value: ", q)
        polynomial_features = prep.PolynomialFeatures(degree=q)
        new_features = polynomial_features.fit_transform(X)
        logRegressionModel = LogisticRegression(C=1,
                                                penalty='l2',
                                                solver="saga",
                                                max_iter=7000,
                                                random_state=0)
        temp = []

        for train, test in kf.split(new_features):
            logRegressionModel.fit(new_features[train], y[train])
            predictions = logRegressionModel.predict(new_features[test])
            temp.append(mean_squared_error(y[test], predictions))

        mean_error.append(np.array(temp).mean())
        std_error.append(np.array(temp).std())
    plt.figure(2)
    plt.errorbar(q_range, mean_error, yerr=std_error, linewidth=3)
    plt.xlabel("Polynomial Degree 'q'")
    plt.ylabel("Mean square error")
    plt.title("Mean Square Error vs. Polynomial Degree Q")
    # plt.show()
    curr_min = min(mean_error)
    indexOfMinimum = [i for i, j in enumerate(mean_error) if j == curr_min]
    return indexOfMinimum[0] + 1  #  take first element for simplest model
コード例 #13
0
def determine_optimal_C():
    kf = KFold(n_splits=optimal_k)
    mean_error = []
    std_error = []
    for i, C in enumerate(c_values):
        print("--- Trying C Value: ", C)
        polynomial_features = prep.PolynomialFeatures(degree=5)
        new_features = polynomial_features.fit_transform(X)
        logRegressionModel = LogisticRegression(C=C,
                                                penalty='l1',
                                                solver="saga",
                                                max_iter=7000,
                                                random_state=0)

        temp = []
        for train, test in kf.split(new_features):
            logRegressionModel.fit(new_features[train], y[train])
            predictions = logRegressionModel.predict(new_features[test])
            temp.append(mean_squared_error(y[test], predictions))

        mean_error.append(np.array(temp).mean())
        std_error.append(np.array(temp).std())
    plt.figure(3)
    plt.errorbar(c_values, mean_error, yerr=std_error, linewidth=3)
    plt.xlabel("C Value")
    plt.ylabel("Mean square error")
    plt.title("Mean Square Error vs. C Values")
    plt.xlim([0, 20])
    # plt.show()
    curr_min = min(mean_error)
    indexOfMinimum = [i for i, j in enumerate(mean_error) if j == curr_min]
    print("OPTIMAL C must be: ", c_values[indexOfMinimum[0]])
    #  take first element for simplest model
    return indexOfMinimum[0]
コード例 #14
0
 def __init__(
     self,
     data,
     target,
     standard,
     feature_interaction,
     cv,
     test_size,
 ):
     self.data = data
     self.target = target
     self.standard = standard
     self.feature_interaction = feature_interaction
     self.cv = cv
     self.test_size = test_size
     self.validateInit()
     self.__features = list(self.data.columns)
     self.__features.remove(self.target)
     self.__y = np.array(self.data[self.target])
     self.__X = np.array(self.data[self.__features])
     if self.standard:
         self.__X = preproc.StandardScaler().fit_transform(self.__X)
     if self.feature_interaction:
         if not self.standard:
             print(
                 'Consider standardizing the feature matrix (standard=True).'
             )
         self.__X = preproc.PolynomialFeatures(
             include_bias=False).fit_transform(self.__X)
     self.__XTrain, self.__XTest, self.__yTrain, self.__yTest = train_test_split(
         self.__X, self.__y, test_size=self.test_size)
コード例 #15
0
	def eval_confusion_matrix(self, d = 1):#, thresh = 0.6):
		from sklearn.metrics import confusion_matrix
		#you should have already ran _load_train_data and load_model
		if not hasattr(self,'clf'):
			print("You must run load_model before this.")
			return
		if not hasattr(self,'X_train'):
			print("You must run _load_train_data before running this.")
			return
		#expand dataset, if appropriate
		X_data = self.X_train
		if d==2:
			print("Expanding feature set to include quadratic, cross terms.")
			poly=preprocessing.PolynomialFeatures(degree = d, interaction_only = True)
			X_data_exp = poly.fit_transform(X_data)

			#FIRST, SCALE THE DATA USING THE SCALER
			X_data_scaled = self.scaler.transform(X_data_exp)
		else:
			X_data_scaled = self.scaler.transform(X_data)
		#scale the data

		Probs = self.clf.predict_proba(X_data_scaled)
		#max_probs = np.max(Probs,axis=1)
		Pred = np.argmax(Probs,axis=1)
		#IM_num = int(np.max(self.Y_train)+1)
		#for pred_i in range(len(Pred)):
		#	if max_probs[pred_i]<thresh:
		#		Pred[pred_i]=IM_num

		CM = confusion_matrix(self.Y_train,Pred) #tn, fp, fn, tp
		s = np.sum(CM,axis=1)
		CM = CM / s[:,None] #normalize matrix by rows
		#where max of Probs<0.6
		return CM
コード例 #16
0
def PolynomialFeatures(train_df, test_df, HP):
    degree, interaction_only, include_bias, order = HP['PolynomialFeatures'][
        'degree'], HP['PolynomialFeatures']['interaction_only'], HP[
            'PolynomialFeatures']['include_bias'], HP['PolynomialFeatures'][
                'order']
    train_x = train_df.iloc[:, :-1]
    train_y = train_df.iloc[:, -1:]
    test_x = test_df.iloc[:, :-1]
    test_y = test_df.iloc[:, -1:]

    transformer = preprocessing.PolynomialFeatures(
        degree=degree,
        interaction_only=interaction_only,
        include_bias=include_bias,
        order=order)
    train_x_copy = train_x.copy()
    train_x_transformed = transformer.fit_transform(train_x_copy)
    test_x_copy = test_x.copy()
    test_x_transformed = transformer.transform(test_x_copy)  # TODO check here

    train_column_name = list(train_x_copy.columns)
    test_column_name = list(test_x_copy.columns)

    train_x_transformed_df = pd.DataFrame(train_x_transformed)
    train_x_transformed_df.columns = train_column_name
    train_df_transformed = train_x_transformed_df.assign(label=train_y.values)

    test_x_transformed_df = pd.DataFrame(test_x_transformed)
    test_x_transformed_df.columns = test_column_name
    test_df_transformed = test_x_transformed_df.assign(label=test_y.values)

    return train_df_transformed, test_df_transformed
コード例 #17
0
    def fit(self, data, is_replace=True):
        print('----- 因子变换: {} -----'.format(self.method))
        self.is_place = is_replace
        if self.method == 'log':
            self.data_col = data.columns
            for col in self.data_col:
                new_col = pd.DataFrame(np.log(data[[col]]),
                                       columns=['ln_' + col],
                                       index=data.index)
            data = pd.concat([data, new_col], axis=1)
            if self.is_place:
                data = data.drop(self.data_col, axis=1)
        elif self.method == 'avgstd':
            scaler = preprocessing.StandardScaler()
            data = pd.DataFrame(scaler.fit_transform(data),
                                index=data.index,
                                columns=data.columns)
            self.scaler = scaler

        elif self.method == 'minmax':
            scaler = preprocessing.MinMaxScaler()
            data = pd.DataFrame(scaler.fit_transform(data),
                                index=data.index,
                                columns=data.columns)
            self.scaler = scaler

        elif self.method == 'poly':
            scaler = preprocessing.PolynomialFeatures()
            scaler.fit(data)
            self.scaler = scaler
コード例 #18
0
def mapFeatures(x, d):
    if d == 1:
        Z = x
    else:
        poly = pp.PolynomialFeatures(d)
        Z = poly.fit_transform(x)
    return Z
コード例 #19
0
    def polynomial_regression(self,  paramDic, X_train, y_train, X_test, y_test):
        '''
        Polynomial regression model being used for the second learning phase.

        :param: paramDic:   Dictionary containing all the necessary hyperparameters.
                X_train:    Training attributes.
                y_train:    Training labels.
                X_test:     Test attributes.
                y_test:     Test labels.
        :return: (model name, trained model)
        '''
        modelName = "polynomial_regression"

        poly = preprocessing.PolynomialFeatures(2)
        x_poly = poly.fit_transform(X_train)

        pr = linear_model.LinearRegression(
            fit_intercept = paramDic['fit_intercept'],
            normalize = paramDic['normalize'],
            copy_X = paramDic['copy_X'],
            n_jobs = paramDic['n_jobs']
        )
        print(x_poly.shape,y_train.shape)
        pr.fit(x_poly,y_train)

        return (modelName, pr)
コード例 #20
0
def showModelOverfitting(full_data):
    """
    模型过拟合情况,详情看
    :param full_data: 训练数据overfitting_model_plot.png
    :return: 
    """
    full_data.plot(kind='scatter',
                   x="GDP per capita",
                   y='Life satisfaction',
                   figsize=(8, 3))
    plt.axis([0, 110000, 0, 10])

    # 暂时不清楚,后续回来看
    poly = preprocessing.PolynomialFeatures(degree=60, include_bias=False)
    scaler = preprocessing.StandardScaler()
    module = sklearn.linear_model.LinearRegression()

    # 数据预处理太过导致过拟合
    Xfull = np.c_[full_data["GDP per capita"]]
    yfull = np.c_[full_data["Life satisfaction"]]
    X = np.linspace(0, 110000, 1000)

    pipeline_reg = pipeline.Pipeline([('poly', poly), ('scal', scaler),
                                      ('lin', module)])
    pipeline_reg.fit(Xfull, yfull)
    curve = pipeline_reg.predict(X[:, np.newaxis])
    plt.plot(X, curve)
    save_fig('overfitting_model_plot')
コード例 #21
0
def create_polinomial_model(X_train,y_train,X_test,y_test,n):
    polynomial_transformer = sk_preprocessing.PolynomialFeatures(degree=n)
    X_transformed_train = polynomial_transformer.fit_transform(X_train)
    X_transformed_test = polynomial_transformer.fit_transform(X_test)
    pol_model = train_linear_model(X_transformed_train,y_train)
    res = get_MSE(pol_model,X_transformed_test,y_test)
    return res
コード例 #22
0
ファイル: Main.py プロジェクト: RowanMeara/KaggleCompetitions
    def encode_features(self, xtr, xte, header):
        # Encode Categorical Features
        print('Encoding Features...')
        (cat, cont, other) = getdata.get_feature_indices(self.CATEGORICAL, self.CONTINUOUS, header)
        xtr_cat = xtr[:, cat]
        xtr_cont = xtr[:, cont]
        xte_cat = xte[:, cat]
        xte_cont = xte[:, cont]
        enc = preprocessing.OneHotEncoder(sparse=False)
        enc.fit(np.concatenate((xtr_cat, xte_cat)))
        xtr_cat = enc.transform(xtr_cat)
        xte_cat = enc.transform(xte_cat)

        # Remove features with low counts included during transform
        t = np.sum(xtr_cat, axis=0)
        features_to_remove = []
        for i in range(0, t.size):
            if t[i] < 10:
                features_to_remove.append(i)
        remove = np.array(features_to_remove)
        xtr_cat = np.delete(xtr_cat, remove, axis=1)
        xte_cat = np.delete(xte_cat, remove, axis=1)

        # Encode Continuous Features
        quad_enc = preprocessing.PolynomialFeatures(2)
        xtr_cont = quad_enc.fit_transform(xtr_cont)
        xte_cont = quad_enc.transform(xte_cont)

        # Recombine categorical, continuous, and other features
        xtr = np.concatenate((xtr[:, other], xtr_cont, xtr_cat), axis=1)
        xte = np.concatenate((xte[:, other], xte_cont, xte_cat), axis=1)
        return xtr, xte
コード例 #23
0
def question_23():
    data = question_18().as_matrix()[:, :7]
    _X = np.array(data[:, 1:7])
    poly = preprocessing.PolynomialFeatures(interaction_only=False)
    _X_new = poly.fit_transform(_X)
    _X_new = preprocessing.scale(preprocessing.normalize(_X_new))
    print _X_new.shape
コード例 #24
0
def interactionfeaturesinPredictions():

    df = pd.read_csv('data/OnlineNewsPopularity.csv',
                     delimiter=', ',
                     engine='python')
    print(df.columns)

    #### 数据源中缺少 features 数据列
    X = df[features]
    y = df[['shares']]

    X2 = preproc.PolynomialFeatures(include_bias=False).fit_transform(X)
    X2.shape

    ### Create train/test sets for both feature sets
    X1_train, X1_test, X2_train, X2_test, y_train, y_test = \
                train_test_split(X, X2, y, test_size=0.3, random_state=123)

    def evaluate_feature(X_train, X_test, y_train, y_test):
        ###Fit a linear regression model on the training set and score on the test set
        model = linear_model.LinearRegression().fit(X_train, y_train)
        r_score = model.score(X_test, y_test)
        return (model, r_score)


### Train models and compare score on the two feature sets

    (m1, r1) = evaluate_feature(X1_train, X1_test, y_train, y_test)
    (m2, r2) = evaluate_feature(X2_train, X2_test, y_train, y_test)
    print("R-squared score with singleton features: %0.5f" % r1)
    print("R-squared score with pairwise features: %0.10f" % r2)
コード例 #25
0
ファイル: test_data.py プロジェクト: stjordanis/dask-ml
    def test_basic(self):
        a = dpp.PolynomialFeatures()
        b = spp.PolynomialFeatures()

        a.fit(X)
        b.fit(X.compute())
        assert_estimator_equal(a._transformer, b)
コード例 #26
0
def gen_features(train, y, test):
    ntrain = len(train)
    df_all = pd.concat([train, test])

    poly = preprocessing.PolynomialFeatures(degree=3)
    dpoly = poly.fit_transform(df_all)

    df_all['ap_diff'] = df_all.ap_hi - df_all.ap_lo

    h = df_all['height'] / 100
    df_all['BWI'] = df_all['weight'] / (h * h)
    df_all['bad_bwi'] = (df_all.BWI > 60).values * 1 + (df_all.BWI <
                                                        10).values * 1

    df_all['bad_height'] = (df_all.height < 130).values * 1

    df_all['bad_weight'] = (df_all.weight + 120 < df_all.height).values * 1

    df_all['bad_ap_hi'] = 0
    df_all.ix[(df_all.ap_hi < 80).values + (df_all.ap_hi > 220).values,
              'bad_ap_hi'] = 1

    df_all['bad_ap_lo'] = 0
    df_all.ix[(df_all.ap_lo < 40).values + (df_all.ap_lo > 200).values,
              'bad_ap_lo'] = 1

    df_all['has_bad_data'] = (df_all.bad_bwi + df_all.bad_height +
                              df_all.bad_weight + df_all.bad_ap_hi +
                              df_all.bad_ap_lo) > 0

    return df_all[:ntrain].reindex(), y, df_all[ntrain:].reindex()
コード例 #27
0
    def send_data(self):
        if self.data is not None:
            attributes = self.x_var_model[self.x_var_index]
            class_var = self.y_var_model[self.y_var_index]

            data_table = Table(Domain([attributes], class_vars=[class_var]),
                               self.data)
            polyfeatures = skl_preprocessing.PolynomialFeatures(
                int(self.polynomialexpansion))

            x = data_table.X[~np.isnan(data_table.X).any(axis=1)]
            x = polyfeatures.fit_transform(x)
            x_label = data_table.domain.attributes[0].name

            out_array = np.concatenate((x, data_table.Y[np.newaxis].T), axis=1)

            out_domain = Domain(
                [ContinuousVariable("1")] +
                ([data_table.domain.attributes[0]]
                 if self.polynomialexpansion > 0 else []) + [
                     ContinuousVariable("{}^{}".format(x_label, i))
                     for i in range(2,
                                    int(self.polynomialexpansion) + 1)
                 ],
                class_vars=[class_var])

            self.send("Data", Table(out_domain, out_array))
            return

        self.send("Data", None)
コード例 #28
0
def plot_Polinomial_Score(XTrain,
                          YTrain,
                          model,
                          alpha=[0.0001],
                          bias=False,
                          score='explained_variance'):
    parameters = {'Pol__degree': [1, 2, 3, 4, 5, 6, 7], 'Model__alpha': alpha}

    pipe = Pipeline([
        ('Pol', preprocessing.PolynomialFeatures(include_bias=bias)),
        ('Scale', preprocessing.StandardScaler()), ('Model', model)
    ])

    grid = GridSearchCV(pipe, param_grid=parameters, cv=5, scoring=score)

    with warnings.catch_warnings():  #Catch conversion warnings
        warnings.simplefilter("ignore")
        grid.fit(XTrain, YTrain)

    resultados = grid.cv_results_['mean_test_score']
    resultados[resultados < 0.0] = 0.0
    print(grid.cv_results_['mean_test_score'])

    plt.plot([1, 2, 3, 4, 5, 6, 7], resultados)
    plt.ylim([0.0, 1.0])
    plt.xlabel('Polinomio')
    plt.ylabel('Varianza Explicada')
    plt.title('Grado Polinomico')
    plt.axis('tight')

    pass
コード例 #29
0
 def test_normal_mixture_hard(self):
     np.random.seed(0)
     size_batch = 1000
     competition = AdversarialCompetition(
         size_batch=size_batch,
         true_model=GenerativeNormalMixtureModel(
             np.arange(-3, 4),
             np.random.uniform(1, 2, 7).round(2)),
         discriminative=pipeline.make_pipeline(
             preprocessing.PolynomialFeatures(4),
             linear_model.LogisticRegression()),
         generative=GenerativeNormalMixtureModel(np.arange(-3, 4) * 0.1,
                                                 np.ones(7),
                                                 updates=["mu", "sigma"]),
         gradient_descent=GradientDescent(np.array([0.3, 0.1, 0.3]).reshape(
             (-1, 1)),
                                          inertia=0.9,
                                          annealing=2000,
                                          last_learning_rate=0.001),
     )
     for i in range(5000):
         competition.iteration()
     params = competition.generatives[-1]._params
     print params.shape
     true_params = competition.true_model._params
     np.testing.assert_allclose(params, true_params, 0, 0.2)
コード例 #30
0
def run_reg():
    print("### Regularized logistic regression ###")
    data = pd.read_csv(
        'data/ex2data2.txt',
        header=None,
        names=['Microchip Test 1', 'Microchip Test 2', 'Accepted'])

    X = data.values[:, :-1]
    y = data.values[:, -1:]

    poly = skp.PolynomialFeatures(6)
    X = poly.fit_transform(X)

    l = 1
    initial_theta = np.zeros([len(X.T), 1])

    res = sop.minimize(cost_function_reg,
                       initial_theta,
                       args=(X, y, l),
                       method='TNC',
                       jac=True)
    theta = res.x.reshape(-1, 1)

    p = np.round(sigmoid(X.dot(theta)))
    acc = np.mean(p == y) * 100

    print("prediction accuracy: {:.2f}%".format(acc))