Ejemplo n.º 1
0
def combination_algorithm(AMDs_train, energy_train, AMDs_test, energy_test,
                          type):
    NUMBER_OF_CLUSTER = 5
    if type == "kmeans_com":
        model = KMeans(n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train)
        y_clusters = model.predict(AMDs_test)
    elif type == "affinity_com":
        model = AffinityPropagation(damping=0.9,
                                    random_state=5).fit(AMDs_train)
        y_clusters = model.predict(AMDs_test)
    elif type == "agglomerative_com":
        model = AgglomerativeClustering(n_clusters=NUMBER_OF_CLUSTER)
        y_clusters = model.fit_predict(AMDs_test)
    elif type == "birch_com":
        model = Birch(threshold=0.1,
                      n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train)
        y_clusters = model.predict(AMDs_test)
    elif type == "minibatch_com":
        model = MiniBatchKMeans(n_clusters=NUMBER_OF_CLUSTER).fit(AMDs_train)
        y_clusters = model.predict(AMDs_test)
    elif type == "meanshift_com":
        model = MeanShift().fit(AMDs_train)
        y_clusters = model.predict(AMDs_test)
    else:
        return

    new_energy = []
    new_energy_test = []
    for i in range(NUMBER_OF_CLUSTER):
        if i not in y_clusters:
            print("ERROR: ", i, " is not here")
            continue
        index = 0
        temp_AMDs = []
        temp_energy = []
        for j in model.labels_:
            if i == j:
                temp_AMDs.append(AMDs_train[index])
                temp_energy.append(energy_train[index])
            index += 1

        index = 0
        temp_AMDs_test = []
        temp_energy_test = []
        for j in y_clusters:
            if i == j:
                temp_AMDs_test.append(AMDs_test[index])
                temp_energy_test.append(energy_test[index])
            index += 1

        quadratic_featurizer = PolynomialFeatures(degree=1,
                                                  interaction_only=True)
        X_train_quadratic = quadratic_featurizer.fit_transform(temp_AMDs)
        X_test_quadratic = quadratic_featurizer.fit_transform(temp_AMDs_test)
        model2 = LinearRegression()
        model2.fit(X_train_quadratic, temp_energy)

        temp_energy_pred = model2.predict(X_test_quadratic)

        new_energy.extend(temp_energy_pred)
        new_energy_test.extend(temp_energy_test)

        fig, ax = plt.subplots()
        ax.scatter(temp_energy_test, temp_energy_pred)
        ax.plot([np.min(temp_energy_test),
                 np.max(temp_energy_test)],
                [np.min(temp_energy_test),
                 np.max(temp_energy_test)],
                'k--',
                lw=4)
        ax.set_xlabel('Given')
        ax.set_ylabel('Predicted')
        plt.savefig('./image/combination_algorithm' + str(i) + '.jpg')

    fig, ax = plt.subplots()
    print("R^2 score of the combination algorithm is: ",
          r2_score(new_energy_test, new_energy))
    print("RMSE of the combination algorithm is: ",
          math.sqrt(mean_squared_error(new_energy_test, new_energy)))
    ax.scatter(new_energy_test, new_energy)
    ax.plot([np.min(new_energy_test),
             np.max(new_energy_test)],
            [np.min(new_energy_test),
             np.max(new_energy_test)],
            'k--',
            lw=4)
    ax.set_xlabel('Given')
    ax.set_ylabel('Predicted')
    plt.savefig('./image/combination_algorithm.jpg')
Ejemplo n.º 2
0
st.sidebar.title("Upload Your Sales History")


def load_data(file):
    df = pd.read_csv(file,decimal=".")
    df2 =df.drop(["date"], axis=1)
    df2=df2.replace(0, 0.01)
    df2['total']=df2.sum(axis=1)
    return df, df2


if uploaded_file is not None:
    df, df2 = load_data(uploaded_file)
    # prepare models
    models = []
    models.append(('LR', LinearRegression()))
    models.append(('KNN', KNeighborsRegressor()))
    models.append(('RF', RandomForestRegressor()))
    models.append(('GB', GradientBoostingRegressor()))
    models.append(('XGBoost', XGBRegressor(verbosity = 0)))
    models.append(('SVM', LinearSVR()))
    models.append(('Extra Trees', ExtraTreesRegressor()))
    models.append(('Naive', NaiveForecaster(strategy="last", sp=12)))
    models.append(('Theta', ThetaForecaster(sp=12)))
    models.append(('Exp_Smoothing', ExponentialSmoothing(trend="add", seasonal="additive", sp=12)))
    models.append(('TBATS', TBATS(sp=12, use_trend=True, use_box_cox=False)))
    
    forecast_horizon = st.sidebar.slider(label = 'Forecast Length (months)',min_value = 3, max_value = 36, value = 12)
    window_length = st.sidebar.slider(label = 'Sliding Window Length ',min_value = 1, value = 12)
    # evaluate each model in turn
    results1 = []
# In[10]:

feature_cols = [
    "Monthly Income", "Transaction Time", "Gender_Female", "Gender_Male",
    "City_Tier 1", "City_Tier 2", "City_Tier 3", "Record"
]

# In[11]:

X = df_new[feature_cols]
Y = df_new["Total Spend"]

# In[12]:

lm = LinearRegression()
lm.fit(X, Y)

# In[13]:

print(lm.intercept_)
print(lm.coef_)

# In[14]:

list(zip(feature_cols, lm.coef_))

# In[15]:

lm.score(X, Y)
print (x)

"""**Encoding Categorical Data**"""

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct= ColumnTransformer( transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

print (x)

"""**Seperate Test Set and Training Set**"""

from sklearn.model_selection import train_test_split
x_train, x_test, y_train , y_test = train_test_split(x, y, test_size=0.2 , random_state=0)

"""**Training the Multiple Linear Regression Model**"""

from sklearn.linear_model import LinearRegression
regressor= LinearRegression()
regressor.fit(x_train, y_train)

"""**Predicting the Test Set Result**"""

y_pred= regressor.predict(x_test)
np.set_printoptions(precision=2)

print (y_pred)

print(y_test)
Ejemplo n.º 5
0
#No missing values, no need for imputer this time

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
#We will split 10 to test, 20 to train
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=1 / 3,
                                                    random_state=0)

#No need for feature scaling

#Fitting Simple Lin Regression model to Training Set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()  #We are fine with default parameters
regressor.fit(
    X_train,
    y_train)  #machine is the regressor, made it learn on the training set

#Machine can now based on its learning experience predict the new salary
#Regressor learned the correlations between experience and salary

#Predicting the test results - create a vector  of predictions
y_pred = regressor.predict(
    X_test)  #vector of predictions of dependant variable

#The predictions are pretty damn close

#Visualizing the results with matplotlib
plt.scatter(X_train, y_train, color='red')  #plots the real values
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.svm import LinearSVC
import mglearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import numpy as np

#线性回归与L2正则化
X, y = mglearn.datasets.load_extended_boston()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
lr = LinearRegression().fit(X_train, y_train)

print(lr.score(X_train, y_train), lr.score(X_test, y_test))

ridge = Ridge(alpha=0.1).fit(X_train,
                             y_train)  #L2正则化,alpha为正则化参数,越大则越趋向0,泛化性越强
print(ridge.score(X_train, y_train), ridge.score(X_test, y_test))

mglearn.plots.plot_ridge_n_samples()
plt.show()

#L1正则化
lasso = Lasso(alpha=0.1).fit(X_train, y_train)
print(lasso.score(X_train, y_train), lasso.score(X_test, y_test))

#分类的线性模型
Ejemplo n.º 7
0
X = preprocessing.scale(
    X
)  # scaled the Data this is a set of ADJ Close values for label these are the results or X
#used to generate the model
X_lately = X[-forcast_out:]
X = X[:-forcast_out]
#used for labels
y = np.array(dfreg['Label'])
y = y[:-forcast_out]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
len(X)

#Linear LinearRegression
clfreg = LinearRegression(n_jobs=-1)  # -1 means uses all processors
clfreg.fit(X_train, y_train)

#Quaratic Regression
clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
clfpoly2.fit(X_train, y_train)

#Polynomial regresion of degree 3 (Cubic?)
clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
clfpoly3.fit(X_train, y_train)

#KNN Regression

clfknn = KNeighborsRegressor(n_neighbors=2)
clfknn.fit(X_train, y_train)
Ejemplo n.º 8
0
def GetForecast():
	loc = request.args.get('loc')
	
	def PolynomialRegressionPrecip(degree,val):
		regressor = LinearRegression()
		regressor.fit(MainList, Preci)
		xx = np.linspace(0, 26, 100)
		yy = regressor.predict(xx.reshape(xx.shape[0], 1))
		
		quadratic_featurizer = PolynomialFeatures(degree)
		X_quadratic = quadratic_featurizer.fit_transform(MainList)
		
		regressor_quadratic = LinearRegression()
		regressor_quadratic.fit(X_quadratic, Preci)
		
		
		xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))
		
		#print ('Residual sum of squares: %.2f' % np.mean(( regressor_quadratic.predict(X_quadratic)- Preci) ** 2))  10.74
		X_quadratic = quadratic_featurizer.fit_transform([30 + val])
		output = regressor_quadratic.predict(X_quadratic)
		return (output)

	def PolynomialRegressionHumidity(degree,val):
		regressor = LinearRegression()
		regressor.fit(MainList, Humidity)
		xx = np.linspace(0, 26, 100)
		yy = regressor.predict(xx.reshape(xx.shape[0], 1))
		
		quadratic_featurizer = PolynomialFeatures(degree)
		X_quadratic = quadratic_featurizer.fit_transform(MainList)
		
		regressor_quadratic = LinearRegression()
		regressor_quadratic.fit(X_quadratic, Humidity)
		
		
		xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))
		
		   # print ('Residual sum of squares: %.2f' % np.mean(( regressor_quadratic.predict(X_quadratic)- Humidity) ** 2)) #error 7.08
		X_quadratic = quadratic_featurizer.fit_transform([30 + val])
		output = regressor_quadratic.predict(X_quadratic)
		return (output)


	df = pd.read_csv('static/WeatherData.csv')
	url = 'https://mean-wizards-v2.mybluemix.net/api/getdata?loc='+loc+'&limit=30'
	r = requests.get(url)
	ServerData =r.json()

	Temp = df['Temp'].values
	Temp = Temp[90:120]
	Humidity = df['Humidity'].values
	Humidity = Humidity[90:120]
	Preci = df['preci'].values
	Preci = Preci[90:120]
	WeatherData = df[['Temp','Humidity']].values

	X = [each for each in range(1,31)]
	MainList = []


	for each in X:
		MainList.append([each])
	slr = LinearRegression()
	WeatherData = WeatherData[:30]


	coData = [float(i['co']) for i in ServerData]
	co2Data = [float(i['co2']) for i in ServerData]
	no2Data = [float(i['no2']) for i in ServerData]
	pm25Data = [float(i['pm25']) for i in ServerData]
	so2Data = [float(i['so2']) for i in ServerData]
	print (len(coData))
	print (len(WeatherData))
	# In[284]:

	def Prediction(val):
		slr.fit(MainList,Temp)
		Temp_predict = [30 + val]
		Temp_output = slr.predict(Temp_predict)
		PredictedWeatherData['temp'] = Temp_output[0]

		Humid =  PolynomialRegressionHumidity(9,val)
		PredictedWeatherData['Humidity'] = Humid[0]

		preci =  PolynomialRegressionPrecip(22,val)
		PredictedWeatherData['preci'] = preci[0]

		AQIParameters = {}
		model = LinearRegression()
		
		model.fit(WeatherData, coData)
		
		X_predict = [PredictedWeatherData['temp'],PredictedWeatherData['Humidity']]
		y_predict = model.predict(X_predict)
		AQIParameters['co'] = y_predict[0]

		model.fit(WeatherData, co2Data)
		y_predict = model.predict(X_predict)
		AQIParameters['co2'] = y_predict[0]

		model.fit(WeatherData, no2Data)
		y_predict = model.predict(X_predict)
		AQIParameters['no2'] = y_predict[0]

		model.fit(WeatherData, pm25Data)
		y_predict = model.predict(X_predict)
		AQIParameters['pm25'] = y_predict[0]

		model.fit(WeatherData, so2Data)
		y_predict = model.predict(X_predict)
		AQIParameters['so2'] = y_predict[0]

		FinalDataRecord = {}
		FinalDataRecord = AQIParameters.copy()
		FinalDataRecord.update(PredictedWeatherData)
		
		CObr=[0, 1.0, 2.0, 10, 17, 34, 49];
		SO2br=[0, 40, 80, 380, 800, 1600, 2400];
		O3br=[0, 50, 100, 168, 208, 748, 1300]
		PM25br=[0, 30, 60, 90, 120, 250, 350.4];
		PM10br=[0, 50, 100, 250, 350, 430, 504];
		NO2br=[0, 40, 80, 180, 280, 400, 540];
		AQI=[0, 50, 100, 200, 300, 400, 500];

		dummy = []
		so2= FinalDataRecord['so2']
		so2AQI=0;
		i=0;
		while(i<6):
			if( so2 > SO2br[i] and so2 <= SO2br[i+1]):
				so2AQI= ( ( AQI[i+1]-AQI[i] ) * ( so2 - SO2br[i] ) / ( SO2br[i+1] - SO2br[i] ) ) + AQI[i]
				break; 
			else:
				i = i+1
		dummy.append(so2AQI)

		no2= FinalDataRecord['no2']
		no2AQI=0;
		i=0;
		while(i<6):
			if( no2 > NO2br[i] and no2 <= NO2br[i+1]):
				no2AQI= ( ( AQI[i+1]-AQI[i] ) * ( no2 - NO2br[i] ) / ( NO2br[i+1] - NO2br[i] ) ) + AQI[i]
				break; 
			else:
				i = i+1
		dummy.append(no2AQI)

		co= FinalDataRecord['co']
		coAQI=0;
		i=0;
		while(i<6):
			if( co > CObr[i] and co <= CObr[i+1]):
				coAQI= ( ( AQI[i+1]-AQI[i] ) * ( co - CObr[i] ) / ( CObr[i+1] - CObr[i] ) ) + AQI[i]
				break; 
			else:
				i = i+1
		dummy.append(coAQI)

		pm25= FinalDataRecord['pm25']
		pmAQI=0;
		i=0;
		while(i<6):
			if( pm25 > PM25br[i] and pm25 <= PM25br[i+1]):
				pmAQI= ( ( AQI[i+1]-AQI[i] ) * ( pm25 - PM25br[i] ) / ( PM25br[i+1] - PM25br[i] ) ) + AQI[i]
				break; 
			else:
				i = i+1
		dummy.append(pmAQI)
		finalAQI = max(dummy)
		FinalDataRecord['AQI'] = finalAQI
		return FinalDataRecord


	# In[285]:

	Data = []
	Data.append(Prediction(1))
	Data.append(Prediction(2))
	Data.append(Prediction(3))
	Data.append(Prediction(4))
	Data.append(Prediction(5))
	Data.append(Prediction(6))
	Data.append(Prediction(7))
	print(Data)

	return jsonify(results=Data)
Ejemplo n.º 9
0
def fit_model(x_train, y_train):
    # Fits a linear regression to find the actual b and w that minimize the loss
    regression = LinearRegression()
    regression.fit(x_train, y_train)
    b_minimum, w_minimum = regression.intercept_[0], regression.coef_[0][0]
    return b_minimum, w_minimum
Ejemplo n.º 10
0
def linear_regression(X, y):
    return LinearRegression().fit(X, y)
Ejemplo n.º 11
0
	def Prediction(val):
		slr.fit(MainList,Temp)
		Temp_predict = [30 + val]
		Temp_output = slr.predict(Temp_predict)
		PredictedWeatherData['temp'] = Temp_output[0]

		Humid =  PolynomialRegressionHumidity(9,val)
		PredictedWeatherData['Humidity'] = Humid[0]

		preci =  PolynomialRegressionPrecip(22,val)
		PredictedWeatherData['preci'] = preci[0]

		AQIParameters = {}
		model = LinearRegression()
		
		model.fit(WeatherData, coData)
		
		X_predict = [PredictedWeatherData['temp'],PredictedWeatherData['Humidity']]
		y_predict = model.predict(X_predict)
		AQIParameters['co'] = y_predict[0]

		model.fit(WeatherData, co2Data)
		y_predict = model.predict(X_predict)
		AQIParameters['co2'] = y_predict[0]

		model.fit(WeatherData, no2Data)
		y_predict = model.predict(X_predict)
		AQIParameters['no2'] = y_predict[0]

		model.fit(WeatherData, pm25Data)
		y_predict = model.predict(X_predict)
		AQIParameters['pm25'] = y_predict[0]

		model.fit(WeatherData, so2Data)
		y_predict = model.predict(X_predict)
		AQIParameters['so2'] = y_predict[0]

		FinalDataRecord = {}
		FinalDataRecord = AQIParameters.copy()
		FinalDataRecord.update(PredictedWeatherData)
		
		CObr=[0, 1.0, 2.0, 10, 17, 34, 49];
		SO2br=[0, 40, 80, 380, 800, 1600, 2400];
		O3br=[0, 50, 100, 168, 208, 748, 1300]
		PM25br=[0, 30, 60, 90, 120, 250, 350.4];
		PM10br=[0, 50, 100, 250, 350, 430, 504];
		NO2br=[0, 40, 80, 180, 280, 400, 540];
		AQI=[0, 50, 100, 200, 300, 400, 500];

		dummy = []
		so2= FinalDataRecord['so2']
		so2AQI=0;
		i=0;
		while(i<6):
			if( so2 > SO2br[i] and so2 <= SO2br[i+1]):
				so2AQI= ( ( AQI[i+1]-AQI[i] ) * ( so2 - SO2br[i] ) / ( SO2br[i+1] - SO2br[i] ) ) + AQI[i]
				break; 
			else:
				i = i+1
		dummy.append(so2AQI)

		no2= FinalDataRecord['no2']
		no2AQI=0;
		i=0;
		while(i<6):
			if( no2 > NO2br[i] and no2 <= NO2br[i+1]):
				no2AQI= ( ( AQI[i+1]-AQI[i] ) * ( no2 - NO2br[i] ) / ( NO2br[i+1] - NO2br[i] ) ) + AQI[i]
				break; 
			else:
				i = i+1
		dummy.append(no2AQI)

		co= FinalDataRecord['co']
		coAQI=0;
		i=0;
		while(i<6):
			if( co > CObr[i] and co <= CObr[i+1]):
				coAQI= ( ( AQI[i+1]-AQI[i] ) * ( co - CObr[i] ) / ( CObr[i+1] - CObr[i] ) ) + AQI[i]
				break; 
			else:
				i = i+1
		dummy.append(coAQI)

		pm25= FinalDataRecord['pm25']
		pmAQI=0;
		i=0;
		while(i<6):
			if( pm25 > PM25br[i] and pm25 <= PM25br[i+1]):
				pmAQI= ( ( AQI[i+1]-AQI[i] ) * ( pm25 - PM25br[i] ) / ( PM25br[i+1] - PM25br[i] ) ) + AQI[i]
				break; 
			else:
				i = i+1
		dummy.append(pmAQI)
		finalAQI = max(dummy)
		FinalDataRecord['AQI'] = finalAQI
		return FinalDataRecord
Ejemplo n.º 12
0
 def __init__(self):
     self.linear_regression = LinearRegression()
pipelines.append(
    (
        "SVM",
        make_pipeline(
            preprocessing.StandardScaler(), LinearSVR(C=4, random_state=seed)
        ),
    )
)
pipelines.append(
    ("RF", make_pipeline(RandomForestRegressor(n_estimators=100, random_state=seed)))
)
pipelines.append(
    ("KNN", make_pipeline(preprocessing.StandardScaler(), KNeighborsRegressor()))
)
pipelines.append(("LM", make_pipeline(LinearRegression())))

#%%

plot_cv_scores(
    pipelines=pipelines,
    X=X,
    y=y,
    crossvalidation=crossvalidation,
    scoring=scoring,
    file_suffix="unoptimized_simple",
)

plot_cv_predictions(
    pipelines=pipelines,
    X=X,
    # icept = np.mean(icepts)
    x = np.array(x).reshape(-1, 1)
    # x *= 0.510127
    # x = x[~np.isnan(x) and x > float(-inf)].reshape(-1,1)
    y = np.array(y).reshape(-1, 1)
    # print x.shape, y.shape
    # y = y[~np.isnan(y) and x > float(-inf)].reshape(-1,1)
    # print x.shape, y.shape
    data = np.concatenate((y, x), axis=1)
    data = data[np.all(data != float('+inf'), axis=1)]
    # print data.shape
    print data[:10]
    # np.save('data4regression.npy',data)

    # data = np.load('data4regression.npy')
    lreg = LinearRegression(normalize=True, n_jobs=-1)
    lreg.fit(data[:, [1]], data[:, [0]])
    print "coefficient: %f\t\tintercept: %f" % (lreg.coef_, lreg.intercept_)

    for p in preds:
        if args['nn_score_fmt'] == "sphinx":
            scores = readSen(p)
        elif args['nn_score_fmt'] == "text":
            scores = np.loadtxt(p)
        print p + '.sen'
        writeSenScores(p + '.sen', scores, lreg.coef_, 0)

    os.system("""pocketsphinx_batch \
        -hmm {} \
        -lm {} \
        -cepdir {} \
Ejemplo n.º 15
0
def regression_ceof(pts):
    x = np.array([pt[0] for pt in pts]).reshape(-1, 1)
    y = np.array([pt[1] for pt in pts])
    model = LinearRegression()
    model.fit(x, y)
    return model.coef_[0], model.intercept_
Ejemplo n.º 16
0
    def train_stage2(self, force=False, print_fnc=print):
        """
        trains stage2 models, store it in self.stage2_model
        Args:
            force: force training even if we've already trained
            print_fnc: some function for printing/logging

        """
        try:
            self.stage2_model
            if not force:
                raise ValueError(
                    "stage2 model already trained, set force=True to force retraining"
                )
        except AttributeError:
            pass
        # generate the stage2 training data if not already done
        try:
            self.stage2_data
        except AttributeError:
            self._generate_stage2_data()
        x_cols = self.exog_x_cols + [self.endog_x_col]
        if self.stage2_model_type == 'lgb':
            # lgb datasets for training
            df_train = self.stage2_data.loc[self.stage2_data['_purpose_'] ==
                                            'train2', :]
            df_val = self.stage2_data.loc[self.stage2_data['_purpose_'] ==
                                          'val2', :]
            dat_train = lgb.Dataset(df_train[x_cols],
                                    label=df_train[self.y_col])
            dat_train.grouper = df_train[self.id_col]
            dat_val = lgb.Dataset(df_val[x_cols], label=df_val[self.y_col])
            dat_val.grouper = df_val[self.id_col]
            # ok, now start training
            params = self.stage2_params
            print_every = 0 if print_fnc is None else params[
                'num_iterations'] // 10
            eval_results = {
            }  # store evaluation results as well with the trained model
            if self.stage2_objective == 'true':
                # copy the params because lgb modifies it during run...?
                gbm = lgb.train(
                    params.copy(),
                    train_set=dat_train,
                    valid_sets=[dat_train, dat_val],
                    valid_names=['train', 'val'],
                    verbose_eval=print_every,
                    fobj=lambda preds, dataset: co.grouped_sse_loss_grad_hess(
                        preds, dataset.label, dataset.grouper),
                    feval=lambda preds, dataset:
                    ('grouped sse',
                     co.grouped_sse_loss(preds, dataset.label, dataset.grouper
                                         ), False),
                    callbacks=[lgb.record_evaluation(eval_results)])
            elif self.stage2_objective == 'upper':
                gbm = lgb.train(
                    params.copy(),
                    train_set=dat_train,
                    valid_sets=[dat_train, dat_val],
                    valid_names=['train', 'val'],
                    verbose_eval=print_every,
                    callbacks=[lgb.record_evaluation(eval_results)])
            else:
                raise ValueError("self.stage2_objective not recognized")
            gbm.eval_results = eval_results
            # save the model
            self.stage2_model = ModelWrapper(gbm)
        elif self.stage2_model_type == 'linear':
            df_train = self.stage2_data
            if self.stage2_objective == 'true':
                min_output = minimize(fun=co.grouped_sse_loss_linear,
                                      x0=np.zeros(shape=len(x_cols) + 1),
                                      args=(df_train, x_cols, self.y_col,
                                            self.id_col))
                coefs = min_output.x[1:]
                intercept = min_output.x[0]
                model = LinearModel(coefs, intercept)
            elif self.stage2_objective == 'upper':
                model = LinearRegression()
                model.fit(df_train[x_cols], df_train[self.y_col])
            else:
                raise ValueError("self.stage2_objective not recognized")
            # add a feature_name functionality to this object, then wrap it up and return
            model.feature_name = lambda: x_cols
            self.stage2_model = ModelWrapper(model)
        else:
            raise ValueError("self.stage2_model_type not recognized")
Ejemplo n.º 17
0
print(seenMovie)
print(metadata)
print("Data loaded")
print(seenMovie.shape, '\t', metadata.shape)
seenMovie = seenMovie.astype('int')
# split train and test set
X_train, X_test, y_train, y_test = train_test_split(metadata,
                                                    seenMovie,
                                                    test_size=0.3,
                                                    random_state=1,
                                                    shuffle=True,
                                                    stratify=seenMovie)

# build model 2 nnls regression model
reg_nnls = LinearRegression(positive=True)
y_pred_nnls = reg_nnls.fit(X_train, y_train).predict(X_test)
r2_score_nnls = r2_score(y_test, y_pred_nnls)
print("NNLS R2 score", r2_score_nnls)
logLossVal_nnls = log_loss(y_test,
                           y_pred_nnls,
                           eps=1e-15,
                           normalize=True,
                           sample_weight=None,
                           labels=None)

scaled_test = minmax_scale(y_test, feature_range=(0, 1))
scaled_pred = minmax_scale(y_pred_nnls, feature_range=(0, 1))
mse_2 = calculateMeanSquareError(scaled_test, scaled_pred)
# m2_recall = recall_score(y_test, y_pred_nnls, average='binary')
# m2_precision = precision_score(y_test, y_pred_nnls, average='binary')
   
#------------------------------- Machine Learning Models --------------------------------------#
#Splitting data into train and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cleanData.iloc[:, cleanData.columns != 'Absenteeism time in hours'], cleanData.iloc[:, 20], test_size = 0.30, random_state = 1)



#------------------------------------ Linear Regression Model ---------------------------------#
# Root Mean Squared Error: 2.898405340060082
# R^2 Score(coefficient of determination) = 0.2772050386036977

from sklearn.linear_model import LinearRegression

#Build Linear regression model
lrModel = LinearRegression().fit(X_train , y_train)

#Perdict for test records
lrModelPred = lrModel.predict(X_test)

#Storing results in a data frame for Actual and Predicted values
lrResult = pd.DataFrame({'Actual': y_test, 'Predicted': lrModelPred})
print(lrResult.head())

#Calculate RMSE and R-squared value
def RMSE(y_actual,y_predicted):
    rmse = np.sqrt(mean_squared_error(y_actual,y_predicted))
    return rmse

print("Root Mean Squared Error: "+str(RMSE(y_test, lrModelPred)))
print("R^2 Score(coefficient of determination) = "+str(r2_score(y_test, lrModelPred)))
Ejemplo n.º 19
0
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("../../../data/Position_Salaries.csv")
X = df.iloc[:, 1:2].values
Y = df.iloc[:, 2:].values

from sklearn.preprocessing import PolynomialFeatures
poly_feature = PolynomialFeatures(degree=2)
X_poly = poly_feature.fit_transform(X)

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_poly, Y)

fig = plt.figure()
ax = fig.add_axes([0, 0, 1, 1])
ax.scatter(X, Y, color='r')
X_grid = np.arange(min(X), max(X), 0.1)
X_grid = X_grid.reshape((len(X_grid), 1))
ax.plot(X_grid, lin_reg.predict(poly_feature.fit_transform(X_grid)))
ax.set_title('level-salary curve')
ax.set_xlabel('level')
ax.set_ylabel('salary')
plt.show()
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3,
                    maxSentLength=30, emb_size=300, hidden_size=[300,10],
                    margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/';
    rng = numpy.random.RandomState(23455)
    datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength)
    vocab_size=len(word2id)+1
    mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt')
    wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2]
    indices_train_r=indices_train[1::2]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2]
    indices_test_r=indices_test[1::2]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    train_size = len(indices_train_l)
    test_size = len(indices_test_l)
    
    train_batch_start=range(train_size)
    test_batch_start=range(test_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int32')
#     indices_train_r=T.cast(indices_train_r, 'int32')
#     indices_test_l=T.cast(indices_test_l, 'int32')
#     indices_test_r=T.cast(indices_test_r, 'int32')
    


    rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng)
#     rand_values[0]=numpy.array(numpy.zeros(emb_size))
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True)      
    

    
    # allocate symbolic variables for the data
#     index = T.iscalar()
    x_index_l = T.imatrix()   # now, x is the index matrix, must be integer
    x_index_r = T.imatrix()
    y = T.ivector()  
    left_l=T.iscalar()
    right_l=T.iscalar()
    left_r=T.iscalar()
    right_r=T.iscalar()
    length_l=T.iscalar()
    length_r=T.iscalar()
    norm_length_l=T.fscalar()
    norm_length_r=T.fscalar()
    mts=T.fmatrix()
    wmf=T.fmatrix()
#     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))
    conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0]))
    layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0]))
    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)
    

    
    
    
    
    
    sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size))
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size))
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    HL_layer_1_input=T.concatenate([
#                                 mts, 
                                eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
                                uni_cosine,
#                                 sum_uni_l,
#                                 sum_uni_r,
#                                 sum_uni_l+sum_uni_r,
                                1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)),
                                cosine(layer0_l_output_maxpool, layer0_r_output_maxpool),
                                layer0_l_output_maxpool,
                                layer0_r_output_maxpool,
                                T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10),
                                
                                layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
                                layer1.output_cosine,
                                layer1.output_vector_l,
                                layer1.output_vector_r,
                                T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10),
#                                 len_l, len_r
                                layer1.output_attentions
#                                 wmf,
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input,
                                mts, len_l, len_r
#                                 wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_size=1+1+   1+1+3* nkerns[0]   +1+1+3*nkerns[0]+10*10
    
    HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2
    
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh)
    
    LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1)
    LR_layer_input_with_extra=T.concatenate([HL_layer_2.output,  HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output,
    
    LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2)
#     LR_layer_input=HL_layer_2.output
#     LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2)

#     layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()
#     diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix)
    cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg
    

    test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True)



    params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params 
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        clipped_grad = T.clip(grad_i, -0.5, 0.5)
        acc = acc_i + T.sqr(clipped_grad)
        updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True)

    train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True)



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is


    best_params = None
    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    nn_max_acc=0.0
    best_iter=0
    cost_tmp=0.0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data

        for index in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * train_size + minibatch_index +1

            minibatch_index=minibatch_index+1

#             if iter%update_freq != 0:
#                 cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
#                 #print 'cost_ij: ', cost_ij
#                 cost_tmp+=cost_ij
#                 error_sum+=error_ij
#             else:

            cost_i, error_i= train_model(indices_train_l[index: index + batch_size],
                                                              indices_train_r[index: index + batch_size],
                                                              trainY[index: index + batch_size],
                                                              trainLeftPad_l[index],
                                                              trainRightPad_l[index],
                                                              trainLeftPad_r[index],
                                                              trainRightPad_r[index],
                                                              trainLengths_l[index],
                                                              trainLengths_r[index],
                                                              normalized_train_length_l[index],
                                                              normalized_train_length_r[index],
                                                              mt_train[index: index + batch_size],
                                                              wm_train[index: index + batch_size])
            cost_tmp+=cost_i
            if iter < 6000 and iter %100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
            if iter >= 6000 and iter % 100 == 0:
#             if iter%100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
                test_losses=[]
                test_y=[]
                test_features=[]
                for index in test_batch_start:
                    test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size],
                                                                  indices_test_r[index: index + batch_size],
                                                                  testY[index: index + batch_size],
                                                                  testLeftPad_l[index],
                                                                  testRightPad_l[index],
                                                                  testLeftPad_r[index],
                                                                  testRightPad_r[index],
                                                                  testLengths_l[index],
                                                                  testLengths_r[index],
                                                                  normalized_test_length_l[index],
                                                                  normalized_test_length_r[index],
                                                                  mt_test[index: index + batch_size],
                                                                  wm_test[index: index + batch_size])
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_acc = (1-test_score) * 100.
                if test_acc > nn_max_acc:
                    nn_max_acc = test_acc
                print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc

                #now, see the results of svm
                if use_svm:
                    train_y=[]
                    train_features=[]
                    for index in train_batch_start: 
                        cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size],
                                                                  indices_train_r[index: index + batch_size],
                                                                  trainY[index: index + batch_size],
                                                                  trainLeftPad_l[index],
                                                                  trainRightPad_l[index],
                                                                  trainLeftPad_r[index],
                                                                  trainRightPad_r[index],
                                                                  trainLengths_l[index],
                                                                  trainLengths_r[index],
                                                                  normalized_train_length_l[index],
                                                                  normalized_train_length_r[index],
                                                                  mt_train[index: index + batch_size],
                                                                  wm_train[index: index + batch_size])
                        train_y.append(y[0])
                        train_features.append(layer3_input[0])
                        #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n')
                    #write_feature.close()
     
                    clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                    clf.fit(train_features, train_y)
                    results=clf.predict(test_features)
                    lr=LinearRegression().fit(train_features, train_y)
                    results_lr=lr.predict(test_features)
                    corr_count=0
                    corr_lr=0
                    test_size=len(test_y)
                    for i in range(test_size):
                        if results[i]==test_y[i]:
                            corr_count+=1
                        if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                            corr_lr+=1
                    acc=corr_count*1.0/test_size
                    acc_lr=corr_lr*1.0/test_size
                    if acc > max_acc:
                        max_acc=acc
                        best_iter=iter
                    if acc_lr> max_acc:
                        max_acc=acc_lr
                        best_iter=iter
                    print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ',    max_acc , ' at iter: ', best_iter

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Ejemplo n.º 21
0
def auto_arima(y,
               exogenous=None,
               start_p=2,
               d=None,
               start_q=2,
               max_p=5,
               max_d=2,
               max_q=5,
               start_P=1,
               D=None,
               start_Q=1,
               max_P=2,
               max_D=1,
               max_Q=2,
               max_order=5,
               m=1,
               seasonal=True,
               stationary=False,
               information_criterion='aic',
               alpha=0.05,
               test='kpss',
               seasonal_test='ocsb',
               stepwise=True,
               n_jobs=1,
               start_params=None,
               trend=None,
               method='lbfgs',
               maxiter=50,
               offset_test_args=None,
               seasonal_test_args=None,
               suppress_warnings=False,
               error_action='warn',
               trace=False,
               random=False,
               random_state=None,
               n_fits=10,
               return_valid_fits=False,
               out_of_sample_size=0,
               scoring='mse',
               scoring_args=None,
               with_intercept=True,
               sarimax_kwargs=None,
               **fit_args):

    # NOTE: Doc is assigned BELOW this function

    # pop out the deprecated kwargs
    fit_args = _warn_for_deprecations(**fit_args)

    start = time.time()

    # validate start/max points
    if any(_ < 0 for _ in (max_p, max_q, max_P, max_Q, start_p, start_q,
                           start_P, start_Q)):
        raise ValueError('starting and max p, q, P & Q values must '
                         'be positive integers (>= 0)')
    if max_p < start_p or max_q < start_q \
            or max_P < start_P or max_Q < start_Q:
        raise ValueError('max p, q, P & Q must be >= than '
                         'their starting values')

    # validate d & D
    for _d, _max_d in ((d, max_d), (D, max_D)):
        if _max_d < 0:
            raise ValueError('max_d & max_D must be positive integers (>= 0)')
        if _d is not None:
            if _d < 0:
                raise ValueError('d & D must be None or a positive '
                                 'integer (>= 0)')
            # v0.9.0+ - ignore this if it's explicitly set...
            # if _d > _max_d:
            #     raise ValueError('if explicitly defined, d & D must be <= '
            #                      'max_d & <= max_D, respectively')

    # is stepwise AND parallel enabled?
    if stepwise and n_jobs != 1:
        n_jobs = 1
        warnings.warn('stepwise model cannot be fit in parallel (n_jobs=%i). '
                      'Falling back to stepwise parameter search.' % n_jobs)

    # check on m
    if m < 1:
        raise ValueError('m must be a positive integer (> 0)')

    # check on n_iter
    if random and n_fits < 0:
        raise ValueError('n_iter must be a positive integer '
                         'for a random search')

    # validate error action
    actions = {'warn', 'raise', 'ignore', None}
    if error_action not in actions:
        raise ValueError('error_action must be one of %r, but got %r' %
                         (actions, error_action))

    # copy array
    y = check_endog(y, dtype=DTYPE)
    n_samples = y.shape[0]

    sarimax_kwargs = {} if not sarimax_kwargs else sarimax_kwargs

    # check for constant data
    if is_constant(y):
        warnings.warn('Input time-series is completely constant; '
                      'returning a (0, 0, 0) ARMA.')
        return _return_wrapper(
            _post_ppc_arima(
                solvers._fit_arima(y,
                                   xreg=exogenous,
                                   order=(0, 0, 0),
                                   seasonal_order=(0, 0, 0, 0),
                                   start_params=start_params,
                                   trend=trend,
                                   method=method,
                                   maxiter=maxiter,
                                   fit_params=fit_args,
                                   suppress_warnings=suppress_warnings,
                                   trace=trace,
                                   error_action=error_action,
                                   scoring=scoring,
                                   out_of_sample_size=out_of_sample_size,
                                   scoring_args=scoring_args,
                                   with_intercept=with_intercept,
                                   **sarimax_kwargs)), return_valid_fits,
            start, trace)

    # test ic, and use AIC if n <= 3
    if information_criterion not in VALID_CRITERIA:
        raise ValueError('auto_arima not defined for information_criteria=%s. '
                         'Valid information criteria include: %r' %
                         (information_criterion, VALID_CRITERIA))

    # the R code handles this, but I don't think statsmodels
    # will even fit a model this small...
    # if n_samples <= 3:
    #     if information_criterion != 'aic':
    #         warnings.warn('n_samples (%i) <= 3 '
    #                       'necessitates using AIC' % n_samples)
    #     information_criterion = 'aic'

    # adjust max p, q -- R code:
    # max.p <- min(max.p, floor(serieslength/3))
    # max.q <- min(max.q, floor(serieslength/3))
    max_p = int(min(max_p, np.floor(n_samples / 3)))
    max_q = int(min(max_q, np.floor(n_samples / 3)))

    # this is not in the R code and poses a risk that R did not consider...
    # if max_p|q has now dropped below start_p|q, correct it.
    start_p = min(start_p, max_p)
    start_q = min(start_q, max_q)

    # if it's not seasonal, we can avoid multiple 'if not is None' comparisons
    # later by just using this shortcut (hack):
    if not seasonal:
        D = m = -1

    # choose the order of differencing
    xx = y.copy()
    if exogenous is not None:
        lm = LinearRegression().fit(exogenous, y)
        xx = y - lm.predict(exogenous)

    # is the TS stationary?
    if stationary:
        d = D = 0

    # now for seasonality
    if m == 1:
        D = max_P = max_Q = 0

    # m must be > 1 for nsdiffs
    elif D is None:  # we don't have a D yet and we need one (seasonal)
        seasonal_test_args = seasonal_test_args \
            if seasonal_test_args is not None else dict()
        D = nsdiffs(xx,
                    m=m,
                    test=seasonal_test,
                    max_D=max_D,
                    **seasonal_test_args)

        if D > 0 and exogenous is not None:
            diffxreg = diff(exogenous, differences=D, lag=m)
            # check for constance on any column
            if np.apply_along_axis(is_constant, arr=diffxreg, axis=0).any():
                D -= 1

    # D might still be None if not seasonal. Py 3 will throw and error for that
    # unless we explicitly check for ``seasonal``
    if D > 0:
        dx = diff(xx, differences=D, lag=m)
    else:
        dx = xx

    # If D was too big, we might have gotten rid of x altogether!
    if dx.shape[0] == 0:
        raise ValueError(
            "The seasonal differencing order, D=%i, was too "
            "large for your time series, and after differencing, "
            "there are no samples remaining in your data. "
            "Try a smaller value for D, or if you didn't set D "
            "to begin with, try setting it explicitly. This can "
            "also occur in seasonal settings when m is too large." % D)

    # difference the exogenous matrix
    if exogenous is not None:
        if D > 0:
            diffxreg = diff(exogenous, differences=D, lag=m)
        else:
            diffxreg = exogenous
    else:
        # here's the thing... we're only going to use diffxreg if exogenous
        # was not None in the first place. However, PyCharm doesn't know that
        # and it thinks we might use it before assigning it. Therefore, assign
        # it to None as a default value and it won't raise the warning anymore.
        diffxreg = None

    # determine/set the order of differencing by estimating the number of
    # orders it would take in order to make the TS stationary.
    if d is None:
        offset_test_args = offset_test_args \
            if offset_test_args is not None else dict()
        d = ndiffs(dx, test=test, alpha=alpha, max_d=max_d, **offset_test_args)

        if d > 0 and exogenous is not None:
            diffxreg = diff(diffxreg, differences=d, lag=1)

            # if any columns are constant, subtract one order of differencing
            if np.apply_along_axis(is_constant, arr=diffxreg, axis=0).any():
                d -= 1

    # check differences (do we want to warn?...)
    if error_action == 'warn' and not suppress_warnings:
        if D >= 2:
            warnings.warn(
                "Having more than one seasonal differences is "
                "not recommended. Please consider using only one "
                "seasonal difference.", ModelFitWarning)
        # if D is -1, this will be off, so we include the OR
        elif D + d > 2 or d > 2:
            warnings.warn(
                "Having 3 or more differencing operations is not "
                "recommended. Please consider reducing the total "
                "number of differences.", ModelFitWarning)

    if d > 0:
        dx = diff(dx, differences=d, lag=1)

    # check for constance
    if is_constant(dx):
        if exogenous is None and not (D > 0 or d < 2):
            raise ValueError('data follow a simple polynomial and are not '
                             'suitable for ARIMA modeling')

        # perfect regression
        ssn = (0, 0, 0, 0) if not seasonal else (0, D, 0, m)
        return _return_wrapper(
            _post_ppc_arima(
                solvers._fit_arima(y,
                                   xreg=exogenous,
                                   order=(0, d, 0),
                                   seasonal_order=ssn,
                                   start_params=start_params,
                                   trend=trend,
                                   method=method,
                                   maxiter=maxiter,
                                   fit_params=fit_args,
                                   suppress_warnings=suppress_warnings,
                                   trace=trace,
                                   error_action=error_action,
                                   scoring=scoring,
                                   out_of_sample_size=out_of_sample_size,
                                   scoring_args=scoring_args,
                                   with_intercept=with_intercept,
                                   **sarimax_kwargs)), return_valid_fits,
            start, trace)

    # seasonality issues
    if m > 1:
        if max_P > 0:
            max_p = min(max_p, m - 1)
        if max_Q > 0:
            max_q = min(max_q, m - 1)

    if not stepwise:

        # validate max_order
        if max_order is None:
            max_order = np.inf
        elif max_order < 0:
            raise ValueError('max_order must be None or a positive '
                             'integer (>= 0)')

        # NOTE: pre-1.5.2, we started at start_p, start_q, etc. However, when
        # using stepwise=FALSE in R, hyndman starts at 0. He only uses start_*
        # when stepwise=TRUE.

        # generate the set of (p, q, P, Q) FIRST, since it is contingent
        # on whether or not the user is interested in a seasonal ARIMA result.
        # This will reduce the search space for non-seasonal ARIMA models.
        # loop p, q. Make sure to loop at +1 interval,
        # since max_{p|q} is inclusive.
        if seasonal:
            gen = (((p, d, q), (P, D, Q, m)) for p in range(0, max_p + 1)
                   for q in range(0, max_q + 1) for P in range(0, max_P + 1)
                   for Q in range(0, max_Q + 1) if p + q + P + Q <= max_order)
        else:
            # otherwise it's not seasonal and we don't need the seasonal pieces
            gen = (((p, d, q), (0, 0, 0, 0)) for p in range(0, max_p + 1)
                   for q in range(0, max_q + 1) if p + q <= max_order)

        # if we are fitting a random search rather than an exhaustive one, we
        # will scramble up the generator (as a list) and only fit n_iter ARIMAs
        if random:
            random_state = check_random_state(random_state)

            # make a list to scramble...
            gen = random_state.permutation(list(gen))[:n_fits]

        # get results in parallel
        all_res = Parallel(n_jobs=n_jobs)(
            delayed(solvers._fit_arima)(y,
                                        xreg=exogenous,
                                        order=order,
                                        seasonal_order=seasonal_order,
                                        start_params=start_params,
                                        trend=trend,
                                        method=method,
                                        maxiter=maxiter,
                                        fit_params=fit_args,
                                        suppress_warnings=suppress_warnings,
                                        trace=trace,
                                        error_action=error_action,
                                        out_of_sample_size=out_of_sample_size,
                                        scoring=scoring,
                                        scoring_args=scoring_args,
                                        with_intercept=with_intercept,
                                        **sarimax_kwargs)
            for order, seasonal_order in gen)

    # otherwise, we're fitting the stepwise algorithm...
    else:
        if n_samples < 10:
            start_p = min(start_p, 1)
            start_q = min(start_q, 1)
            start_P = start_Q = 0

        # adjust to p, q, P, Q vals
        p = start_p = min(start_p, max_p)
        q = start_q = min(start_q, max_q)
        P = start_P = min(start_P, max_P)
        Q = start_Q = min(start_Q, max_Q)

        # init the stepwise model wrapper
        stepwise_wrapper = solvers._StepwiseFitWrapper(
            y,
            xreg=exogenous,
            start_params=start_params,
            trend=trend,
            method=method,
            maxiter=maxiter,
            fit_params=fit_args,
            suppress_warnings=suppress_warnings,
            trace=trace,
            error_action=error_action,
            out_of_sample_size=out_of_sample_size,
            scoring=scoring,
            scoring_args=scoring_args,
            p=p,
            d=d,
            q=q,
            P=P,
            D=D,
            Q=Q,
            m=m,
            start_p=start_p,
            start_q=start_q,
            start_P=start_P,
            start_Q=start_Q,
            max_p=max_p,
            max_q=max_q,
            max_P=max_P,
            max_Q=max_Q,
            seasonal=seasonal,
            information_criterion=information_criterion,
            with_intercept=with_intercept,
            **sarimax_kwargs)

        # do the step-through...
        all_res = stepwise_wrapper.solve_stepwise()

    # filter the non-successful ones
    filtered = _post_ppc_arima(all_res)

    # sort by the criteria - lower is better for both AIC and BIC
    # (https://stats.stackexchange.com/questions/81427/aic-guidelines-in-model-selection)
    sorted_res = sorted(filtered,
                        key=(lambda mod: getattr(mod, information_criterion)
                             ()))

    # remove all the cached .pmdpkl files... someday write this as an exit hook
    # in case of a KeyboardInterrupt or anything
    for model in sorted_res:
        model._clear_cached_state()

    return _return_wrapper(sorted_res, return_valid_fits, start, trace)
Ejemplo n.º 22
0
data.head()

# TO VISUALISE DATA
fig, axs = plt.subplots(1, 3, sharey=True)
data.plot(kind='scatter', x='TV', y='Sales', ax=axs[0], figsize=(14, 7))
data.plot(kind='scatter', x='Radio', y='Sales', ax=axs[1])
data.plot(kind='scatter', x='Newspaper', y='Sales', ax=axs[2])

# CREATING X&Y FOR LINEAR REGRESSION
feature_cols = ['TV']
X = data[feature_cols]
Y = data.Sales

#IMPORTING LINEAR REGRESSION ALGO FOR SIMPLE LINEAR REGRESSION
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X, Y)

print(lr.intercept_)
print(lr.coef_)

result = 6.97 + 0.0554 * 50
print(result)

#CREATE A DATAFRAM WITH MIN AND MAX VALUE OF THE TABLE
X_new = pd.DataFrame({'TV': [data.TV.min(), data.TV.max()]})
X_new.head()

preds = lr.predict(X_new)
preds
Ejemplo n.º 23
0
def main(regressor="random_forest"):
    """
    The main method
    """
    # Fetch data from internet
    data = fetch_and_load_data()

    # Process median_income into categories
    data["income_cat"] = np.ceil(data["median_income"] / 1.5)
    data["income_cat"].where(data["income_cat"] < 5, 5.0, inplace=True)

    # Split data into training and testing sets
    train_data, test_data = split_train_test_stratified(data, "income_cat")

    # Extract labels and housing data
    housing_labels = train_data["median_house_value"].copy()
    housing = train_data.drop("median_house_value", axis=1)

    # split housing into categorical and numerical data
    # cat_attributes = ["ocean_proximity", "income_cat"]
    cat_attributes = ["ocean_proximity"]
    num_attributes = ['longitude', 'latitude', 'housing_median_age',
                      'total_rooms', 'total_bedrooms', 'population',
                      'households', 'median_income']
 
    # Set up pipeline to prepare data with.
    full_pipeline = setup_pipeline(num_attributes, cat_attributes)

    # Prepare the data
    housing_prepared = full_pipeline.fit_transform(housing)

    print()

    # Select the appropriate regressor
    if regressor == "linear":
        reg_model = LinearRegression()
        reg_name = "Linear Regressor"
    elif regressor == "random_forest":
        reg_model = RandomForestRegressor()
        reg_name = "Random Forest Regressor"
    elif regressor == "decision_tree":
        reg_model = DecisionTreeRegressor()
        reg_name = "Decision Tree Regressor"
    elif regressor == "svr":
        reg_model = SVR(kernel="linear", gamma='auto')
        reg_name = "Support Vector Machine"
    else:
        error_mes = "Regressor '{regressor}' not recognised."
        raise ValueError(error_mes.format(regressor=regressor))

    # Train regression model
    reg_model.fit(housing_prepared, housing_labels)
    display_model_performance(reg_model,
                              housing_prepared,
                              housing_labels,
                              reg_name)

    if regressor == "random_forest":
        # Fine tune Random Forest
        param_grid = [
                {'n_estimators': [50, 100, 1000], 'max_features': [2, 4, 6, 8]},
                {'bootstrap': [False], 'n_estimators': [50, 100, 1000], 'max_features': [2, 4, 6]}]
        final_model = fine_tune_model(RandomForestRegressor(),
                                      param_grid,
                                      housing_prepared,
                                      housing_labels)
        # Get the best model weights
        print()
        print("Attribute weights:")
        feature_importances = final_model.feature_importances_
        print_attribute_importances(feature_importances, num_attributes, full_pipeline)
    elif regressor == "linear":
        final_model = reg_model
        print("Coefficients used by linear model:")
        coeffs = final_model.coef_
        print_attribute_importances(coeffs, num_attributes, full_pipeline)
    elif regressor == "decision_tree":
        # Fine tune Decision Tree
        param_grid = [{'criterion': ["mse", "friedman_mse", "mae"]}]
        final_model = fine_tune_model(DecisionTreeRegressor(),
                                      param_grid,
                                      housing_prepared,
                                      housing_labels)
    elif regressor == "svr":
        param_grid = [
                {'kernel': ["linear"], "C": [10000, 100000]},
                {'kernel': ["rbf"], "C": [10000, 100000],
                    "gamma": [0.045, 0.05, 0.055]}]
        final_model = fine_tune_model(SVR(),
                                      param_grid,
                                      housing_prepared,
                                      housing_labels)
    else:
        final_model = reg_model

    print()

    # Evaluate on test set
    X_test = test_data.drop("median_house_value", axis=1)
    y_test = test_data["median_house_value"].copy()

    X_test_prepared = full_pipeline.transform(X_test)
    final_predictions = final_model.predict(X_test_prepared)

    final_mse = mean_squared_error(y_test, final_predictions)
    final_rmse = np.sqrt(final_mse)
    print("Final Standard Error:", final_rmse)
def validate():
    """
    run KFOLD method for regression 
    """
    #defining directories    
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/merraLRValidation"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    
    #cd to the lagged predictors directory
    os.chdir(dir_in)
    
    
    x = 824
    y = 825
    
    #empty dataframe for model validation
    df = pd.DataFrame(columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse'])
    
    #looping through 
    for tg in range(x,y):
        
        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)
        
        ##########################################
        #check if this tg is already taken care of
        ##########################################
        os.chdir(dir_out)
        if os.path.isfile(tg_name):
            return "file already analyzed!"
        
        
        os.chdir(dir_in)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis = 1)

        #standardize predictor data
        dat = pred.iloc[:,1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis = 1)
        
    
        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis = 1, inplace = True)
        
        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index, axis = 0, inplace = True)
        surge.reset_index(inplace = True)
        surge.drop('index', axis = 1, inplace = True)
        
        
        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])), columns = ['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]], axis = 1)
    
        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized, surge_new.iloc[:,:2], on='date', how='right')
        pred_surge.sort_values(by = 'date', inplace = True)
        
        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis =1)]
        pred_surge.drop(row_nan.index, axis = 0, inplace = True)
        pred_surge.reset_index(inplace = True)
        pred_surge.drop('index', axis = 1, inplace = True)
        
        
        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-'*80)
            print('Predictors and Surge don''t overlap')
            print('-'*80)
            continue
        
     
        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])
        
        #prepare data for training/testing
        X = pred_surge.iloc[:,1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis = 1, inplace = True)
        
        #apply PCA
        pca = PCA(.95)
        pca.fit(X)
        X_pca = pca.transform(X)
        
        #apply 10 fold cross validation
        kf = KFold(n_splits=10, random_state=29)
        
        metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
        for train_index, test_index in kf.split(X):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = y['surge'][train_index], y['surge'][test_index]
            
            #train regression model
            lm = LinearRegression()
            lm.fit(X_train, y_train)
            
            #predictions
            predictions = lm.predict(X_test)
            # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #                       pd.DataFrame(np.array(y_test))], \
            #                      axis = 1)
            # pred_obs.columns = ['pred', 'obs']
            # combo = pd.concat([combo, pred_obs], axis = 0)    
            
            #evaluation matrix - check p value
            if stats.pearsonr(y_test, predictions)[1] >= 0.05:
                print("insignificant correlation!")
                continue
            else:
                print(stats.pearsonr(y_test, predictions))
                metric_corr.append(stats.pearsonr(y_test, predictions)[0])
                print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
                metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            
        
        #number of years used to train/test model
        num_years = (pred_surge['date'][pred_surge.shape[0]-1] -\
                             pred_surge['date'][0]).days/365
        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1] #number of principal components
        corr = np.mean(metric_corr)
        rmse = np.mean(metric_rmse)
        
        print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',np.mean(metric_corr), ' -  avg_rmse (m) = ', \
              np.mean(metric_rmse), '\n')
        
        #original size and pca size of matrix added
        new_df = pd.DataFrame([tg_name, longitude, latitude, num_years, num_pc, corr, rmse]).T
        new_df.columns = ['tg', 'lon', 'lat', 'num_year', \
                                 'num_95pcs','corrn', 'rmse']
        df = pd.concat([df, new_df], axis = 0)
        
        
        #save df as cs - in case of interruption
        os.chdir(dir_out)
        df.to_csv(tg_name)
        
        #cd to dir_in
        os.chdir(dir_in)
#train_x = np.reshape(train_x,(-1,2))
#print(train_x)
#train_x2 = data["Y"][:-2].values
#train_x2 = np.reshape(train_x2,(-1,1))
train_y = data["Expected_output"][:-2].values.reshape(-1,1)
#train_y = np.reshape(train_y,(-1,1))

#test_x = pd.DataFrame(data,columns = data[["X","Y"]][-2:].values)
test_x = data[["X","Y"]][-2:].values.reshape(-1,2)
#test_x = np.reshape(test_x,(-1,2))
#test_x2 = data["Y"][-2:].values
#test_x2 = np.reshape(test_x2,(-1,1))
test_y = data["Expected_output"][-2:].values.reshape(-1,1)
#test_y = np.reshape(test_y,(-1,1))

#print(test_x["X"])
model = LinearRegression()
model.fit(train_x,train_y)

coeff = model.coef_
intercept = model.intercept_
points = [intercept+(coeff[0]*i[0]) for i in train_x]
plt.plot(points,"ro")
predict_y = model.predict(test_x)
plt.plot(train_y,predict_y,"b*")

print(predict_y)
plt.show()
#intercept =
#points =
Ejemplo n.º 26
0
def imputation(data):

    data = basic_eda(data)

    count_of_null = data.isnull().sum()
    percent_of_missing = data.isnull().sum() * 100 / len(data)
    missing_value_data = pd.DataFrame(
        {'percent_missing': percent_of_missing, 'Count_of_Missing_Values ': count_of_null})

    global numerical_column_names
    global categorical_column_names

    numerical_column_names, categorical_column_names = num_cat_separation(data)

    global data_null_treated
    data_null_treated = data.copy()
    label_encoder = LabelEncoder()

    cols_to_be_imputed = missing_value_data[missing_value_data['percent_missing'] > 0].sort_values(
        'percent_missing', ascending=False).index
    cols_to_be_imputed = list(cols_to_be_imputed)
    # if target in cols_to_be_imputed:
    #     cols_to_be_imputed.remove(target)

    Imputed_column_array = []
    for i in cols_to_be_imputed:

        data_dup = data_null_treated.copy()

        # Replacing column having below 2 percent missing values with median and mode

        below_2_percent_columns = missing_value_data[missing_value_data['percent_missing'] < 2].index
        below_2_percent_columns = list(below_2_percent_columns)
        if i in below_2_percent_columns:
            below_2_percent_columns.remove(i)

        for j in below_2_percent_columns:

            if j in numerical_column_names:
                data_dup[j] = data_dup[[j]].apply(
                    lambda x: x.fillna(x.median()), axis=0)
            else:
                data_dup[j] = data_dup[[j]].apply(
                    lambda x: x.fillna(data_dup[j].value_counts().index.max()))

        # Seperating rows without null for train
        data_dup_train = data_dup[data_dup[i].isna() == False]

        data_dup_train_copy = data_dup_train.copy()

        # Dropping null values in other columns
        data_dup_train = data_dup_train.dropna()

        # Seperating rows with null for test
        data_dup_test = data_dup[data_dup[i].isna()]

        # Removing column having above 15 percent missing values

        above_15_percent_columns = missing_value_data[missing_value_data['percent_missing'] > 15].index
        above_15_percent_columns = list(above_15_percent_columns)
        if i in above_15_percent_columns:
            above_15_percent_columns.remove(i)
            
        data_dup_train = data_dup_train.drop(above_15_percent_columns, axis=1)
        data_dup_test = data_dup_test.drop(above_15_percent_columns, axis=1)

        # Train test split

        x_test = data_dup_test.drop(i, axis=1)
        x_test = pd.get_dummies(x_test, drop_first=True)
        x_test_columns = x_test.columns
        for k in x_test_columns:
            if x_test[k].dtype == 'float64':
                x_test[k] = x_test[[k]].apply(
                    lambda x: x.fillna(x.median()), axis=0)
            else:
                x_test[k] = x_test[[k]].apply(lambda x: x.fillna(
                    x_test[k].value_counts().index.max()))

        x_train = data_dup_train.drop(i, axis=1)
        x_train = pd.get_dummies(x_train, drop_first=True)
        x_train = x_train[x_test.columns]

        y_train = data_dup_train[[i]]
        if y_train[i].dtype == 'O':
            y_train[i] = label_encoder.fit_transform(y_train[i])
            y_train[[i]] = y_train[[i]].astype('int')

        # Building model
        if i in numerical_column_names:
            model_rf = RandomForestRegressor(n_estimators=100, max_depth=6)
        else:
            model_rf = RandomForestClassifier(n_estimators=100, max_depth=6)

        model_rf.fit(x_train, y_train)
        rf_score = model_rf.score(x_train, y_train)
        print('RandomForest Score :', rf_score)

        if i in numerical_column_names:
            model_lr = LinearRegression()
        else:
            model_lr = LogisticRegression()

        model_lr.fit(x_train, y_train)
        lr_score = model_lr.score(x_train, y_train)
        print('\nLogisticRegression Score :', lr_score)

        # Checking which model is better
        if rf_score > lr_score:
            print('\nFor', i, ' RandomForest performs better. So we will go with this.\n')
            model = model_rf
            Imputed_column_array.append({i: 'Random Forest'})
        else:
            print(
                '\n\nFor', i, ' Logistic Regression performs better. So we will go with this.')
            model = model_lr
            Imputed_column_array.append({i: 'Logistic Regression'})

        prediction = model.predict(x_test)
        print(prediction.dtype, '\n\n')
        if prediction.dtype == 'int32':
            prediction = label_encoder.inverse_transform(prediction)

        prediction_df = pd.DataFrame(prediction)
        #print('\n\n Predicted count of ', i , '  :' , prediction_df[0].value_counts())

        data_dup_test = data_dup_test.drop(i, axis=1)

        data_dup_test[i] = prediction

        data_dup_complete = pd.concat([data_dup_train_copy, data_dup_test])

        data_dup_complete = data_dup_complete.sort_index()

        predicted = data_dup_complete[[i]]

        data_null_treated = data_null_treated.drop(i, axis=1)

        data_null_treated[i] = predicted

        #feature_selection(data_null_treated, target)

    return (Imputed_column_array, data_null_treated)
# In[ ]:


X_train = train.drop("Survived", axis=1)
Y_train = train["Survived"]
X_test  = test.drop("PassengerId", axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape


# In[ ]:


# The set of models I am going to compare
models = [
            LinearRegression(),
            LogisticRegressionCV(),
            Perceptron(),
            GaussianNB(),
            KNeighborsClassifier(),
            SVC(probability=True),
            DecisionTreeClassifier(),
            AdaBoostClassifier(),
            RandomForestClassifier(),
            XGBClassifier()    
        ]

# Create a table of comparison for models
models_columns = ['Name', 'Parameters','Train Accuracy', 'Validation Accuracy', 'Execution Time']
models_df = pd.DataFrame(columns = models_columns)
predictions = pd.DataFrame(columns = ['Survived'])
Ejemplo n.º 28
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('Position_Salaries.csv')
df.head()

X = df.iloc[:, 1:2].values
y = df.iloc[:, -1].values

#fitting linear regression model
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
linear_reg.fit(X, y)

#fitting polynomial regression model
from sklearn.preprocessing import PolynomialFeatures

poly_reg = PolynomialFeatures(degree=5)
X_poly = poly_reg.fit_transform(X)
X_poly

lin_reg_2 = LinearRegression()
lin_reg_2.fit(X_poly, y)

#visualing the linear model
plt.scatter(X, y, color='red')
plt.plot(X, linear_reg.predict(X), color='blue')
plt.title('LinearModel')
gsFeatureSelector = RFECV(gs_b.best_estimator_, cv = 5).fit(X_train,Y_train)
gsX = gsFeatureSelector.transform(X_train)
gsFeatureSupport = gsFeatureSelector.support_

gs_a = GridSearchCV(
    param_grid = {'min_samples_leaf':np.linspace(5,55,10).astype(int),
                  'min_samples_split':np.linspace(5,55,10).astype(int)},
    estimator = RandomForestClassifier(n_estimators=1000),
    scoring = 'accuracy')
gs_a.support = gsFeatureSupport
gs_a.selector = gsFeatureSelector
gs_a.fit(gsX,Y_train)#Train the model

#%%
# Linear Regression model and feature selection
linearRegression = LinearRegression()
linearFeatureSelector = RFECV(linearRegression, cv = 5).fit(X_train,Y_train)
LinearX = linearFeatureSelector.transform(X_train)
linearFeatureSupport = linearFeatureSelector.support_
# store selector in Linear Regression Model
linearRegression.support = linearFeatureSupport
linearRegression.selector = linearFeatureSelector

# train Linear Regression Model
linearRegression.fit(LinearX,Y_train)

# %%

'''
We can choose to open or not open a model
'''
Ejemplo n.º 30
0
def nn_linear_regression(AMDs_train, AMDs_test, energy_train, energy_test):
    mse_min = 100
    r2_max = 0
    total_number = ATTRIBUTE_NUM
    weight_list = [0.008, 0.04, 0.2, 1, 5, 25, 125]
    import random
    while True:
        remains = total_number
        center_number = random.randint(0, (int)(remains / 2)) * 2
        remains -= center_number
        number_1 = random.randint(0, (int)(remains / 2)) * 2
        remains -= number_1
        number_2 = random.randint(0, (int)(remains / 2)) * 2
        remains -= number_2
        number_3 = remains

        center_list = []
        list_1 = []
        list_2 = []
        list_3 = []
        for i in range(center_number):
            temp = random.randint(0, total_number - 1)
            while temp in center_list:
                temp = random.randint(0, total_number - 1)
            center_list.append(temp)

        list_1_1 = []
        list_1_2 = []
        for i in range(number_1):
            temp = random.randint(0, total_number - 1)
            while (temp in center_list) \
                    or (temp in list_1):
                temp = random.randint(0, total_number - 1)
            list_1.append(temp)
            if i % 2 == 0:
                list_1_1.append(temp)
            else:
                list_1_2.append(temp)

        list_2_1 = []
        list_2_2 = []
        for i in range(number_2):
            temp = random.randint(0, total_number - 1)
            while (temp in center_list) \
                    or (temp in list_1) \
                    or (temp in list_2):
                temp = random.randint(0, total_number - 1)
            list_2.append(temp)
            if i % 2 == 0:
                list_2_1.append(temp)
            else:
                list_2_2.append(temp)

        for i in range(total_number):
            if i not in center_list \
                    and i not in list_1 \
                    and i not in list_2:
                list_3.append(i)
        list_3_1 = list_3[0:(int)(number_3 / 2)]
        list_3_2 = list_3[(int)(number_3 / 2):number_3]

        new_AMDs_train = []
        for row in AMDs_train:
            temp_row = []
            for i in range(total_number):
                if i in center_list:
                    temp_row.append(weight_list[3] * row[i])
                elif i in list_1_2:
                    temp_row.append(weight_list[4] * row[i])
                elif i in list_1_1:
                    temp_row.append(weight_list[2] * row[i])
                elif i in list_2_2:
                    temp_row.append(weight_list[5] * row[i])
                elif i in list_2_1:
                    temp_row.append(weight_list[1] * row[i])
                elif i in list_3_2:
                    temp_row.append(weight_list[6] * row[i])
                elif i in list_3_1:
                    temp_row.append(weight_list[0] * row[i])
                else:
                    print("index not found: ", i)
            new_AMDs_train.append(temp_row)

        new_AMDs_test = []
        for row in AMDs_test:
            temp_row = []
            for i in range(total_number):
                if i in center_list:
                    temp_row.append(weight_list[3] * row[i])
                elif i in list_1_2:
                    temp_row.append(weight_list[4] * row[i])
                elif i in list_1_1:
                    temp_row.append(weight_list[2] * row[i])
                elif i in list_2_2:
                    temp_row.append(weight_list[5] * row[i])
                elif i in list_2_1:
                    temp_row.append(weight_list[1] * row[i])
                elif i in list_3_2:
                    temp_row.append(weight_list[6] * row[i])
                elif i in list_3_1:
                    temp_row.append(weight_list[0] * row[i])
                else:
                    print("index not found: ", i)
            new_AMDs_test.append(temp_row)

        # Linear regression
        linreg = LinearRegression(normalize=True, n_jobs=-1)
        linreg.fit(new_AMDs_train, energy_train)
        energy_pred = linreg.predict(new_AMDs_test)
        mse = round(mean_squared_error(energy_test, energy_pred), 4)
        r2 = round(r2_score(energy_test, energy_pred), 4)

        print("MSR of weighted linear regression is: ", mse)
        print("r2 is ", r2)

        fig, ax = plt.subplots()
        ax.scatter(energy_test, energy_pred)
        ax.plot([np.min(energy_test), np.max(energy_test)],
                [np.min(energy_test), np.max(energy_test)],
                'k--',
                lw=4)
        ax.set_xlabel('Given')
        ax.set_ylabel('Predicted')
        plt.savefig('./image/wlin_' + str(index) + '.jpg')
        break