def test_pickle(): tmpdir = tempfile.mkdtemp(prefix="pickle") a = lrange(10) # test with str path_str = tmpdir + "/res.pkl" save_pickle(a, path_str) b = load_pickle(path_str) assert_equal(a, b) # test with pathlib path_pathlib = pathlib.Path(tmpdir) / "res2.pkl" save_pickle(a, path_pathlib) c = load_pickle(path_pathlib) assert_equal(a, c) # cleanup, tested on Windows try: import os os.remove(path_str) os.remove(path_pathlib) os.rmdir(tmpdir) except (OSError, IOError): pass assert not os.path.exists(tmpdir) # test with file handle fh = BytesIO() save_pickle(a, fh) fh.seek(0, 0) d = load_pickle(fh) fh.close() assert_equal(a, d)
def test_pickle(): import tempfile from numpy.testing import assert_equal tmpdir = tempfile.mkdtemp(prefix='pickle') a = range(10) save_pickle(a, tmpdir+'/res.pkl') b = load_pickle(tmpdir+'/res.pkl') assert_equal(a, b) #cleanup, tested on Windows try: import os os.remove(tmpdir+'/res.pkl') os.rmdir(tmpdir) except (OSError, IOError): pass assert not os.path.exists(tmpdir) #test with file handle from statsmodels.compatnp.py3k import BytesIO fh = BytesIO() save_pickle(a, fh) fh.seek(0,0) c = load_pickle(fh) fh.close() assert_equal(a,b)
def test_pickle(): import tempfile from numpy.testing import assert_equal tmpdir = tempfile.mkdtemp(prefix='pickle') a = lrange(10) save_pickle(a, tmpdir+'/res.pkl') b = load_pickle(tmpdir+'/res.pkl') assert_equal(a, b) #cleanup, tested on Windows try: import os os.remove(tmpdir+'/res.pkl') os.rmdir(tmpdir) except (OSError, IOError): pass assert not os.path.exists(tmpdir) #test with file handle fh = BytesIO() save_pickle(a, fh) fh.seek(0,0) c = load_pickle(fh) fh.close() assert_equal(a,b)
def main(): """ 特征评分卡制作主程序入口 """ odds = config.get('SCORECARD', 'ODDS') score = config.get('SCORECARD', 'SCORE') pdo = config.get('SCORECARD', 'PDO') feature_engineering = load_pickle("result/feature_engineering.pickle") woe_result = feature_engineering["woe_result"] model = load_pickle("result/lr.pickle") coefficient = list( zip(feature_engineering["feature_selected"], list(model.coef_[0]))) coefficient.append(("intercept_", model.intercept_[0])) coefs = dict(coefficient) make_card(coefs, woe_result, odds, score, pdo)
def run_forcast(ts, config): """This function does the forcasting and return a pandas df with all the historical value and predicted one. We have 2 differents mode according to our usage. If you need to refit the model, add clean entry to True to the config dict Args: ts (pandas data frame): ts means time series. There is no a strong hypthesis on the data frame except an exictence of a minumum of 2 columns ds (datetime) and y (numeric) periods (int): used to define the future period for forecasting. According to the freq args, it's interpreted as a daily,monthly or yearly periods freq (string): frequency in the time series. Could take 3 values : 'D' for daily, 'M' for monthly, 'Y' for yearly config (dictionary): config is a dictionnary with a minimum of 3 entries. Could have some model dependant entries or optionnal technical entries model (string): the model name date (datetime): the date of the training as a time staamp with millis id_model (string): model indentifier to make the model unique prediction_conf (dictionary): future_period (int): number of prediction to perform freq (string): 'D' for daily, 'M' for monthly, 'Y' for yearly tech_conf (dictionary) [optionnal]: clean (bool): clean all existing model and refit. This option remove all exixting model. need improvement to perform an ad hoc clean Returns: forcasted result (pandas data frame): This function return some new columns in the data frame. yhat predicted values, yhat_lower and yhat_upper for uncertainty, trend, seasonnability...to complete Raises: XXXXXX See usage.py for a demo.""" #Try to get the model model_ref_prefix = tb.compute_model_id_hash(config) model_ref_name = tb.get_model_ref_name(model_ref_prefix, tempfile.gettempdir()) clean = 'tech_conf' in config and 'clean' in config[ 'tech_conf'] and config['tech_conf']['clean'] if model_ref_name is None or clean: logger.info( "Fitted model doesn't exist or not up to date. We will create one") ref_name, m_ref = train_for_forcasting(ts, config) else: logger.info("Get the stored model") m_ref = pick.load_pickle( os.path.join(tempfile.gettempdir(), model_ref_name)) period = config['prediction_conf']['future_period'] freq = config['prediction_conf']['freq'] future = tb.make_future(ts, period, freq) forcasted_result = dtt.back_to_origin(m_ref.predict(future)) return forcasted_result
def predicted_sneaker_resale(form_dict): repo_path = Path(os.getcwd()) lm = load_pickle(repo_path / "prod-models" / "resale_predictor.pickle") try: df = pd.DataFrame(form_dict, index=[0]) df['retail'] = df['retail'].astype(int) df['log_retail'] = np.log(df['retail']) df['date'] = pd.to_datetime(df['date']) df['release_month'] = df['date'].dt.month df['release_dow'] = df['date'].dt.weekday month = {1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'} dow = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'} df['release_month'] = df['release_month'].map(month) df['release_dow'] = df['release_dow'].map(dow) wmns = {'Male':0, 'Female':1} df['wmns'] = df['wmns'].map(wmns) bools = {'No':0, 'Yes':1} df['collab'] = df['collab'].map(bools) df['retro'] = df['retro'].map(bools) df['kids'] = df['kids'].map(bools) sname = form_dict['name'] pred_resale_price = round(np.exp(lm.predict(df)[0])) retail = int(form_dict['retail']) diff = round(pred_resale_price - retail) pp = round(((pred_resale_price - retail) / pred_resale_price)*100, 2) return "The {0} is expected to resell for ${1}! This is a projected {2} dollar difference from the original ${4} retail price and an {3}% price premium.".format(sname, pred_resale_price, diff, pp, retail) except: return 'Sorry, there was a problem processing the data entered... Please go back and double check your entries, thanks!'
def test_pickle_supports_open(): tmpdir = tempfile.mkdtemp(prefix="pickle") a = lrange(10) class SubPath: def __init__(self, path): self._path = pathlib.Path(path) def open( self, mode="r", buffering=-1, encoding=None, errors=None, newline=None, ): return self._path.open( mode=mode, buffering=buffering, encoding=encoding, errors=errors, newline=newline, ) # test with pathlib path_pathlib = SubPath(tmpdir + os.pathsep + "res2.pkl") save_pickle(a, path_pathlib) c = load_pickle(path_pathlib) assert_equal(a, c)
def load_model(): path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "MLR_model_v2.pickle") from statsmodels.iolib.smpickle import load_pickle model = load_pickle(path) return model
def processed_feature(mode, read_local=True, do_transform=True, woe=True): """ 模型训练主程序入口 """ assert os.path.exists('result/feature_engineering.pickle') if do_transform else True data = load_data(mode=mode, read_local=read_local) numeric_var, category_var, datetime_var, y_var, identifier_var, text_var = get_variable_type() feature = data[numeric_var + category_var + datetime_var + text_var] y = data[y_var] if mode == "train" else pd.Series([np.nan]*len(feature), name=y_var) # 自定义数据清洗 cs = Custom() feature, y = cs.clean_data(feature, y) # 数据类型转换 feature[numeric_var] = feature[numeric_var].astype('float') feature[category_var] = feature[category_var].astype('category') feature[datetime_var] = feature[datetime_var].astype('datetime64[ns]') # 自定义特征组合,全部为数值变量 feature = cs.feature_combination(feature, y) # 数据处理 if do_transform: feature_engineering = load_pickle('result/feature_engineering.pickle') feature = feature[feature_engineering['feature_selected']] feature = transform(feature, feature_engineering, woe=woe) return feature, y
def forecast(dataDir): #Load the model results_ARIMA = load_pickle(dataDir + 'model/results_ARIMA.pickle') #Forecast next 15 days's price dt = datetime.datetime.now() base = datetime.date(dt.year, dt.month, dt.day) numdays = 15 dates = [pd.Timestamp(base + datetime.timedelta(days=x)) for x in range(1, numdays+1)] forecast = pd.Series(results_ARIMA.forecast(steps=15)[0],dates) forecast = np.exp(forecast) print(forecast) forecast.to_csv(dataDir + 'forecast/forecast.csv',index_label='time',header=['price'])
def main(mode, model_path, do_transform, read_local, woe, score=False, save_remote=False): # 数据、模型加载 model, threshold = load_pickle(model_path) data = load_data(mode=mode, read_local=read_local) feature, y = processed_feature(do_transform=do_transform, mode=mode, read_local=read_local, woe=woe) cs = Custom() # 应用预测 print(">>> 应用预测") res_label = pd.DataFrame(model.predict(feature), columns=['label_predict']) res_prob = pd.DataFrame(model.predict_proba(feature), columns=['probability_0', "probability_1"]) res_prob[ 'res_odds'] = res_prob['probability_0'] / res_prob["probability_1"] res_prob['label_threshold'] = res_prob['probability_1'].apply( lambda x: 0 if x < threshold else 1) res = pd.concat([data, res_label, res_prob], axis=1) if score: print(">>> 概率转换评分") odds = config.get('SCORECARD', 'odds') score = config.get('SCORECARD', 'score') pdo = config.get('SCORECARD', 'pdo') a, b = make_score(odds, score, pdo) res['score'] = res_prob['res_odds'].apply( lambda x: a + b * log(float(x))) bins = tree_binning(res[y.name], res['score'].to_frame( ))[0]["result"]["score"] if mode == "train" else cs.adjust_bins if bins: print(">>> 数据集分组") res['level'] = pd.cut(res['score'], bins) temp = res.groupby("level", as_index=False).count() temp['rate'] = temp['label_threshold'] / feature.shape[0] temp = temp[['level', 'rate']] print(temp) print(res.head()) # 结果保存 print(f">>> 结果保存中,保存模式:{save_remote}") res['load_date'] = str(date.today()) save_result(res, filename=f"{mode}_result.csv", remote=save_remote)
def test_pickle(): tmpdir = tempfile.mkdtemp(prefix='pickle') a = lrange(10) save_pickle(a, tmpdir + '/res.pkl') b = load_pickle(tmpdir + '/res.pkl') assert_equal(a, b) # cleanup, tested on Windows try: import os os.remove(tmpdir + '/res.pkl') os.rmdir(tmpdir) except (OSError, IOError): pass assert not os.path.exists(tmpdir) # test with file handle fh = BytesIO() save_pickle(a, fh) fh.seek(0, 0) c = load_pickle(fh) fh.close() assert_equal(a, c)
def load(cls, fname): ''' load a pickle, (class method) Parameters ---------- fname : string or filehandle fname can be a string to a file path or filename, or a filehandle. Returns ------- unpickled instance ''' from statsmodels.iolib.smpickle import load_pickle return load_pickle(fname)
def get_predictions(station_number, number_of_predictions): base_dir = os.getcwd() + '/data/' try: model = load_pickle(base_dir + 'models/' + str(station_number) + '.pkl') except FileNotFoundError as e: print("Model not found. Please run this script atleast once.") return None yhat = model.forecast(model.y, steps=number_of_predictions) max_vals = [] columns = [] try: max_val_file_content = list( csv.reader( open(base_dir + "csv/max_val_dump(do not delete).csv", 'r'))) except FileNotFoundError as e: print( "CSV file for maximum values dumped not found. Please run the script to create it." ) return None try: columns_file_content = list( csv.reader(open(base_dir + "csv/cols_dump(do not delete).csv", 'r'))) except FileNotFoundError as e: print( "CSV file for columns dumped not found. Please run the script to create it." ) return None for line in max_val_file_content: if str(line[0]) == str(station_number)[:-1]: max_vals = list(map(float, line[1:])) break yhat = yhat * [max_vals] for line in columns_file_content: if str(line[0]) == str(station_number)[:-1]: columns = line[1:] break cols_to_return = {} for key in COLUMNS: if key in columns: cols_to_return[key] = [] index = columns.index(key) for row in yhat: cols_to_return[key].append('{:.2f}'.format(row[index])) else: cols_to_return[key] = None return cols_to_return
def load(cls, fname): """ Load a pickled results instance. Parameters ---------- fname : {str, handle} A string filename or a file handle. Returns ------- Results The unpickled results instance. """ from statsmodels.iolib.smpickle import load_pickle return load_pickle(fname)
def get_prediction(num_arete, hour, minute=0): m, s = False, False if (hour <= 9 and hour >= 7): m = True hour = hour - 7 elif (hour <= 19 and hour >= 17): s = True hour = hour - 17 creneau = 0 if m else 1 res = load_pickle("../data/Regression/vehicules_aretes_" + str(creneau) + "_n" + str(num_arete) + ".pickle") #print(res.summary()) #print(res.params) pred = res.predict(exog=[hour, minute]) y_pred = res.params[0] + res.params[1] * (hour * 60 + minute) return y_pred
def predict_margins(nfl): margin_res = load_pickle("models/margin_res.pickle") margin_ari_score = 0 margin_ari_opp = 0 margins = [] for key, row in nfl.iterrows(): if row.team == "ARI": team_coeff = margin_ari_score else: res_team = "team[T." + row.team + "]" team_coeff = margin_res.params[res_team] if row.opp == "ARI": opp_coeff = margin_ari_opp else: res_opp = "team[T." + row.opp + "]" opp_coeff = margin_res.params[res_opp] if row.ha == "away": ha_coeff = margin_res.params["ha[T.home]"]*-1 else: ha_coeff = margin_res.params["ha[T.home]"]*1 margin_predict = margin_res.params.Intercept + margin_res.params.third_per*row['third_per'] + \ margin_res.params.third_per_allowed*row['third_per_allowed'] + margin_res.params.TOP*row['TOP'] + \ margin_res.params.first_downs * row['first_downs'] + margin_res.params.first_downs_allowed * \ row['first_downs_allowed'] + margin_res.params.pass_yards*row['pass_yards'] + \ margin_res.params.pass_yards_allowed*row['pass_yards_allowed'] + margin_res.params.penalty_yards * \ row['penalty_yards'] + margin_res.params.plays*row['plays'] + margin_res.params.rush_yards * \ row['rush_yards'] + margin_res.params.rush_yards_allowed*row['rush_yards_allowed'] + \ margin_res.params.sacked*row['sacked'] + margin_res.params.sacks*row['sacks'] + \ margin_res.params.takeaways*row['takeaways'] + margin_res.params.total_yards*row['total_yards'] + \ margin_res.params.total_yards_allowed*row['total_yards_allowed'] + margin_res.params.turnovers * \ row['turnovers'] + ha_coeff + team_coeff + opp_coeff margins.append(margin_predict) away_margin = margins[0] + margins[1] home_margin = -1*away_margin pred_margins = [home_margin, away_margin] return pred_margins
def predict_totals(nfl): total_res = load_pickle("models/total_res.pickle") total_ari_score = 0 total_ari_opp = 0 totals = [] for key, row in nfl.iterrows(): if row.team == "ARI": team_coeff = total_ari_score else: res_team = "team[T." + row.team + "]" team_coeff = total_res.params[res_team] if row.opp == "ARI": opp_coeff = total_ari_opp else: res_opp = "team[T." + row.opp + "]" opp_coeff = total_res.params[res_opp] if row.ha == "away": ha_coeff = total_res.params["ha[T.home]"]*-1 else: ha_coeff = total_res.params["ha[T.home]"]*1 total_predict = total_res.params.Intercept + total_res.params.third_per*row['third_per'] + \ total_res.params.third_per_allowed*row['third_per_allowed'] + total_res.params.TOP*row['TOP'] + \ total_res.params.first_downs*row['first_downs'] + total_res.params.first_downs_allowed*row['first_downs_allowed'] + \ total_res.params.pass_yards*row['pass_yards'] + total_res.params.pass_yards_allowed*row['pass_yards_allowed'] + \ total_res.params.penalty_yards*row['penalty_yards'] + total_res.params.plays*row['plays'] + \ total_res.params.rush_yards*row['rush_yards'] + total_res.params.rush_yards_allowed*row['rush_yards_allowed'] + \ total_res.params.sacked*row['sacked'] + total_res.params.sacks*row['sacks'] + total_res.params.takeaways * \ row['takeaways'] + total_res.params.total_yards*row['total_yards'] + total_res.params.total_yards_allowed * \ row['total_yards_allowed'] + total_res.params.turnovers * \ row['turnovers'] + ha_coeff + team_coeff + opp_coeff totals.append(total_predict) total_predicted = (totals[0] + totals[1])/2 pred_totals = [total_predicted, total_predicted] return totals, pred_totals
def load(cls, fname): """ Load a pickled results instance .. warning:: Loading pickled models is not secure against erroneous or maliciously constructed data. Never unpickle data received from an untrusted or unauthenticated source. Parameters ---------- fname : {str, handle} A string filename or a file handle. Returns ------- Results The unpickled results instance. """ from statsmodels.iolib.smpickle import load_pickle return load_pickle(fname)
def load(dataset, mode_type='regression'): if mode_type not in ['knn', 'ada']: fname = dataset + '.pickle' from statsmodels.iolib.smpickle import load_pickle if os.path.isfile(fname): return load_pickle(fname) else: print("Pickled file " + fname + " not found!") print("Constructing model...") model = construct(dataset) print("Serializing...") model.save(fname) return model else: fname = dataset + '_{0}.pickle'.format(mode_type) if os.path.isfile(fname): return joblib.load(fname) else: print("Pickled file " + fname + " not found!") print("Constructing model...") model = scikit_construct(dataset, mode_type) print("Serializing...") joblib.dump(model, fname) return model
def load(cls, fname): from statsmodels.iolib.smpickle import load_pickle return load_pickle(fname)
# 如果指定到未来的时期,将会进行滚动预测 pred = results.predict(start='2015-02-01', end='2015-02-10') print(pred) # 2)进行滚动预测,预测未来几个值 pred = results.forecast(5) print(pred) # 由于采用滚动预测,越远越不准确 # 3)保存模型 fname = 'out.pkl' results.save(fname) # 4)加载模型 from statsmodels.iolib.smpickle import load_pickle results = load_pickle(fname) # 5)应用模型 print(results.params) pred = results.forecast(3) print(pred) ###################################################################### ######## Part2、二次指数平滑 Double Exponential Smoothing ###################################################################### # Brown's Linear Exponential Smoothing def double_exponential_smoothing(ts, alpha=0.8, isPlot=True): """ 布朗线性趋势模型(二次指数平滑)
def __init__(self): self.model_filename = os.path.join(MODELS_BASE_DIR, 'final_var_model.pkl') self.model = load_pickle(self.model_filename) LOGGER.info('VAR Model: ' + self.model_filename.split('/')[-1] + ' is loaded')
import numpy as np import pandas as pd from flask import Flask, request, jsonify, render_template import pickle app = Flask(__name__) from statsmodels.iolib.smpickle import load_pickle model = load_pickle("slr_wcat.pkl") @app.route('/') def home(): return render_template('startup.html') @app.route('/predict', methods=['POST']) def predict(): ''' For rendering results on HTML GUI ''' float_features = [float(x) for x in request.form.values()] waist = float_features[0] waist_sq = waist * waist waist_cb = waist * waist * waist wcat = pd.DataFrame([[waist, waist_sq, waist_cb]], columns=["Waist", "Waist_sq", "Waist_cb"]) x = np.exp(model.predict(wcat)) print(float(round(x, 2))) # flt_features = [float(x) for x in request.form.values()] ## int_features = [int(x) for x in request.form.values()] # final_features = [np.array(int_features)]
help='Directory of Pandas DataFrame') parser.add_argument('--model', type=str, default='./models/baseline_regression_linear.pickle', help='Directory of Model') FLAGS, unparsed = parser.parse_known_args() # Loading the data data = pd.read_csv(FLAGS.data) # Preprocess the data data = preprocess_data(data) # Load model model = load_pickle(FLAGS.model) # Predictions predictions = model.get_prediction(exog=data) data['final_grade_predicted'] = predictions.predicted_mean print('\nData:\n') print(data) print() print( f"Average increase: " f"{100*np.mean((data['final_grade_predicted']/data['final_grade'])-1):.3f}%" ) print() # Metrics print('\nEvaluation Metrics:')
def import_models(i): model = load_pickle(r"{}\models\model{}.pickle".format(path, i)) return model
import copy import pandas_tutorials as pd from statsmodels.iolib.smpickle import load_pickle data = pd.read_csv("../data/lr-binary.csv") combos = copy.deepcopy(data) print(combos.head()) # 数据中的列要跟预测时用到的列一致 predict_cols = combos.columns[1:] # 预测集也要添加intercept变量 combos['intercept'] = 1.0 model = load_pickle("lr_admit.model") print(model.summary()) # 进行预测,并将预测评分存入 predict 列中 print(model.predict(combos[predict_cols])) # 预测完成后,predict 的值是介于 [0, 1] 间的概率值 # 我们可以根据需要,提取预测结果 # 例如,假定 predict > 0.5,则表示会被录取 # 在这边我们检验一下上述选取结果的精确度 total = 0 hit = 0 for value in combos.values: # 预测分数 predict, 是数据中的最后一列 predict = value[-1] # 实际录取结果 admit = int(value[0])
for path in [cell_paths_sorted[0]]: # Only need the first cell, which is the Ellsworth mouth/outlet cell_result = pd.read_csv(results_dir / path) jday_pad = cell_result['Jday'].apply(lambda x: str(x).zfill(3)) str_year = cell_result['Year'].apply(lambda x: str(x)) cell_result['date'] = str_year + jday_pad rng = pd.to_datetime(cell_result['date'], format='%Y%j') cell_result.index = rng cell_results_scenario.append(cell_result) cell_results.append(cell_results_scenario) # ======================================================================= # Correct VELMA stream temperature seasonal bias using pre-trained regression model # *** Not sure if this correction is still valid considering the non-linear seasonal changes of the climate projections *** olsmodel = load_pickle(config.data_path.parents[0] / 'models' / 'stream_temp_correction_ols.pickle') stream_temps_corrected = [] for i, scenario in enumerate(scenarios): stream_temps_scenario = [] for j, gcm in enumerate(gcms): z = cell_results[i][j]['Water_Surface_Temperature(degrees_C)'] day = 24 * 60 * 60 year = 365.2425 * day timestamp_secs = pd.to_datetime(z.index) timestamp_secs = timestamp_secs.map(datetime.datetime.timestamp) year_cos = np.cos(timestamp_secs * (2 * np.pi / year)) year_sin = np.sin(timestamp_secs * (2 * np.pi / year)) y = pd.DataFrame(data=np.column_stack([z, year_cos, year_sin]), columns=['temp', 'year_cos', 'year_sin'])
def run_models(): #-------------------------------------Creating and storing MLP model----------------------------------------------------- # Importing the dataset and separating dependent/independent variables dataset = pd.read_csv("assets/predicts.csv") print(dataset.dtypes) dataset['Main purpose of visit'].value_counts() dataset['Accessibility status'].value_counts() dataset['Accomodation status'].value_counts() dataset['health services status'].value_counts() cleanup_nums = { "Accessibility status": { "Poor": 1, "Fair": 2, "Good": 3, "Better": 4 }, "Accomodation status": { "Poor": 1, "Fair": 2, "Good": 3, "Better": 4 }, "health services status": { "Poor": 1, "Fair": 2, "Good": 3, "Better": 4 }, } dataset.replace(cleanup_nums, inplace=True) dataset.head(5) print(dataset.head(5)) X = dataset.iloc[:, 1:8].values print(X[:, 3]) y = dataset.iloc[:, 10].values print(y) # Encoding categorical data labelencoder_X_3 = LabelEncoder() X[:, 3] = labelencoder_X_3.fit_transform(X[:, 3]) list(labelencoder_X_3.inverse_transform([0, 1, 2, 3])) X[:, 3] X[:, 0:4] print(X) onehotencoder = OneHotEncoder(categorical_features=[3]) X = onehotencoder.fit_transform(X).toarray() X = X[:, 1:] print('\n'.join( [''.join(['{:9}'.format(item) for item in row]) for row in X])) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) a = y_test b = y_train # Feature Scaling //escaping sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Part 2 - making the the ANN model # Importing the Keras libraries and packages # Initialising the ANN for regression #Creating regression model REG = Sequential() # Adding the input layer and the first hidden layer with dropout if required REG.add( Dense(units=20, input_dim=9, kernel_initializer="normal", activation='relu')) #REG.add(Dropout(p=0.1)) # Adding the second hidden layer REG.add(Dense(units=20, kernel_initializer="normal", activation='relu')) #REG.add(Dropout(p=0.1)) # Adding the output layer REG.add(Dense(units=1, kernel_initializer="normal")) # Compiling the ANN #def root_mean_squared_error(y_true, y_pred): # return K.sqrt(K.mean(K.square(y_pred - y_true))) REG.compile(optimizer='adam', loss='mean_squared_error') # Fitting the ANN to the Training set REG.fit(X_train, y_train, batch_size=10, epochs=200) # Part 3 - Making the predictions and evaluating the model X_test # Predicting the Test set results y_pred = REG.predict(X_test) REG.save('assets/REG_MLP_model.h5') K.clear_session() #--------------------------------------------------------------------------------------------------------------------- #---------------------------------------Creating and storing SARIMA model---------------------------------------------- #data collecting...converting dataset to html.... df = pd.read_csv('assets/Touristarrival_monthly.csv') df1 = df.iloc[:5] html_table_template = df1.to_html(index=False) html_table = df.to_html(index=False) #data observation and log transformation df.index = pd.to_datetime(df['Month']) df['#Tourists'].plot() mpl.pyplot.ylabel("No.of Toursits Arrivals ") mpl.pyplot.xlabel("Year") #storing plots mpl.pyplot.savefig('PredictionEngine/static/img/sarima_input.png', dpi=600, bbox_inches='tight') mpl.pyplot.clf() series = df['#Tourists'] logtransformed = np.log(series) logtransformed.plot() mpl.pyplot.ylabel("log Scale(No.of Toursits Arrivals) ") mpl.pyplot.xlabel("Year") #storing plots mpl.pyplot.savefig( 'PredictionEngine/static/img/sarima_input_logscaled.png', dpi=600, bbox_inches='tight') mpl.pyplot.clf() #Train test split percent_training = 0.80 split_point = round(len(series) * percent_training) print(split_point) training, testing = series[0:split_point], series[split_point:] training = np.log(training) #differencing to achieve stationarity training_diff = training.diff(periods=1).values[1:] #plot of residual log differenced series mpl.pyplot.plot(training_diff) mpl.pyplot.title("Tourist arrivals data log-differenced") mpl.pyplot.xlabel("Years") mpl.pyplot.ylabel("Toursits arrivals") mpl.pyplot.clf() #ACF and PACF plots 1(with log differenced training data) lag_acf = acf(training_diff, nlags=40) lag_pacf = pacf(training_diff, nlags=40, method='ols') #plot ACF mpl.pyplot.figure(figsize=(15, 5)) mpl.pyplot.subplot(121) mpl.pyplot.stem(lag_acf) mpl.pyplot.axhline(y=0, linestyle='-', color='black') mpl.pyplot.axhline(y=-1.96 / np.sqrt(len(training)), linestyle='--', color='gray') mpl.pyplot.axhline(y=1.96 / np.sqrt(len(training)), linestyle='--', color='gray') mpl.pyplot.xlabel('lag') mpl.pyplot.ylabel("ACF") #storing plots in bytes mpl.pyplot.savefig('PredictionEngine/static/img/sarima_afc.png', dpi=600, bbox_inches='tight') mpl.pyplot.clf() #plot PACF mpl.pyplot.figure(figsize=(15, 5)) mpl.pyplot.subplot(122) mpl.pyplot.stem(lag_pacf) mpl.pyplot.axhline(y=0, linestyle='-', color='black') mpl.pyplot.axhline(y=-1.96 / np.sqrt(len(training)), linestyle='--', color='gray') mpl.pyplot.axhline(y=1.96 / np.sqrt(len(training)), linestyle='--', color='gray') mpl.pyplot.xlabel('lag') mpl.pyplot.ylabel("PACF") #storing plots in bytes mpl.pyplot.savefig('PredictionEngine/static/img/sarima_pafc.png', dpi=600, bbox_inches='tight') mpl.pyplot.clf() #SARIMA Model specification model = sm.tsa.statespace.SARIMAX(training, order=(2, 0, 3), seasonal_order=(2, 1, 0, 12), trend='c', enforce_invertibility=False, enforce_stationarity=False) # fit model model_fit = model.fit() model_fit.save("assets/REG_SARIMA_model.pickle") print(model_fit.summary()) #plot residual errors # residuals = pd.DataFrame(model_fit.resid) # fig, ax = mpl.pyplot.subplots(1,2) # residuals.plot(title="Residuals", ax=ax[0]) # residuals.plot(kind='kde', title='Density', ax=ax[1]) # mpl.pyplot.show() # print(residuals.describe()) # Model evaluation and forecast model_fitted = load_pickle("assets/REG_SARIMA_model.pickle") forecast = model_fitted.forecast(len(df) - 250) print(forecast) forecast = np.exp(forecast) print(forecast) #plot forecast results and display RMSE mpl.pyplot.figure(figsize=(10, 5)) mpl.pyplot.plot(forecast, 'r') mpl.pyplot.plot(series, 'b') mpl.pyplot.legend(['Predicted test values', 'Actual data values']) mpl.pyplot.title('RMSE:%.2f' % np.sqrt(sum((forecast - testing)**2) / len(testing))) mpl.pyplot.ylabel("No.of Toursits Arrivals Monthly") mpl.pyplot.xlabel("Year") mpl.pyplot.autoscale(enable='True', axis='x', tight=True) mpl.pyplot.axvline(x=series.index[split_point], color='black') #storing plots mpl.pyplot.savefig('PredictionEngine/static/img/sarima_result.png', dpi=600, bbox_inches='tight') mpl.pyplot.clf() forecaste = model_fitted.forecast(len(df) - 214) forecast_next = forecaste[62:] forecast_next = np.exp(forecast_next) print(forecast_next) mpl.pyplot.figure(figsize=(10, 5)) mpl.pyplot.plot(forecast_next, 'r') mpl.pyplot.plot(series, 'b') mpl.pyplot.legend(['Predicted next steps values']) mpl.pyplot.title('Monthly tourist arrivals predictions') mpl.pyplot.ylabel("No.of Toursits Arrivals ") mpl.pyplot.xlabel("Year") mpl.pyplot.autoscale(enable='True', axis='x', tight=True) #storing plots in bytes mpl.pyplot.savefig('PredictionEngine/static/img/sarima_forecast.png', dpi=600, bbox_inches='tight') mpl.pyplot.clf()
import warnings import itertools import numpy as np import matplotlib.pyplot as plt warnings.filterwarnings("ignore") plt.style.use('fivethirtyeight') import pandas as pd import statsmodels.api as sm from statsmodels.tsa.api import VAR #vector auto reg from statsmodels.iolib.smpickle import load_pickle import dataProcessing as dp # for get_capacities, get stations y_df = pd.read_csv("Velib/dataframe_streaming_1.csv") #Velib/ #??model_fit = VAR.load('ar_model.pkl') model_fit = load_pickle('ar_model.pkl') def predict_nextstate(y_df, model_fit): df_s_c = pd.read_csv("stations_capacities.csv") #Velib/ TODO once in github stations = dp.get_stations(df_s_c) capacities = dp.get_capacities(df_s_c) #remove dates y = y_df.drop(columns=['Date']).to_numpy() #y[i][0] c'est la station et y[i][1] c'est le nombre de bikes #Buid a vector .. [Y] #nombre de temps enregistrés times = len(y)/len(stations)
from pyspark import SparkContext, SparkConf from pyspark.sql import SparkSession from pyspark.streaming import StreamingContext warnings.filterwarnings("ignore") import pandas as pd import statsmodels.api as sm from statsmodels.tsa.api import VAR # vector auto reg from statsmodels.iolib.smpickle import load_pickle import time import dataProcessing as dp # for get_capacities, get stations y_df = pd.read_csv("dataframe_streaming.csv" ) # Velib/ #??model_fit = VAR.load('ar_model.pkl') model_fit = load_pickle('../model/ar_model.pkl') def predict_nextstate(y_df, model_fit): df_s_c = pd.read_csv("../training_csv/stations_capacities.csv" ) # Velib/ TODO once in github stations = dp.get_stations() capacities = dp.get_capacities(df_s_c) # remove dates y = y_df.drop(columns=['Date']).to_numpy() # Buid a vector .. [Y] # nombre de temps enregistrés times = len(y) / len(stations) print("nombre de temps dans les données de test: ", times)