Exemple #1
0
def pipeline_trans_reg():
    '''
            Application of Transformed Linear Regression

    #n_quantiles needs to be smaller than the number of samples (standard is 1000)

    PRIMARY_MERCHANT_NAME
    #accuracy negative; model totally off
    ---
    AMOUNT_MEAN_LAG7
    q-t R2-score: 0.896
    unprocessed R2-score: 0.926
    '''
    transformer = QuantileTransformer(n_quantiles=750,
                                      output_distribution='normal')
    regressor = LinearRegression()
    regr = TransformedTargetRegressor(regressor=regressor,
                                      transformer=transformer)

    regr.fit(X_train, y_train)

    TransformedTargetRegressor(...)
    print('q-t R2-score: {0:.3f}'.format(regr.score(X_test, y_test)))

    raw_target_regr = LinearRegression().fit(X_train, y_train)
    print('unprocessed R2-score: {0:.3f}'.format(
        raw_target_regr.score(X_test, y_test)))
    return regr, raw_target_regr
    def ols_prediction(self):
        """
        uses linear regression after standardising to normal dist
        prints out accuracy metrics and then saves the design matrix with y and predicted y as a csv file
        also creates another column to calculate relative percentage difference between y and predicted y
        :return:
        """
        logger.info("running Linear Regression model")
        crab_df_woo = self.pre_process_data()
        transformer = QuantileTransformer(output_distribution='normal')
        # since I observed that the data was skewed, I decided to transform the continuous variables to normal dist
        reg = linear_model.LinearRegression()
        t_reg = TransformedTargetRegressor(regressor=reg,
                                           transformer=transformer)
        ohe = ce.OneHotEncoder(handle_unknown='ignore',
                               use_cat_names=True,
                               drop_invariant=True)
        crab_df_woo_enc = ohe.fit_transform(crab_df_woo)
        X = crab_df_woo_enc.drop("age", axis=1)
        y = crab_df_woo_enc[["age"]]
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=100)
        t_reg.fit(X_train, y_train)
        s = t_reg.score(X_test, y_test)
        logger.info("R-squared from Linear Regression is: {0}".format(s))
        y_pred = t_reg.predict(X)
        mse = np.sqrt(mean_squared_error(y, y_pred))
        mae = mean_absolute_error(y, y_pred)
        logger.debug("Linear Regression MAE: {0}".format(mae))
        logger.debug("Linear Regression RMSE: {0}".format(mse))
        logger.debug("Linear Regression R-squared: {0}".format(s))

        crab_df = X.copy()
        crab_df["age"] = pd.Series(y.values.ravel())
        crab_df["age_ols"] = pd.Series(y_pred.ravel())
        crab_df['sex'] = crab_df.apply(lambda row: self.reverse_ohe(row),
                                       axis=1)
        crab_df.drop(["sex_I", "sex_M", "sex_F"], axis=1, inplace=True)
        crab_df["percentage_difference"] = np.abs(
            np.divide(
                (crab_df["age"] - crab_df["age_ols"]), crab_df["age"]) * 100)
        crab_df.to_csv("crab_predit_ols.csv", index=False)
        logger.info("Crab data with predicted variables saved: {0}".format(
            "crab_predit_ols.csv"))
        logger.info("Linear Regression execution finished")
 def rf_prediction(self):
     """
     uses ensemble (Random Forest) method to predict crab age
     :return:
     """
     logger.info("running Random Forest model")
     X = self.crab_data.drop("age", axis=1)
     y = self.crab_data[["age"]]
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=0.2,
                                                         random_state=100)
     #
     numerical_features = X_train.dtypes == 'float'
     categorical_features = ~numerical_features
     # I used pipelining so that the predicted values were automatically transformed/scaled back
     preprocess = make_column_transformer(
         (RobustScaler(), numerical_features),
         (OneHotEncoder(sparse=False), categorical_features))
     forest = RandomForestRegressor(n_estimators=5000,
                                    max_depth=20,
                                    min_samples_leaf=2,
                                    min_samples_split=4,
                                    random_state=100)
     f_reg = Pipeline(steps=[('preprocess', preprocess), ('model', forest)])
     f_reg_ttr = TransformedTargetRegressor(regressor=f_reg)
     f_reg_ttr.fit(X_train, y_train)
     s = f_reg_ttr.score(X_test, y_test)
     logger.info("R-squared from Random Forest is: {0}".format(s))
     y_pred = f_reg_ttr.predict(X)
     mse = np.sqrt(mean_squared_error(y, y_pred))
     mae = mean_absolute_error(y, y_pred)
     logger.debug("RandomForest MAE: {0}".format(mae))
     logger.debug("RandomForest RMSE: {0}".format(mse))
     logger.debug("RandomForest R-squared: {0}".format(s))
     # recreate the original dataset
     crab_df = X.copy()
     crab_df["age"] = pd.Series(y.values.ravel())
     crab_df["age_forest"] = pd.Series(y_pred.ravel())
     crab_df["percentage_difference"] = np.abs(
         np.divide(
             (crab_df["age"] - crab_df["age_forest"]), crab_df["age"]) *
         100)
     crab_df.to_csv("crab_predit_forest.csv", index=False)
     logger.info("Crab data with predicted variables saved: {0}".format(
         "crab_predit_forest.csv"))
     logger.info("Random Forest execution finished")
Exemple #4
0
def tlr_reg(X_train, X_test, y_train, y_test):
    '''
    Transformed Linear Regression
    #n_quantiles needs to be smaller than the number of samples (standard is 1000)
    '''
    transformer = QuantileTransformer(n_quantiles=750,
                                      output_distribution='normal')
    regressor = LinearRegression(n_jobs=-1)

    #Initialize the transformed target regressor
    regr = TransformedTargetRegressor(regressor=regressor,
                                      transformer=transformer)
    regr.fit(X_train, y_train)

    # raw LinearRegressor for comparison
    raw_target_regr = LinearRegression(n_jobs=-1).fit(X_train, y_train)

    #Print the best value combination
    print('q-t R2-score: {0:.3f}'.format(regr.score(X_test, y_test)))
    print('unprocessed R2-score: {0:.3f}'.format(
        raw_target_regr.score(X_test, y_test)))

    return regr, raw_target_regr
    '''
Exemple #5
0
def examples():
    from sklearn.pipeline import Pipeline
    from sklearn.svm import SVC
    from sklearn.decomposition import PCA
    estimators = [('reduce_dim', PCA()), ('clf', SVC())]
    pipe = Pipeline(estimators)
    print(pipe)
    print(pipe.steps[0])
    print(pipe.named_steps['reduce_dim'])

    pipe.set_params(clf__C=10)
    print(pipe.named_steps['clf'])

    ###################################################
    # 网格搜索,搜索管道中的参数(重要)
    from sklearn.model_selection import GridSearchCV
    param_grid = dict(reduce_dim__n_components=[2, 5, 10],
                      clf__C=[0.1, 10, 100])
    grid_search = GridSearchCV(pipe, param_grid=param_grid)
    print(grid_search)

    ###################################################
    # 网格搜索,搜索管道中的参数(重要)
    from sklearn.linear_model import LogisticRegression

    param_grid = dict(reduce_dim=[None, PCA(5), PCA(10)],
                      clf=[SVC(), LogisticRegression()],
                      clf__C=[0.1, 10, 100])  # 多个可组成列表
    grid_search = GridSearchCV(pipe, param_grid=param_grid)
    print(grid_search)

    ###################################################
    from sklearn.pipeline import make_pipeline
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.preprocessing import Binarizer
    pipe = make_pipeline(Binarizer(), MultinomialNB())
    print(pipe)

    ###################################################
    # 利用memory减少重复计算
    from tempfile import mkdtemp
    from shutil import rmtree
    from sklearn.decomposition import PCA
    from sklearn.svm import SVC
    from sklearn.pipeline import Pipeline
    estimators = [('reduce_dim', PCA()), ('clf', SVC())]
    cachedir = mkdtemp()
    pipe = Pipeline(estimators, memory=cachedir)
    print(pipe)

    # Clear the cache directory when you don't need it anymore
    rmtree(cachedir)

    #####################################################
    #  Transforming target in regression
    import numpy as np
    from sklearn.datasets import load_boston
    from sklearn.compose import TransformedTargetRegressor
    from sklearn.preprocessing import QuantileTransformer
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    boston = load_boston()
    X = boston.data
    y = boston.target
    transformer = QuantileTransformer(output_distribution='normal')
    regressor = LinearRegression()
    regr = TransformedTargetRegressor(regressor=regressor,
                                      transformer=transformer)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    regr.fit(X_train, y_train)

    print('R2 score: {0:.2f}'.format(regr.score(X_test, y_test)))

    raw_target_regr = LinearRegression().fit(X_train, y_train)
    print('R2 score: {0:.2f}'.format(raw_target_regr.score(X_test, y_test)))

    ##########################################################
    # 对每列数据进行处理-预处理
    import pandas as pd
    X = pd.DataFrame({
        'city': ['London', 'London', 'Paris', 'Sallisaw'],
        'title': [
            "His Last Bow", "How Watson Learned the Trick", "A Moveable Feast",
            "The Grapes of Wrath"
        ],
        'expert_rating': [5, 3, 4, 5],
        'user_rating': [4, 5, 4, 3]
    })

    from sklearn.compose import ColumnTransformer
    from sklearn.feature_extraction.text import CountVectorizer
    column_trans = ColumnTransformer(
        [('city_category', CountVectorizer(analyzer=lambda x: [x]), 'city'),
         ('title_bow', CountVectorizer(), 'title')],
        remainder='drop')

    print(column_trans.fit(X))
    print(column_trans.get_feature_names())
    print(column_trans.transform(X).toarray())
Exemple #6
0
class Effort:
    logger = utils.get_logger()

    def __init__(self, project, model, value, task, df, hourly_wage):
        self.modelType = model
        self.type = value
        self.task = task
        self.df = df
        self.t_records = len(self.df)
        self.project_name = project
        self.model = None
        self.predictions = None
        self.r_squared = None
        self.r_squared_adj = None
        self.mae = None
        self.mse = None
        self.rmse = None
        self.pred25 = None
        self.pred50 = None
        self.X = None
        self.Y = None
        self.results = None
        self.module_forecast_results = None
        self.average_effort_release = None
        self.hourly_wage = hourly_wage
        self.df = self.calculate_costs(df)
        # self.df = df

    def get_cost_columns(self):
        EFFORT = self.type
        T_CONTRIBS = None
        BILLED = None
        COST = c.COST
        HOURS_DIFF = None
        AVG_EFFORT_CONTRIBS = None
        CONTRIB_DIFF = None

        if self.type == c.LINE_CC or self.type == c.MODULE_CC:
            T_CONTRIBS = c.T_CC
            BILLED = c.BILLED_HOURS_CC
            HOURS_DIFF = c.HOURS_DIFF_CC
            AVG_EFFORT_CONTRIBS = c.AVG_MODULE_CONTRIBS_CC
            CONTRIB_DIFF = c.CONTRIB_DIFF_CC
        else:
            T_CONTRIBS = c.T_EC
            BILLED = c.BILLED_HOURS_EC
            HOURS_DIFF = c.HOURS_DIFF_EC
            AVG_EFFORT_CONTRIBS = c.AVG_MODULE_CONTRIBS_EC
            CONTRIB_DIFF = c.CONTRIB_DIFF_EC

        return EFFORT, T_CONTRIBS, BILLED, COST, HOURS_DIFF, AVG_EFFORT_CONTRIBS, CONTRIB_DIFF

    def minContrib(self, row, effort, contribs):
        if row[contribs] == 0 and row[effort] > 0:
            return 1
        else:
            return row[contribs]

    def calculate_costs(self, df):
        EFFORT, T_CONTRIBS, BILLED, COST, HOURS_DIFF, AVG_EFFORT_CONTRIBS, CONTRIB_DIFF = self.get_cost_columns(
        )

        df[c.DATE_P] = df[c.DATE].shift()
        df[c.DATE_P].fillna(df[c.DATE].min(), inplace=True)

        # Cost section
        df[HOURS_DIFF] = utils.calculate_hours_diff(df)

        if df[[c.DATE, c.DATE_P]].isna().values.any():
            df[[c.DATE, c.DATE_P]].fillna(0, inplace=True)

        df[T_CONTRIBS] = df.apply(self.minContrib,
                                  effort=EFFORT,
                                  contribs=T_CONTRIBS,
                                  axis=1)

        average_effort = df[EFFORT].tail(30).mean()
        average_effort_contribs = df[T_CONTRIBS].mean()
        self.average_effort_release = average_effort / average_effort_contribs

        df[AVG_EFFORT_CONTRIBS] = df.apply(utils.calculate_contribs,
                                           effort=self.average_effort_release,
                                           model=EFFORT,
                                           contribs=T_CONTRIBS,
                                           axis=1)

        df[CONTRIB_DIFF] = round(df[T_CONTRIBS] - df[AVG_EFFORT_CONTRIBS], 2)
        df[BILLED] = round(df[HOURS_DIFF] * df[AVG_EFFORT_CONTRIBS], 2)
        df[COST] = round(df[BILLED] * self.hourly_wage, 2)

        return df

    def forecast_variable(self, variable, predicton_months):
        self.logger.debug(
            "\n{0} - Forcasting {1} for {2} and task {3}: \n {4}".format(
                self.project_name, variable, self.type, self.task,
                self.df[variable]))

        data = {c.DATE: self.df[c.DATE], c.NT: self.df[variable]}
        NT = pd.DataFrame(data)
        NT.columns = ['ds', 'y']

        are_same = utils.is_all_same(NT['y'])

        if are_same:
            size = len(NT)
            min = NT['y'].head(1)
            NT['y'] = np.random.randint(min, min + 2, size)

        NT['y_orig'] = NT['y']
        NT['y'], lam = boxcox(NT['y'] + 1)

        m_NT = Prophet(uncertainty_samples=0)
        m_NT.fit(NT)
        future_NT = m_NT.make_future_dataframe(periods=predicton_months,
                                               freq='m')
        forecast_NT = m_NT.predict(future_NT)

        # m_NT.plot(forecast_NT)

        forecast_NT_inv = pd.DataFrame()
        forecast_NT_inv['ds'] = forecast_NT['ds']
        # forecast_NT_inv[['yhat','yhat_upper','yhat_lower']] = forecast_NT[['yhat','yhat_upper','yhat_lower']].apply(lambda x: inv_boxcox(x, lam))
        forecast_NT_inv[['yhat']] = forecast_NT[[
            'yhat'
        ]].apply(lambda x: inv_boxcox(x, lam))

        m_NT.history['y_t'] = m_NT.history['y']
        m_NT.history['y'] = m_NT.history['y_orig']

        NT['y_t'] = NT['y']
        NT['y'] = NT['y_orig']

        # m_NT.plot(forecast_NT_inv)
        forecast_NT_inv['yhat'].fillna(
            forecast_NT_inv['yhat'].tail(predicton_months - 12).mean(),
            inplace=True)

        self.forecast = forecast_NT_inv

        return self.forecast

    def forecast_effort(self, data, dateIndex, variable, rf_regressor):
        X_Future = pd.DataFrame(data)

        X_Future = X_Future.astype('float32')

        X_Future = X_Future.replace([np.inf, -np.inf, np.nan], 0)

        y_pred_rf = rf_regressor.predict(X_Future)
        y_pred_index = dateIndex

        # resultData = {variable: y_pred_rf.round(2), c.DATE: y_pred_index}
        data[variable] = y_pred_rf.round(2)
        data[c.DATE] = y_pred_index
        # results = pd.DataFrame(resultData)
        results = pd.DataFrame(data)
        return results

    def predict_effort(self):
        self.df[c.T_LINE_P] = self.df[c.T_LINE].shift()

        if self.df.isna().values.any():
            self.df.fillna(0, inplace=True)

        if self.type == c.LINE_CC:
            self.X = self.df[[c.NT_CC, c.NO_CC, c.T_CC, c.T_LINE_P]]
            self.Y = self.df[c.LINE_CC]
        elif self.type == c.LINE_EC:
            self.X = self.df[[c.NT_EC, c.NO_EC, c.T_EC, c.T_LINE_P]]
            self.Y = self.df[c.LINE_EC]
        elif self.type == c.MODULE_CC:
            self.X = self.df[[c.NT_CC, c.NO_CC, c.T_CC, c.T_LINE_P]]
            self.Y = self.df[c.MODULE_CC]
        elif self.type == c.MODULE_EC:
            self.X = self.df[[c.NT_EC, c.NO_EC, c.T_EC, c.T_LINE_P]]
            self.Y = self.df[c.MODULE_EC]

        splits = 10

        if self.t_records <= splits:
            splits = self.t_records

        pipeline = Pipeline(
            steps=[('scaler', QuantileTransformer()),
                   ('predictor',
                    DecisionTreeRegressor(
                        random_state=0, max_depth=10, min_samples_split=10))])
        self.model = TransformedTargetRegressor(
            regressor=pipeline, transformer=QuantileTransformer())

        self.model.fit(self.X, self.Y)

        kfold = model_selection.KFold(n_splits=splits)
        self.predictions = cross_val_predict(self.model,
                                             self.X,
                                             self.Y,
                                             cv=kfold)

        results = self.calculate_diff()

        return results

    def calculate_diff(self):
        NT = None
        NO = None
        T_CONTRIBUTORS = None
        TYPE = self.type
        LINE = None
        MODULE = None

        if self.type == c.LINE_CC or self.type == c.MODULE_CC:
            NT = c.NT_CC
            NO = c.NO_CC
            T_CONTRIBUTORS = c.T_CC
            LINE = c.LINE_CC
            MODULE = c.MODULE_CC
        elif self.type == c.LINE_EC or self.type == c.MODULE_EC:
            NT = c.NT_EC
            NO = c.NO_EC
            T_CONTRIBUTORS = c.T_EC
            LINE = c.LINE_EC
            MODULE = c.MODULE_EC

        EFFORT, T_CONTRIBS, BILLED, COST, HOURS_DIFF, AVG_EFFORT_CONTRIBS, CONTRIB_DIFF = self.get_cost_columns(
        )

        data = {
            c.DATE: self.df[c.DATE],
            c.PROJECT: self.project_name,
            c.MODEL: TYPE,
            c.TASK: self.task,
            c.NT: self.X[NT],
            c.NO: self.X[NO],
            c.T_CONTRIBUTORS: self.X[T_CONTRIBUTORS],
            c.T_LINE: self.X[c.T_LINE_P],
            c.LINE: self.df[LINE],
            c.MODULE: self.df[MODULE],
            c.AVG_MODULE_CONTRIBS: self.df[AVG_EFFORT_CONTRIBS],
            c.HOURS_DIFF: self.df[HOURS_DIFF],
            c.CONTRIB_DIFF: self.df[CONTRIB_DIFF],
            c.BILLED_HOURS: self.df[BILLED],
            c.COST: self.df[COST]
        }

        data[c.OBSERVED] = self.Y.round(2)
        data[c.PREDICTED] = self.predictions.round(2)
        data[c.DIFFERENCE] = abs(self.Y - self.predictions).round(2)
        data[c.PERCENT_ERROR] = (abs(self.Y - self.predictions) /
                                 self.Y).round(2)

        self.results = pd.DataFrame(data)
        self.results[c.PERCENT_ERROR].fillna(0, inplace=True)
        self.results[c.PERCENT_ERROR].replace(np.inf, 0, inplace=True)

        return self.results

    def calculate_perf_measurements(self):
        self.r_squared = round(self.model.score(self.X, self.Y), 2)
        self.r_squared_adj = round(
            utils.calculated_rsquared_adj(self.X, self.X, self.r_squared), 2)
        self.mae = round(metrics.mean_absolute_error(self.Y, self.predictions),
                         2)
        self.mse = round(metrics.mean_squared_error(self.Y, self.predictions),
                         2)
        self.rmse = round(
            np.sqrt(metrics.mean_squared_error(self.Y, self.predictions)), 2)
        self.pred25 = round(
            utils.calculate_PRED(0.25, self.results, c.PERCENT_ERROR), 2)
        self.pred50 = round(
            utils.calculate_PRED(0.50, self.results, c.PERCENT_ERROR), 2)

    def create_output_df(self):
        row_df = pd.DataFrame({
            c.PROJECT: [self.project_name],
            c.MODEL: [self.type],
            c.TASK: [self.task],
            c.R_SQUARED: [self.r_squared],
            c.R_SQUARED_ADJ: [self.r_squared_adj],
            c.MAE: [self.mae],
            c.MSE: [self.mse],
            c.RMSE: [self.rmse],
            c.PRED_25: [self.pred25],
            c.PRED_50: [self.pred50],
            c.T_RECORDS: self.t_records
        })
        return row_df

    def forecast_module_effort(self, prediction_months, team_size=None):
        NT = None
        NO = None
        T_CONTRIBUTORS = None

        if self.type == c.LINE_CC or self.type == c.MODULE_CC:
            NT = c.NT_CC
            NO = c.NO_CC
            T_CONTRIBUTORS = c.T_CC
        elif self.type == c.LINE_EC or self.type == c.MODULE_EC:
            NT = c.NT_EC
            NO = c.NO_EC
            T_CONTRIBUTORS = c.T_EC
            team_size = None

        # with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        #   forecast_NT = executor.submit(self.forecast_variable, NT, predicton_months).result()
        #   forecast_NO = executor.submit(self.forecast_variable, NO, predicton_months).result()
        #   forecast_T_Contributors = executor.submit(self.forecast_variable, T_CONTRIBUTORS, predicton_months).result()
        #   forecast_T_Line_P = executor.submit(self.forecast_variable, c.T_LINE_P, predicton_months).result()
        forecast_NT = self.forecast_variable(NT, prediction_months)
        forecast_NO = self.forecast_variable(NO, prediction_months)
        forecast_T_Line_P = self.forecast_variable(c.T_LINE_P,
                                                   prediction_months)
        forecast_T_Contributors = None

        if team_size != None:
            forecast_T_Contributors = {}
            df_size = len(forecast_NT['yhat'])
            forecast_T_Contributors['yhat'] = utils.make_contrib_forecast(
                df_size, team_size)
        else:
            forecast_T_Contributors = self.forecast_variable(
                T_CONTRIBUTORS, prediction_months)

        data = {
            c.NT: forecast_NT['yhat'],
            c.NO: forecast_NO['yhat'],
            T_CONTRIBUTORS: forecast_T_Contributors['yhat'],
            c.T_LINE_P: forecast_T_Line_P['yhat']
        }

        dateIndex = forecast_NT['ds']
        self.module_forecast_results = self.forecast_effort(
            data, dateIndex, self.type, self.model)

        return self.module_forecast_results

    def calculate_total_effort(self, prediction_years):
        results = self.module_forecast_results
        results = self.calculate_costs(results)
        results['Year'] = results[c.DATE].apply(lambda x: x.year)
        results = pd.pivot_table(results,
                                 index=["Year"],
                                 values=[self.type, c.COST],
                                 aggfunc=np.sum).tail(prediction_years + 1)
        return results

    def display_forecast(self, prediction_years):
        results = self.calculate_total_effort(prediction_years)

        logger.info("\n{0} - {1} {2} Forecasted Effort: \n".format(
            self.project_name, self.type, self.task))
        logger.info(results.head(prediction_years + 1))
# # print(digits.images[0])
#
# # Learning and predicting
# clf = svm.SVC(gamma=0.001, C=100.)
# clf.fit(digits.data[:-1], digits.target[:-1])
#
# clf.predict(digits.data[-1:])

# Transforming target in regression
from sklearn.compose import TransformedTargetRegressor
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer

boston = load_boston()
X = boston.data
y = boston.target

regressor = LinearRegression()
transformer = QuantileTransformer(output_distribution='normal')
regr = TransformedTargetRegressor(regressor=regressor, transformer=transformer)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

regr.fit(X_train, y_train)
print('R2 score: {0:.2f}'.format(regr.score(X_test, y_test)))

raw_target_regr = LinearRegression().fit(X_train, y_train)
print('R2 score: {0:.2f}'.format(raw_target_regr.score(X_test, y_test)))
X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(
    no_of_days, confirmedworld_cases, test_size=0.05, shuffle=False)

#using the Transformed target regressor to find the future confirmed cases worldwide
from sklearn.linear_model import RidgeCV
from sklearn.compose import TransformedTargetRegressor

regr = TransformedTargetRegressor(regressor=RidgeCV(),
                                  func=np.log1p,
                                  inverse_func=np.expm1)
regr.fit(X_train_confirmed, y_train_confirmed)
regr_pred = regr.predict(forecast_data)

print('Accuracy of TransformedTargetRegressor on test set: {:.2f}'.format(
    regr.score(X_test_confirmed, y_test_confirmed)))

plt.figure(figsize=(20, 12))
plt.plot(updated_dates[:], confirmedworld_cases[:])
plt.plot(forecast_data, regr_pred, linestyle='dotted', color='red')
plt.title('No of Coronavirus Cases Worldwide Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('No of Cases', size=30)
plt.legend([
    'Confirmed Cases',
    ' logarithmic  and an exponential transformed Ridge Regression  predictions'
])
plt.xticks(size=15)
plt.show()

print('Transformed Target Regressor future pred')