コード例 #1
0
def model_load(prefix='sl', data_dir=None, training=True):
    """
    example funtion to load model
    
    The prefix allows the loading of different models
    """

    if not data_dir:
        #         data_dir = os.path.join("..","data","cs-train")
        data_dir = os.path.join(os.getcwd(), "data", "cs-train")

    models = [
        f for f in os.listdir(os.path.join(".", "models"))
        if re.search("sl", f)
    ]

    if len(models) == 0:
        raise Exception(
            "Models with prefix '{}' cannot be found did you train?".format(
                prefix))

    all_models = {}
    for model in models:
        all_models[re.split("-", model)[1]] = joblib.load(
            os.path.join(".", "models", model))

    ## load data
    ts_data = fetch_ts(data_dir)
    all_data = {}
    for country, df in ts_data.items():
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}

    return (all_data, all_models)
コード例 #2
0
def model_train(data_dir, test=False):
    """
    funtion to train model given a df
    
    'mode' -  can be used to subset data essentially simulating a train
    """

    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    if test:
        print("... test flag on")
        print("...... subseting data")
        print("...... subseting countries")

    ## fetch time-series formatted data
    ts_data = fetch_ts(data_dir)

    ## train a different model for each data sets
    for country, df in ts_data.items():

        if test and country not in ['all', 'united_kingdom']:
            continue

        _model_train(df, country, test=test)
コード例 #3
0
def model_predict(country, year, month, day):
    time_start = time.time()
    data_dir = os.path.join("data", "cs_train", "data")
    ts_data = fetch_ts(data_dir)
    countries = []
    for c, df in ts_data.items():
        countries.append(c)

    if (country not in countries):
        text = "Could not find country called " + country
        return (text)

    else:
        filename = "./data/forecasts/forecast_" + country
        forecasts = pd.read_csv(filename)
        date_str = year + "-" + month + "-" + day
        row = forecasts.loc[forecasts['ds'] == date_str]

    if (len(row) == 0):
        return "Date not available"
    else:
        # update the log file
        m, s = divmod(time.time() - time_start, 60)
        h, m = divmod(m, 60)
        runtime = "%03d:%02d:%02d" % (h, m, s)
        test = False
        update_predict_log(row.yhat.values[0], runtime, MODEL_VERSION,
                           MODEL_VERSION_NOTE, test)
        return row
コード例 #4
0
ファイル: model.py プロジェクト: mucheinz/capstone-w
def model_load(prefix='sl', data_dir=None, training=True):

    if not data_dir:
        data_dir = os.path.join("..", "capstone-w", "cs-train")

    models = [
        f for f in os.listdir(os.path.join(".", "models"))
        if re.search("sl", f)
    ]

    if len(models) == 0:
        raise Exception(
            "Models with prefix '{}' cannot be found did you train?".format(
                prefix))

    all_models = {}
    for model in models:
        all_models[re.split("-", model)[1]] = joblib.load(
            os.path.join(".", "models", model))

    # load data
    ts_data = fetch_ts(data_dir)
    all_data = {}
    for country, df in ts_data.items():
        df = clean_data(df)
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}

    return (all_data, all_models)
コード例 #5
0
def model_train():
    ## start timer for runtime
    time_start = time.time()
    data_dir = os.path.join("data", "cs_train", "data")
    ts_data = fetch_ts(data_dir)

    for country, df in ts_data.items():
        m = Prophet()
        df2 = df[["date", "revenue"]]
        df2.columns = ['ds', 'y']
        m.fit(df2)
        future = m.make_future_dataframe(periods=120)
        forecast = m.predict(future)
        forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
        filename = "data/forecasts/forecast_" + country
        forecast.to_csv(filename)

    ## update the log file
    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)
    test = False
    update_train_log(forecast.shape, runtime, MODEL_VERSION,
                     MODEL_VERSION_NOTE, test)

    return True
コード例 #6
0
ファイル: model.py プロジェクト: tdody/RevenueModel
def model_train(data_dir, test=False, country=None):
    """
    funcion to train model given a df
    
    'mode' -  can be used to subset data essentially simulating a train
    """

    ## create storage folder if needed
    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    ## on test, use only two countries and small fraction of data
    if test:
        print("... test flag on")
        print("...... subseting data")
        print("...... subseting countries")

    ## fetch time-series formatted data
    ts_data = fetch_ts(data_dir, country=country, clean=False)

    ## train a different model for each data sets
    ## for test, only train "all" and "UK"
    for country, df in ts_data.items():

        if test and country not in ['all', 'united_kingdom']:
            continue

        ## train specific model
        _model_train(df, country, test=test)
コード例 #7
0
def get_latest_train_data(country):
    """
    load the data used in the latest training
    """

    data_dir = os.path.join("data","cs_train","data")
    ts_data = fetch_ts(data_dir)
        
    for c,df in ts_data.items():
        if(c==country):
            return df
コード例 #8
0
ファイル: model.py プロジェクト: tdody/RevenueModel
def model_load(prefix='sl', data_dir=None, training=True, country=None):
    """
    example function to load model
    
    The prefix allows the loading of different models
    """
    ## if data path not specified, use generic
    if not data_dir:
        data_dir = os.path.join(".", "data", "cs-train")

    ## load all models (or filter for country)
    if country is None:
        models = [
            f for f in os.listdir(os.path.join(".", "models"))
            if re.search(prefix, f)
        ]
    else:
        country_id = re.sub("\s+", "_", country.lower())
        models = [
            f for f in os.listdir(os.path.join(".", "models"))
            if (re.search(prefix, f) and re.search(country_id, f))
        ]

    if len(models) == 0:
        if country is None:
            raise Exception(
                "Models with prefix '{}' cannot be found, did you train?".
                format(prefix))
        else:
            raise Exception(
                "Model for '{0}' with predix '{1}' cannot be found, did you train it?"
                .format(prefix, country))

    ## store model in dictionary
    ## key = model name
    ## value = model
    all_models = {}
    for model in models:
        all_models[re.split("-", model)[1]] = joblib.load(
            os.path.join(".", "models", model))

    ## load data
    ts_data = fetch_ts(data_dir, country=country)
    all_data = {}
    for country, df in ts_data.items():
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}

    return (all_data, all_models)
コード例 #9
0
def ingest():
    """
    basic re-ingest method generating ts data from the original invoicing data and rebuild the model
    """
    print('### API ingest entering ###')
    data_dir = os.path.join("data", "cs-train")
    print("...fetching data")

    ts_all = fetch_ts(data_dir, clean=True)

    for key, item in ts_all.items():
        print(key, item.shape)
    print("... re-ingesting complete")

    return (jsonify(True))
コード例 #10
0
def model_train(data_dir, test=False):

    ## multiple models were compared

    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    if test:
        print("... test flag on")
        print("...... subseting data")
        print("...... subseting countries")

    ts_data = fetch_ts(data_dir)

    for country, df in ts_data.items():

        if test and country not in ['all', 'united_kingdom']:
            continue

        _model_train(df, country, test=test)
コード例 #11
0
ファイル: model.py プロジェクト: mucheinz/capstone-w
def model_train(data_dir, test=False):

    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    if test:
        print("... test flag on")
        print("...... subsetting data")
        print("...... subsetting countries")

    # fetch time-series formatted data
    ts_data = fetch_ts(data_dir)

    # train a different model for each data sets
    for country, df in ts_data.items():

        if test and country not in ['all', 'united_kingdom']:
            continue

        _model_train(df, country, test=test)
コード例 #12
0
def model_load(prefix='sl', data_dir=None, training=True):
    """
    example funtion to load model
    
    The prefix allows the loading of different models
    """

    if not data_dir:
        data_dir = os.path.join(PARENT_DIR, "Final_Capstone/cs-train")

    all_models = model_load_only(prefix=prefix)

    ## load data
    ts_data = fetch_ts(data_dir)
    all_data = {}
    for country, df in ts_data.items():
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}

    return (all_data, all_models)
コード例 #13
0
def model_train(prefix='sl',
                data_dir=DATA_DIR,
                test=False,
                countries=False,
                model=DEFAULT_MODEL,
                model_param_grid=DEFAULT_PARAM_GRID,
                scaler=DEFAULT_SCALER):
    """
    funtion to train model given a df    
    'mode' -  can be used to subset data essentially simulating a train
    """

    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    if test:
        print("... test flag on")
        print("...... subseting data")
        print("...... subseting countries")

    ## fetch time-series formatted data
    ts_data = fetch_ts(data_dir)

    ## train a different model for each data sets
    for country, df in ts_data.items():
        # only train model for all and uk in test mode
        if test and country not in ['all', 'united_kingdom']:
            continue
        # only train model for country in countries
        if countries and not (country in countries):
            continue
        _model_train(prefix,
                     df,
                     country,
                     test=test,
                     model=model,
                     model_param_grid=model_param_grid,
                     scaler=scaler)
コード例 #14
0
def model_load(country, prefix='sl', data_dir=None, training=True):
    """
    example funtion to load model
    
    The prefix allows the loading of different models
    """
    warnings.filterwarnings("ignore")

    if not data_dir:
        data_dir = os.path.join(DATA_DIR)

    # country when passed will load that country's model. 'all' will all models
    model_name = prefix + '-' + country
    models = [
        f for f in os.listdir(os.path.join(MODEL_DIR))
        if re.search(model_name, f)
    ]
    if len(models) == 0:
        raise Exception(
            "Models with prefix '{}' cannot be found did you train?".format(
                prefix))

    all_models = {}
    for model in models:
        all_models[re.split("-", model)[1]] = joblib.load(
            os.path.join(MODEL_DIR, model))

    # load data
    ts_data = fetch_ts(data_dir)
    all_data = {}
    for country, df in ts_data.items():
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}

    return (all_data, all_models)
コード例 #15
0
        mask = np.arange(X.shape[0]) < np.arange(X.shape[0])[-30]
        X = X[mask]
        y = y[mask]
        dates = dates[mask]
        X.reset_index(drop=True, inplace=True)

    return (X, y, dates)


if __name__ == "__main__":

    run_start = time.time()
    data_dir = os.path.join("..", "data", "cs-train")
    print("...fetching data")

    ts_all = fetch_ts(data_dir, clean=False)

    m, s = divmod(time.time() - run_start, 60)
    h, m = divmod(m, 60)
    print("load time:", "%d:%02d:%02d" % (h, m, s))

    for key, item in ts_all.items():
        print(key, item.shape)

##  the data ingestion exists as a function or script to facilitate automation

MODEL_DIR = "models"
MODEL_VERSION = 0.1
MODEL_VERSION_NOTE = "supervised learing model for time-series"