Esempio n. 1
0
def pipeline():

    remote_data_path = sys.argv[1] if len(sys.argv) > 1 else \
        "s3a://asystem-astore-staging"
    remote_model_path = sys.argv[2] if len(sys.argv) > 2 else \
        "s3a://asystem-amodel-staging/asystem/amodel/energyforecastintraday"
    local_model_path = sys.argv[3] if len(sys.argv) > 3 else \
        tempfile.mkdtemp()
    print("Pipeline starting on [{}]\n".format(remote_data_path))

    time_start = int(round(time.time()))
    spark = SparkSession.builder \
        .appName("asystem-amodel-energyforecastintraday").getOrCreate()
    print("Session created ...")

    ds_energy = spark.read.parquet(*paths(
        qualify(remote_data_path +
                "/[0-9]/asystem/astore/processed/canonical/parquet/dict/snappy"
                ), ["/*/*/*/*/astore_metric=energy"], "/*.snappy.parquet"))
    ds_sun = spark.read.parquet(*paths(
        qualify(remote_data_path +
                "/[0-9]/asystem/astore/processed/canonical/parquet/dict/snappy"
                ), ["/*/*/*/*/astore_metric=sun"], "/*.snappy.parquet"))
    print("Listing finished ...")

    ds_energy.createOrReplaceTempView('energy')
    ds_energy.cache()
    df_energy = spark.sql("""
        SELECT
          bin_timestamp,
          data_value / data_scale AS bin_energy
        FROM energy
        WHERE
          data_metric='energy__production__inverter' AND 
          data_type='integral' AND
          bin_width=1 AND
          bin_unit='day'
        ORDER BY bin_timestamp ASC
    """).toPandas()
    ds_sun.createOrReplaceTempView('sun')
    ds_sun.cache()
    df_sun_rise = spark.sql("""
        SELECT
          bin_timestamp,
          data_value / data_scale AS bin_sunrise
        FROM sun
        WHERE          
          data_metric='sun__outdoor__rise' AND
          data_type='epoch' AND
          bin_width=1 AND
          bin_unit='day'
        ORDER BY bin_timestamp ASC
    """).toPandas()
    df_sun_set = spark.sql("""
        SELECT
          bin_timestamp,
          data_value / data_scale AS bin_sunset
        FROM sun
        WHERE          
          data_metric='sun__outdoor__set' AND
          data_type='epoch' AND
          bin_width=1 AND
          bin_unit='day'
        ORDER BY bin_timestamp ASC
    """).toPandas()
    spark.catalog.clearCache()
    print("Dataframes collected ...")

    df = df_energy.set_index(
        pd.to_datetime(df_energy['bin_timestamp'],
                       unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE))
    df['bin_date'] = df.index.date
    df.set_index('bin_date', inplace=True)
    df_energy_day = df.groupby(df.index)['bin_energy'].max().to_frame() \
        .rename(columns={'bin_energy': 'bin_energy_day'})
    df = df.merge(df_energy_day,
                  how='inner',
                  left_index=True,
                  right_index=True)
    df_sun_rise.set_index(
        pd.to_datetime(df_sun_rise['bin_timestamp'],
                       unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE),
        inplace=True)
    df_sun_rise['bin_date'] = df_sun_rise.index.date
    df_sun_rise.set_index('bin_date', inplace=True)
    df = df.merge(df_sun_rise.groupby(
        df_sun_rise.index)['bin_sunrise'].max().to_frame(),
                  how='inner',
                  left_index=True,
                  right_index=True)
    df_sun_set.set_index(
        pd.to_datetime(df_sun_set['bin_timestamp'],
                       unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE),
        inplace=True)
    df_sun_set['bin_date'] = df_sun_set.index.date
    df_sun_set.set_index('bin_date', inplace=True)
    df = df.merge(df_sun_set.groupby(
        df_sun_set.index)['bin_sunset'].max().to_frame(),
                  how='inner',
                  left_index=True,
                  right_index=True)
    df.set_index(
        pd.to_datetime(df['bin_timestamp'],
                       unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE),
        inplace=True)
    df.sort_index(inplace=True)
    print("Output compiled ...")
    print("\nTraining data:\n{}\n\n".format(df.describe()))

    dfvs = {'VETTED': {}, 'PURGED': {}, 'TOVETT': {}}
    for dfs in df.groupby(df.index.date):
        day = dfs[0].strftime('%Y/%m/%d')
        dfvs[('PURGED' if day in DAYS_BLACK_LIST else
              ('TOVETT' if day >= datetime.datetime.now().strftime("%Y/%m/%d")
               else 'VETTED'))][day] = dfs[1]

    for vetting in dfvs:
        for day, dfv in sorted(dfvs[vetting].iteritems()):
            dfv.set_index(pd.to_datetime(
                dfv['bin_timestamp'],
                unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE),
                          inplace=True)
            if DAYS_PLOT and DAYS_PLOT_DEBUG:
                dfv.plot(title="Energy ({}) - {}".format(day, vetting),
                         y=['bin_energy', 'bin_energy_day'])

    for vetting in dfvs:
        print("Processed {} {} days ...".format(len(dfvs[vetting]),
                                                vetting.lower()))

    dfnss = []
    bins = 1000
    for day, dfv in sorted(dfvs['VETTED'].iteritems()):
        dfv['normalised'] = dfv['bin_energy'] / dfv['bin_energy_day']
        dfv['standardised'] = bins * (
                dfv['bin_timestamp'] - dfv['bin_sunrise']) / \
                              (dfv['bin_sunset'] - dfv['bin_sunrise'])
        dfv['standardised'] = dfv['standardised'].clip(0, bins).astype(int)
        dfns = dfv.drop([
            'bin_timestamp', 'bin_energy', 'bin_energy_day', 'bin_sunrise',
            'bin_sunset'
        ],
                        axis=1).drop_duplicates()
        dfns.set_index('standardised', inplace=True)
        dfns.sort_index(inplace=True)
        dfns = dfns[~dfns.index.duplicated(keep='first')]
        dfns = dfns.reindex(np.arange(0, bins + 1)).ffill()
        dfns.loc[0:10] = 0
        dfns.loc[990:1000] = 1
        dfnss.append(dfns)
        if DAYS_PLOT and DAYS_PLOT_DEBUG:
            dfns.plot(title="Energy ({}) - VETTED".format(day))
    dfnsa = pd.concat(dfnss, axis=1, ignore_index=True)
    if DAYS_PLOT:
        dfnsa.plot(title="Energy Normalised/Standardised (All) - VETTED",
                   legend=False)
    dfnsa = pd.concat(dfnss)
    dfnsa = dfnsa.groupby(dfnsa.index).mean()
    if DAYS_PLOT:
        dfnsa.plot(title="Energy Normalised/Standardised (Mean) - VETTED",
                   legend=False)
    print("Model built ...")

    model_file = '/model/pickle/joblib/none/' \
                 'amodel_version=${project.version}/amodel_model=${asystem-model-energyforecast-intraday.build.version}/model.pkl'
    local_model_file = local_model_path + model_file
    remote_model_file = remote_model_path + model_file
    if os.path.exists(os.path.dirname(local_model_file)):
        shutil.rmtree(os.path.dirname(local_model_file))
    os.makedirs(os.path.dirname(local_model_file))
    pickled_execute = StringIO()
    dill.dump(execute, pickled_execute)
    pickled_execute.flush()
    joblib.dump({
        'pipeline': dfnsa,
        'execute': pickled_execute
    },
                local_model_file,
                compress=True)
    print("Model serialised ...")

    model = joblib.load(local_model_file)
    dfi = pd.DataFrame([{
        "energy__production_Dforecast_Ddaylight__inverter": 0
    }, {
        "energy__production_Dforecast_Ddaylight__inverter": 250
    }, {
        "energy__production_Dforecast_Ddaylight__inverter": 500
    }, {
        "energy__production_Dforecast_Ddaylight__inverter": 750
    }, {
        "energy__production_Dforecast_Ddaylight__inverter":
        1000
    }]).apply(pd.to_numeric, errors='ignore')
    dfo = dill.load(StringIO(model['execute'].getvalue())) \
        (model=model, features=dfi, prediction=True)
    print("Model de-serialised ...")
    print("\nEnergy Mean Input:\n{}\n\nEnergy Mean Output:\n{}\n".format(
        dfi, dfo))

    publish(local_model_file, remote_model_file)
    shutil.rmtree(local_model_path)
    print("Model published ...")

    print("\nPipeline finished in [{}] s".format(
        int(round(time.time())) - time_start))
Esempio n. 2
0
def pipeline():

    remote_data_path = sys.argv[1] if len(
        sys.argv) > 1 else "s3a://asystem-astore"
    print("Pipeline starting on [{}]\n".format(remote_data_path))

    time_start = int(round(time.time()))
    spark = SparkSession.builder.appName(
        "asystem-amodel-dataset").getOrCreate()
    print("Session created ...")

    dataset = spark.read.parquet(*paths(
        qualify(
            remote_data_path +
            "/[0-9]/asystem/astore/processed/canonical/parquet/dict/snappy"),
        ["/*/*/*/*/astore_metric=temperature"], "/*.snappy.parquet"))
    print("Listing finished ...")

    dataset.createOrReplaceTempView('dataset')
    dataset.cache()
    dataset = spark.sql("""
        SELECT
          bin_timestamp AS timestamp,
          data_metric AS metric,
          data_temporal AS temporal,
          data_value / data_scale AS temperature
        FROM dataset
        WHERE
          data_temporal='current' AND
          data_type='point' AND
          data_version=2 AND
          data_metric NOT LIKE '%forecast%' AND
          data_metric NOT LIKE '%parents' AND
          data_metric NOT LIKE '%shed' AND
          data_metric NOT LIKE '%roof'
        ORDER BY timestamp
    """)
    dataframe = dataset.toPandas()
    spark.catalog.clearCache()
    print("Dataframe collected ...")

    dataframe = dataframe.pivot_table(values='temperature',
                                      index='timestamp',
                                      columns='metric')
    dataframe = dataframe.set_index(
        pd.to_datetime(
            dataframe.index,
            unit='s').tz_localize('UTC').tz_convert('Australia/Perth'))
    dataframe = dataframe.loc[(dataframe.index.strftime('%Y-%m-%d') >=
                               '2018-07-19')]
    dataframe = dataframe.fillna(method='bfill')
    dataframe = dataframe.fillna(method='ffill')
    dataframe = dataframe.resample('300S').mean()
    dataframe = dataframe.fillna(method='bfill')
    dataframe = dataframe.fillna(method='ffill')
    dataframe = dataframe.round(1)
    dataframe = dataframe.loc[(dataframe < 50).all(axis=1), :]
    dataframe = dataframe.loc[(dataframe > -10).all(axis=1), :]
    dataframe.columns = dataframe.columns.map(
        lambda name: re.compile('.*__.*__(.*)').sub('\\1', name))
    print("Output compiled ...")
    print("\nTraining data:\n{}\n\n".format(dataframe.describe()))

    output = tempfile.NamedTemporaryFile(prefix='asystem-temperature-',
                                         suffix='.csv',
                                         delete=False).name
    dataframe.to_csv(output)
    print("Wrote output to [{}]".format(output))

    print("\nPipeline finished in [{}] s".format(
        int(round(time.time())) - time_start))