Esempio n. 1
0
def get_data(bq_client, bq_storage_client):
    city_raw_data = get_raw_data(bq_client, bq_storage_client,
                                 "light_funnel_dau_city")
    (city_clean_data,
     city_clean_training_data) = prepare_data(city_raw_data, s2d('2016-04-08'),
                                              s2d('2020-01-30'))
    city_forecast_data = forecast(city_clean_training_data, city_clean_data)

    country_raw_data = get_raw_data(bq_client, bq_storage_client,
                                    "light_funnel_dau_country")
    (country_clean_data,
     country_clean_training_data) = prepare_data(country_raw_data,
                                                 s2d('2016-04-08'),
                                                 s2d('2020-01-30'))
    country_forecast_data = forecast(country_clean_training_data,
                                     country_clean_data)
    city_forecast_data.update(country_forecast_data)
    return city_forecast_data
Esempio n. 2
0
def dataFilter(data, product):
    startDates = {
        "desktop_global": s2d('2016-04-08'),
        "fxa_global": s2d('2018-03-20'),
        "fxa_tier1": s2d('2018-03-20'),
        "Fennec Android": s2d('2017-03-04'),
        "Focus iOS": s2d('2017-12-06'),
        "Focus Android": s2d('2017-07-17'),
        "Fennec iOS": s2d('2017-03-03'),
        "Fenix": s2d('2019-07-03'),
        "Firefox Lite": s2d('2017-03-04'),
        "FirefoxForFireTV": s2d('2018-02-04'),
        "FirefoxConnect": s2d('2018-10-10'),
        "nondesktop_nofire_global": s2d('2017-01-30'),
        "nondesktop_nofire_tier1": s2d('2017-01-30'),
    }

    anomalyDates = {
        "desktop_global": [s2d('2019-05-16'),
                           s2d('2019-06-07')],
        "Focus Android": [s2d('2018-09-01'),
                          s2d('2019-03-01')],
        "Fennec iOS": [s2d('2017-11-08'), s2d('2017-12-31')],
    }
    temp = data.copy()
    if product in startDates:
        startDate = startDates[product]  # noqa: F841
        temp = temp.query("ds >= @startDate")
    if product in anomalyDates:
        anomalyStartDate = anomalyDates[product][0]  # noqa: F841
        anomalyEndDate = anomalyDates[product][1]  # noqa: F841
        temp = temp.query("(ds < @anomalyStartDate) | (ds > @anomalyEndDate)")
    return temp
Esempio n. 3
0
def _getSinglePrediciton(model, data, trainingEndDate, targetDate):
    model.fit(data.query("ds <= @trainingEndDate"))
    forecast_period = pd.DataFrame({'ds': [s2d(targetDate)]})
    forecast = model.predict(forecast_period)
    return (forecast.yhat[0], forecast.yhat_lower[0], forecast.yhat_upper[0])
Esempio n. 4
0
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import pandas as pd
from fbprophet import Prophet
from dscontrib.jmccrosky.forecast.utils import s2d

# The only holidays we have identified the need to explicitly model are Chinese
# New Year and Holi
chinese_new_year = pd.DataFrame({
    'ds': [
        s2d("2016-02-08"),
        s2d("2017-01-28"),
        s2d("2018-02-16"),
        s2d("2019-02-05"),
        s2d("2020-01-25")
    ],
    'holiday':
    "chinese_new_year",
    'lower_window':
    -20,
    'upper_window':
    20,
})

holi = pd.DataFrame({
    'ds': [
        s2d("2016-03-06"),
        s2d("2017-03-13"),
        s2d("2018-03-02"),
Esempio n. 5
0
def pipeline(bq_client, bq_storage_client, output_bq_client):
    metrics = {
        "light_funnel_dau_city":
        "desktop_dau",
        "light_funnel_dau_country":
        "desktop_dau",
        "light_funnel_mean_active_hours_per_profile_city":
        "mean_active_hours_per_client",
        "light_funnel_mean_active_hours_per_profile_country":
        "mean_active_hours_per_client",
    }

    output_data = pd.DataFrame(
        {
            "date": [],
            "metric": [],
            "deviation": [],
            "ci_deviation": [],
            "geography": [],
        },
        columns=["date", "metric", "deviation", "ci_deviation", "geography"])

    for metric in metrics.keys():
        raw_data = get_raw_data(bq_client, bq_storage_client, metric)
        (clean_data,
         clean_training_data) = prepare_data(raw_data, s2d('2016-04-08'),
                                             s2d('2020-01-30'))
        forecast_data = forecast(clean_training_data, clean_data)

        for geo in forecast_data:
            output_data = pd.concat([
                output_data,
                pd.DataFrame(
                    {
                        "date":
                        pd.to_datetime(
                            forecast_data[geo].ds).dt.strftime("%Y-%m-%d"),
                        "metric":
                        metrics[metric],
                        "deviation":
                        forecast_data[geo].delta,
                        "ci_deviation":
                        forecast_data[geo].ci_delta,
                        "geography":
                        geo,
                    },
                    columns=[
                        "date", "metric", "deviation", "ci_deviation",
                        "geography"
                    ])
            ],
                                    ignore_index=True)
    dataset_ref = output_bq_client.dataset("analysis")
    table_ref = dataset_ref.table("deviations")
    try:
        output_bq_client.delete_table(table_ref)
    except NotFound:
        pass
    schema = [
        bigquery.SchemaField('date', 'DATE', mode='REQUIRED'),
        bigquery.SchemaField('metric', 'STRING', mode='REQUIRED'),
        bigquery.SchemaField('deviation', 'FLOAT', mode='REQUIRED'),
        bigquery.SchemaField('ci_deviation', 'FLOAT', mode='REQUIRED'),
        bigquery.SchemaField('geography', 'STRING', mode='REQUIRED'),
    ]
    table = bigquery.Table(table_ref, schema=schema)
    table = output_bq_client.create_table(table)
    n = len(output_data)
    for i in range(0, n, 10000):
        errors = output_bq_client.insert_rows(
            table,
            list(output_data[i:min(i + 10000, n)].itertuples(index=False,
                                                             name=None)))
    return (output_data, errors)