Exemple #1
0
def load_page_data(from_cache=True):
  logger.debug("Loading data for app_tests ...")
  global df_demand, df_active_alternatives, df_baseline_vs_tests

  df_demand = demand_data(spark,from_cache)
  df_active_alternatives = active_tests_results(spark,from_cache)
  df_baseline_vs_tests = baseline_versus_tests(spark,from_cache)
Exemple #2
0
def update_selected_experiment(selected_city,selected_series_type,start_date,end_date,selected_experiment):
  logger.debug("Updating selected experiment: " + str(selected_experiment) + " selected_city: " + str(selected_city) + " start_date: " + start_date + " end_date: " + end_date + " selected_series_type: " + selected_series_type)

  #Setting filters
  filters = {}
  if selected_city != -1:
    filters = {'city':selected_city}

  df_exp_agg = get_experiment_agg_data(selected_experiment, start_date, end_date, filters, 'date, alternative_id')

  # x_range, y_range_reta, y_range, conversion, price, arpu, elasticity_label = elasticity_for_experiment(selected_experiment, initial_window_date = start_date, base_date = end_date, field = selected_series_type)
  # elasticity_plot = plot_pef_elasticity(x_range, y_range_reta, y_range, conversion, price, arpu)

  fig_arpu,fig_ratio,fig_ticket,fig_conversion = plot_experiment_comparison(df_exp_agg, BASE_FIELDS[selected_series_type]['fields'])
  for fig in [fig_arpu]:
    format_figure(fig,showlegend=True)

  for fig in [fig_ratio,fig_ticket,fig_conversion]:
    format_figure(fig)

  print("Updating selected experiment: " + str(selected_experiment) + " FINISH")

  arpu_label = 'ARPU - ' + BASE_FIELDS[selected_series_type]['label']
  ticket_label = 'Ticket Médio - ' + BASE_FIELDS[selected_series_type]['label']
  conversion_label = 'Conversão (pagos/registered) - ' + BASE_FIELDS[selected_series_type]['label']

  # elasticity_label = 'Elasticidade PEF - ' + elasticity_label

  return fig_arpu,fig_ratio,fig_ticket,fig_conversion,arpu_label,ticket_label,conversion_label
Exemple #3
0
def load_daily_share_rpu(spark, from_cache=False):
    logger.debug("Loading daily_share_rpu ... ")

    query = "SELECT * FROM data_science.consolidated_rpu_per_day WHERE level='Graduação'"
    df = load_from_db_cache(spark, query, 'consolidated_rpu_per_day',
                            from_cache)

    #Pricing log

    pricing_log = {
        "RECIFE": [-15, 14],
        "CURITIBA": [-10, 14],
        "RIO DE JANEIRO": [-10, 14],
        "JOÃO PESSOA": [-20, 14],
        "BELEM": [-5, 14],
        "SALVADOR": [-10, 14],
        "FORTALEZA": [-10, 14],
        "MANAUS": [10, 14],
        "PORTO VELHO": [20, 14],
        "CARUARU": [-50, 14],
        "FEIRA DE SANTANA": [-30, 14],
        "TERESINA": [-10, 14],
        "CAMPINAS": [-15, 14],
        "OSASCO": [-20, 14],
        "SÃO JOSÉ DOS CAMPOS": [-10, 14],
        "SOROCABA": [-30, 14],
        "SANTO ANDRE": [10, 14],
        "JUAZEIRO DO NORTE": [-10, 37],
        "CUIABA": [-20, 42],
        "BOA VISTA": [-20, 42]
    }

    if from_cache:
        #Simulating pricing logs
        df['pricing_change'] = np.nan
        df['pricing_log'] = np.nan

        for key in pricing_log:

            df.loc[(df['dia_ano'] == pricing_log[key][1]) &
                   (df['city'] == key), 'pricing_change'] = pricing_log[key][0]
            df.loc[(df['dia_ano'] == pricing_log[key][1]) & (
                df['city'] == key
            ), 'pricing_log'] = "Alteração de {}% para elevar ARPU e captação".format(
                pricing_log[key][0])

    df['delta_rpu_20_19'] = (df['receita_acumulada'] / df['ordens_acumuladas']
                             ) / (df['receita_acumulada_18'] /
                                  df['ordens_acumuladas_18']) - 1
    df['delta_pagos_20_19'] = df['pagos_acumulados'] / df[
        'pagos_acumulados_18'] - 1

    #Excluding cities where the deltas are not defined
    df = df[(df['delta_rpu_20_19'] != np.inf)
            & (df['delta_pagos_20_19'] != np.inf)]
    df = df.sort_values(['city', 'dia_ano'])

    return df
Exemple #4
0
def load_page_data(from_cache=True):
    logger.debug("Loading data for app_goals ...")
    global df_goals_only_qb_time_series, df_ies_categorization, df_campaigns, df_goals_only_qb_time_series_summer

    df_goals_only_qb_time_series = rpu_goals_only_qb_time_series(
        spark, from_cache)
    df_ies_categorization = ies_categorization(spark, from_cache)
    df_campaigns = campaigns(spark, from_cache)
    df_goals_only_qb_time_series_summer = rpu_goals_only_qb_time_series_summer(
        spark, from_cache)
Exemple #5
0
def update_city_dropdown(selected_experiment,start_date,end_date):
  logger.debug("Updating city dropdown: " + str(selected_experiment))
  aux = get_experiment_agg_data_per_city(selected_experiment, start_date, end_date)
  aux = aux[aux['order_id']>MIN_N_POINTS]
  options = [{'label':'All','value':-1}]
  for city,n_orders in zip(aux['city'].tolist(),aux['order_id'].tolist()):
      options.append({
          'label':city + '  (' + str(n_orders) + ' orders)' ,
          'value':city
      })
  logger.debug("Updating city dropdown: " + str(selected_experiment) + " FINISHED")
  return options,-1
Exemple #6
0
def log_recalculate(spark, from_cache=False):
    logger.debug("Loading log_recalculate ... ")

    query = """
      with mudancas as
      (
        select *
        from parcerias.log_changes_pricing
      ),

      base as
      (
        select datas.dia,
          modalidade.kind,
          campus.university_id,
          campus.city,
          campus.state,
          campus.ies

        from (select distinct university_id, city, state, ies from parcerias.log_changes_pricing) as campus
        cross join (select explode(sequence(date('2019-12-01'), date(now()))) dia) as datas
        cross join (select distinct case when parent_id = 1 then 'Presencial' else 'EaD + Semi' end kind from querobolsa_production.kinds where kinds.parent_id is not null) as modalidade
      )

      select base.*,
        coalesce(mudancas.qtde,0.0) as qtde

      from base
      left join mudancas on base.dia = mudancas.dia
                        and base.kind = mudancas.kind
                        and base.university_id = mudancas.university_id
                        and base.city = mudancas.city
                        and base.state = mudancas.state

      order by base.dia, base.kind, mudancas.qtde




  
  """
    df = load_from_db_cache(spark, query, 'log_recalculate', from_cache)
    #df['university_id'] = df['university_id'].astype(int)

    # NUMBER OF IES PER RELEVANT CITIES
    #df['revenue_city'] = df.groupby(['dia','city','state','kind'])['qtde'].transform('sum')

    return df
Exemple #7
0
def demand_data(spark, from_cache=False):
    logger.debug("Loading pricing updates ... ")

    query = """
  SELECT
  *
  FROM
  data_science.base_ordens_experimentos
  WHERE
  base_ordens_experimentos.registered_at BETWEEN '2020-01-01' AND '2020-10-01'
  AND
  offered_price <> 0
  AND
  origin IN ('Quero Bolsa')
  """

    df = load_from_db_cache(spark, query, 'demand_data', from_cache)
    return df
Exemple #8
0
def load_daily_pef(spark, from_cache=False):
    logger.debug("Loading consolidated_pef_per_day ... ")
    query = """SELECT date,
                    city,
                    case when name = 'Presencial' then 'Presencial' else 'EaD + Semi' end kind,
                    avg(value) as value,
                    avg(original_value) as original_value,
                    avg(pef_desconto) as pef_desconto


   FROM data_science.consolidated_pef_per_day
   GROUP BY 1,2,3""" #WHERE name ='Presencial'"

    df = load_from_db_cache(spark, query, 'consolidated_pef_per_day',
                            from_cache)
    df = df.sort_values(['city', 'date'])

    return df
Exemple #9
0
def updates_per_day(spark, from_cache=False):
    logger.debug("Loading pricing updates ... ")

    query = """
  select * from
  (
    select dia,
      origem,
      ies,
      city,
      state,
      kind,
      qtde,
      row_number() over (partition by ies, city, kind order by qtde desc) as ranking

    from parcerias.log_changes_pricing
  ) dd
  """
    df = load_from_db_cache(spark, query, 'pricing_updates', from_cache)
    return df
Exemple #10
0
def update_selected_alternative(alternative_selection,selected_city,start_date,end_date):
  print('update_selected_alternative: ' + str(alternative_selection))
  number_of_outputs = 6
  if alternative_selection is None:
    return number_of_outputs*(blank_fig(ROW_HEIGHTS[2]),)

  #Setting filters
  filters = {}
  if selected_city != -1:
    filters = {'city':selected_city}

  start = time.process_time()
  #Setting BayesianABTest
  df_cum_results = get_alternative_agg_data(alternative_selection, start_date, end_date, filters)


  # hist_data = {
  #     'conversion':{
  #         'mean': .07,
  #         'std': .2
  #     },
  #     'aov':{
  #         'mean': 500,
  #         'std':100
  #     }
  # }
  hist_data = None
  unique_alternatives = df_cum_results['alternative'].unique().tolist()
  abtest = BayesianABTest('arpu',unique_alternatives, hist_data = hist_data)

  #Feeding data to the test
  for alternative in unique_alternatives:
    last_data = df_cum_results.groupby('alternative').agg('last').reset_index()

    n_visits = last_data[last_data['alternative']==alternative]['n_visits'].values[0]
    n_paids = last_data[last_data['alternative']==alternative]['n_paids'].values[0]
    revenue = last_data[last_data['alternative']==alternative]['revenue'].values[0]
    arpu = last_data[last_data['alternative']==alternative]['arpu'].values[0]

    abtest.feed_alternative_data(
        alternative,
        n_visits=n_visits,
        n_paids=n_paids,
        revenue = revenue
    )

  fig_arpu = abtest.plot_results(plotly=True)
  fig_prob2beat,fig_expLoss = abtest.plot_cumulative_results(df_cum_results,plotly=True)

  legend_colors = {}
  for i in range(len(fig_arpu.data)):
    legend_colors[fig_arpu.data[i]['legendgroup']] = fig_arpu.data[i]['marker']['color']
  format_figure(fig_arpu,showlegend=True,height = ROW_HEIGHTS[2])

  #Formatting and setting the same legend colors
  for fig in [fig_prob2beat,fig_expLoss]:
    format_figure(fig,showlegend=False, height = ROW_HEIGHTS[2])

    for i in range(len(fig.data)):
      label = fig.data[i]['legendgroup'].replace('variable=','')
      fig.data[i]['line']['color'] = legend_colors[label]

  logger.debug("Time to run BayesianABTest: " + str(time.process_time() - start))

  start = time.process_time()
  #Comparative plots
  fig_offered_price = plot_offered_price_alternative(df_cum_results,legend_colors)
  fig_price = plot_price_discount_alternative(df_cum_results)
  fig_customer_count = plot_customer_count(df_cum_results,legend_colors)
  logger.debug("Time to BayesianABTest plots: " + str(time.process_time() - start))

  return fig_arpu,fig_prob2beat,fig_expLoss,fig_offered_price,fig_price,fig_customer_count
Exemple #11
0
def daily_order_pef(spark, from_cache=False):
    logger.debug("Loading daily_order_pef ... ")

    query = """
   WITH city_sales AS (
    SELECT DISTINCT
      sales.campus_city AS city,
      sales.campus_state AS state,
      round(sum(sales.total_revenue)) AS sales
    FROM
      data_warehouse.sales
      LEFT JOIN querobolsa_production.coupons ON coupons.id = sales.coupon_id
      LEFT JOIN querobolsa_production.offers ON offers.id = coupons.offer_id
      LEFT JOIN querobolsa_production.university_offers ON university_offers.id = offers.university_offer_id
      LEFT JOIN (SELECT * FROM  querobolsa_production.kinds WHERE parent_id IS NOT NULL) kinds ON kinds.name = sales.course_kind
      LEFT JOIN (SELECT * FROM querobolsa_production.levels WHERE parent_id IS NOT NULL) levels ON levels.name = sales.course_level
    WHERE
      university_offers.enrollment_semester IN ('2019.1','2019.2','2020.1')
      AND sales.campus_city IS NOT NULL
      AND sales.campus_city <> ''
    GROUP BY 1,2
    ORDER BY 3 DESC
  ),
  top_in_state AS (
  SELECT
    city_sales.*
  FROM
    (SELECT state, max(sales) AS max_sales FROM city_sales GROUP BY 1 ) AS ref
    JOIN city_sales ON city_sales.state = ref.state AND city_sales.sales = ref.max_sales
  ),
  top_cities AS (
    SELECT
      *
    FROM
      city_sales
    ORDER BY sales DESC
    LIMIT 40
  ),
  cidades_alvo AS(
    SELECT
    *
    FROM
      top_cities
    UNION SELECT * FROM top_in_state ORDER BY sales DESC
  )

  SELECT
    DATE(orders.registered_at) AS date,
    campuses.city,
    case when k.parent_id = 1 then 'Presencial' else 'EaD + Semi' end kind,
    AVG(orders.price) AS value
  FROM
    querobolsa_production.orders
  JOIN
    querobolsa_production.line_items ON orders.id = line_items.order_id
  JOIN
    querobolsa_production.pre_enrollment_fees ON pre_enrollment_fees.id = line_items.pre_enrollment_fee_id
  JOIN
    querobolsa_production.offers ON line_items.offer_id = offers.id
  JOIN
    querobolsa_production.courses ON offers.course_id = courses.id
  JOIN
    querobolsa_production.campuses ON campuses.id = courses.campus_id
  JOIN
    querobolsa_production.levels l ON courses.level = l.name AND l.parent_id IS NOT NULL
  JOIN
    querobolsa_production.levels ON l.parent_id = levels.id
  JOIN
    querobolsa_production.kinds k ON courses.kind = k.name AND k.parent_id IS NOT NULL
  JOIN
    querobolsa_production.kinds ON k.parent_id = kinds.id
  JOIN
    cidades_alvo ON campuses.city = cidades_alvo.city 

  WHERE
    orders.checkout_step NOT IN ('initiated')
  AND l.parent_id = 1
  AND orders.registered_at BETWEEN '2019-12-12' AND '2020-04-01'
  GROUP BY 1,2,3
  ORDER BY 1
  """

    df = load_from_db_cache(spark, query, 'daily_order_pef', from_cache)

    return df
Exemple #12
0
def baseline_versus_tests(spark, from_cache=False):
    logger.debug("Loading pricing updates ... ")

    query = """
  WITH base AS(
  SELECT
  DATE(base_ordens_experimentos.registered_at) AS date,
  fee_experiment_id,
  -- alternative,
  CASE WHEN alternative = 'baseline' THEN 'baseline' ELSE 'testes' END AS alternative_kind, 
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS paids,
  COUNT(DISTINCT customer_id) AS customers,
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END)/COUNT(DISTINCT customer_id) AS conversao,
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END)/SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS ticket_medio,
  (SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)))/SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS ticket_medio_ltv,
  SUM(orders.price)/COUNT(DISTINCT customer_id) AS ticket_customer,
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) AS revenue,
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)) AS receita_com_ltv,
  (SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)))/COUNT(DISTINCT customer_id) AS rpu,
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END)/COUNT(DISTINCT customer_id) AS rpu_sem_ltv

  FROM
  data_science.base_ordens_experimentos
  LEFT JOIN
  data_warehouse.sales ON base_ordens_experimentos.order_id = sales.order_id
  JOIN
  querobolsa_production.orders ON base_ordens_experimentos.order_id = orders.id
  JOIN
  querobolsa_production.line_items ON orders.id = line_items.order_id
  JOIN
  querobolsa_production.offers ON offers.id = line_items.offer_id
  JOIN
  querobolsa_production.courses ON courses.id = offers.course_id
  JOIN
  querobolsa_production.kinds k ON k.name = courses.kind
  JOIN
  querobolsa_production.kinds ON k.parent_id = kinds.id
  JOIN
  querobolsa_production.levels l ON l.name = courses.level
  JOIN
  querobolsa_production.levels ON l.parent_id = levels.id

  WHERE
  base_ordens_experimentos.registered_at BETWEEN '2020-04-01' AND current_date - interval 2 days
  AND
  origin IN ('Quero Bolsa')

  GROUP BY
  1,2,3

  ORDER BY
  1,2
  ),
  base_limpa AS (
  SELECT
  date,
  fee_experiment_id,
  alternative_kind,
  paids,
  customers,
  receita_com_ltv

  FROM
  base

  ORDER BY
  1,alternative_kind
  ),
  base_evolutivo AS(
  SELECT
  date,
  fee_experiment_id,
  SUM(CASE WHEN alternative_kind = 'baseline' THEN paids ELSE 0 END) AS baseline_paids,
  SUM(CASE WHEN alternative_kind = 'baseline' THEN customers ELSE 0 END) AS baseline_customers,
  SUM(CASE WHEN alternative_kind = 'baseline' THEN receita_com_ltv ELSE 0 END) AS baseline_revenue,
  SUM(CASE WHEN alternative_kind = 'testes' THEN paids ELSE 0 END) AS testes_paids,
  SUM(CASE WHEN alternative_kind = 'testes' THEN customers ELSE 0 END) AS testes_customers,
  SUM(CASE WHEN alternative_kind = 'testes' THEN receita_com_ltv ELSE 0 END) AS testes_revenue

  FROM
  base_limpa

  GROUP BY
  1,2
  ),
  base_consolidada AS (
  SELECT
  date,
  fee_experiment_id,

  -- Acumulado

  SUM(baseline_customers) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS baseline_customers_ac,
  SUM(baseline_paids) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS baseline_paids_ac,
  SUM(baseline_revenue) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS baseline_revenue_ac,
  SUM(testes_customers) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS testes_customers_ac,
  SUM(testes_paids) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS testes_paids_ac,
  SUM(testes_revenue) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS testes_revenue_ac,
          
  -- Média Móvel        
          
  AVG(baseline_customers) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS baseline_customers_mm,
  AVG(baseline_paids) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS baseline_paids_mm,
  AVG(baseline_revenue) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS baseline_revenue_mm,
  AVG(testes_customers) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS testes_customers_mm,
  AVG(testes_paids) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS testes_paids_mm,
  AVG(testes_revenue) OVER (
          PARTITION BY
          fee_experiment_id    
          ORDER BY
          date
          ROWS BETWEEN 6 PRECEDING AND CURRENT ROW) AS testes_revenue_mm
          
  FROM
  base_evolutivo
  ),
base_final AS(
  SELECT
  date,
  fee_experiment_id,
  baseline_revenue_ac,
  testes_revenue_ac,
  baseline_customers_ac,
  testes_customers_ac,
  baseline_paids_ac,
  testes_paids_ac,

  baseline_revenue_ac/baseline_customers_ac AS rpu_baseline_ac,
  testes_revenue_ac/testes_customers_ac AS rpu_testes_ac,

  baseline_paids_ac/baseline_customers_ac AS conversao_baseline_ac,
  testes_paids_ac/testes_customers_ac AS conversao_testes_ac,

  baseline_revenue_mm/baseline_customers_mm AS rpu_baseline_mm,
  testes_revenue_mm/testes_customers_mm AS rpu_testes_mm,

  baseline_paids_mm/baseline_customers_mm AS conversao_baseline_mm,
  testes_paids_mm/testes_customers_mm AS conversao_testes_mm

  FROM
  base_consolidada

  WHERE
  fee_experiment_id IN (56,57,58,59,60,61,62,63,65)
)
SELECT
*,
round(rpu_testes_ac/rpu_baseline_ac-1,2) AS rpu_gain,
round(conversao_testes_ac/conversao_baseline_ac-1,2) AS conversion_gain

FROM
base_final
  """

    df = load_from_db_cache(spark, query, 'baseline_versus_tests', from_cache)
    return df
Exemple #13
0
def active_tests_results(spark, from_cache=False):
    logger.debug("Loading pricing updates ... ")

    query = """
WITH
alternatives_per_day AS(
  SELECT
    date,
    experiments_aggregate_base.fee_experiment_id,
    collect_set(alternative_id) AS alternative_ids
  FROM
    data_science.experiments_aggregate_base
  WHERE
    date >= '2020-04-01'
  AND
    alternative_ratio IS NOT NULL
  GROUP BY
  1,2
),

active_alternatives(
  SELECT
    fee_experiment_id,
    alternative_ids
  FROM
    alternatives_per_day
  WHERE
    date = date_sub(current_date(),3)
),

test_start AS(
  SELECT
    alternatives_per_day.fee_experiment_id,
    min(alternatives_per_day.date) AS date
  FROM
    alternatives_per_day 
  JOIN 
    active_alternatives ON active_alternatives.alternative_ids = alternatives_per_day.alternative_ids
  GROUP BY
    1
),
base_resultados AS(
SELECT
  test_start.date AS test_start_date,
  base_ordens_experimentos.fee_experiment_id,
  alternative,
  CASE WHEN alternative = 'seasonality minus 25' THEN 'c' ELSE
       CASE WHEN alternative = 'seasonality lowest' THEN 'd' ELSE 
       CASE WHEN alternative = 'seasonality minus 75' THEN 'e' ELSE alternative    
       END END END AS ordem,   
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS paids,
  COUNT(DISTINCT customer_id) AS customers,
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END)/COUNT(DISTINCT customer_id) AS conversao,
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END)/SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS ticket_medio,
  (SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)))/SUM(CASE WHEN orders.checkout_step = 'paid' THEN 1 ELSE 0 END) AS ticket_medio_ltv,
  SUM(orders.price)/COUNT(DISTINCT customer_id) AS ticket_customer,
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) AS revenue,
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)) AS receita_com_ltv,
  (SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END) + SUM(coalesce(sales.ltv_qp,0)))/COUNT(DISTINCT customer_id) AS rpu,
  SUM(CASE WHEN orders.checkout_step = 'paid' THEN orders.price ELSE 0 END)/COUNT(DISTINCT customer_id) AS rpu_sem_ltv,
  AVG(base_ordens_experimentos.offered_price) AS offered_price

FROM
  data_science.base_ordens_experimentos
LEFT JOIN
  data_warehouse.sales ON base_ordens_experimentos.order_id = sales.order_id
JOIN
  querobolsa_production.orders ON base_ordens_experimentos.order_id = orders.id
JOIN
  querobolsa_production.line_items ON orders.id = line_items.order_id
JOIN
  querobolsa_production.offers ON offers.id = line_items.offer_id
JOIN
  querobolsa_production.courses ON courses.id = offers.course_id
JOIN
  querobolsa_production.kinds k ON k.name = courses.kind
JOIN
  querobolsa_production.kinds ON k.parent_id = kinds.id
JOIN
  querobolsa_production.levels l ON l.name = courses.level
JOIN
  querobolsa_production.levels ON l.parent_id = levels.id
JOIN
  test_start ON test_start.fee_experiment_id = base_ordens_experimentos.fee_experiment_id AND base_ordens_experimentos.registered_at >= test_start.date

WHERE
  origin IN ('Quero Bolsa')

GROUP BY
  1,2,3

ORDER BY
  1,2
)

SELECT
  *
FROM
  base_resultados
WHERE
  customers > 50
ORDER BY
  fee_experiment_id
  """

    df = load_from_db_cache(spark, query, 'active_tests_results', from_cache)
    return df
Exemple #14
0
def city2ies(spark, from_cache=False):
    logger.debug("Loading city2ies ... ")

    query = """
  WITH city_sales as (
    select distinct
      campuses.city_id,
      sales.campus_city as city,
      sales.campus_state as state,
      kinds.parent_id as kind_id,
      round(sum(sales.total_revenue)) as sales
    from
      data_warehouse.sales
      left join querobolsa_production.coupons on coupons.id = sales.coupon_id
      left join querobolsa_production.offers on offers.id = coupons.offer_id
      left join querobolsa_production.university_offers  on university_offers.id = offers.university_offer_id
      left join (select * from querobolsa_production.kinds where parent_id is not null) kinds on kinds.name = sales.course_kind
      left join (select * from querobolsa_production.levels where parent_id is not null) levels on levels.name = sales.course_level
      left join querobolsa_production.campuses on campuses.id = sales.campus_id
    where
      university_offers.enrollment_semester in ('2019.1','2019.2','2020.1')
      and sales.campus_city is not null
      and sales.campus_city <> ''
    group by 1,2,3,4
    order by 5 desc
  ),
  top_in_state as (
  select
    city_sales.*
  from
    (select kind_id, state, max(sales) as max_sales from city_sales group by 1,2 ) as ref
    join city_sales on city_sales.state = ref.state and city_sales.sales = ref.max_sales and ref.kind_id = city_sales.kind_id
  ),
  top_cities as (
    select
      *
    from
      city_sales
    order by sales desc
    limit 40
  ),
  filter_cities as (
  select * from top_cities union select * from top_in_state order by sales desc
  )

  select distinct
    campuses.city_id,
    sales.campus_city as city,
    sales.campus_state as state,
    case when kinds.id in (3,8) then 'EaD + Semi' else 'Presencial' end kind, 
    levels.name AS level,
    offers.university_id,
    universities.name,
    sum(sales.total_revenue) as revenue
  from
    data_warehouse.sales
    left join querobolsa_production.coupons on coupons.id = sales.coupon_id
    left join querobolsa_production.offers on offers.id = coupons.offer_id
    left join querobolsa_production.universities on universities.id = offers.university_id
    left join querobolsa_production.university_offers  on university_offers.id = offers.university_offer_id
    left join querobolsa_production.courses on sales.course_id = courses.id
    left join querobolsa_production.kinds k ON k.name = sales.course_kind AND k.parent_id IS NOT NULL
    left join querobolsa_production.kinds ON k.parent_id = kinds.id
    left join querobolsa_production.levels l ON l.name = sales.course_level AND l.parent_id IS NOT NULL
    left join querobolsa_production.levels ON l.parent_id = levels.id
    join filter_cities on filter_cities.city = sales.campus_city and filter_cities.state = sales.campus_state and filter_cities.kind_id = kinds.id
    left join querobolsa_production.campuses on campuses.id = sales.campus_id
  where
    sales.payment_date BETWEEN '2019-10-01' AND '2020-04-01'
    and offers.university_id is not null
    and levels.id = 1
    and kinds.id IN (1,3,8)
  group by 1,2,3,4,5,6,7
  order by 8 desc
  """
    df = load_from_db_cache(spark, query, 'city2ies', from_cache)
    df['university_id'] = df['university_id'].astype(int)

    # NUMBER OF IES PER RELEVANT CITIES
    df['revenue_city'] = df.groupby(['city', 'state',
                                     'kind'])['revenue'].transform('sum')
    df['relevance_city'] = df['revenue'] / df['revenue_city']
    df['cumrelevance_city'] = df.groupby(
        ['city', 'state',
         'kind'])['relevance_city'].transform(lambda x: x.cumsum())
    # df[df['city']=='Brasília'].sort_values('revenue',ascending=False)
    df = df.sort_values(['revenue_city', 'revenue'], ascending=False)

    #Getting only IES that are in 80% of relevance
    df = df[df['cumrelevance_city'] < .8]

    return df