def get_booker_cc(self): """function to get the booker country for an account Returns: a panda dataframe with booker country """ # get the campaign id, campaign name correspondence campaign_query = """ SELECT id AS campaign_id, name AS campaign_name FROM mrktctl.account_1Bid_Campaign """ msql = mu.getConnSql() msql.execute("USE mrktctl") campaigns = pd.read_sql_query(campaign_query, con=msql) msql.close() # get the pos, campaign id correspondence because in account_1, campaign could be # one pos or multiple pos cid = spark.table("spmeta.account_1_cc_campaign").withColumnRenamed( "cc", "pos") pd_cid = cid.toPandas() pd_cid.loc[pd_cid.pos == "TR", "campaign_id"] = 105 booker_cc_df = campaigns.merge(pd_cid) return booker_cc_df
def get_id_pos(self): """get campaign id, campaign correspondence from mysql database and join with the filtered campaign lists returns: spark dataframe """ # get the campaign id and campaign name correspondence from mysql msql = mu.getConnSql() msql.execute('USE mrktctl') qs = """ SELECT id campaign_id, name pos FROM account_1Bid_Campaign""" account_1_campaign = pd.read_sql_query(qs, con=msql) msql.close() account_1_campaign = spark.createDataFrame(account_1_campaign) # filter for the campaign list if self.pos == ['All']: return account_1_campaign else: filtered_pos = spark.createDataFrame( pd.DataFrame(data=self.pos, columns=["pos"])) account_1_campaign = account_1_campaign.join(filtered_pos, on="pos", how="inner") return account_1_campaign
def get_cancellations(self): """get cancellation data from mysql at desired aggregated dimension and filter for the selected point of sales. Returns: spark dataframe with cancelled commission, cancelled bookings and cancelled roomnights at desired aggregated dimension """ msql = mu.getConnSql() msql.execute('USE ppc_sp') cancellation_query = """ SELECT r.date_cancelled yyyy_mm_dd , a.distribution pos , CAST(coalesce(r.dest_id, r.hotel_id) AS INT) hotel_id , CAST(sum(1) AS INT) cancellations , sum(commission_amount_euro) cancelled_commission , CAST(sum(roomnights) AS INT) cancelled_roomnights FROM {reservation_table} r force index (cancel) JOIN {affiliate_table} a on (a.affiliate_id = r.affiliate_id) WHERE r.date_cancelled >= '{start_date}' AND r.date_cancelled < '{end_date}' AND r.status not in ('fraudulent','test','unknown') AND r.partner_id = {account_4_partner_id} GROUP BY yyyy_mm_dd, pos, coalesce(r.dest_id, r.hotel_id) """.format(reservation_table=self.reservation_table, affiliate_table=self.affiliate_table, start_date=self.start_date, end_date=self.end_date, account_4_partner_id=self.partner_id) cancellations = pd.read_sql_query(cancellation_query, con=msql) msql.close() cancellations = spark.createDataFrame(cancellations) cancellations_agg = cancellations.groupBy(*self.agg_on)\ .agg(f.sum("cancelled_commission").alias("cancelled_commission") ,f.sum("cancelled_roomnights").alias("cancelled_roomnights") ,f.sum("cancellations").alias("cancellations")) if self.pos == ['All']: return cancellations_agg else: filtered_pos = spark.createDataFrame( pd.DataFrame(data=self.pos, columns=["pos"])) cancellations_agg = cancellations_agg.join(filtered_pos, on="pos", how="inner") return cancellations_agg