Esempio n. 1
0
    def extract_google_analytics_data(gcp_conn_id: str, start_date: str,
                                      end_date: str, locations: str):
        bq_hook = BigQueryHook(gcp_conn_id=gcp_conn_id)

        cities = ','.join(['"{}"'.format(c["name"]) for c in locations])
        countries = ','.join(['"{}"'.format(c["country"]) for c in locations])
        states = ','.join(['"{}"'.format(c["state"]) for c in locations])

        query = """SELECT 
                        fullVisitorId,
                        date,
                        geoNetwork.city as city,
                        geoNetwork.country as country,
                        geoNetwork.region as region,
                        product.v2ProductCategory as productCategory,
                        product.v2ProductName as productName,
                        hits.eCommerceAction.action_type as action_type,
                        hits.eCommerceAction.step as action_step,
                        product.productQuantity as quantity,
                        product.productPrice as price,
                        product.productRevenue as revenue,
                        product.isImpression as isImpression,
                        hits.transaction.transactionId as transactionId,
                        hits.transaction.transactionRevenue as transactionRevenue,
                        hits.transaction.transactionTax as transactionTax,
                        hits.transaction.transactionShipping as transactionShipping,
                    FROM `bigquery-public-data.google_analytics_sample.ga_sessions_*`,
                        UNNEST(hits) as hits,
                        UNNEST(hits.product) as product
                    WHERE 
                        _TABLE_SUFFIX BETWEEN '{start_date}' AND '{end_date}'
                    AND
                        geoNetwork.country IN ({countries})
                    AND
                        geoNetwork.city IN ({cities})
                    AND 
                        geoNetwork.region IN ({states})
                    ORDER BY 
                        date ASC""".format(start_date=start_date,
                                           end_date=end_date,
                                           countries=countries,
                                           cities=cities,
                                           states=states)

        df = bq_hook.get_pandas_df(sql=query, dialect="standard")

        df["location_name"] = df.apply(lambda row: get_location_string(
            row['country'], row['city'], row['region']),
                                       axis=1)
        df["date"] = pd.to_datetime(df["date"])
        df["price"] = df["price"] / (10**6)
        df["revenue"] = df["revenue"] / (10**6)
        df["transactionRevenue"] = df["transactionRevenue"] / (10**6)
        df["transactionTax"] = df["transactionTax"] / (10**6)
        df["transactionShipping"] = df["transactionShipping"] / (10**6)

        df = df.set_index(["date", "location_name"])
        return df.to_csv(date_format="%Y-%m-%d %H:%M:%S")
Esempio n. 2
0
    def execute(self, context):
        bq = BigQueryHook(gcp_conn_id=self.gcp_conn_id, use_legacy_sql=False)

        total_rows = []

        # self.log.info("{credentials}")
        for table in self.table_list:
            q = self.sql.format(table)
            response = bq.get_pandas_df(q)
            if response.empty:
                response_value = 0
            else:
                response_value = response['f0_'].iloc[0]
            total_rows.append(response_value)

        if sum(total_rows) != self.pass_value:
            logging.info(sum(total_rows))
            logging.info('Tests failed')
            logging.info(total_rows)
            logging.info(self.pass_value)
            raise ValueError('Data quality check failed')