Ejemplo n.º 1
1
 def objects(self):
     where_clause = ''
     if DB_WHITELIST:
         dbs = ",".join(["'" + db + "'" for db in DB_WHITELIST])
         where_clause = "AND b.name IN ({})".format(dbs)
     if DB_BLACKLIST:
         dbs = ",".join(["'" + db + "'" for db in DB_BLACKLIST])
         where_clause = "AND b.name NOT IN ({})".format(dbs)
     sql = """
     SELECT CONCAT(b.NAME, '.', a.TBL_NAME), TBL_TYPE
     FROM TBLS a
     JOIN DBS b ON a.DB_ID = b.DB_ID
     WHERE
         a.TBL_NAME NOT LIKE '%tmp%' AND
         a.TBL_NAME NOT LIKE '%temp%' AND
         b.NAME NOT LIKE '%tmp%' AND
         b.NAME NOT LIKE '%temp%'
     {where_clause}
     LIMIT {LIMIT};
     """.format(where_clause=where_clause, LIMIT=TABLE_SELECTOR_LIMIT)
     h = MySqlHook(METASTORE_MYSQL_CONN_ID)
     d = [
             {'id': row[0], 'text': row[0]}
         for row in h.get_records(sql)]
     return json.dumps(d)
Ejemplo n.º 2
0
def simple_select_mysql(ds, **kwargs):
    mysql_hook = MySqlHook(mysql_conn_id="soletrade_localhost")
    users_trend = """SELECT 
                DISTINCT(email),
                sum(spent),
                count(transactions()
            FROM
            orders AS o
            GROUP BY 1
            ;"""
    with open('temp/output.csv', 'w') as f:
        wr = csv.writer(f, delimiter=',')
        csvRow = ["user", "spent", "transactions"]
        wr.writerow(csvRow)
        for user, spent, transactions in mysql_hook.get_records(users_trend):
            wr.writerow([user, spent, transactions])
Ejemplo n.º 3
0
 def objects(self):
     where_clause = ''
     if DB_WHITELIST:
         dbs = ",".join(["'" + db + "'" for db in DB_WHITELIST])
         where_clause = "AND b.name IN ({})".format(dbs)
     if DB_BLACKLIST:
         dbs = ",".join(["'" + db + "'" for db in DB_BLACKLIST])
         where_clause = "AND b.name NOT IN ({})".format(dbs)
     sql = """
     SELECT CONCAT(b.NAME, '.', a.TBL_NAME), TBL_TYPE
     FROM TBLS a
     JOIN DBS b ON a.DB_ID = b.DB_ID
     WHERE
         a.TBL_NAME NOT LIKE '%tmp%' AND
         a.TBL_NAME NOT LIKE '%temp%' AND
         b.NAME NOT LIKE '%tmp%' AND
         b.NAME NOT LIKE '%temp%'
     {where_clause}
     LIMIT {LIMIT};
     """.format(where_clause=where_clause, LIMIT=TABLE_SELECTOR_LIMIT)
     h = MySqlHook(METASTORE_MYSQL_CONN_ID)
     d = [{'id': row[0], 'text': row[0]} for row in h.get_records(sql)]
     return json.dumps(d)
Ejemplo n.º 4
0
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {
            ('', 'count'): 'COUNT(*)'
        }
        for col, col_type in field_types.items():
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join([
            v + " AS " + k[0] + '__' + k[1]
            for k, v in exprs.items()])

        where_clause = [
            "{0} = '{1}'".format(k, v) for k, v in self.partition.items()]
        where_clause = " AND\n        ".join(where_clause)
        sql = """
        SELECT
            {exprs_str}
        FROM {self.table}
        WHERE
            {where_clause};
        """.format(**locals())

        hook = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info('Executing SQL check: ' + sql)
        row = hook.get_first(hql=sql)
        logging.info("Record: " + str(row))
        if not row:
            raise Exception("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        logging.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{self.table}' AND
            partition_repr='{part_json}' AND
            dttm='{self.dttm}'
        LIMIT 1;
        """.format(**locals())
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{self.table}' AND
                partition_repr='{part_json}' AND
                dttm='{self.dttm}';
            """.format(**locals())
            mysql.run(sql)

        logging.info("Pivoting and loading cells into the Airflow db")
        rows = [
            (self.ds, self.dttm, self.table, part_json) +
            (r[0][0], r[0][1], r[1])
            for r in zip(exprs, row)]
        mysql.insert_rows(
            table='hive_stats',
            rows=rows,
            target_fields=[
                'ds',
                'dttm',
                'table_name',
                'partition_repr',
                'col',
                'metric',
                'value',
            ]
        )
Ejemplo n.º 5
0
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {('', 'count'): 'COUNT(*)'}
        for col, col_type in field_types.items():
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join(
            [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()])

        where_clause = [
            "{0} = '{1}'".format(k, v) for k, v in self.partition.items()
        ]
        where_clause = " AND\n        ".join(where_clause)
        sql = """
        SELECT
            {exprs_str}
        FROM {self.table}
        WHERE
            {where_clause};
        """.format(**locals())

        hook = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info('Executing SQL check: ' + sql)
        row = hook.get_first(hql=sql)
        logging.info("Record: " + str(row))
        if not row:
            raise AirflowException("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        logging.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{self.table}' AND
            partition_repr='{part_json}' AND
            dttm='{self.dttm}'
        LIMIT 1;
        """.format(**locals())
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{self.table}' AND
                partition_repr='{part_json}' AND
                dttm='{self.dttm}';
            """.format(**locals())
            mysql.run(sql)

        logging.info("Pivoting and loading cells into the Airflow db")
        rows = [(self.ds, self.dttm, self.table, part_json) +
                (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)]
        mysql.insert_rows(table='hive_stats',
                          rows=rows,
                          target_fields=[
                              'ds',
                              'dttm',
                              'table_name',
                              'partition_repr',
                              'col',
                              'metric',
                              'value',
                          ])