def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) logging.info("Dumping MySQL query results to local file") conn = mysql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) with NamedTemporaryFile("w") as f: csv_writer = csv.writer(f, delimiter=self.delimiter) field_dict = OrderedDict() for field in cursor.description: field_dict[field[0]] = self.type_map(field[1]) csv_writer.writerows(cursor) f.flush() cursor.close() conn.close() logging.info("Loading file into Hive") hive.load_file( f.name, self.hive_table, field_dict=field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate)
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id) logging.info("Dumping MySQL query results to local file") conn = mysql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) with NamedTemporaryFile("wb") as f: csv_writer = csv.writer(f, delimiter=self.delimiter, encoding="utf-8") field_dict = OrderedDict() for field in cursor.description: field_dict[field[0]] = self.type_map(field[1]) csv_writer.writerows(cursor) f.flush() cursor.close() conn.close() logging.info("Loading file into Hive") hive.load_file(f.name, self.hive_table, field_dict=field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate)
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'] sql = self.sql.strip().strip(';') hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE AS {sql}; """.format(**locals()) hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find('/user') static_path = hdfs_uri[pos:] druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) logging.info("Inserting rows into Druid") druid.load_from_hdfs(datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, metric_spec=self.metric_spec) logging.info("Load seems to have succeeded!")
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) vertica = VerticaHook(vertica_conn_id=self.vertica_conn_id) logging.info("Dumping Vertica query results to local file") conn = vertica.get_conn() cursor = conn.cursor() cursor.execute(self.sql) with NamedTemporaryFile("w") as f: csv_writer = csv.writer(f, delimiter=self.delimiter, encoding='utf-8') field_dict = OrderedDict() col_count = 0 for field in cursor.description: col_count += 1 col_position = "Column{position}".format(position=col_count) field_dict[col_position if field[0] == '' else field[0]] = self.type_map(field[1]) csv_writer.writerows(cursor.iterate()) f.flush() cursor.close() conn.close() logging.info("Loading file into Hive") hive.load_file( f.name, self.hive_table, field_dict=field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate)
def execute(self, context): self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.s3 = S3Hook(s3_conn_id=self.s3_conn_id) logging.info("Downloading S3 file") if self.wildcard_match: if not self.s3.check_for_wildcard_key(self.s3_key): raise AirflowException("No key matches {0}".format(self.s3_key)) s3_key_object = self.s3.get_wildcard_key(self.s3_key) else: if not self.s3.check_for_key(self.s3_key): raise AirflowException( "The key {0} does not exists".format(self.s3_key)) s3_key_object = self.s3.get_key(self.s3_key) with NamedTemporaryFile("w") as f: logging.info("Dumping S3 key {0} contents to local" " file {1}".format(s3_key_object.key, f.name)) s3_key_object.get_contents_to_file(f) f.flush() self.s3.connection.close() if not self.headers: logging.info("Loading file into Hive") self.hive.load_file( f.name, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate) else: with open(f.name, 'r') as tmpf: if self.check_headers: header_l = tmpf.readline() header_line = header_l.rstrip() header_list = header_line.split(self.delimiter) field_names = list(self.field_dict.keys()) test_field_match = [h1.lower() == h2.lower() for h1, h2 in zip(header_list, field_names)] if not all(test_field_match): logging.warning("Headers do not match field names" "File headers:\n {header_list}\n" "Field names: \n {field_names}\n" "".format(**locals())) raise AirflowException("Headers do not match the " "field_dict keys") with NamedTemporaryFile("w") as f_no_headers: tmpf.seek(0) next(tmpf) for line in tmpf: f_no_headers.write(line) f_no_headers.flush() logging.info("Loading file without headers into Hive") self.hive.load_file( f_no_headers.name, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate)
def __init__(self, s3_key, field_dict, hive_table, delimiter=',', create=True, recreate=False, partition=None, headers=False, check_headers=False, s3_conn_id='s3_default', hive_cli_conn_id='hive_cli_default', *args, **kwargs): super(S3ToHiveTransfer, self).__init__(*args, **kwargs) self.s3_key = s3_key self.field_dict = field_dict self.hive_table = hive_table self.delimiter = delimiter self.create = create self.recreate = recreate self.partition = partition self.headers = headers self.check_headers = check_headers self.hive = HiveCliHook(hive_cli_conn_id=hive_cli_conn_id) self.s3 = S3Hook(s3_conn_id=s3_conn_id)
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) mssql = MsSqlHook(mssql_conn_id=self.mssql_conn_id) logging.info("Dumping Microsoft SQL Server query results to local file") conn = mssql.get_conn() cursor = conn.cursor() cursor.execute(self.sql) with NamedTemporaryFile("w") as f: csv_writer = csv.writer(f, delimiter=self.delimiter, encoding="utf-8") field_dict = OrderedDict() col_count = 0 for field in cursor.description: col_count += 1 col_position = "Column{position}".format(position=col_count) field_dict[col_position if field[0] == "" else field[0]] = self.type_map(field[1]) csv_writer.writerows(cursor) f.flush() cursor.close() conn.close() logging.info("Loading file into Hive") hive.load_file( f.name, self.hive_table, field_dict=field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate, )
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'] sql = self.sql.strip().strip(';') hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = '') AS {sql} """.format(**locals()) hive.run_cli(hql) #hqls = hql.split(';') #logging.info(str(hqls)) #from airflow.hooks import HiveServer2Hook #hive = HiveServer2Hook(hiveserver2_conn_id="hiveserver2_silver") #hive.get_results(hqls) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find('/user') static_path = hdfs_uri[pos:] schema, table = hive_table.split('.') druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) logging.info("Inserting rows into Druid") logging.info("HDFS path: " + static_path) druid.load_from_hdfs( datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, metric_spec=self.metric_spec, hadoop_dependency_coordinates=self.hadoop_dependency_coordinates) logging.info("Load seems to have succeeded!") logging.info("Cleaning up by dropping the temp " "Hive table {}".format(hive_table)) hql = "DROP TABLE IF EXISTS {}".format(hive_table)
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") hive_table = 'druid.' + context['task_instance_key_str'] sql = self.sql.strip().strip(';') hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE TBLPROPERTIES ('serialization.null.format' = '') AS {sql} """.format(**locals()) hive.run_cli(hql) #hqls = hql.split(';') #logging.info(str(hqls)) #from airflow.hooks import HiveServer2Hook #hive = HiveServer2Hook(hiveserver2_conn_id="hiveserver2_silver") #hive.get_results(hqls) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find('/user') static_path = hdfs_uri[pos:] schema, table = hive_table.split('.') druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) logging.info("Inserting rows into Druid") logging.info("HDFS path: " + static_path) druid.load_from_hdfs( datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, metric_spec=self.metric_spec, hadoop_dependency_coordinates=self.hadoop_dependency_coordinates) logging.info("Load seems to have succeeded!") logging.info( "Cleaning up by dropping the temp " "Hive table {}".format(hive_table)) hql = "DROP TABLE IF EXISTS {}".format(hive_table)
def __init__( self, s3_key, field_dict, hive_table, delimiter=',', create=True, recreate=False, partition=None, headers=False, check_headers=False, s3_conn_id='s3_default', hive_cli_conn_id='hive_cli_default', *args, **kwargs): super(S3ToHiveTransfer, self).__init__(*args, **kwargs) self.s3_key = s3_key self.field_dict = field_dict self.hive_table = hive_table self.delimiter = delimiter self.create = create self.recreate = recreate self.partition = partition self.headers = headers self.check_headers = check_headers self.hive = HiveCliHook(hive_cli_conn_id=hive_cli_conn_id) self.s3 = S3Hook(s3_conn_id=s3_conn_id)
def execute(self, context): hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) logging.info("Extracting data from Hive") hive_table = "druid." + context["task_instance_key_str"] sql = self.sql.strip().strip(";") hql = """\ set mapred.output.compress=false; set hive.exec.compress.output=false; DROP TABLE IF EXISTS {hive_table}; CREATE TABLE {hive_table} ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE AS {sql}; """.format( **locals() ) hive.run_cli(hql) m = HiveMetastoreHook(self.metastore_conn_id) t = m.get_table(hive_table) columns = [col.name for col in t.sd.cols] hdfs_uri = m.get_table(hive_table).sd.location pos = hdfs_uri.find("/user") static_path = hdfs_uri[pos:] druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id) logging.info("Inserting rows into Druid") druid.load_from_hdfs( datasource=self.druid_datasource, intervals=self.intervals, static_path=static_path, ts_dim=self.ts_dim, columns=columns, metric_spec=self.metric_spec, ) logging.info("Load seems to have succeeded!")
def ddl(self): table = request.args.get("table") sql = "SHOW CREATE TABLE {table};".format(table=table) h = HiveCliHook(HIVE_CLI_CONN_ID) return h.run_cli(sql)
class S3ToHiveTransfer(BaseOperator): """ Moves data from S3 to Hive. The operator downloads a file from S3, stores the file locally before loading it into a Hive table. If the ``create`` or ``recreate`` arguments are set to ``True``, a ``CREATE TABLE`` and ``DROP TABLE`` statements are generated. Hive data types are inferred from the cursors's metadata from. Note that the table generated in Hive uses ``STORED AS textfile`` which isn't the most efficient serialization format. If a large amount of data is loaded and/or if the tables gets queried considerably, you may want to use this operator only to stage the data into a temporary table before loading it into its final destination using a ``HiveOperator``. :param s3_key: The key to be retrieved from S3 :type s3_key: str :param field_dict: A dictionary of the fields name in the file as keys and their Hive types as values :type field_dict: dict :param hive_table: target Hive table, use dot notation to target a specific database :type hive_table: str :param create: whether to create the table if it doesn't exist :type create: bool :param recreate: whether to drop and recreate the table at every execution :type recreate: bool :param partition: target partition as a dict of partition columns and values :type partition: dict :param headers: whether the file contains column names on the first line :type headers: bool :param headers: whether the column names on the first line should be checked against the keys of field_dict :type headers: bool :param delimiter: field delimiter in the file :type delimiter: str :param s3_conn_id: source s3 connection :type s3_conn_id: str :param hive_conn_id: desctination hive connection :type hive_conn_id: str """ __mapper_args__ = {'polymorphic_identity': 'S3ToHiveOperator'} template_fields = ('s3_key', 'partition', 'hive_table') template_ext = () ui_color = '#a0e08c' @apply_defaults def __init__(self, s3_key, field_dict, hive_table, delimiter=',', create=True, recreate=False, partition=None, headers=False, check_headers=False, s3_conn_id='s3_default', hive_cli_conn_id='hive_cli_default', *args, **kwargs): super(S3ToHiveTransfer, self).__init__(*args, **kwargs) self.s3_key = s3_key self.field_dict = field_dict self.hive_table = hive_table self.delimiter = delimiter self.create = create self.recreate = recreate self.partition = partition self.headers = headers self.check_headers = check_headers self.hive = HiveCliHook(hive_cli_conn_id=hive_cli_conn_id) self.s3 = S3Hook(s3_conn_id=s3_conn_id) def execute(self, context): logging.info("Downloading S3 file") if not self.s3.check_for_key(self.s3_key): raise Exception("The key {0} does not exists".format(self.s3_key)) s3_key_object = self.s3.get_key(self.s3_key) with NamedTemporaryFile("w") as f: logging.info("Dumping S3 file {0} contents to local" " file {1}".format(self.s3_key, f.name)) s3_key_object.get_contents_to_file(f) f.flush() self.s3.connection.close() if not self.headers: logging.info("Loading file into Hive") self.hive.load_file(f.name, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate) else: with open(f.name, 'r') as tmpf: if self.check_headers: header_l = tmpf.readline() header_line = header_l.rstrip() header_list = header_line.split(self.delimiter) field_names = list(self.field_dict.keys()) test_field_match = [ h1.lower() == h2.lower() for h1, h2 in zip(header_list, field_names) ] if not all(test_field_match): logging.warning("Headers do not match field names" "File headers:\n {header_list}\n" "Field names: \n {field_names}\n" "".format(**locals())) raise Exception("Headers do not match the " "field_dict keys") with NamedTemporaryFile("w") as f_no_headers: tmpf.seek(0) next(tmpf) for line in tmpf: f_no_headers.write(line) f_no_headers.flush() logging.info("Loading file without headers into Hive") self.hive.load_file(f_no_headers.name, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate)
class S3ToHiveTransfer(BaseOperator): """ Moves data from S3 to Hive. The operator downloads a file from S3, stores the file locally before loading it into a Hive table. If the ``create`` or ``recreate`` arguments are set to ``True``, a ``CREATE TABLE`` and ``DROP TABLE`` statements are generated. Hive data types are inferred from the cursor's metadata from. Note that the table generated in Hive uses ``STORED AS textfile`` which isn't the most efficient serialization format. If a large amount of data is loaded and/or if the tables gets queried considerably, you may want to use this operator only to stage the data into a temporary table before loading it into its final destination using a ``HiveOperator``. :param s3_key: The key to be retrieved from S3 :type s3_key: str :param field_dict: A dictionary of the fields name in the file as keys and their Hive types as values :type field_dict: dict :param hive_table: target Hive table, use dot notation to target a specific database :type hive_table: str :param create: whether to create the table if it doesn't exist :type create: bool :param recreate: whether to drop and recreate the table at every execution :type recreate: bool :param partition: target partition as a dict of partition columns and values :type partition: dict :param headers: whether the file contains column names on the first line :type headers: bool :param check_headers: whether the column names on the first line should be checked against the keys of field_dict :type check_headers: bool :param wildcard_match: whether the s3_key should be interpreted as a Unix wildcard pattern :type wildcard_match: bool :param delimiter: field delimiter in the file :type delimiter: str :param s3_conn_id: source s3 connection :type s3_conn_id: str :param hive_conn_id: destination hive connection :type hive_conn_id: str """ template_fields = ('s3_key', 'partition', 'hive_table') template_ext = () ui_color = '#a0e08c' @apply_defaults def __init__( self, s3_key, field_dict, hive_table, delimiter=',', create=True, recreate=False, partition=None, headers=False, check_headers=False, wildcard_match=False, s3_conn_id='s3_default', hive_cli_conn_id='hive_cli_default', *args, **kwargs): super(S3ToHiveTransfer, self).__init__(*args, **kwargs) self.s3_key = s3_key self.field_dict = field_dict self.hive_table = hive_table self.delimiter = delimiter self.create = create self.recreate = recreate self.partition = partition self.headers = headers self.check_headers = check_headers self.wildcard_match = wildcard_match self.hive_cli_conn_id = hive_cli_conn_id self.s3_conn_id = s3_conn_id def execute(self, context): self.hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id) self.s3 = S3Hook(s3_conn_id=self.s3_conn_id) logging.info("Downloading S3 file") if self.wildcard_match: if not self.s3.check_for_wildcard_key(self.s3_key): raise AirflowException("No key matches {0}".format(self.s3_key)) s3_key_object = self.s3.get_wildcard_key(self.s3_key) else: if not self.s3.check_for_key(self.s3_key): raise AirflowException( "The key {0} does not exists".format(self.s3_key)) s3_key_object = self.s3.get_key(self.s3_key) with NamedTemporaryFile("w") as f: logging.info("Dumping S3 key {0} contents to local" " file {1}".format(s3_key_object.key, f.name)) s3_key_object.get_contents_to_file(f) f.flush() self.s3.connection.close() if not self.headers: logging.info("Loading file into Hive") self.hive.load_file( f.name, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate) else: with open(f.name, 'r') as tmpf: if self.check_headers: header_l = tmpf.readline() header_line = header_l.rstrip() header_list = header_line.split(self.delimiter) field_names = list(self.field_dict.keys()) test_field_match = [h1.lower() == h2.lower() for h1, h2 in zip(header_list, field_names)] if not all(test_field_match): logging.warning("Headers do not match field names" "File headers:\n {header_list}\n" "Field names: \n {field_names}\n" "".format(**locals())) raise AirflowException("Headers do not match the " "field_dict keys") with NamedTemporaryFile("w") as f_no_headers: tmpf.seek(0) next(tmpf) for line in tmpf: f_no_headers.write(line) f_no_headers.flush() logging.info("Loading file without headers into Hive") self.hive.load_file( f_no_headers.name, self.hive_table, field_dict=self.field_dict, create=self.create, partition=self.partition, delimiter=self.delimiter, recreate=self.recreate)
def get_hook(self): return HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id, run_as=self.run_as)