def make_create_table_sql(self, table): sql = super(ImpalaDbConnection, self).make_create_table_sql(table) if not self._bulk_load_table: return sql hdfs_url_base = get_default_hdfs_config().get('fs.defaultFS') sql += "\nLOCATION '%s%s'" % (hdfs_url_base, dirname(self.hdfs_file_path)) if self._bulk_load_table.storage_format.upper() != 'TEXTFILE': sql += "\nSTORED AS " + table.storage_format if table.storage_format == 'avro': avro_schema = {'name': 'my_record', 'type': 'record', 'fields': []} for col in table.cols: if issubclass(col.type, Int): avro_type = 'int' else: avro_type = col.type.__name__.lower() avro_schema['fields'].append({'name': col.name, 'type': ['null', avro_type]}) json_avro_schema = dumps(avro_schema) # The Hive metastore has a limit to the amount of schema it can store inline. # Beyond this limit, the schema needs to be stored in HDFS and Hive is given a # URL instead. if len(json_avro_schema) > 4000: avro_schema_url = 'foo' avro_schema_path = '%s/%s.avro' % (self.hdfs_db_dir, table.name) hdfs = create_default_hdfs_client() hdfs.create_file(avro_schema_path, json_avro_schema, overwrite=True) sql += "\nTBLPROPERTIES ('avro.schema.url' = '%s')" % avro_schema_url else: sql += "\nTBLPROPERTIES ('avro.schema.literal' = '%s')" % json_avro_schema return sql
def end_bulk_load_table(self): super(ImpalaDbConnection, self).end_bulk_load_table() hdfs = create_default_hdfs_client() pywebhdfs_dirname = dirname(self.hdfs_file_path).lstrip('/') hdfs.make_dir(pywebhdfs_dirname) pywebhdfs_file_path = pywebhdfs_dirname + '/' + basename(self.hdfs_file_path) try: # TODO: Only delete the file if it exists hdfs.delete_file_dir(pywebhdfs_file_path) except Exception as e: LOG.debug(e) with open(self._bulk_load_data_file.name) as readable_file: hdfs.create_file(pywebhdfs_file_path, readable_file) self._bulk_load_data_file.close() self.execute("INVALIDATE METADATA %s" % self._bulk_load_table.name) if self._bulk_load_non_text_table: self.hive_connection.execute('CREATE TABLE %s AS SELECT * FROM %s' % (self._bulk_load_non_text_table.name, self._bulk_load_table.name)) self.drop_table(self._bulk_load_table.name) self.execute("INVALIDATE METADATA %s" % self._bulk_load_non_text_table) self._bulk_load_data_file = None
def end_bulk_load_table(self, create_tables): DbConnection.end_bulk_load_table(self, create_tables) if self.hdfs_host is None: hdfs = create_default_hdfs_client() else: hdfs = get_hdfs_client(self.hdfs_host, self.hdfs_port, user_name='hdfs') pywebhdfs_dirname = dirname(self.hdfs_file_path).lstrip('/') hdfs.make_dir(pywebhdfs_dirname) pywebhdfs_file_path = pywebhdfs_dirname + '/' + basename(self.hdfs_file_path) try: # TODO: Only delete the file if it exists hdfs.delete_file_dir(pywebhdfs_file_path) except Exception as e: LOG.debug(e) with open(self._bulk_load_data_file.name) as readable_file: hdfs.create_file(pywebhdfs_file_path, readable_file) self._bulk_load_data_file.close() if self._bulk_load_non_text_table: if create_tables: self.create_table(self._bulk_load_non_text_table) self.execute('INSERT INTO TABLE %s SELECT * FROM %s' % (self._bulk_load_non_text_table.name, self._bulk_load_table.name)) self.drop_table(self._bulk_load_table.name) self._bulk_load_data_file = None