def make_create_table_sql(self, table):
    sql = super(ImpalaDbConnection, self).make_create_table_sql(table)
    if not self._bulk_load_table:
      return sql

    hdfs_url_base = get_default_hdfs_config().get('fs.defaultFS')
    sql += "\nLOCATION '%s%s'" % (hdfs_url_base, dirname(self.hdfs_file_path))
    if self._bulk_load_table.storage_format.upper() != 'TEXTFILE':
      sql += "\nSTORED AS " + table.storage_format

    if table.storage_format == 'avro':
      avro_schema = {'name': 'my_record', 'type': 'record', 'fields': []}
      for col in table.cols:
        if issubclass(col.type, Int):
          avro_type = 'int'
        else:
          avro_type = col.type.__name__.lower()
        avro_schema['fields'].append({'name': col.name, 'type': ['null', avro_type]})
      json_avro_schema = dumps(avro_schema)
      # The Hive metastore has a limit to the amount of schema it can store inline.
      # Beyond this limit, the schema needs to be stored in HDFS and Hive is given a
      # URL instead.
      if len(json_avro_schema) > 4000:
        avro_schema_url = 'foo'
        avro_schema_path = '%s/%s.avro' % (self.hdfs_db_dir, table.name)
        hdfs = create_default_hdfs_client()
        hdfs.create_file(avro_schema_path, json_avro_schema, overwrite=True)
        sql += "\nTBLPROPERTIES ('avro.schema.url' = '%s')" % avro_schema_url
      else:
        sql += "\nTBLPROPERTIES ('avro.schema.literal' = '%s')" % json_avro_schema

    return sql
Beispiel #2
0
 def end_bulk_load_table(self):
   super(ImpalaDbConnection, self).end_bulk_load_table()
   hdfs = create_default_hdfs_client()
   pywebhdfs_dirname = dirname(self.hdfs_file_path).lstrip('/')
   hdfs.make_dir(pywebhdfs_dirname)
   pywebhdfs_file_path = pywebhdfs_dirname + '/' + basename(self.hdfs_file_path)
   try:
     # TODO: Only delete the file if it exists
     hdfs.delete_file_dir(pywebhdfs_file_path)
   except Exception as e:
     LOG.debug(e)
   with open(self._bulk_load_data_file.name) as readable_file:
     hdfs.create_file(pywebhdfs_file_path, readable_file)
   self._bulk_load_data_file.close()
   self.execute("INVALIDATE METADATA %s" % self._bulk_load_table.name)
   if self._bulk_load_non_text_table:
     self.hive_connection.execute('CREATE TABLE %s AS SELECT * FROM %s'
         % (self._bulk_load_non_text_table.name, self._bulk_load_table.name))
     self.drop_table(self._bulk_load_table.name)
     self.execute("INVALIDATE METADATA %s" % self._bulk_load_non_text_table)
   self._bulk_load_data_file = None
 def end_bulk_load_table(self, create_tables):
   DbConnection.end_bulk_load_table(self, create_tables)
   if self.hdfs_host is None:
     hdfs = create_default_hdfs_client()
   else:
     hdfs = get_hdfs_client(self.hdfs_host, self.hdfs_port, user_name='hdfs')
   pywebhdfs_dirname = dirname(self.hdfs_file_path).lstrip('/')
   hdfs.make_dir(pywebhdfs_dirname)
   pywebhdfs_file_path = pywebhdfs_dirname + '/' + basename(self.hdfs_file_path)
   try:
     # TODO: Only delete the file if it exists
     hdfs.delete_file_dir(pywebhdfs_file_path)
   except Exception as e:
     LOG.debug(e)
   with open(self._bulk_load_data_file.name) as readable_file:
     hdfs.create_file(pywebhdfs_file_path, readable_file)
   self._bulk_load_data_file.close()
   if self._bulk_load_non_text_table:
     if create_tables:
       self.create_table(self._bulk_load_non_text_table)
     self.execute('INSERT INTO TABLE %s SELECT * FROM %s'
         % (self._bulk_load_non_text_table.name, self._bulk_load_table.name))
     self.drop_table(self._bulk_load_table.name)
   self._bulk_load_data_file = None