Example #1
0
def from_hdfs(ic,
              path,
              schema,
              table=None,
              overwrite=False,
              file_format='TEXTFILE',
              partition_schema=None,
              field_terminator='\t',
              line_terminator='\n',
              escape_char='\\'):
    """Create a BDF backed by an external file in HDFS.

    File must be Impala-compatible
    """
    if partition_schema is not None:
        raise NotImplementedError(
            "Partitions not yet implemented in .from_hdfs()")
    if table is None:
        temp_table = _random_id('tmp_table_', 8)
        table = "%s.%s" % (ic._temp_db, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    create_stmt = _create_table(table_name,
                                schema,
                                path=path,
                                file_format=file_format,
                                field_terminator=field_terminator,
                                line_terminator=line_terminator,
                                escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    return from_sql_table(ic, table_name.to_sql())
Example #2
0
    def store(self,
              path=None,
              table=None,
              file_format='TEXTFILE',
              field_terminator='\t',
              line_terminator='\n',
              escape_char='\\',
              overwrite=False):
        """
        Materialize the results and stores them in HFDS. Functions as an
        EXTERNAL table.

        Implemented through a `CREATE TABLE AS SELECT`.
        """
        temp_table = _random_id('tmp_table_', 8)
        if table is None:
            table = "%s.%s" % (self._temp_db, temp_table)
        if path is None:
            path = os.path.join(self._temp_dir, temp_table)
        table_name = _to_TableName(table)
        return self._store(path=path,
                           table_name=table_name,
                           file_format=file_format,
                           field_terminator=field_terminator,
                           line_terminator=line_terminator,
                           escape_char=escape_char,
                           overwrite=overwrite)
Example #3
0
    def store(self,
              path=None,
              table=None,
              file_format='TEXTFILE',
              field_terminator='\t',
              line_terminator='\n',
              escape_char='\\',
              overwrite=False):
        """Materialize the results and stores them in HFDS

        Implemented through a `CREATE TABLE AS SELECT`.
        """
        temp_table = _random_id('tmp_table_', 8)
        if table is None:
            table = "%s.%s" % (self._temp_db, temp_table)
        if path is None:
            path = os.path.join(self._temp_dir, temp_table)
        table_name = _to_TableName(table)
        if overwrite:
            self._cursor.execute("DROP TABLE IF EXISTS %s" %
                                 table_name.to_sql())
        create_stmt = _create_table_as_select(
            table_name,
            path=path,
            file_format=file_format,
            field_terminator=field_terminator,
            line_terminator=line_terminator,
            escape_char=escape_char)
        query = create_stmt + self.to_sql()
        self._cursor.execute(query)
        return from_sql_table(self._ic, table_name.to_sql())
Example #4
0
def from_sql_table(ic, table):
    """Create a BDF from a table name usable in Impala"""
    table_name = _to_TableName(table)
    table_ref = BaseTableRef(table_name)
    schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql())
    select_list = tuple([SelectItem(expr=Literal(col)) for (col, ty) in schema])
    return BigDataFrame(ic, SelectStmt(select_list, table_ref))
Example #5
0
def from_sql_table(ic, table):
    """Create a BDF from a table name usable in Impala"""
    table_name = _to_TableName(table)
    table_ref = BaseTableRef(table_name)
    schema = _get_table_schema_hack(ic._cursor, table_ref.to_sql())
    select_list = tuple(
        [SelectItem(expr=Literal(col)) for (col, ty) in schema])
    return BigDataFrame(ic, SelectStmt(select_list, table_ref))
Example #6
0
    def store_managed(self, table, file_format='PARQUET', field_terminator='\t', line_terminator='\n', escape_char='\\',
              overwrite=False):
        """Materialize the results and stores them in HDFS as an impala managed table.

        Implemented through a `CREATE TABLE AS SELECT`.
        """
        table_name = _to_TableName(table)
        return self._store(path=None, table_name=table_name, file_format=file_format, field_terminator=field_terminator,
            line_terminator=line_terminator, escape_char=escape_char, overwrite=overwrite)
Example #7
0
 def save_view(self, name, overwrite=False):
     """Create a named view representing this BDF for later reference"""
     # TODO: is this fn useful?
     table_name = _to_TableName(name)
     if overwrite:
         self._ic._cursor.execute('DROP VIEW IF EXISTS %s' % table_name.to_sql())
     sql = 'CREATE VIEW %s AS %s' % (table_name.to_sql(),
             self._query_ast.to_sql())
     self._ic._cursor.execute(sql)
     return from_sql_table(self._ic, table_name.to_sql())
Example #8
0
 def save_view(self, name, overwrite=False):
     """Create a named view representing this BDF for later reference"""
     # TODO: is this fn useful?
     table_name = _to_TableName(name)
     if overwrite:
         self._ic._cursor.execute('DROP VIEW IF EXISTS %s' %
                                  table_name.to_sql())
     sql = 'CREATE VIEW %s AS %s' % (table_name.to_sql(),
                                     self._query_ast.to_sql())
     self._ic._cursor.execute(sql)
     return from_sql_table(self._ic, table_name.to_sql())
Example #9
0
def from_pandas(ic, df, table=None, path=None, method='in_query',
                file_format='TEXTFILE', field_terminator='\t',
                line_terminator='\n', escape_char='\\', overwrite=False):
    """Create a BDF by shipping an in-memory pandas `DataFrame` into Impala

    path is the dir, not the filename
    """
    # TODO: this is not atomic
    assert isinstance(ic, ImpalaContext)
    temp_table = _random_id('tmp_table_', 8)
    if table is None:
        table = "%s.%s" % (ic._temp_db, temp_table)
    if path is None:
        path = os.path.join(ic._temp_dir, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    columns = list(df.columns)
    types = [_numpy_dtype_to_impala_PrimitiveType(ty) for ty in df.dtypes]
    schema = zip(columns, types)
    create_stmt = _create_table(table_name, schema, path=path,
                                file_format=file_format,
                                field_terminator=field_terminator,
                                line_terminator=line_terminator,
                                escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    if method == 'in_query':
        query = "INSERT INTO %s VALUES " % table_name.to_sql()
        query += ', '.join(['(%s)' % ', '.join(map(_py_to_sql_string, row))
                            for row in df.values])
        ic._cursor.execute(query)
    elif method == 'webhdfs':
        if file_format != 'TEXTFILE':
            raise ValueError("only TEXTFILE format supported for webhdfs")
        if path is None:
            raise ValueError(
                "must supply a path for EXTERNAL table for webhdfs")
        hdfs_client = ic.hdfs_client()
        raw_data = StringIO()
        df.to_csv(raw_data, sep=field_terminator,
                  line_terminator=line_terminator, quoting=csv.QUOTE_NONE,
                  escapechar=escape_char, header=False, index=False)
        hdfs_client.write(
            os.path.join(path, 'data.txt'), raw_data.getvalue(),
            overwrite=overwrite)
        raw_data.close()
    else:
        raise ValueError(
            "method must be 'in_query' or 'webhdfs'; got %s" % method)
    return from_sql_table(ic, table_name.to_sql())
Example #10
0
File: bdf.py Project: cgc17/impyla
    def store(self, path=None, table=None, file_format='TEXTFILE',
              field_terminator='\t', line_terminator='\n', escape_char='\\',
              overwrite=False):
        """Materialize the results and stores them in HFDS. Functions as an EXTERNAL table.

        Implemented through a `CREATE TABLE AS SELECT`.
        """
        temp_table = _random_id('tmp_table_', 8)
        if table is None:
            table = "%s.%s" % (self._temp_db, temp_table)
        if path is None:
            path = os.path.join(self._temp_dir, temp_table)
        table_name = _to_TableName(table)
        return self._store(path=path, table_name=table_name, file_format=file_format, field_terminator=field_terminator,
                    line_terminator=line_terminator, escape_char=escape_char, overwrite=overwrite)
Example #11
0
def from_hdfs(ic, path, schema, table=None, overwrite=False,
        file_format='TEXTFILE', partition_schema=None,
        field_terminator='\t', line_terminator='\n', escape_char='\\'):
    """Create a BDF backed by an external file in HDFS.

    File must be Impala-compatible
    """
    if partition_schema is not None:
        raise NotImplementedError("Partitions not yet implemented in .from_hdfs()")
    if table is None:
        temp_table = _random_id('tmp_table_', 8)
        table = "%s.%s" % (ic._temp_db, temp_table)
    table_name = _to_TableName(table)
    if overwrite:
        ic._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
    create_stmt = _create_table(table_name, schema, path=path,
            file_format=file_format, field_terminator=field_terminator,
            line_terminator=line_terminator, escape_char=escape_char)
    ic._cursor.execute(create_stmt)
    return from_sql_table(ic, table_name.to_sql())
Example #12
0
    def store(self, path=None, table=None, file_format='TEXTFILE',
            field_terminator='\t', line_terminator='\n', escape_char='\\', overwrite=False):
        """Materialize the results and stores them in HFDS

        Implemented through a `CREATE TABLE AS SELECT`.
        """
        temp_table = _random_id('tmp_table_', 8)
        if table is None:
            table = "%s.%s" % (self._temp_db, temp_table)
        if path is None:
            path = os.path.join(self._temp_dir, temp_table)
        table_name = _to_TableName(table)
        if overwrite:
            self._cursor.execute("DROP TABLE IF EXISTS %s" % table_name.to_sql())
        create_stmt = _create_table_as_select(table_name, path=path,
                file_format=file_format, field_terminator=field_terminator,
                line_terminator=line_terminator, escape_char=escape_char)
        query = create_stmt + self.to_sql()
        self._cursor.execute(query)
        return from_sql_table(self._ic, table_name.to_sql())