Esempio n. 1
0
 def test_is_valid(self):
     self.assertTrue(AthenaCompression.is_valid('snappy'))
     self.assertFalse(AthenaCompression.is_valid(None))
     self.assertFalse(AthenaCompression.is_valid(''))
     self.assertFalse(AthenaCompression.is_valid('foobar'))
Esempio n. 2
0
def to_sql(df,
           name,
           conn,
           location,
           schema='default',
           index=False,
           index_label=None,
           chunksize=None,
           if_exists='fail',
           compression=None,
           flavor='spark',
           type_mappings=to_sql_type_mappings):
    # TODO Supports orc, avro, json, csv or tsv format
    # TODO Supports partitioning
    if if_exists not in ('fail', 'replace', 'append'):
        raise ValueError('`{0}` is not valid for if_exists'.format(if_exists))
    if compression is not None and not AthenaCompression.is_valid(compression):
        raise ValueError(
            '`{0}` is not valid for compression'.format(compression))

    import pyarrow as pa
    import pyarrow.parquet as pq
    bucket_name, key_prefix = parse_output_location(location)
    bucket = conn.session.resource('s3',
                                   region_name=conn.region_name,
                                   **conn._client_kwargs).Bucket(bucket_name)
    cursor = conn.cursor()
    retry_config = conn.retry_config

    table = cursor.execute("""
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = '{schema}'
    AND table_name = '{table}'
    """.format(schema=schema, table=name)).fetchall()
    if if_exists == 'fail':
        if table:
            raise OperationalError('Table `{0}.{1}` already exists.'.format(
                schema, name))
    elif if_exists == 'replace':
        if table:
            cursor.execute("""
            DROP TABLE {schema}.{table}
            """.format(schema=schema, table=name))
            objects = bucket.objects.filter(Prefix=key_prefix)
            if list(objects.limit(1)):
                objects.delete()

    if index:
        reset_index(df, index_label)
    for chunk in get_chunks(df, chunksize):
        table = pa.Table.from_pandas(chunk)
        buf = pa.BufferOutputStream()
        pq.write_table(table, buf, compression=compression, flavor=flavor)
        retry_api_call(bucket.put_object,
                       config=retry_config,
                       Body=buf.getvalue().to_pybytes(),
                       Key=key_prefix + str(uuid.uuid4()))

    ddl = generate_ddl(df=df,
                       name=name,
                       location=location,
                       schema=schema,
                       compression=compression,
                       type_mappings=type_mappings)
    cursor.execute(ddl)
Esempio n. 3
0
 def test_is_valid(self):
     self.assertTrue(AthenaCompression.is_valid("snappy"))
     self.assertFalse(AthenaCompression.is_valid(None))
     self.assertFalse(AthenaCompression.is_valid(""))
     self.assertFalse(AthenaCompression.is_valid("foobar"))
Esempio n. 4
0
def to_sql(
    df,
    name,
    conn,
    location,
    schema="default",
    index=False,
    index_label=None,
    partitions=None,
    chunksize=None,
    if_exists="fail",
    compression=None,
    flavor="spark",
    type_mappings=to_sql_type_mappings,
    executor_class=ThreadPoolExecutor,
    max_workers=(cpu_count() or 1) * 5,
):
    # TODO Supports orc, avro, json, csv or tsv format
    if if_exists not in ("fail", "replace", "append"):
        raise ValueError("`{0}` is not valid for if_exists".format(if_exists))
    if compression is not None and not AthenaCompression.is_valid(compression):
        raise ValueError("`{0}` is not valid for compression".format(compression))
    if partitions is None:
        partitions = []

    bucket_name, key_prefix = parse_output_location(location)
    bucket = conn.session.resource(
        "s3", region_name=conn.region_name, **conn._client_kwargs
    ).Bucket(bucket_name)
    cursor = conn.cursor()

    table = cursor.execute(
        """
    SELECT table_name
    FROM information_schema.tables
    WHERE table_schema = '{schema}'
    AND table_name = '{table}'
    """.format(
            schema=schema, table=name
        )
    ).fetchall()
    if if_exists == "fail":
        if table:
            raise OperationalError(
                "Table `{0}.{1}` already exists.".format(schema, name)
            )
    elif if_exists == "replace":
        if table:
            cursor.execute(
                """
            DROP TABLE {schema}.{table}
            """.format(
                    schema=schema, table=name
                )
            )
            objects = bucket.objects.filter(Prefix=key_prefix)
            if list(objects.limit(1)):
                objects.delete()

    if index:
        reset_index(df, index_label)
    with executor_class(max_workers=max_workers) as e:
        futures = []
        session_kwargs = deepcopy(conn._session_kwargs)
        session_kwargs.update({"profile_name": conn.profile_name})
        client_kwargs = deepcopy(conn._client_kwargs)
        client_kwargs.update({"region_name": conn.region_name})
        if partitions:
            for keys, group in df.groupby(by=partitions, observed=True):
                keys = keys if isinstance(keys, tuple) else (keys,)
                group = group.drop(partitions, axis=1)
                partition_prefix = "/".join(
                    ["{0}={1}".format(key, val) for key, val in zip(partitions, keys)]
                )
                for chunk in get_chunks(group, chunksize):
                    futures.append(
                        e.submit(
                            to_parquet,
                            chunk,
                            bucket_name,
                            "{0}{1}/".format(key_prefix, partition_prefix),
                            conn._retry_config,
                            session_kwargs,
                            client_kwargs,
                            compression,
                            flavor,
                        )
                    )
        else:
            for chunk in get_chunks(df, chunksize):
                futures.append(
                    e.submit(
                        to_parquet,
                        chunk,
                        bucket_name,
                        key_prefix,
                        conn._retry_config,
                        session_kwargs,
                        client_kwargs,
                        compression,
                        flavor,
                    )
                )
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            _logger.info("to_parquet: {0}".format(result))

    ddl = generate_ddl(
        df=df,
        name=name,
        location=location,
        schema=schema,
        partitions=partitions,
        compression=compression,
        type_mappings=type_mappings,
    )
    _logger.info(ddl)
    cursor.execute(ddl)
    if partitions:
        repair = "MSCK REPAIR TABLE {0}.{1}".format(schema, name)
        _logger.info(repair)
        cursor.execute(repair)
Esempio n. 5
0
 def test_is_valid(self):
     assert AthenaCompression.is_valid("snappy")
     assert AthenaCompression.is_valid("SNAPPY")
     assert not AthenaCompression.is_valid("")
     assert not AthenaCompression.is_valid("foobar")
Esempio n. 6
0
def to_sql(
    df: "DataFrame",
    name: str,
    conn: "Connection",
    location: str,
    schema: str = "default",
    index: bool = False,
    index_label: Optional[str] = None,
    partitions: List[str] = None,
    chunksize: Optional[int] = None,
    if_exists: str = "fail",
    compression: str = None,
    flavor: str = "spark",
    type_mappings: Callable[["Series"], str] = to_sql_type_mappings,
    executor_class: Type[Union[ThreadPoolExecutor,
                               ProcessPoolExecutor]] = ThreadPoolExecutor,
    max_workers: int = (cpu_count() or 1) * 5,
    repair_table=True,
) -> None:
    # TODO Supports orc, avro, json, csv or tsv format
    if if_exists not in ("fail", "replace", "append"):
        raise ValueError(f"`{if_exists}` is not valid for if_exists")
    if compression is not None and not AthenaCompression.is_valid(compression):
        raise ValueError(f"`{compression}` is not valid for compression")
    if partitions is None:
        partitions = []
    if not location.endswith("/"):
        location += "/"

    bucket_name, key_prefix = parse_output_location(location)
    bucket = conn.session.resource("s3",
                                   region_name=conn.region_name,
                                   **conn._client_kwargs).Bucket(bucket_name)
    cursor = conn.cursor()

    table = cursor.execute(
        textwrap.dedent(f"""
            SELECT table_name
            FROM information_schema.tables
            WHERE table_schema = '{schema}'
            AND table_name = '{name}'
            """)).fetchall()
    if if_exists == "fail":
        if table:
            raise OperationalError(f"Table `{schema}.{name}` already exists.")
    elif if_exists == "replace":
        if table:
            cursor.execute(
                textwrap.dedent(f"""
                    DROP TABLE {schema}.{name}
                    """))
            objects = bucket.objects.filter(Prefix=key_prefix)
            if list(objects.limit(1)):
                objects.delete()

    if index:
        reset_index(df, index_label)
    with executor_class(max_workers=max_workers) as e:
        futures = []
        session_kwargs = deepcopy(conn._session_kwargs)
        session_kwargs.update({"profile_name": conn.profile_name})
        client_kwargs = deepcopy(conn._client_kwargs)
        client_kwargs.update({"region_name": conn.region_name})
        partition_prefixes = []
        if partitions:
            for keys, group in df.groupby(by=partitions, observed=True):
                keys = keys if isinstance(keys, tuple) else (keys, )
                group = group.drop(partitions, axis=1)
                partition_prefix = "/".join(
                    [f"{key}={val}" for key, val in zip(partitions, keys)])
                partition_prefixes.append((
                    ", ".join([
                        f"`{key}` = '{val}'"
                        for key, val in zip(partitions, keys)
                    ]),
                    f"{location}{partition_prefix}/",
                ))
                for chunk in get_chunks(group, chunksize):
                    futures.append(
                        e.submit(
                            to_parquet,
                            chunk,
                            bucket_name,
                            f"{key_prefix}{partition_prefix}/",
                            conn._retry_config,
                            session_kwargs,
                            client_kwargs,
                            compression,
                            flavor,
                        ))
        else:
            for chunk in get_chunks(df, chunksize):
                futures.append(
                    e.submit(
                        to_parquet,
                        chunk,
                        bucket_name,
                        key_prefix,
                        conn._retry_config,
                        session_kwargs,
                        client_kwargs,
                        compression,
                        flavor,
                    ))
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            _logger.info(f"to_parquet: {result}")

    ddl = generate_ddl(
        df=df,
        name=name,
        location=location,
        schema=schema,
        partitions=partitions,
        compression=compression,
        type_mappings=type_mappings,
    )
    _logger.info(ddl)
    cursor.execute(ddl)
    if partitions and repair_table:
        for partition in partition_prefixes:
            add_partition = textwrap.dedent(f"""
                ALTER TABLE `{schema}`.`{name}`
                ADD IF NOT EXISTS PARTITION ({partition[0]}) LOCATION '{partition[1]}'
                """)
            _logger.info(add_partition)
            cursor.execute(add_partition)