Python create_schemaの例

プログラミング言語: Python

名前空間/パッケージ名: s3parq.publish_redshift

メソッド/関数: create_schema

hotexamples.comのコード掲載数: 3

Python create_schema - 3件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのs3parq.publish_redshift.create_schemaの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

    def test_create_schema(self, mock_session_helper, mock_execute):

        mock_execute.return_value = MockScopeObj()
        mock_session_helper.db_session_scope.return_value.__enter__ = scope_execute_mock

        schema_name = "my_string"
        db_name = "my_database"
        iam_role = "my_iam_role"
        with mock_session_helper.db_session_scope() as mock_scope:
            publish_redshift.create_schema(
                schema_name, db_name, iam_role, mock_session_helper)
            mock_scope.execute.assert_called_once_with(f"CREATE EXTERNAL SCHEMA IF NOT EXISTS {schema_name} \
                FROM DATA CATALOG \
                database '{db_name}' \
                iam_role '{iam_role}';")

コード例 #2

ファイルを表示

ファイル: publish_parq.py プロジェクト: IntegriChain1/s3parq

def custom_publish(bucket: str,
                   key: str,
                   partitions: List[str],
                   dataframe: pd.DataFrame,
                   custom_redshift_columns: dict,
                   redshift_params: dict = None) -> List[str]:
    """ Dataframe to S3 Parquet Publisher with a CUSTOM redshift column definition.
    Custom publish allows custom defined redshift column definitions to be used and 
    enables support for Redshift's decimal data type. 
    This function handles the portion of work that will see a dataframe converted
    to parquet and then published to the given S3 location.
    It supports partitions and will use the custom redshift columns defined in the
    custom_redshift_columns dictionary when creating the table schema for the parquet file. 
    View the Custom Publishes section of s3parq's readme file for more guidance on formatting
    the custom_redshift_columns dictionary. It also has the option to automatically publish up 
    to Redshift Spectrum for the newly published parquet files. 

    Args:
        bucket (str): S3 Bucket name
        key (str): S3 key to lead to the desired dataset
        partitions (List[str]): List of columns that should be partitioned on
        dataframe (pd.DataFrame): Dataframe to be published
        custom_redshift_columns (dict): 
            This dictionary contains custom column data type definitions for redshift.
            The params should be formatted as follows:
                - column name (str)
                - data type (str)
        redshift_params (dict, Optional):
            This dictionary should be provided in the following format in order
            for data to be published to Spectrum. Leave out entirely to avoid
            publishing to Spectrum.
            The params should be formatted as follows:
                - schema_name (str): Name of the Spectrum schema to publish to
                - table_name (str): Name of the table to write the dataset as
                - iam_role (str): Role to take while writing data to Spectrum
                - region (str): AWS region for Spectrum
                - cluster_id (str): Spectrum cluster id
                - host (str): Redshift Spectrum host name
                - port (str): Redshift Spectrum port to use
                - db_name (str): Redshift Spectrum database name to use
                - ec2_user (str): If on ec2, the user that should be used

    Returns:
        A str list of all the newly published object keys
    """
    logger.debug("Running custom publish...")

    session_helper = None

    if redshift_params:
        if "index" in dataframe.columns:
            raise ValueError(
                "'index' is a reserved keyword in Redshift. Please remove or rename your DataFrame's 'index' column."
            )

        logger.debug(
            "Found redshift parameters. Checking validity of params...")
        redshift_params = validate_redshift_params(redshift_params)
        logger.debug("Redshift parameters valid. Opening Session helper.")
        session_helper = SessionHelper(
            region=redshift_params['region'],
            cluster_id=redshift_params['cluster_id'],
            host=redshift_params['host'],
            port=redshift_params['port'],
            db_name=redshift_params['db_name'],
            ec2_user=redshift_params['ec2_user'])

        session_helper.configure_session_helper()
        publish_redshift.create_schema(redshift_params['schema_name'],
                                       redshift_params['db_name'],
                                       redshift_params['iam_role'],
                                       session_helper)
        logger.debug(
            f"Schema {redshift_params['schema_name']} created. Creating table {redshift_params['table_name']}..."
        )

        publish_redshift.create_custom_table(redshift_params['table_name'],
                                             redshift_params['schema_name'],
                                             partitions, s3_url(bucket, key),
                                             custom_redshift_columns,
                                             session_helper)
        logger.debug(f"Custom table {redshift_params['table_name']} created.")

    logger.debug("Checking publish params...")
    check_empty_dataframe(dataframe)
    check_dataframe_for_timedelta(dataframe)
    check_partitions(partitions, dataframe)
    logger.debug("Publish params valid.")
    logger.debug("Begin writing to S3..")

    files = []
    for frame_params in _sized_dataframes(dataframe):
        logger.info(
            f"Publishing dataframe chunk : {frame_params['lower']} to {frame_params['upper']}"
        )
        frame = pd.DataFrame(
            dataframe[frame_params['lower']:frame_params['upper']])
        _gen_parquet_to_s3(bucket=bucket,
                           key=key,
                           dataframe=frame,
                           partitions=partitions,
                           custom_redshift_columns=custom_redshift_columns)

        published_files = _assign_partition_meta(
            bucket=bucket,
            key=key,
            dataframe=frame,
            partitions=partitions,
            session_helper=session_helper,
            redshift_params=redshift_params,
            custom_redshift_columns=custom_redshift_columns)
        files = files + published_files

    logger.info("Done writing to S3.")

    return files

コード例 #3

ファイルを表示

def publish(bucket: str,
            key: str,
            partitions: List['str'],
            dataframe: pd.DataFrame,
            redshift_params=None) -> None:
    """Redshift Params:
        ARGS: 
            schema_name: str
            table_name: str
            iam_role: str
            region: str
            cluster_id: str
            host: str 
            port: str 
            db_name: str
    """
    session_helper = None

    if redshift_params:
        if "index" in dataframe.columns:
            raise ValueError(
                "'index' is a reserved keyword in Redshift. Please remove or rename your DataFrame's 'index' column."
            )

        logger.debug(
            "Found redshift parameters. Checking validity of params...")
        check_redshift_params(redshift_params)
        logger.debug("Redshift parameters valid. Opening Session helper.")
        session_helper = SessionHelper(
            region=redshift_params['region'],
            cluster_id=redshift_params['cluster_id'],
            host=redshift_params['host'],
            port=redshift_params['port'],
            db_name=redshift_params['db_name'])
        session_helper.configure_session_helper()
        publish_redshift.create_schema(redshift_params['schema_name'],
                                       redshift_params['db_name'],
                                       redshift_params['iam_role'],
                                       session_helper)
        logger.debug(
            f"Schema {redshift_params['schema_name']} created. Creating table {redshift_params['table_name']}..."
        )

        df_types = _get_dataframe_datatypes(dataframe, partitions)
        partition_types = _get_dataframe_datatypes(dataframe, partitions, True)
        publish_redshift.create_table(redshift_params['table_name'],
                                      redshift_params['schema_name'],
                                      df_types, partition_types,
                                      s3_url(bucket, key), session_helper)
        logger.debug(f"Table {redshift_params['table_name']} created.")

    logger.info("Checking params...")
    check_empty_dataframe(dataframe)
    check_dataframe_for_timedelta(dataframe)
    check_partitions(partitions, dataframe)
    logger.info("Params valid.")
    logger.debug("Begin writing to S3..")

    files = []
    for frame in _sized_dataframes(dataframe):
        _gen_parquet_to_s3(bucket=bucket,
                           key=key,
                           dataframe=frame,
                           partitions=partitions)

        published_files = _assign_partition_meta(
            bucket=bucket,
            key=key,
            dataframe=frame,
            partitions=partitions,
            session_helper=session_helper,
            redshift_params=redshift_params)
        files = files + published_files

    logger.debug("Done writing to S3.")

    return files