def _create_tables():
    redshift = RedshiftHandler()
    transaction = []
    for table in TableName:
        transaction.append(CREATE_QUERY_TEMPLATE[table.value].format("", "", table.value))

    redshift.transaction(transaction)
 def setUp(self):
     self.dss_env = MATRIX_ENV_TO_DSS_ENV[os.environ['DEPLOYMENT_STAGE']]
     self.api_url = f"https://{os.environ['API_HOST']}/v0"
     self.res_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "res")
     self.headers = {'Content-type': 'application/json', 'Accept': 'application/json'}
     self.verbose = True
     self.s3_file_system = s3fs.S3FileSystem(anon=False)
     self.redshift_handler = RedshiftHandler()
    def __init__(self, bundle_uuid, bundle_version, event_type):
        logger.info(
            f"Running NotificationHandler with parameters: {bundle_uuid}, {bundle_version}, {event_type}"
        )
        self.bundle_uuid = bundle_uuid
        self.bundle_version = bundle_version
        self.event_type = event_type

        self.redshift = RedshiftHandler()
Beispiel #4
0
    def __init__(self, request_id: str, bundles_per_worker: int = 100):
        Logging.set_correlation_id(logger, value=request_id)

        self.request_id = request_id
        self.bundles_per_worker = bundles_per_worker
        self.request_tracker = RequestTracker(request_id)
        self.dynamo_handler = DynamoHandler()
        self.sqs_handler = SQSHandler()
        self.infra_config = MatrixInfraConfig()
        self.redshift_config = MatrixRedshiftConfig()
        self.query_results_bucket = os.environ['MATRIX_QUERY_RESULTS_BUCKET']
        self.s3_handler = S3Handler(os.environ['MATRIX_QUERY_BUCKET'])
        self.redshift_handler = RedshiftHandler()
def load_tables(job_id: str, is_update: bool = False):
    """
    Creates tables and loads PSVs from S3 into Redshift via SQL COPY.
    :param job_id: UUID of the S3 prefix containinng the data to load in
    :param is_update: True if existing database rows will be updated, else False
    """
    _create_tables()
    lock_query = """LOCK TABLE write_lock"""
    delete_query_template = """DELETE FROM {0} USING {0}_temp WHERE {0}.{1} = {0}_temp.{1};"""
    insert_query_template = """INSERT INTO {0} SELECT * FROM {0}_temp;"""

    redshift = RedshiftHandler()
    transaction = [lock_query]
    for table in TableName:
        if (is_update and table
                == TableName.FEATURE) or table == TableName.WRITE_LOCK:
            continue
        s3_prefix = f"s3://{os.environ['MATRIX_PRELOAD_BUCKET']}/{job_id}/{table.value}"
        if table == TableName.CELL:
            s3_prefix += '/'
        iam = os.environ['MATRIX_REDSHIFT_IAM_ROLE_ARN']

        table_name = table.value if not is_update else f"{table.value}_temp"

        if table == TableName.FEATURE:
            copy_stmt = f"COPY {table_name} FROM '{s3_prefix}' iam_role '{iam}' COMPUPDATE ON;"
        elif table == TableName.CELL:
            copy_stmt = f"COPY {table_name} FROM '{s3_prefix}' iam_role '{iam}' GZIP COMPUPDATE ON;"
        elif table == TableName.EXPRESSION:
            copy_stmt = f"COPY {table_name} FROM '{s3_prefix}' iam_role '{iam}' GZIP COMPUPDATE ON COMPROWS 10000000;"
        else:
            copy_stmt = f"COPY {table_name} FROM '{s3_prefix}' iam_role '{iam}';"

        if is_update:
            logger.info(f"ETL: Building queries to update {table.value} table")
            transaction.extend([
                CREATE_QUERY_TEMPLATE[table.value].format(
                    "TEMP ", "_temp", table_name), copy_stmt,
                delete_query_template.format(
                    table.value, RedshiftHandler.PRIMARY_KEY[table]),
                insert_query_template.format(table.value)
            ])
        else:
            logger.info(f"ETL: Building queries to load {table_name} table")
            transaction.append(copy_stmt)

    logger.info(f"ETL: Populating Redshift tables. Committing transaction.")
    try:
        redshift.transaction(transaction)
    except psycopg2.Error as e:
        logger.error("Failed to populate Redshift tables. Rolling back.", e)
def _verify_load(es_query):
    dss_client = etl.get_dss_client(deployment_stage=os.environ['DEPLOYMENT_STAGE'])
    response = dss_client.post_search.iterate(es_query=es_query,
                                              replica='aws',
                                              per_page=500)
    expected_bundles = list(result['bundle_fqid'] for result in response)

    print(f"Loading {len(expected_bundles)} bundles to {os.environ['DEPLOYMENT_STAGE']} complete.\n"
          f"Verifying row counts in Redshift...")
    redshift = RedshiftHandler()
    count_bundles_query = f"SELECT COUNT(*) FROM analysis WHERE bundle_fqid IN {format_str_list(expected_bundles)}"
    results = redshift.transaction(queries=[count_bundles_query],
                                   return_results=True)
    print(f"Found {results[0][0]} analysis rows for {len(expected_bundles)} expected bundles.")
    assert (results[0][0] == len(expected_bundles))
Beispiel #7
0
def _redshift_detail_lookup(name, description):
    type_ = constants.METADATA_FIELD_TO_TYPE[name]
    column_name = constants.METADATA_FIELD_TO_TABLE_COLUMN[name]
    table_name = constants.TABLE_COLUMN_TO_TABLE[column_name]
    fq_name = table_name + "." + column_name

    table_primary_key = RedshiftHandler.PRIMARY_KEY[TableName(table_name)]

    rs_handler = RedshiftHandler()

    if type_ == "categorical":
        query = query_constructor.create_field_detail_query(
            fq_name, table_name, table_primary_key, type_)
        results = dict(
            rs_handler.transaction([query],
                                   return_results=True,
                                   read_only=True))
        if None in results:
            results[""] = results[None]
            results.pop(None)
        if True in results:
            results["True"] = results[True]
            results.pop(True)
        if False in results:
            results["False"] = results[False]
            results.pop(False)
        return ({
            "field_name": name,
            "field_description": description,
            "field_type": type_,
            "cell_counts": results
        }, requests.codes.ok)

    elif type_ == "numeric":
        query = query_constructor.create_field_detail_query(
            fq_name, table_name, table_primary_key, type_)
        results = rs_handler.transaction([query],
                                         return_results=True,
                                         read_only=True)
        min_ = results[0][0]
        max_ = results[0][1]
        return ({
            "field_name": name,
            "field_description": description,
            "field_type": type_,
            "minimum": min_,
            "maximum": max_
        }, requests.codes.ok)
#!/usr/bin/env python
"""
This script creates a readonly user in the redshift db
"""

from matrix.common.aws.redshift_handler import RedshiftHandler
from matrix.common.config import MatrixRedshiftConfig

matrix_config = MatrixRedshiftConfig()
redshift_handler = RedshiftHandler()


def handler():
    readonly_username = matrix_config.readonly_username
    readonly_password = matrix_config.readonly_password
    drop_user_query = f"DROP USER IF EXISTS {readonly_username};"
    add_user_query = f"CREATE USER {readonly_username} WITH PASSWORD '{readonly_password}';"
    grant_public_query = f"grant select on all tables in schema public to {readonly_username};"
    grant_information_schema_query = f"grant select on all tables in schema information_schema to {readonly_username};"
    try:
        redshift_handler.transaction([drop_user_query, add_user_query])
    except Exception as e:
        print(e)
    redshift_handler.transaction(
        [grant_public_query, grant_information_schema_query])
    print("permissions applied to readonly user")


if __name__ == '__main__':
    handler()
 def __init__(self):
     self.sqs_handler = SQSHandler()
     self.s3_handler = S3Handler(os.environ["MATRIX_QUERY_BUCKET"])
     self.batch_handler = BatchHandler()
     self.redshift_handler = RedshiftHandler()
     self.matrix_infra_config = MatrixInfraConfig()