Example #1
0
def create_data_block_from_sql(
    env: Environment,
    sql: str,
    sess: Session,
    db_api: DatabaseStorageApi,
    nominal_schema: Schema = None,
    inferred_schema: Schema = None,
    created_by_node_key: str = None,
) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]:
    # TODO: we are special casing sql right now, but could create another DataFormat (SqlQueryFormat, non-storable).
    #       but, not sure how well it fits paradigm (it's a fundamentally non-python operation, the only one for now --
    #       if we had an R runtime or any other shell command, they would also be in this bucket)
    #       fine here for now, but there is a generalization that might make the sql pipe less awkward (returning sdb)
    logger.debug("CREATING DATA BLOCK from sql")
    tmp_name = f"_tmp_{rand_str(10)}".lower()
    sql = db_api.clean_sub_sql(sql)
    create_sql = f"""
    create table {tmp_name} as
    select
    *
    from (
    {sql}
    ) as __sub
    """
    db_api.execute_sql(create_sql)
    cnt = db_api.count(tmp_name)
    if not nominal_schema:
        nominal_schema = env.get_schema("Any", sess)
    if not inferred_schema:
        inferred_schema = infer_schema_from_db_table(db_api, tmp_name)
        env.add_new_generated_schema(inferred_schema, sess)
    realized_schema = cast_to_realized_schema(env, sess, inferred_schema,
                                              nominal_schema)
    block = DataBlockMetadata(
        id=get_datablock_id(),
        inferred_schema_key=inferred_schema.key if inferred_schema else None,
        nominal_schema_key=nominal_schema.key,
        realized_schema_key=realized_schema.key,
        record_count=cnt,
        created_by_node_key=created_by_node_key,
    )
    storage_url = db_api.url
    sdb = StoredDataBlockMetadata(
        id=get_datablock_id(),
        data_block_id=block.id,
        data_block=block,
        storage_url=storage_url,
        data_format=DatabaseTableFormat,
    )
    sess.add(block)
    sess.add(sdb)
    # sess.flush([block, sdb])
    db_api.rename_table(tmp_name, sdb.get_name())
    return block, sdb
Example #2
0
def create_data_block_from_records(
    env: Environment,
    sess: Session,
    local_storage: Storage,
    records: Any,
    nominal_schema: Schema = None,
    inferred_schema: Schema = None,
    created_by_node_key: str = None,
) -> Tuple[DataBlockMetadata, StoredDataBlockMetadata]:
    from snapflow.storage.storage import LocalPythonStorageEngine

    logger.debug("CREATING DATA BLOCK")
    if isinstance(records, MemoryDataRecords):
        dro = records
        # Important: override nominal schema with DRO entry if it exists
        if dro.nominal_schema is not None:
            nominal_schema = env.get_schema(dro.nominal_schema, sess)
    else:
        dro = as_records(records, schema=nominal_schema)
    if not nominal_schema:
        nominal_schema = env.get_schema("Any", sess)
    if not inferred_schema:
        inferred_schema = dro.data_format.infer_schema_from_records(
            dro.records_object)
        env.add_new_generated_schema(inferred_schema, sess)
    realized_schema = cast_to_realized_schema(env, sess, inferred_schema,
                                              nominal_schema)
    dro = dro.conform_to_schema(realized_schema)
    block = DataBlockMetadata(
        id=get_datablock_id(),
        inferred_schema_key=inferred_schema.key if inferred_schema else None,
        nominal_schema_key=nominal_schema.key,
        realized_schema_key=realized_schema.key,
        record_count=dro.record_count,
        created_by_node_key=created_by_node_key,
    )
    sdb = StoredDataBlockMetadata(  # type: ignore
        id=get_datablock_id(),
        data_block_id=block.id,
        data_block=block,
        storage_url=local_storage.url,
        data_format=dro.data_format,
    )
    sess.add(block)
    sess.add(sdb)
    # sess.flush([block, sdb])
    local_storage.get_api().put(sdb.get_name(), dro)
    return block, sdb