Ejemplo n.º 1
0
def start_up(search_config_id: str,
             search_config_url: str = None,
             search_config_title: str = None,
             detail_config_id: str = None,
             metadata_config_url: str = None,
             detail_config_url: str = None,
             detail_config_title: str = None):
    conf = get_config()
    search_config_index_name = conf.get_string("search-config.index-name")
    search_config = conf.get_config("presets.file-search.search-config")

    detail_config_index_name = conf.get_string("detail-config.index-name")
    detail_config = conf.get_config("presets.file-search.detail-config")

    if detail_config_id is None:
        detail_config_id = search_config_id

    try:
        es_client = start()
        auth_header = get_admin_auth_header()
        if not es_client.exists(search_config_index_name,
                                search_config_id,
                                headers=auth_header):
            create_search_config(es_client,
                                 search_config_index_name,
                                 search_config_id,
                                 search_config,
                                 search_config_url,
                                 search_config_title,
                                 headers=auth_header)

        if not es_client.exists(detail_config_index_name,
                                detail_config_id,
                                headers=auth_header):
            create_detail_config(es_client,
                                 detail_config_index_name,
                                 detail_config_id,
                                 detail_config,
                                 metadata_config_url,
                                 detail_config_url,
                                 detail_config_title,
                                 headers=auth_header)
    finally:
        es_client.close()
Ejemplo n.º 2
0
def start():
    conf = get_config().get_config("elasticsearch")
    return OpenDistro(**conf)
Ejemplo n.º 3
0
import logging

from config.utils import get_admin_auth_header, get_config
from start_client import start

from .loaddata import (create_sample_index, load_detail_config,
                       load_sample_config, load_sample_data)

index_name = "sample-data"

conf = get_config()


def start_up():
    try:
        es_client = start()
        auth_header = get_admin_auth_header()
        if not es_client.indices.exists(index_name, headers=auth_header):
            create_sample_index(es_client, index_name, headers=auth_header)
            load_sample_data(es_client,
                             index_name,
                             num_docs=30,
                             headers=auth_header)
            load_sample_config(
                es_client,
                index_name=conf.get_string("search-config.index-name"),
                headers=auth_header)
            load_detail_config(
                es_client,
                index_name=conf.get_string("detail-config.index-name"),
                headers=auth_header)
Ejemplo n.º 4
0
def run_job(args, config):
    app_name_template = config.get('global').get('description')
    project_name = args["--project"]
    country = args["--country"]

    # read spark/hadoop informations
    save_hdfs = (args["--save-hdfs"].lower() == "true")
    remove_from_local_hdfs = (
        args["--remove-from-local-hdfs"].lower() == "true")

    # file system saving info
    dependent_schema_dir = args["--dependent-schema-dir"]
    base_directory = args["--base-directory"]
    project_schema = "dw_{}_{}".format(project_name, country)
    save_dir = "{}/{}".format(base_directory, project_schema)

    target_database_config = {
        "type": args["--target-type"],
        "hostname": args["--target-hostname"],
        "port": args["--target-port"],
        "username": args["--target-username"],
        "password": args["--target-password"],
        "database": args["--target-database"],
        "schema": args["--target-schema"]
    }

    # write s3 bucket informations
    use_s3_dist_cp = (args["--use-s3-dist-cp"].lower() == "true")
    aws_access_key = args["--aws-access-key"]
    aws_secret_key = args["--aws-secret-key"]
    s3_bucket = args["--s3-bucket-name"]

    # file related args
    input_file_type = args["--input-file-type"]
    output_file_type = args["--output-file-type"]
    separator = args["--csv-separator"]
    header = (args["--csv-header"].lower() == "true")
    infer_schema = (args["--infer-schema"].lower() == "true")

    # validate if s3 interface is valid with given option
    validate_s3_interface(use_s3_dist_cp, s3_bucket)

    metadata_config = config.get('metadata')
    project_config_file = '../config/dimensions-facts/{}/{}.yml'.format(
        project_name, country)
    project_config = get_config(project_config_file).get('default')

    app_name = app_name_template.format(project_name, country,
                                        output_file_type)
    spark = create_spark_session(aws_access_key, aws_secret_key, app_name)

    dimensions_config = project_config.get("dimensions")
    facts_config = project_config.get("facts")

    dependent_schemas = project_config.get("dependency")

    # load all tables (stages) from all dependent schemas to calculate dimensions and facts
    for schema in dependent_schemas:
        logging.info("loading tables from {} schema".format(schema))
        schema_name = dependent_schemas.get(schema)
        stage_tables_info = get_table_info(metadata_config, schema_name)

        # for each stage table found, load a temporary view for spark
        for stage_table in stage_tables_info:
            table_name = stage_table[0]
            load_dependent_table(spark, dependent_schema_dir, schema_name,
                                 table_name, input_file_type)

    # load user defined functions (udf) to use as spark sql functions
    load_spark_sql_udf(spark)

    # for each dimension
    for dimension in dimensions_config:
        # unpersist dataframe from cache after write
        unpersist_after_write = True
        # get sql file
        dim_file = dimensions_config.get(dimension).get("file")
        # get partition column list
        dim_partition = dimensions_config.get(dimension).get("partition")
        if dim_partition is not None:
            dim_partition = dim_partition.replace(" ", "").split(',')

        _process_dim_fact(spark=spark,
                          project_name=project_name,
                          project_schema=project_schema,
                          country=country,
                          dim_fact_name=dimension,
                          dim_fact_file=dim_file,
                          dim_fact_partition=dim_partition,
                          output_file_type=output_file_type,
                          separator=separator,
                          header=header,
                          use_s3_dist_cp=use_s3_dist_cp,
                          s3_bucket=s3_bucket,
                          save_hdfs=save_hdfs,
                          remove_from_local_hdfs=remove_from_local_hdfs,
                          save_dir=save_dir,
                          database_config=target_database_config,
                          unpersist_after_write=unpersist_after_write,
                          infer_schema=infer_schema)
        logging.info("Dimension process {} finished".format(dimension))

    # for each fact
    for fact in facts_config:
        # unpersist dataframe from cache after write
        unpersist_after_write = True
        # get sql file
        fact_file = facts_config.get(fact).get("file")
        # get partition column list
        fact_partition = facts_config.get(fact).get("partition")
        if fact_partition is not None:
            fact_partition = fact_partition.replace(" ", "").split(',')

        _process_dim_fact(spark=spark,
                          project_name=project_name,
                          project_schema=project_schema,
                          country=country,
                          dim_fact_name=fact,
                          dim_fact_file=fact_file,
                          dim_fact_partition=fact_partition,
                          output_file_type=output_file_type,
                          separator=separator,
                          header=header,
                          use_s3_dist_cp=use_s3_dist_cp,
                          s3_bucket=s3_bucket,
                          save_hdfs=save_hdfs,
                          remove_from_local_hdfs=remove_from_local_hdfs,
                          save_dir=save_dir,
                          database_config=target_database_config,
                          unpersist_after_write=unpersist_after_write,
                          infer_schema=infer_schema)
        logging.info("Fact process {} finished".format(fact))

    # spark is not needed
    stop_spark_context(spark)
Ejemplo n.º 5
0
                          project_schema=project_schema,
                          country=country,
                          dim_fact_name=fact,
                          dim_fact_file=fact_file,
                          dim_fact_partition=fact_partition,
                          output_file_type=output_file_type,
                          separator=separator,
                          header=header,
                          use_s3_dist_cp=use_s3_dist_cp,
                          s3_bucket=s3_bucket,
                          save_hdfs=save_hdfs,
                          remove_from_local_hdfs=remove_from_local_hdfs,
                          save_dir=save_dir,
                          database_config=target_database_config,
                          unpersist_after_write=unpersist_after_write,
                          infer_schema=infer_schema)
        logging.info("Fact process {} finished".format(fact))

    # spark is not needed
    stop_spark_context(spark)


if __name__ == '__main__':
    args = docopt(__doc__, version='1')
    # configure log
    set_log(args['--log-level'])

    env = os.getenv('env_type', 'default')
    config = get_config('../config/load-dim-fact.yml').get(env)
    run_job(args, config)
Ejemplo n.º 6
0
def preview_router(index_name: str, cache_path: str, path_property: str, tags: List[str] = ["preview"]):
    router = APIRouter()
    query_builder = ElasticsearchAPIQueryBuilder()

    conf = get_config()
    manager = PreviewManager(cache_path, create_folder=True)

    @query_builder.filter()
    def filter_config(id: str = Path(None,
                                     description="Id of the document to preview.")):
        return {
            "ids": {
                "values": [id]
            }
        }

    @router.get("/preview/{id}", tags=tags)
    async def preview(
            page: Optional[int] = Query(0,
                                        ge=0,
                                        description="The page of the document to generate the preview."),
            width: Optional[int] = Query(300,
                                         ge=1,
                                         le=1024,
                                         description="The width of the generated preview."),
            height: Optional[int] = Query(200,
                                          ge=1,
                                          le=1024,
                                          description="The height of the generated preview."),
            query_body: Dict = Depends(query_builder.build(source=[path_property])),
            es_client: Elasticsearch = Depends(get_client),
            auth_header: Dict = Depends(get_auth_header)) -> FileResponse:
        resp = es_client.search(
            body=query_body,
            headers=auth_header,
            index=index_name
        )
        if resp["hits"]["total"]["value"] > 0:
            document_path = resp["hits"]["hits"][0]["_source"][path_property]
            path_to_preview_image = manager.get_jpeg_preview(document_path,
                                                             page=page,
                                                             width=width,
                                                             height=height,
                                                             )
            return FileResponse(path_to_preview_image)
        else:
            raise HTTPException(status_code=404, detail="Document not found")

    @router.get("/preview/info/{id}", tags=tags, response_model=PreviewInfoModel)
    async def preview_info(
            query_body: Dict = Depends(query_builder.build(source=[path_property])),
            es_client: Elasticsearch = Depends(get_client),
            auth_header: Dict = Depends(get_auth_header)) -> FileResponse:
        resp = es_client.search(
            body=query_body,
            headers=auth_header,
            index=index_name
        )
        if resp["hits"]["total"]["value"] > 0:
            document_path = resp["hits"]["hits"][0]["_source"][path_property]
            if os.path.isfile(document_path):
                supported = manager.has_jpeg_preview(document_path)
                pages = manager.get_page_nb(document_path)
                return PreviewInfoModel(supported=supported, pages=pages)
            else:
                return PreviewInfoModel(supported=False, pages=0)
        else:
            raise HTTPException(status_code=404, detail="Document not found")

    return router
Ejemplo n.º 7
0
from opendistro import OpenDistro
from passlib.context import CryptContext
from pydantic import BaseModel
from starlette.responses import JSONResponse

from .basic import auth_header as basic_auth_header

security = HTTPBearer(bearerFormat="JWT")


def load_secret_key(jwt_config):
    secret_key = load_secret(jwt_config, "secret-file", "secret-key")
    return base64.b64decode(secret_key).decode("utf-8")


app_config = get_config()
jwt_config = app_config.get_config("jwt")
secret_key = load_secret_key(jwt_config)
algorithm = jwt_config.get_string("algorithm")
access_token_expires_minutes = jwt_config.get_int(
    "access-token-expire-minutes")


class Token(BaseModel):
    access_token: str
    token_type: str


class UserPasswordForm:
    def __init__(self, username: str = Form(...), password: str = Form(...)):
        self.username = username
Ejemplo n.º 8
0
                                        file_type=file_type,
                                        separator=separator,
                                        header=header,
                                        infer_schema=infer_schema)
            if use_s3_dist_cp:
                # write dataframe into s3 through s3-dist-cp binary
                send_s3_using_s3_dist_cp(save_dir=save_dir,
                                         project_name=project_name,
                                         table_name=table_name,
                                         s3_bucket=s3_bucket,
                                         file_type=file_type)

        # delete temporary hdfs files
        delete_hdfs_file(remove_from_local_hdfs, source_schema, table_name)

        logging.info("Extract process for table {} finished".format(table_name))
    # close spark context
    stop_spark_context(spark)

if __name__ == '__main__':
    args = docopt(__doc__, version='1')
    # configure log
    set_log(args['--log-level'])

    # .get('default').get('metadata').get('test')
    config_file_path = args['--config-file']
    env = os.getenv('env_type', 'default')
    config = get_config(config_file_path).get(env)

    run_job(args, config)
Ejemplo n.º 9
0
def init_config(app: web.Application) -> None:
    app['config'] = get_config()