linqdm_filter
    3. Creates all tables in Fact database.
        linqdm_fdn
    4. Creates all tables in Base database for now this database will have only DIH tables.
        dih
       """  
    
    # Set the batch id from Airflow dag run
    setbatch = getpythonoperator("BatchId", getBatchId)
    batchid = "{{ ti.xcom_pull(key='batchId', task_ids='Run_BatchId') }}"

    for db in database:

        with TaskGroup(group_id="{}_Tab".format(db)) as run_stage0:
            stagetaskgrp = []
            with TaskGroup(group_id="{}_S2HS".format(db)) as run_stage1:
                for tabname in database[db]["tabname"]:

                    taskname = "CRT_{}_{}".format(db, tabname)
                    taskid = 'TA_' + taskname
                    commands = "base64 -d <<< {} | kinit {}@{} && ssh -o StrictHostKeyChecking=no -o GSSAPIAuthentication=yes -oGSSAPIDelegateCredentials=yes {}@{} '{}'".format(password, kinitprincipal, kinitdomain, kinitprincipal, edgenodehost, "{} {}  {} {} {} {}".format(scriptpaths["hiveload"], tabname , batchid, 'ddl', db, database[db]["type"]))
                    ssh_create_stage = getbashoperator(taskname, False, commands)
                    ssh_create_stage
                    stagetaskgrp.append(run_stage1)
            run_stage1
        group.append(run_stage0) 

    dummyop = DummyOperator(task_id='NoOP')

setbatch >> group >> dummyop
    
edgenodehost = kinitparms["edgenodehost"]
password = Variable.get("kinit_passwd")

default_args = {
    'owner': dset["owner"],
    'depends_on_past': dset["depends_on_past"],
    'start_date': datetime.utcnow(),
    'email': dset["email"],
    'email_on_failure': dset["email_on_failure"],
    'email_on_retry': dset["email_on_retry"],
    'concurrency' : dset["concurrency"],
    'retries': dset["retries"],
    'retry_delay': timedelta(minutes=dset["retry_delay"])
    }
	
with DAG(dset["name"], default_args=default_args, schedule_interval=None, dagrun_timeout=timedelta(minutes=dset['dagrun_timeout'])) as dag:

    # Set the batch id from Airflow dag run
    setbatch = getpythonoperator("BatchId", getBatchId)

    batchid = "{{ ti.xcom_pull(key='batchId', task_ids='Run_BatchId') }}"

    
    # Running spark job
    #command="base64 -d <<< {} | kinit {}@{} && ssh -o StrictHostKeyChecking=no -o GSSAPIAuthentication=yes -o GSSAPIDelegateCredentials=yes {}@{} '{}'".format(password,kinitprincipal, kinitdomain, kinitprincipal, edgenodehost, "cd {} && sh {}".format(scriptpaths["cdcommand"],scriptpaths["sparkscript"]))
	command="base64 -d <<< {} | kinit {}@{} && ssh -o StrictHostKeyChecking=no -o GSSAPIAuthentication=yes -o GSSAPIDelegateCredentials=yes {}@{} '{}'".format(password, kinitprincipal, kinitdomain, kinitprincipal, edgenodehost, "{}".format(scriptpaths["tag"]))
    
    taskname = "spark_transmission_run"
    atlas_tag = getbashoperator(taskname, False, command)

setbatch >>  ssh_spark
Beispiel #3
0
password = Variable.get("kinit_passwd")

default_args = {
    'owner': dset["owner"],
    'depends_on_past': dset["depends_on_past"],
    'start_date': datetime.utcnow(),
    'email': dset["email"],
    'email_on_failure': dset["email_on_failure"],
    'email_on_retry': dset["email_on_retry"],
    'concurrency' : dset["concurrency"],
    'retries': dset["retries"],
    'retry_delay': timedelta(minutes=dset["retry_delay"])
    }
	
with DAG(dset["name"], default_args=default_args, schedule_interval=None, dagrun_timeout=timedelta(minutes=dset['dagrun_timeout'])) as dag:

    # Set the batch id from Airflow dag run
    setbatch = getpythonoperator("BatchId", getBatchId)

    batchid = "{{ ti.xcom_pull(key='batchId', task_ids='Run_BatchId') }}"

    
    # Running spark job
    #command="base64 -d <<< {} | kinit {}@{} && ssh -o StrictHostKeyChecking=no -o GSSAPIAuthentication=yes -o GSSAPIDelegateCredentials=yes {}@{} '{}'".format(password,kinitprincipal, kinitdomain, kinitprincipal, edgenodehost, "cd {} && sh {}".format(scriptpaths["cdcommand"],scriptpaths["sparkscript"]))
	for tabconf in confs:
		command="base64 -d <<< {} | kinit {}@{} && ssh -o StrictHostKeyChecking=no -o GSSAPIAuthentication=yes -o GSSAPIDelegateCredentials=yes {}@{} '{}'".format(password, kinitprincipal, kinitdomain, kinitprincipal, edgenodehost, "cd {} && sh {} -c {}".format(scriptpaths["cdcommand"],scriptpaths["validationscript"],tabconf))
    
		taskname = "validation_hbase_hdfs"
		validation = getbashoperator(taskname, False, command)
	dummyop1 = DummyOperator(task_id='NoOP')
setbatch >>  ssh_spark >> dummyop1
Beispiel #4
0
    'retry_delay': timedelta(minutes=dset["retry_delay"])
    }

with DAG(dset["name"], default_args=default_args, schedule_interval=None, dagrun_timeout=timedelta(minutes=dset['dagrun_timeout'])) as dag:

    # Set the batch id from Airflow dag run
    setbatch = getpythonoperator("BatchId", getBatchId)

    batchid = "{{ ti.xcom_pull(key='batchId', task_ids='Run_BatchId') }}"

    
    # Running spark job
    command="base64 -d <<< {} | kinit {}@{} && ssh -o StrictHostKeyChecking=no -o GSSAPIAuthentication=yes -o GSSAPIDelegateCredentials=yes {}@{} '{}'".format(password,kinitprincipal, kinitdomain, kinitprincipal, edgenodehost, "cd {} && sh {}".format(scriptpaths["cdcommand"],scriptpaths["sparkscript"]))
    
    taskname = "spark_transmission_run"
    ssh_spark = getbashoperator(taskname, False, command)

    # Task Group for Sqoop, validation and stage table load
  
    group =[]
    
    for stagegrp in sparkjobs:

        with TaskGroup(group_id="{}_SparktoFact".format(stagegrp)) as run_stage0:

            stagetaskgrp = []
            with TaskGroup(group_id="{}_SparktoStage".format(stagegrp)) as run_stage1:  
            
                for landtab in sparkjobs[stagegrp]["landtab"]:

                    dbname, tabname = landtab.split('.')
            #    for stagedeptab in snowsqljobs[stagegrp]["depstage"]:
            #        dbname, tabname = stagedeptab.split('.')
            #        taskname = "DEPSTG_{}_{}".format(dbname, tabname)
            #        taskid = 'TA_' + taskname
            #        commands = "base64 -d <<< {} | kinit {}@{} && ssh -o StrictHostKeyChecking=no -o GSSAPIAuthentication=yes -o GSSAPIDelegateCredentials=yes {}@{} '{}'".format(password, kinitprincipal, kinitdomain, kinitprincipal, edgenodehost, "{} {} {} {} {} {}".format(scriptpaths["hiveload"], tabname , batchid,  'dml', dbname, 'stage'))
            #        ssh_stage = getbashoperator(taskname, False, commands)
            #        depstagetaskgrp.append(run_depstage)

            facttaskgrp = []
            with TaskGroup(
                    group_id="{}_FactLoad".format(stagegrp)) as run_fact:

                for table in snowsqljobs[stagegrp]["facttabs"]:

                    dbname, tabname = table.split('.')
                    taskname = "FCT_{}".format(table)
                    taskid = 'TA_' + taskname
                    commands = "base64 -d <<< {} | kinit {}@{} && ssh -o StrictHostKeyChecking=no -o GSSAPIAuthentication=yes -o GSSAPIDelegateCredentials=yes {}@{} '{}'".format(
                        password, kinitprincipal, kinitdomain, kinitprincipal,
                        edgenodehost,
                        "{} {} {} {} {} {}".format(scriptpaths["hiveload"],
                                                   tabname, batchid, 'dml',
                                                   dbname, 'fact'))
                    ssh_fact = getbashoperator(taskname, False, commands)
                    facttaskgrp.append(run_fact)

            run_fact
            #run_stage1 >> run_depstage >> run_fact
        group.append(run_stage0)
    dummyop1 = DummyOperator(task_id='DIHLODCMP')
setbatch >> group >> dummyop1
    'retries': dset['retries']
}

with DAG(dset["name"],
         default_args=default_args,
         schedule_interval=None,
         dagrun_timeout=timedelta(minutes=dset['dagrun_timeout'])) as dag:

    # Set the batch id from Airflow dag run
    setbatch = getpythonoperator("BatchId", getBatchId)

    batchid = "{{ ti.xcom_pull(key='batchId', task_ids='Run_BatchId') }}"
    # get the Kinit task
    command = 'echo {} | kinit {}@{}'.format(password, kinitprincipal,
                                             kinitdomain)
    bash_kinit = getbashoperator("KinitEdge", False, command)

    # Task Group for Sqoop, validation and stage table load

    group = []

    for stagegrp in sqoopjobs:

        with TaskGroup(
                group_id="{}_SqooptoFact".format(stagegrp)) as run_stage0:

            stagetaskgrp = []
            with TaskGroup(
                    group_id="{}_SqooptoStage".format(stagegrp)) as run_stage1:

                for landtab in sqoopjobs[stagegrp]["landtab"]:
Beispiel #7
0
    dag.doc_md = __doc__  # providing that you have a docstring at the beggining of the DAG
    dag.doc_md = """
    This Dag runs the method validation for all the SQQOP jobs. This runs after all the fact tables are loaded. This job performs rowcount match and data mismatch checks. The results are stored in a hive table for further analysis. """  
    
    # Set the batch id from Airflow dag run
    setbatch = getpythonoperator("BatchId", getBatchId)
    batchid = "{{ ti.xcom_pull(key='batchId', task_ids='Run_BatchId') }}"

    for db in database:

        with TaskGroup(group_id="{}_Db".format(db)) as run_stage0:
            stagetaskgrp = []
            
            with TaskGroup(group_id="{}_MVAL".format(db)) as run_stage1:
                for tabname in database[db]["tables"]:

                    taskname = "MVAL_{}_{}".format(db, tabname)
                    taskid = 'TA_' + taskname
                    command="base64 -d <<< {} | kinit {}@{} && ssh -o StrictHostKeyChecking=no -o GSSAPIAuthentication=yes -o GSSAPIDelegateCredentials=yes {}@{} '{}'".format(password, kinitprincipal, kinitdomain, kinitprincipal, edgenodehost, "{} {} {} {} {}".format(scriptpaths["baseval"], db, tabname, dset["src2land"][db], batchid))
                    ssh_valid = getbashoperator(taskname, False, command)

                    ssh_valid
                    stagetaskgrp.append(run_stage1)
            run_stage1
        group.append(run_stage0) 

    dummyop = DummyOperator(task_id='NoOP')

setbatch >> group >> dummyop
    
Beispiel #8
0
    'concurrency': dset['concurrency'],
    'retries': dset['retries']
}

with DAG(dset["name"],
         default_args=default_args,
         schedule_interval=None,
         dagrun_timeout=timedelta(minutes=dset['dagrun_timeout'])) as dag:

    # Set the batch id from Airflow dag run
    setbatch = getpythonoperator("BatchId", getBatchId)

    batchid = "{{ ti.xcom_pull(key='batchId', task_ids='Run_BatchId') }}"
    # get the Kinit task
    command = 'echo {} | kinit {}'.format(password, kinitprincipal)
    bash_kinit = getbashoperator("KinitEdge", False, command)

    # Task Group for Sqoop, validation and stage table load

    group = []

    for stagegrp in sqoopjobs:

        with TaskGroup(
                group_id="{}_SqooptoFact".format(stagegrp)) as run_stage0:

            stagetaskgrp = []
            with TaskGroup(
                    group_id="{}_SqooptoStage".format(stagegrp)) as run_stage1:

                for landtab in sqoopjobs[stagegrp]["landtab"]: