Ejemplo n.º 1
0
def main():
    env, env_ver, app, user_id, pwd, key_store, jdbc_ref = arg_handle()
    envvars.populate(env, env_ver, "bdh", "data_ingest")
    #keystore_root = get_keystore_root(key_store,app, \
    #                   envvars.list['hdfs_common_keystore_root'], \
    #                   envvars.list['hdfs_service_keystore_root'])
    keystore_root = envvars.list['hdfs_' + key_store + '_keystore_root']
    print "keystore root = ", keystore_root
    alias_name = get_alias_name(key_store, jdbc_ref, user_id)
    key_provider = "jceks://hdfs/" + keystore_root + "/" + user_id.lower(
    ) + ".jceks"

    #command to delete alias
    hadoop_cmd = "hadoop credential delete " + alias_name + " -f -provider " + key_provider
    print "delete mannually (if needed) alias using command ==> "
    print "     " + hadoop_cmd
    rc, status = commands.getstatusoutput(hadoop_cmd)
    print(status)

    #command to create alias
    hadoop_cmd = "hadoop credential create " + alias_name + " -provider " + key_provider
    print "Generating alias using command ==> "
    print "     " + hadoop_cmd + " -v " + pwd.replace("\$", "\\\$")
    hadoop_cmd = hadoop_cmd + " -v " + pwd.replace("$", "\$")
    rc, status = commands.getstatusoutput(hadoop_cmd)
    print(status)
Ejemplo n.º 2
0
def main():

    options, args = arg_handle()

    envvars.populate(options.env, options.env_ver, options.app,
                     options.sub_app)
    config_file_path = envvars.list[
        'lfs_app_config'] + "/ingest/" + options.config_file
    if not os.path.isfile(config_file_path):
        print "run_gen.py           -> ERROR:   config file " + config_file_path + " does not exists ***"
        sys.exit(1)
    print "**************************************************************************************************"
    args = " ".join([
        envvars.list['lfs_global_scripts'] + "/getmetadata.py ",
        "-e " + options.env.strip(), "-a " + options.app,
        "-u " + options.sub_app, " -v " + options.env_ver,
        " -k " + options.key_store.strip(), " -s " + options.config_file
    ])
    getmetadata_script = "python " + args
    if (options.step == "all") or (options.step == "1"):
        print(
            "run_gen.py           -> STEP-1    : ************************************************************************"
        )
        print("run_gen.py           -> Invoked   : " + getmetadata_script)
        call = subprocess.Popen(getmetadata_script.split(' '),
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT)
        while True:
            line = call.stdout.readline()
            if not line:
                break
            print line.strip()
            sys.stdout.flush()
        call.communicate()

        rc = call.returncode
        if rc != 0:
            print "run_gen.py           -> getting  metadata using " + args + " is not successful."
            sys.exit(1)
        else:
            print "run_gen.py           -> getting  metadata command was successful."
    if (options.step == '1'):
        sys.exit(0)
    print "**************************************************************************************************"

    args = " ".join([
        envvars.list['lfs_global_scripts'] + "/generate.py -s",
        options.config_file, "-m", envvars.list['lfs_app_config'] +
        "/ingest/" + options.config_file + ".meta", "-w",
        envvars.list['lfs_app_workflows'] + "/wf_db_ingest", "-k",
        options.key_store, "-e " + options.env.strip(), "-a " + options.app,
        "-u " + options.sub_app, "-v " + options.env_ver
    ])
    generate_script = "python " + args
    if (options.step == "all") or (options.step == "2"):
        print(
            "run_gen.py           -> STEP-2    : ************************************************************************"
        )
        print("run_gen.py           -> Invoked   : " + generate_script)
        call = subprocess.Popen(generate_script.split(' '),
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT)
        while True:
            line = call.stdout.readline()
            if not line:
                break
            print line.strip()
            sys.stdout.flush()
        call.communicate()

        rc = call.returncode
        if rc != 0:
            print "run_gen.py           -> Generating create scripts and properties file is not successful."
            sys.exit(1)
        else:
            print "run_gen.py           -> generating create scripts and properties files was successful."
    if (options.step == '2'):
        sys.exit(0)
    print "**************************************************************************************************"

    if (options.step == "all") or (options.step == "3"):

        print(
            "run_gen.py           -> STEP-3    : ************************************************************************"
        )
        args = " ".join([
            envvars.list['lfs_global_scripts'] + "/run_hive_create.py",
            "--app " + options.app, "--subapp " + options.sub_app,
            "--env " + options.env, "--op0 " + envvars.list['lfs_app_config'] +
            "/ingest/" + options.config_file + ".list", "--op1 " +
            envvars.list['lfs_global_config'] + "/oozie_global.properties",
            "--env_ver " + options.env_ver,
            "--op2 " + envvars.list['lfs_app_workflows'] + "/wf_db_ingest",
            "--op3 " + envvars.list['lfs_app_src'] + "/hive"
        ])

        hivecreate_script = "python " + args
        print("run_gen.py           -> Invoked   : " + hivecreate_script)
        call = subprocess.Popen(hivecreate_script.split(' '),
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT)
        while True:
            line = call.stdout.readline()
            if not line:
                break
            print line.strip()
            sys.stdout.flush()
        call.communicate()

        rc = call.returncode
        #os.system(hivecreate_script)
        if rc != 0:
            print "run_gen.py           -> Creating hive tables is not successful."
            print rc
            sys.exit(1)
        else:
            print "run_gen.py           -> Completed executing create table scripts."
    print "run_gen.py           -> Completed executing create table scripts."
    print "**************************************************************************************************"
def main():
    global return_code, group, start_line
    sys.stdout.flush()
    common_properties, app, sub_app, env, env_ver, workflow_name, custom_date, file_name = arg_handle(
    )
    envvars.populate(env, env_ver, app, sub_app)

    log_time = datetime.datetime.fromtimestamp(
        time.time()).strftime('%Y-%m-%d_%H-%M-%S')

    log_file = envvars.list[
        'lfs_app_logs'] + "/run_job-" + workflow_name + '_' + log_time + '.log'
    print("LogFile: " + log_file)
    print("To Kill: kill " + str(os.getpid()))
    sys.stdout = open(log_file, 'a', 0)

    start_line = "".join('*' for i in range(100))
    print(start_line)
    print(
        "run_criss_indx_generatr_wf.py   -> Started   : " +
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))

    print envvars.list['lfs_app_wrk']
    final_properties = envvars.list[
        'lfs_app_wrk'] + '/' + env + '_' + workflow_name + '.properties'
    removefile(final_properties)

    properties_file = open(final_properties, 'wb')

    #  Concatenate global properties file and table properties file
    shutil.copyfileobj(open(common_properties, 'rb'), properties_file)

    appSpecificProperties = envvars.list[
        'lfs_app_workflows'] + '/' + workflow_name + '/job.properties'
    # appSpecificFile = Path(appSpecificProperties)

    #db = "hv_db_" + sub_app
    #dbName = envvars.list[db]

    todayDate = ""
    if custom_date == None:
        todayDate = date.today()
    else:
        todayDate = datetime.datetime.strptime(custom_date, '%Y-%m-%d')

    yesterdayDate = todayDate - timedelta(1)

    curDate_yyyy = todayDate.strftime('%Y')
    curDate_mm = todayDate.strftime('%m')
    curDate_yyyy_mm_dd = todayDate.strftime('%Y-%m-%d')

    yesterday_yyyy_mm_dd = yesterdayDate.strftime('%Y-%m-%d')
    yesterday_yyyymmdd = yesterdayDate.strftime('%Y%m%d')

    user = getpass.getuser()
    hadoop_user = user.upper()
    cmd = "rm " + hadoop_user + ".keytab"
    rcc, status_txt = commands.getstatusoutput(cmd)
    print "removing old keytabs if any...status=", status_txt
    cmd = "hdfs dfs -get /user/" + hadoop_user.lower(
    ) + "/" + hadoop_user + ".keytab"
    rcc, status_txt = commands.getstatusoutput(cmd)
    print "Getting keytab and status = ", status_txt

    rc, rec_cnt = lastPreviousWorkingDay(curDate_yyyy_mm_dd, env, hadoop_user)
    lastPreviousBusinessday_yyyy_mm_dd = rec_cnt.split('\t')[0]
    print "lastPreviousBusinessday_yyyy_mm_dd" + lastPreviousBusinessday_yyyy_mm_dd
    curDate_yyyy_mm_dd = lastPreviousBusinessday_yyyy_mm_dd

    emailList = ""
    rawDb = ""
    dbName = ""
    sourceFileName = ""
    targetTable = ""
    lookUpTable = ""
    mappingTable = ""

    if os.path.exists(appSpecificProperties):
        # load app specific job.properties
        envvars.load_file(appSpecificProperties)
        try:
            with open(appSpecificProperties) as fin:
                for line in fin:
                    args = line.split('=')
                    if args[0].strip() == "sourceFileName":
                        sourceFileName = args[1].strip()
                    if args[0].strip() == "mappingTable":
                        mappingTable = args[1].strip()
                    if args[0].strip() == "targetTable":
                        targetTable = args[1].strip()
                    if args[0].strip() == "lookupTable":
                        lookUpTable = args[1].strip()
                    if args[0].strip() == "email_list":
                        emailList = args[1].strip()

        except IOError as e:
            if e.errno != errno.ENOENT:
                raise IOError("exception file reading error")
            else:
                print("No joblist file found")

        shutil.copyfileobj(open(appSpecificProperties, 'rb'), properties_file)

    jobProperties = ""
    if emailList == "":
        #emailList = envvars.list['email_list']
        emailList = ['*****@*****.**']

    dbName = envvars.list['hv_db_efgifi']
    rawDb = envvars.list['hv_db_efgifi_stage']

    home = '/data/'
    path = os.path.dirname(os.path.realpath(__file__))
    print "path" + path
    root = path.split('efgifi/')[0]
    print root
    targethdfsFilePath = ""
    print "file_name :" + str(file_name)
    if file_name == None:
        print "File Name not specified"
    else:
        sourceFilePath = root + "/landingzone/efgifi/" + str(file_name)
        targetLocalFilePath = root + "/efgifi/wrk/" + str(file_name)
        targethdfsFilePath = '/bdp' + env + '/bdh/' + env_ver + '/str/raw/' + rawDb + "/" + "criss_ifind_delim_stg/"
        with open(sourceFilePath, "rb") as ebcdic:
            ascii_txt = codecs.decode(ebcdic.read(), "cp500")
        print "targetLocalFilePath :" + str(targetLocalFilePath)
        with io.open(targetLocalFilePath, mode='w',
                     encoding='utf-8') as target:
            target.write(
                ascii_txt.replace("DTL:",
                                  "\nDTL:").replace("HDR:", "\nHDR:").replace(
                                      "!",
                                      "|").replace("  ",
                                                   "").replace(" \n", "\n"))

        print "Before running hdfs put command ...."
        hdfs_put_cmd = "hdfs dfs -put -f " + targetLocalFilePath + " " + targethdfsFilePath
        print "--- running hdfs_put command -->   " + hdfs_put_cmd
        rc, status = commands.getstatusoutput(hdfs_put_cmd)
        if (rc > 0):
            print status
        else:
            print "source criss index file copied to hdfs  "

    basePath = '/bdp' + env + '/bdh/' + env_ver + '/str/pub/'

    jobProperties = '\n'.join([
        'app=' + app, 'basePath=' + basePath,
        'sourceFilePath=' + targethdfsFilePath + "/" + str(file_name),
        'mappingFilePath=' + basePath + dbName + "/" + mappingTable,
        'targetFilePath=' + basePath + dbName + "/" + targetTable +
        "/load_date=" + yesterday_yyyy_mm_dd, 'lookupFilePath=' + basePath +
        dbName + "/" + lookUpTable + "/load_date=" + yesterday_yyyy_mm_dd,
        'scriptLocation=/data/bdp' + env + '/bdh/' + env_ver + '/' + sub_app +
        '/code/scripts/', 'curDate_yyyy_mm_dd=' + curDate_yyyy_mm_dd,
        'yesterday_yyyy_mm_dd=' + yesterday_yyyy_mm_dd, 'yesterday_yyyymmdd=' +
        yesterday_yyyymmdd, 'lastPreviousDayBusinessday_yyyy_mm_dd=' +
        lastPreviousBusinessday_yyyy_mm_dd, 'happ=' + app,
        'oozieLib=' + envvars.list['oozie.libpath']
    ])

    properties_file.write(jobProperties)

    user = getpass.getuser()
    hadoop_user = user.upper()
    cmd = "rm " + hadoop_user + ".keytab"
    rcc, status_txt = commands.getstatusoutput(cmd)
    print "removing old keytabs if any...status=", status_txt
    cmd = "hdfs dfs -get /user/" + hadoop_user.lower(
    ) + "/" + hadoop_user + ".keytab"
    rcc, status_txt = commands.getstatusoutput(cmd)
    print "Getting keytab and status = ", status_txt

    properties_file.close()

    print("run_efgifi_workflow.py           -> FinalPrpty : " +
          final_properties)

    workflow = envvars.list['hdfs_app_workflows'] + '/' + workflow_name
    print workflow

    oozie_wf_cmd = "oozie job -oozie " + envvars.list['oozieNode'] + " -config "
    oozie_wf_cmd = oozie_wf_cmd + final_properties
    oozie_wf_cmd = oozie_wf_cmd + ' -Doozie.wf.application.path='
    oozie_wf_cmd = oozie_wf_cmd + workflow
    oozie_wf_cmd = oozie_wf_cmd + ' -debug -run'
    print("run_efgifi_workflow.py   -> Invoked   : " + oozie_wf_cmd)
    rc, jobid_str = commands.getstatusoutput(oozie_wf_cmd)

    if rc == 0:
        jobid_str = jobid_str.split('job: ')
        jobid = jobid_str[1].strip()
    else:
        print("run_efgifi_workflow.py   -> Failed    : " + jobid_str)
        return_code = 8
        sys.exit(return_code)

    print(
        jobid + "-> Started   : " + datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    getStatus(jobid, envvars.list['oozieNode'], curDate_yyyy_mm_dd,
              hadoop_user, dbName, targetTable, emailList)
Ejemplo n.º 4
0
    print("getmetadata.py           -> MetadataLocation : " + outputFilePath) 


if __name__ == "__main__":
     parser = optparse.OptionParser()
     options = validateOptions()

     env = options.env
     env_ver = options.env_ver
     env_root = checkenv(options.env)
     
     common_root_path= env_root + "/bdh/01/global"

     sys.path.append( os.path.expanduser(common_root_path + "/code/scripts/") )
     import envvars
     envvars.populate(env,env_ver,options.app,options.sub_app)
     sqoopParamFilePath = envvars.list['lfs_app_config']+"/ingest/"+options.sqoopparams
     tableName = options.tableName
     print "getmetadata.py           -> sqoopParamFilePath = " + sqoopParamFilePath
     
     config = ConfigParser.ConfigParser()
     config.readfp(open(sqoopParamFilePath))


     appName = config.get("DataSourceInfo","app")
     app = appName
     sub_app = config.get("DataSourceInfo","sub_app")
     if appName.find("/") != -1:
        app = appName.split("/")[0]
        sub_app = appName.split("/")[1]
     
Ejemplo n.º 5
0
def main():
    global return_code
    return_code = 0
    start_line = "".join('*' for i in range(100))
    print(start_line)
    print(
        "run_solr_index.py           -> Started    : " +
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    options = arg_handle()
    env = options.env.strip()
    env_ver = options.env_ver.strip()
    app = options.app.strip()
    sub_app = options.sub_app.strip()
    group = options.group.strip()
    # Get envvars from oozie_common_properties file
    envvars.populate(env, env_ver, app, sub_app)
    collection_name = options.collection_name.strip()
    actions = options.actions.strip()
    try:
        script_name = options.script_name.strip()
    except AttributeError as e:
        script_name = ""
    curr_date = datetime.datetime.fromtimestamp(
        time.time()).strftime("%Y-%m-%d")
    prev_load_date = (datetime.datetime.fromtimestamp(time.time()) +
                      datetime.timedelta(days=-1)).strftime("%Y-%m-%d")
    try:
        lower_bound = options.min_bound.strip()
    except AttributeError as e:
        lower_bound = prev_load_date
    try:
        upper_bound = options.max_bound.strip()
    except AttributeError as e:
        upper_bound = prev_load_date

    if collection_name is None or collection_name == "":
        print(
            "run_solr_index.py           -> Error    : Collection name is required "
        )
        sys.exit(8)
    if actions is None or actions == "":
        print(
            "run_solr_index.py           -> Error    : ACtions are required ex:insert,delete,create "
        )
        sys.exit(8)
    collection_folder = envvars.list['lfs_solr'] + '/' + collection_name
    if not os.path.isdir(collection_folder):
        print("run_solr_index.py           -> Error    : Collection folder " +
              collection_folder + " does not exists..")
        sys.exit(8)
    if not (os.path.exists(collection_folder + "/conf")
            and os.path.exists(collection_folder + "/conf/schema.xml")):
        print(
            "run_solr_index.py           -> Error    : Configuration folder is corrupt.."
        )
        sys.exit(8)

    title_trans = ''.join(
        chr(c) if (chr(c).isupper() or chr(c).islower() or str(c).isdigit()
                   ) and chr(c) != '-' else '_' for c in range(256))
    solr_table_name = 'solr_' + collection_name + "_" + lower_bound
    solr_table_name = solr_table_name.translate(title_trans)

    if "delete" in actions.lower():
        solr_cmd = " ".join([
            "solrctl --solr", envvars.list['solr_server'], "--zk",
            envvars.list['zookeeper_ensemble'], "collection --delete ",
            collection_name
        ])
        rc, out = commands.getstatusoutput(solr_cmd)
        if rc != 0:
            print("run_solr_index.py           -> Failed      : " + solr_cmd +
                  ";RC : " + str(rc))
            print out
            #sys.exit(10)
        print("run_solr_index.py           -> Command  : " + solr_cmd +
              ";RC : " + str(rc))
        solr_cmd = " ".join([
            "solrctl --solr", envvars.list['solr_server'], "--zk",
            envvars.list['zookeeper_ensemble'], "instancedir --delete ",
            collection_name
        ])
        rc, out = commands.getstatusoutput(solr_cmd)
        if rc != 0:
            print out
            print("run_solr_index.py           -> Failed      : " + solr_cmd +
                  ";RC : " + str(rc))
            #sys.exit(10)
        print("run_solr_index.py           -> Command  : " + solr_cmd +
              ";RC : " + str(rc))

    if "create" in actions.lower():
        solr_cmd = " ".join([
            "solrctl --solr", envvars.list['solr_server'], "--zk",
            envvars.list['zookeeper_ensemble'], "instancedir --create ",
            collection_name, collection_folder
        ])
        rc, out = commands.getstatusoutput(solr_cmd)
        if rc != 0:
            print out
            print("run_solr_index.py           -> Failed      : " + solr_cmd +
                  ";RC : " + str(rc))
            sys.exit(10)
        print("run_solr_index.py           -> Command  : " + solr_cmd +
              ";RC : " + str(rc))
        solr_cmd = " ".join([
            "solrctl --solr", envvars.list['solr_server'], "--zk",
            envvars.list['zookeeper_ensemble'], "collection --create ",
            collection_name, "-s 1 -r 1 -m 1"
        ])
        rc, out = commands.getstatusoutput(solr_cmd)
        if rc != 0:
            print out
            print("run_solr_index.py           -> Failed      : " + solr_cmd +
                  ";RC : " + str(rc))
            sys.exit(10)
        print("run_solr_index.py           -> Command  : " + solr_cmd +
              ";RC : " + str(rc))
    final_properties = envvars.list[
        'lfs_app_wrk'] + '/' + env + '_' + app.replace(
            "/", "_") + '_' + solr_table_name + '.properties'
    # Remove if the file exists
    silentremove(final_properties)

    if "query" in actions.lower():
        # open the final properties file in write append mode
        properties_file = open(final_properties, 'wb')
        shutil.copyfileobj(
            open('/cloudera_nfs1/config/oozie_global.properties', 'rb'),
            properties_file)
        dynamic_properties = ""
        if script_name == "":
            print(
                "run_solr_index.py           -> Error    : Script name required for insert option"
            )
            sys.exit(8)
        else:
            script_name = options.script_name.strip()
            script_file = collection_folder + "/" + script_name
            hdfs_cmd = "hdfs dfs -put -f " + script_file + " " + envvars.list[
                'hdfs_app_workflows'] + '/wf_hive_query/'
            rc, out = commands.getstatusoutput(hdfs_cmd)
            if rc != 0:
                print("run_solr_index.py           -> Failed      : " +
                      hdfs_cmd + ";RC : " + str(rc))
                print out
                #sys.exit(10)

            dynamic_properties = '\n'.join([
                '\nenv=' + env, 'app=' + app, 'sub_app=' + sub_app,
                'group=' + group, 'happ=' + envvars.list['happ'], 'hv_db=' +
                envvars.list['hv_db_' + app + '_' + sub_app + '_stage'],
                'hv_db_stage=' +
                envvars.list['hv_db_' + app + '_' + sub_app + '_stage'],
                'hv_table=' + solr_table_name, 'table=' + solr_table_name,
                'stage_table=' + solr_table_name, 'hdfs_location=' +
                envvars.list['hdfs_str_raw'] + "/solr/" + solr_table_name,
                'hdfs_tmp_dir=' + envvars.list['hdfs_str_raw'] + "/tmp_" +
                solr_table_name, 'prev_load_date=' + prev_load_date,
                'hive_query=' + script_name, 'curr_date=' + curr_date,
                'min_bound=' + lower_bound, 'max_bound=' + upper_bound
            ])
        properties_file.write(dynamic_properties)
        properties_file.close()
        print("run_solr_index.py           -> DynmcPrpty : " +
              dynamic_properties.replace("\n", ", "))
        print("run_solr_index.py           -> FinalPrpty : " +
              final_properties)
        sys.stdout.flush()
        abc_parameter = env + ',' + env_ver + ',' + app + ',' + sub_app + ',' + group + ",run_solr_index.py"
        rc = runoozieworkflow(final_properties, abc_parameter)
        if rc > 0:
            sys.exit(rc)
    if "insert" in actions.lower():
        morphline_file_name = collection_name + ".conf"
        morphline_file_path = collection_folder + "/" + collection_name + ".conf"
        if not (os.path.exists(morphline_file_path)):
            print(
                "run_solr_index.py           -> Error    : Morphline conf file doesnot exist.."
                + morphline_file_path)
            sys.exit(8)
        solr_mr_cmd = "|".join([
            "hadoop|jar", envvars.list['solr_home'] +
            "/contrib/mr/search-mr-1.0.0-cdh5.5.4-job.jar|org.apache.solr.hadoop.MapReduceIndexerTool",
            "-D|'mapred.child.java.opts=-Xmx4G'",
            "-D|'mapreduce.reduce.memory.mb=8192'", "--morphline-file",
            morphline_file_path, "--output-dir",
            envvars.list['hdfs_str_raw'] + "/tmp_" + solr_table_name,
            "--verbose|--go-live|--zk-host",
            envvars.list['zookeeper_ensemble'], "--collection",
            collection_name,
            envvars.list['hdfs_str_raw'] + "/solr/" + solr_table_name
        ])
        env_var_cmd = ";".join([
            "export myDriverJarDir=/opt/cloudera/parcels/CDH/lib/solr/contrib/crunch",
            "export myDependencyJarDir=/opt/cloudera/parcels/CDH/lib/search/lib/search-crunch",
            "export myDriverJar=$(find $myDriverJarDir -maxdepth 1 -name 'search-crunch-*.jar' ! -name '*-job.jar' ! -name '*-sources.jar')",
            "export myDependencyJarFiles=$(find $myDependencyJarDir -name '*.jar' | sort | tr '\n' ',' | head -c -1)",
            "export myDependencyJarFiles=$myDependencyJarFiles,$(find /opt/cloudera/parcels/CDH/jars -name 'snappy-java-*.jar')",
            "export myDependencyJarPaths=$(find $myDependencyJarDir -name '*.jar' | sort | tr '\n' ':' | head -c -1)",
            'export myJVMOptions="-DmaxConnectionsPerHost=10000 -DmaxConnections=10000 -Dspark.yarn.maxAppAttempts=1"',
            "export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark",
            "export SPARK_SUBMIT_CLASSPATH=/opt/cloudera/parcels/CDH/lib/search/lib/search-crunch/commons-codec-*.jar:$SPARK_HOME/assembly/lib/*:/opt/cloudera/parcels/CDH/lib/search/lib/search-crunch/*"
        ])
        #rc, export = commands.getstatusoutput(env_var_cmd)
        #if rc != 0:
        #   print("run_solr_index.py           -> Failed      : "+env_var_cmd+";RC : "+str(rc))
        #   print out
        #   sys.exit(10)
        rc, dependency_jars = commands.getstatusoutput(
            env_var_cmd + ';echo "$myDependencyJarFiles"')
        if rc != 0:
            print("run_solr_index.py           -> Failed      : " +
                  'echo "$myDependencyJarFiles"' + ";RC : " + str(rc))
            print dependency_jars
            sys.exit(10)
        print("run_solr_index.py           -> Dependency Jars : " +
              dependency_jars)
        rc, driver_jar = commands.getstatusoutput(env_var_cmd +
                                                  ';echo "$myDriverJar"')
        if rc != 0:
            print("run_solr_index.py           -> Failed      : " +
                  'echo "$myDriverJar"' + ";RC : " + str(rc))
            print driver_jar
            sys.exit(10)
        print("run_solr_index.py           -> driver_jar : " + driver_jar)
        rc, jvm_options = commands.getstatusoutput(env_var_cmd +
                                                   ';echo "$myJVMOptions"')
        if rc != 0:
            print("run_solr_index.py           -> Failed      : " +
                  'echo "$myJVMOptions"' + ";RC : " + str(rc))
            print jvm_options
            sys.exit(10)
        print("run_solr_index.py           -> jvm_options : " + jvm_options)
        rc, user_name = commands.getstatusoutput("echo $USER")
        token_file_name = env + '_' + app.replace(
            "/", "_") + '_solr_' + user_name.lower(
            ) + '_' + options.group.lower() + '_' + collection_name + '.token'
        token_file_path = envvars.list['lfs_app_wrk'] + '/' + token_file_name
        tokenCmd = " curl --negotiate -u: '" + envvars.list[
            'solr_server'] + "/?op=GETDELEGATIONTOKEN' > " + token_file_path
        rc, token_txt = commands.getstatusoutput(tokenCmd)
        if rc != 0:
            print("run_solr_index.py           -> Failed      : " + tokenCmd +
                  ";RC : " + str(rc))
            print token_txt
            sys.exit(10)
        log4jCmd = "echo $(ls /opt/cloudera/parcels/CDH/share/doc/search-*/search-crunch/log4j.properties)"
        rc, log4j_path = commands.getstatusoutput(log4jCmd)
        if rc != 0:
            print("run_solr_index.py           -> Failed      : " + log4jCmd +
                  ";RC : " + str(rc))
            print token_txt
            sys.exit(10)
        solr_spark_cmd = "|".join([
            "spark-submit", "--master", "yarn", "--deploy-mode", "cluster",
            "--jars", dependency_jars, '--executor-memory', '6G',
            '--num-executors', '1', '--conf',
            '"spark.executor.extraJavaOptions=' + jvm_options + '"', '--conf',
            '"spark.driver.extraJavaOptions=' + jvm_options + '"', '--class',
            'org.apache.solr.crunch.CrunchIndexerTool', "--files",
            token_file_path + "," + morphline_file_path + "," + log4j_path,
            driver_jar, "-Dhadoop.tmp.dir=/tmp",
            "-Dspark.yarn.maxAppAttempts=1", "-DmorphlineVariable.ZK_HOST=" +
            envvars.list['zookeeper_ensemble'], "-DtokenFile=" +
            token_file_name, "--morphline-file", morphline_file_name,
            "--log4j|log4j.properties", "--pipeline-type|spark|--chatty",
            envvars.list['hdfs_str_raw'] + "/solr/" + solr_table_name
        ])
        solr_cmd = solr_spark_cmd
        call = subprocess.Popen(solr_cmd.split('|'),
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT)
        prev_line = ""
        line_count = 0
        while True:
            line = call.stdout.readline()
            if not line:
                break
            if prev_line[20:] != line.strip()[20:] or line_count > 1000:
                print line.strip()
                line_count = 0
                prev_line = line.strip()
            else:
                print prev_line
                line_count = line_count + 1
            sys.stdout.flush()
        call.communicate()

        if call.returncode != 0:
            print "run_solr_index.py           -> Failed      : " + solr_cmd + ";RC : " + str(
                call.returncode)
            sys.exit(10)
        print("run_solr_index.py           -> Command  : " + solr_cmd +
              ";RC : " + str(call.returncode))

    sys.stdout.flush()

    print(
        "fi_getfreddiemac.py           -> Ended      : " +
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    print start_line
    print "Return-Code:" + str(return_code)
    sys.exit(return_code)
Ejemplo n.º 6
0
def main():
    global return_code
    return_code = 0
    start_line = "".join('*' for i in range(100))
    print(start_line)
    print("put_file.py             -> Started    : " + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    table,field,field_type,field_rdbms_format,field_hadoop_format,file_reg,upper_bound,common_properties,app,sub_app,env,env_ver,group,ingest_type = arg_handle()
    print "field_type=" , field_type
    

    # Get envvars from oozie_common_properties file
    envvars.populate(env,env_ver,app,sub_app)   
    
    #Get Final Properties final name and path from variables
    final_properties = envvars.list['lfs_app_wrk'] + '/' + env + '_' + app.replace("/","_") + '_' + table + '.properties'
    

    # Remove if the file exists
    silentremove(final_properties)
    
    # open the final properties file in write append mode
    properties_file = open(final_properties, 'wb')
    if 'uzip' in ingest_type:
       unzip_cmd = 'unzip  "'+envvars.list['lfs_app_data'] + '/'+file_reg+'" -d "'+envvars.list['lfs_app_data'] + '/'+file_reg.rsplit('/',1)[0]+'/"'
       print("put_file.py             -> Command : "+unzip_cmd)
       sys.stdout.flush()
       rc, out = commands.getstatusoutput(unzip_cmd)
       if rc == 0:
          if rc == 0:
             print("put_file.py             -> GZIP command successful ")
             rm_cmd = 'rm '+envvars.list['lfs_app_data'] + '/'+file_reg
             rc, out = commands.getstatusoutput(rm_cmd)
             if rc!=0:
               print("put_file.py             -> File remove command failed" + str(out))
               sys.exit(1)
             else:
               print("put_file.py             -> Removed Files :"+envvars.list['lfs_app_data'] + '/'+file_reg)
          else:
             print("put_file.py             -> GZIP command failed" + str(out))
             sys.exit(1)
       elif rc == 2304:
         print("put_file.py             -> No Files to be processed RC:" + str(rc))
         print(str(out))
       else:
         print("put_file.py             -> "+unzip_cmd+" command failed RC:" + str(rc))
         print(str(out))
         sys.exit(1)
    if 'rmfl' in ingest_type:
       rmfl_cmd = 'rm  '+envvars.list['lfs_app_data'] + '/'+file_reg
       print("put_file.py             -> Command : "+rmfl_cmd)
       sys.stdout.flush()
       rc, out = commands.getstatusoutput(rmfl_cmd)
       if rc == 0:
          if rc == 0:
             print("put_file.py             -> GZIP command successful ")
          else:
             print("put_file.py             -> GZIP command failed" + str(out))
             sys.exit(1)
       elif rc == 256:
         print("put_file.py             -> No Files to be processed RC:" + str(rc))
         print(str(out))
       else:
         print("put_file.py             -> "+rmfl_cmd+" command failed RC:" + str(rc))
         print(str(out))
         sys.exit(1)
    sys.stdout.flush()
    if 'gzip' in ingest_type:
       gz_cmd = "gzip  "+envvars.list['lfs_app_data'] + '/'+file_reg
       rc, out = commands.getstatusoutput(gz_cmd)
       if rc == 0:
          if rc == 0:
             print("put_file.py             -> GZIP command successful ")
             #Removing txt is not necessary as GZIPping will remove them
             #rm_cmd = 'rm '+envvars.list['lfs_app_data'] + '/'+file_reg
             #rc, out = commands.getstatusoutput(rm_cmd)
             #if rc!=0:
             #  print("put_file.py             -> File remove command failed" + str(out))
             #  sys.exit(1)
             #else:
             #  print("put_file.py             -> Removed Files :"+envvars.list['lfs_app_data'] + '/'+file_reg)
          else:
             print("put_file.py             -> GZIP command failed" + str(out))
             sys.exit(1)
       elif rc == 256:
         print("put_file.py             -> No Files to be processed RC:" + str(rc))
         print(str(out))
       else:
         print("put_file.py             -> "+gz_cmd+" command failed RC:" + str(rc))
         print(str(out))
         sys.exit(1)
    if 'bzip' in ingest_type:
       bz_cmd = "bzip2  "+envvars.list['lfs_app_data'] + '/'+file_reg
       rc, out = commands.getstatusoutput(bz_cmd)
       if rc == 0:
          if rc == 0:
             print("put_file.py             -> GZIP command successful ")
             #Removing txt is not necessary as GZIPping will remove them
             #rm_cmd = 'rm '+envvars.list['lfs_app_data'] + '/'+file_reg
             #rc, out = commands.getstatusoutput(rm_cmd)
             #if rc!=0:
             #  print("put_file.py             -> File remove command failed" + str(out))
             #  sys.exit(1)
             #else:
             #  print("put_file.py             -> Removed Files :"+envvars.list['lfs_app_data'] + '/'+file_reg)
          else:
             print("put_file.py             -> GZIP command failed" + str(out))
             sys.exit(1)
       elif rc == 256:
         print("put_file.py             -> No Files to be processed RC:" + str(rc))
         print(str(out))
       else:
         print("put_file.py             -> "+bz_cmd+" command failed RC:" + str(rc))
         print(str(out))
         sys.exit(1)
    sys.stdout.flush()
    if 'rplc' in ingest_type:
       hdfs_loc = envvars.list['hdfs_str_raw'] + '/'+envvars.list['hv_db_'+app+'_'+sub_app+'_stage']+'/'+table+'/'
       put_cmd = 'hdfs dfs -put -f  '+envvars.list['lfs_app_data'] + '/'+file_reg+' '+ hdfs_loc
       rc, out = commands.getstatusoutput(put_cmd)
       if rc == 0:
          print("put_file.py             -> Command Sucessful: "+put_cmd)
          rm_cmd = 'rm '+envvars.list['lfs_app_data'] + '/'+file_reg
          rc, out = commands.getstatusoutput(rm_cmd)
          if rc!=0:
            print("put_file.py             -> File remove command failed" + str(out))
            sys.exit(1)
          else:
            print("put_file.py             -> Removed File :"+envvars.list['lfs_app_data'] + '/'+file_reg)
       elif rc == 256:
         print("put_file.py             -> No Files to be processed RC:" + str(rc))
         print(str(out))
       else:
         print("put_file.py             -> HDFS PUT command failed RC:" + str(rc))
         print(str(out))
         sys.exit(1)
       sys.stdout.flush()
    if 'extp' in ingest_type:
        try:
           quarter_offset = file_reg.replace('{mm}','mm').replace('{YYYY}','YYYY').index('{q}')
        except ValueError:
           quarter_offset = -1
        try:
           year_offset   = file_reg.replace('{mm}','mm').replace('{q}','q').index('{YYYY}')
        except ValueError:
           year_offset = -1
        try:
           month_offset   = file_reg.replace('{YYYY}','YYYY').replace('{q}','q').index('{mm}')
        except ValueError:
           month_offset = -1
        file_reg = file_reg.replace('.','\.')
        file_reg = file_reg.replace('{q}','[1-4]{1}')
        file_reg = file_reg.replace('{YYYY}','[0-9]{4}')
        file_reg = file_reg.replace('{mm}','[0-9]{2}')
        prog = re.compile(file_reg)
        lfs_data = envvars.list['lfs_app_data'] + '/'+table+'/'
        hdfs_data = envvars.list['hdfs_str_raw'] + '/'+table+'/'
        year = ""
        month = ""
        quarter = ""
        for f in listdir(lfs_data):
           if isfile(join(lfs_data, f)):
              if prog.match(f):
                 hdfs_path = hdfs_data
                 if year_offset != -1:
                    year = f[year_offset:4]
                    hdfs_path = hdfs_path + "year="+str(f)[year_offset:year_offset+4]+"/"
                 if month_offset != -1:
                    month = f[month_offset:2]
                    hdfs_path = hdfs_path +"month="+str(f)[month_offset:month_offset+2]+"/"
                 if quarter_offset != -1:
                    quarter = f[quarter_offset:1]
                    hdfs_path = hdfs_path + "quarter="+str(f)[quarter_offset:quarter_offset+1]+"/" 
                 put_command = "hdfs dfs -put -f "+ join(lfs_data, f) + " " + hdfs_path
                 print(put_command)
                 rc, out = commands.getstatusoutput(put_command)
                 if rc == 0:
                  rm_command = "rm " + join(lfs_data, f)
                  #rc, out = commands.getstatusoutput(rm_command)
                 else:
                   print("hdfs command failed" + str(out))
                   sys.exit(1)

    sys.stdout.flush()                     

    print("put_file.py             -> Ended      : " + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    
    print "put_file.py             -> Return-Code:" + str(return_code)
    print start_line
    sys.exit(return_code)
Ejemplo n.º 7
0
def main():
    global return_code, group, start_line
    return_code = 0
    start_line = "".join('*' for i in range(100))
    print(start_line)
    print(
        "run_oozie_workflow.py   -> Started   : " +
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    sys.stdout.flush()
    abc_parameter, workflow, options_file = arg_handle()
    abc_parameters = abc_parameter.split(',')
    env = abc_parameters[0]
    env_ver = abc_parameters[1]
    app = abc_parameters[2]
    sub_app = abc_parameters[3]
    group = abc_parameters[4]
    parent_script = abc_parameters[5]
    #config path for ABC logging
    # Get envvars from oozie_common_properties file
    envvars.populate(env, env_ver, app, sub_app)
    print("ABC Parameter" + abc_parameter)
    oozie_wf_cmd = "oozie job -oozie " + envvars.list['oozieNode'] + " -config "
    oozie_wf_cmd = oozie_wf_cmd + options_file
    oozie_wf_cmd = oozie_wf_cmd + ' -Doozie.wf.application.path='
    oozie_wf_cmd = oozie_wf_cmd + workflow
    oozie_wf_cmd = oozie_wf_cmd + ' -debug -run'
    print("run_oozie_workflow.py   -> Invoked   : " + oozie_wf_cmd)
    rc, jobid_str = commands.getstatusoutput(oozie_wf_cmd)
    if rc == 0:
        jobid_str = jobid_str.split('job: ')
        jobid = jobid_str[1].strip()
        abc_line = "|".join([
            group, jobid, "oozie", "run_oozie_workflow.py", "", "", "STARTED",
            getpass.getuser(), "oozie workflow started",
            str(datetime.datetime.today())
        ])
        print("**ABC_log**->" + abc_line)
        sys.stdout.flush()
    else:
        print("run_oozie_workflow.py   -> Failed    : " + jobid_str)
        return_code = 8
        sys.exit(return_code)

    print(
        jobid + "-> Started   : " + datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    sys.stdout.flush()
    #ABC logging

    #status = "RUNNING"
    #cnt = 0

    get_status(jobid, envvars.list['oozieNode'], "Main")
    abc_line = "|".join([
        group, jobid, "oozie", "run_oozie_workflow.py", "", "", "ENDED",
        getpass.getuser(), "oozie workflow Ended",
        str(datetime.datetime.today())
    ])
    print("**ABC_log**->" + abc_line)
    sys.stdout.flush()
    print(
        jobid + "-> Ended     : " + datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))

    abc_line = "|".join([
        group, "run_oozie_workflow.py", "python", parent_script, "", "",
        "ENDED",
        getpass.getuser(), "return-code:" + str(return_code),
        str(datetime.datetime.today())
    ])
    print("**ABC_log**->" + abc_line)
    sys.stdout.flush()
    print(
        "run_oozie_workflow.py   -> Ended     : " +
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))
def main():
    """main() is the driver function for entire parquet table load. From raw table will process table by table"""
    global return_code
    return_code = 0
    start_line = "".join('*' for i in range(100))
    print(start_line)
    print("file_ingest_parquet.py           -> Started    : " + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    table, field, field_type, field_rdbms_format, field_hadoop_format, common_properties, app, sub_app, env, env_ver, group, wf_name, hive_script, partition_column, partition_frequency, scd_type, natural_keys, custom_date = arg_handle()
    print ("Workflow_Directory= " + wf_name + "Oozie Hive Ation .hql Name= " + str(hive_script))
    

    # Get envvars from oozie_common_properties file
    envvars.populate(env, env_ver, app, sub_app)   
    
    # Get Final Properties final name and path from variables
    final_properties = envvars.list['lfs_app_wrk'] + '/' + env + '_' + app.replace("/", "_") + '_' + table + '.properties'
    

    # Remove if the file exists
    silentremove(final_properties)
    
    # open the final properties file in write append mode
    properties_file = open(final_properties, 'wb')

    # Build the table properties file name and path from variables - file_ingest_parquet.py calls workflow based on the wf_name mentioned in jobnames.list (5th Parameter)
    table_properties = envvars.list['lfs_app_workflows'] + '/' + wf_name + '/' + table + '.properties'


    # load evironment variables for app specific
    envvars.load_file(table_properties) 
      
    #  Concatenate global properties file and table properties file
    shutil.copyfileobj(open(common_properties, 'rb'), properties_file)
    shutil.copyfileobj(open(table_properties, 'rb'), properties_file)
    
    # Get Databese name from environment variables
    db = envvars.list['hv_db']
    db_stage = envvars.list['hv_db_stage']
    table = envvars.list['hv_table']
    
    # Raw Table's HDFS Directory
    hdfs_raw_dir = envvars.list['hdfs_str_raw_fileingest'] + "/" + db_stage
    
    # get time stamp to load the table
    hdfs_load_ts = "'" + str(datetime.datetime.now()) + "'"
    partitionColAlias = ""
    sys.stdout.flush() 
       
    todayDate = ""
    if custom_date == None:
        todayDate = date.today()
    else:
        todayDate = datetime.datetime.strptime(custom_date , '%Y-%m-%d')
        todayDate = todayDate + timedelta(1)

    yesterdayDate=todayDate - timedelta(1)
    curDate_yyyy=todayDate.strftime('%Y')
    curDate_mm=todayDate.strftime('%m')
    curDate_yyyy_mm_dd=todayDate.strftime('%Y-%m-%d')
    curDate_yyyymmdd=todayDate.strftime('%Y%m%d')

    user=getpass.getuser()
    hadoop_user=user.upper()
    cmd="rm " + hadoop_user +".keytab"
    rcc,status_txt=commands.getstatusoutput(cmd)
    print "removing old keytabs if any...status=", status_txt
    cmd="hdfs dfs -get /user/" + hadoop_user.lower() + "/" + hadoop_user + ".keytab"
    rcc,status_txt=commands.getstatusoutput(cmd)
    print "Getting keytab and status = ", status_txt

    # For Handling SCD Type Tables
    if(scd_type == "scd1" or scd_type == "merge"):
        partitionColAlias = 'B.'
        keys = natural_keys.split(",")
        numKeys = len(keys)
        for idx in range(0, numKeys):
            if(idx == 0):
                onClause = "on_clause= A." + keys[idx] + "=B." + keys[idx]
                nullClauseParquet = "null_clause_parquet= A." + keys[idx] + " IS NULL "
                nullClauseRaw = "null_clause_raw= B." + keys[idx] + " IS NULL "
            if(idx > 0):
                onClause = onClause + " and " + "A." + keys[idx] + "=B." + keys[idx]
                nullClauseParquet = nullClauseParquet + "and" + " A." + keys[idx] + " IS NULL "
                nullClauseRaw = nullClauseRaw + "and" + " B." + keys[idx] + " IS NULL "
    
     # For Partitioning # File Ingest
    date_string = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d')  
    month_string = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m')
    if (partition_column):
        # Retrieval of Date and Month String START
        mnthDate_cat_cmd = "hdfs dfs -cat " + hdfs_raw_dir + "/" + "load_month_date" + "/" + table + "_mnthDate"
                
        rc, cmd_out = commands.getstatusoutput(mnthDate_cat_cmd)
            
        if (rc == 0):
            print("file_ingest_rawdb.py           -> Retrieval of Date and Month is successful.")
            print ("Command Executed is : " + mnthDate_cat_cmd)
        else:
            print("file_ingest_rawdb.py            -> Retrieval of Date and Month is NOT successful.")
            print cmd_out
            print ("Command Executed is : " + mnthDate_cat_cmd)
            sys.exit(1)
        # Retrieval of Date and Month String END
        
        date_string = cmd_out.split('|')[2]
        month_string = cmd_out.split('|')[1]
        
        if sub_app == 'efgifi' and (table=='criss_stdfld_delim' or table=='criss_ifind_delim'):
            rc,rec_cnt=lastPreviousDay(curDate_yyyy_mm_dd, env, hadoop_user)
            date_string=rec_cnt.split('\t')[0]
            print "Inside efgifi lastPreviousDay loop date_string ="+date_string
        elif sub_app == 'efgifi':
            rc,rec_cnt=lastPreviousWorkingDay(curDate_yyyy_mm_dd, env, hadoop_user)
            date_string=rec_cnt.split('\t')[0]
            print "Inside efgifi lastPreviousWorkingDay loop date_string ="+date_string
            
        print "date_string ="+date_string
        
        partitionColumn = "partition_column=" + partition_column
        partition_clause = "partition_clause=partition ( " + partition_column + " )"
        if (partition_frequency == "daily"):
            partition_column_select = "partition_column_select=,'" + date_string + "'" 
        elif (partition_frequency == "monthly"):
            partition_column_select = "partition_column_select=,'" + month_string + "'"
        elif (partition_frequency == "monthly,daily"):
            partition_column_select = "partition_column_select=,'" + month_string + "','" + date_string + "'"
        elif (partition_frequency):
            partition_column_select = "partition_column_select=,'" + partition_frequency + "'"
        else:
            parCols = partition_column.split(",")
            numParCols = len(parCols)
            for idx in range(0, numParCols):
                if(idx == 0):
                    partition_column_with_qualifier = partitionColAlias + parCols[idx]
                if(idx > 0):
                    partition_column_with_qualifier += "," + partitionColAlias + parCols[idx]
            partition_column_select = "partition_column_select=," + partition_column_with_qualifier     
   
    print("file_ingest_parquet.py           -> DownloadTyp: Full Download of table ")
    
    dynamic_properties = '\n'.join(['\n', 'env=' + env,
                                    'app=' + app,
                                    'sub_app=' + sub_app,
                                    'group=' + group,
                                    'happ=' + envvars.list['happ'] ,
                                    'min_bound=' + "''",
                                    'max_bound=' + "''",
                                    'min_bound_hadoop=' + "''",
                                    'max_bound_hadoop=' + "''",
                                    'hdfs_load_ts=' + hdfs_load_ts])
                                    
    if(hive_script):
        dynamic_properties = dynamic_properties + '\n' + 'hive_script=' + hive_script
    if (partition_column):
        dynamic_properties = dynamic_properties + '\n ' + partitionColumn + '\n ' + partition_clause + '\n ' + partition_column_select
    if(scd_type == "scd1" or scd_type == "merge"):
        dynamic_properties = dynamic_properties + '\n ' + onClause + '\n ' + nullClauseParquet + '\n ' + nullClauseRaw
    
    # DATES and MONTHS
    daysCol = ""
    for dy in range(0, 7):
        daysCol += "day_" + str(dy) + "=" + str((datetime.datetime.fromtimestamp(time.time()) - datetime.timedelta(dy)).strftime('%Y-%m-%d')) + "\n"
    dynamic_properties = dynamic_properties + '\n ' + daysCol
    dynamic_properties = dynamic_properties + '\n ' + "where=1=1"
    dynamic_properties = dynamic_properties + '\n ' + "where_hadoop=1=1"
    abc_parameter = env + ',' + env_ver + ',' + app + ',' + sub_app + ',' + group + "," + table + "," 
                    
    # ABC logging parameter for oozie
    # print "env"+ env
    # abc_parameter = env+','+env_ver+','+app+','+sub_app+','+group+","+table+','+field+ lower_bound_hadoop +"to"+upper_bound_hadoop    

    properties_file.write(dynamic_properties)
    properties_file.close()
    print("file_ingest_parquet.py           -> CommnPrpty : " + common_properties) 
    print("file_ingest_parquet.py           -> TablePrpty : " + table_properties)
    print("file_ingest_parquet.py           -> DynmcPrpty : " + dynamic_properties.replace("\n", ", ")) 
    print("file_ingest_parquet.py           -> FinalPrpty : " + final_properties) 
    sys.stdout.flush()
     
     # ABC Logging Pending
    
    rc = runoozieworkflow(final_properties, abc_parameter, wf_name)
    print "Return-Code:" + str(rc)
    if rc > return_code:
       return_code = rc
    abc_line = "|".join([group, "file_ingest_parquet.py", "python", "run_job.py", str(table), "File_Ingest", "ENDED",
                         getpass.getuser(), "return-code:" + str(return_code), str(datetime.datetime.today())]) 
    print("**ABC_log**->" + abc_line)
    sys.stdout.flush()
    
    if (return_code == 0):
        impala_cmd = envvars.list['impalaConnect'] + ' "REFRESH ' + db + '.' + table + ';"'
        print impala_cmd
        rc, output = commands.getstatusoutput(impala_cmd)
        if (rc != 0):
            print("file_ingest_parquet.py           -> Ooozie Successful But Impala REFRESH FAILED      : " + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))    
            print start_line
            print "Return-Code:" + str(return_code)
            sys.exit(rc)
    
    print("file_ingest_parquet.py           -> Ended      : " + datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    print start_line
    print "Return-Code:" + str(return_code)
    sys.exit(return_code)
Ejemplo n.º 9
0
def main():
    global return_code
    return_code = 0
    start_line = "".join('*' for i in range(100))
    print(start_line)
    print("run_ingest.py           -> Started    : " +
          datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    table, field, field_type, field_rdbms_format, field_hadoop_format, lower_bound, upper_bound, common_properties, app, sub_app, env, env_ver, group, ingest_type, common_date = arg_handle(
    )
    print "field_type=", field_type

    # Get envvars from oozie_common_properties file
    envvars.populate(env, env_ver, app, sub_app)

    #Get Final Properties final name and path from variables
    final_properties = envvars.list[
        'lfs_app_wrk'] + '/' + env + '_' + app.replace(
            "/", "_") + '_' + table + '.properties'

    # Remove if the file exists
    silentremove(final_properties)

    # open the final properties file in write append mode
    properties_file = open(final_properties, 'wb')

    # Build the table properties file name and path from variables run_ingest only calls wf_db_ingest workflow
    table_properties = envvars.list[
        'lfs_app_workflows'] + '/wf_db_ingest/' + table + '.properties'
    rm_ctlM = "sed -i -e 's/\r$//' " + table_properties
    rc, status = commands.getstatusoutput(rm_ctlM)
    print("run_ingest.py           -> removing ^M characters in file: " +
          rm_ctlM + " Status:" + str(rc))
    # get time stamp to load the table
    hdfs_load_ts = "'" + str(common_date).replace("_", " ") + "'"
    common_date_tfmt = datetime.strptime(common_date, '%Y-%m-%d_%H:%M:%S.%f')
    log_time = common_date_tfmt.strftime('%Y-%m-%d_%H-%M-%S')
    log_date = common_date_tfmt.strftime('%Y-%m-%d')
    log_folder = envvars.list['lfs_app_logs'] + "/" + log_date
    log_file = log_folder + "/run_job-" + group + '_' + log_time + '.log'
    envvars.list['hdfs_load_ts'] = hdfs_load_ts

    #load evironment variables for app specific
    envvars.load_file(table_properties)

    #  Concatenate global properties file and table properties file
    shutil.copyfileobj(open(common_properties, 'rb'), properties_file)
    shutil.copyfileobj(open(table_properties, 'rb'), properties_file)

    #Get Databese name from environment variables
    db = envvars.list['hv_db']
    table = envvars.list['hv_table']

    sys.stdout.flush()
    if ingest_type == 'sync':
        sourceStats = get_stats_sqoop(table, envvars.list['where_column'])
        targetStats = get_stats_impala(db, table, envvars.list['where_column'])
        #print("Source Result:"+str(sourceStats))
        #print("Target Result:"+str(targetStats))
        whereClause = ""
        whereHClause = ""
        for key in sourceStats:
            if key in targetStats:
                if sourceStats[key] != targetStats[key]:
                    if whereClause == "":
                        whereClause = whereClause + envvars.list[
                            'where_column'] + "=to_timestamp('" + key + "', 'yyyy-mm-dd hh24:mi:ss.FF')"
                    else:
                        whereClause = whereClause + " or " + envvars.list[
                            'where_column'] + "=to_timestamp('" + key + "', 'yyyy-mm-dd hh24:mi:ss.FF')"
            else:
                if whereClause == "":
                    whereClause = whereClause + envvars.list[
                        'where_column'] + "=to_timestamp('" + key + "', 'yyyy-mm-dd hh24:mi:ss.FF')"
                else:
                    whereClause = whereClause + " or " + envvars.list[
                        'where_column'] + "=to_timestamp('" + key + "', 'yyyy-mm-dd hh24:mi:ss.FF')"

        dynamic_properties = '\n'.join([
            '\nenv=' + env, 'app=' + app, 'sub_app=' + sub_app,
            'group=' + group, 'happ=' + envvars.list['happ'],
            'where=' + whereClause, 'log_file=' + log_file,
            'hdfs_load_ts=' + hdfs_load_ts
        ])

    elif ingest_type == 'incr':
        if field is None:
            print(
                "run_ingest.py           -> ERROR: Incremental SQOOP cannot be performed with out where column "
            )
            return_code = 2
            sys.exit(return_code)
        print(
            "run_ingest.py           -> DownloadTyp: Partial Download based on where condition "
        )
        # Check if the lower_date range is passed from jobnames.list file
        if lower_bound is None or lower_bound == "":
            # lower_date range is not found check for presence of exclusions file
            lower_bound, upper_bound = get_exception_args(
                envvars.list['lfs_app_config'], table)

            # lower_date is still none get lower date from impala table
            if lower_bound is None and field is not None and db is not None:
                lower_bound = get_min_bound_impala(db, table, field,
                                                   field_type)
                if lower_bound is None or lower_bound == "":
                    print(
                        "run_ingest.py           -> LowerBound: Cannot be determined. Use Sync option"
                    )
                    return_code = 2
                    sys.exit(return_code)
                else:
                    print("run_ingest.py           -> LowerBound: Min date " +
                          lower_bound + " is determined from Impala table")
            elif lower_bound is None and field is None:
                print(
                    "run_ingest.py           -> Arguments error: lower_bound or field or entry in exception file is expected"
                )
                return_code = 2
                sys.exit(return_code)
            else:
                print(
                    "run_ingest.py           -> LowerBound : Min date is determined from exclusions file"
                )
        else:
            print(
                "run_ingest.py           -> LowerBound : Min date is determined from jobnames.list file"
            )

        if upper_bound is None or upper_bound == "":
            curr_dt = str(datetime.now().date())
            if field.strip().lower() == "msrmnt_prd_id":
                print "run_ingest.py           -> Upper_bound      : BDW table date used " + str(
                    curr_dt)
                upper_bound = get_bdw_date_from_id(db, curr_dt)
            elif field_type.lower() == "timestamp":
                upper_bound = str(
                    datetime.strptime(str(datetime.now()),
                                      "%Y-%m-%d %H:%M:%S.%f"))
            elif field_type.lower() == "int":
                upper_bound = '99999999'
                print("run_ingest.py           -> UpperBound :  is 99999999")
            else:
                upper_bound = curr_dt
                print(
                    "run_ingest.py           -> UpperBound : Max Date is current date"
                )
        else:
            print(
                "run_ingest.py           -> UpperBound : Max Date source is same as Min date"
            )
        if field_type.strip().lower() == "timestamp" or field_type.lower(
        ) == "":
            ingest_special_args = get_ingest_special_args(
                envvars.list['lfs_app_config'], table)
            if "lower_bound_modifier_days" in ingest_special_args:
                try:
                    val = int(ingest_special_args["lower_bound_modifier_days"].
                              strip())
                    print("run_ingest.py           -> LowerBound Modifier:" +
                          str(val))
                    lower_bound = datetime.strptime(
                        lower_bound,
                        "%Y-%m-%d %H:%M:%S.%f") + timedelta(days=val)
                    lower_bound = str(lower_bound)
                    print(
                        "run_ingest.py           -> LowerBound : updated to " +
                        lower_bound + " from ingest_special.properties file")
                except ValueError:
                    print(
                        "lower_bound_modifier is not an int! " +
                        str(ingest_special_args["lower_bound_modifier_days"]) +
                        "!")
        if field_type.lower(
        ) == "timestamp" and envvars.list['datasource'] == "oracle":
            lower_bound_f = "to_timestamp('" + lower_bound + "','YYYY-MM-DD HH24:MI:SS.FF')"
            upper_bound_f = "to_timestamp('" + upper_bound + "','YYYY-MM-DD HH24:MI:SS.FF')"
        else:
            lower_bound_f = lower_bound
            upper_bound_f = upper_bound
        dynamic_properties = '\n'.join([
            '\nenv=' + env, 'app=' + app, 'sub_app=' + sub_app,
            'group=' + group, 'log_file=' + log_file,
            'happ=' + envvars.list['happ'], 'min_bound=' + lower_bound_f,
            'max_bound=' + upper_bound_f, 'hdfs_load_ts=' + hdfs_load_ts
        ])

        if field_type.lower() == "int":
            dynamic_properties = dynamic_properties + '\n ' + "where=${where_column} between ${min_bound}  and ${max_bound}"
            dynamic_properties = dynamic_properties + '\n ' + "where_hadoop=${where_column} between ${min_bound} and ${max_bound}"
            abc_parameter = env + ',' + env_ver + ',' + app + ',' + sub_app + ',' + group + "," + table + ',' + field + lower_bound + "to" + upper_bound
        elif field_type == None or field_type == "" or field_type.lower(
        ) == "date" or field_type.lower() == "timestamp":
            field_rdbms_format = determine_default_field_format(
                field_rdbms_format)
            field_hadoop_format = determine_default_field_format(
                field_hadoop_format)
            if field_type.lower() == "timestamp":
                field_rdbms_format = '%Y-%m-%d %H:%M:%S.%f'
                field_hadoop_format = '%Y-%m-%d %H:%M:%S.%f'
            lower_bound_validated = validate_date_format(
                lower_bound, field_rdbms_format)
            upper_bound_validated = validate_date_format(
                upper_bound, field_rdbms_format)
            lower_bound_hadoop = lower_bound_validated.strftime(
                field_hadoop_format)
            upper_bound_hadoop = upper_bound_validated.strftime(
                field_hadoop_format)
            dynamic_properties = '\n'.join([
                dynamic_properties, 'min_bound_hadoop=' + lower_bound_hadoop,
                'max_bound_hadoop=' + upper_bound_hadoop
            ])
            if field_type.lower(
            ) == "timestamp" and envvars.list['datasource'] == "oracle":
                dynamic_properties = dynamic_properties + '\n ' + "where=${where_column} between ${min_bound} and ${max_bound}"
                dynamic_properties = dynamic_properties + '\n ' + "where_hadoop=${where_column} between '${min_bound_hadoop}' and '${max_bound_hadoop}'"
            else:
                dynamic_properties = dynamic_properties + '\n ' + "where=${where_column} between '${min_bound}' and '${max_bound}'"
                dynamic_properties = dynamic_properties + '\n ' + "where_hadoop=${where_column} between '${min_bound_hadoop}' and '${max_bound_hadoop}'"
            abc_parameter = env + ',' + env_ver + ',' + app + ',' + sub_app + ',' + group + "," + table + ',' + field + lower_bound_hadoop + "to" + upper_bound_hadoop
        else:
            dynamic_properties = dynamic_properties + '\n ' + "where=${where_column} between ${min_bound} and ${max_bound}"
            dynamic_properties = dynamic_properties + '\n ' + "where_hadoop=${where_column} between ${min_bound} and ${max_bound}"
            abc_parameter = env + ',' + env_ver + ',' + app + ',' + sub_app + ',' + group + "," + table + ',' + field + lower_bound + "to" + upper_bound
    else:
        print(
            "run_ingest.py           -> DownloadTyp: Full Download of table ")
        dynamic_properties = '\n'.join([
            '\nenv=' + env, 'app=' + app, 'sub_app=' + sub_app,
            'group=' + group, 'log_file=' + log_file,
            'happ=' + envvars.list['happ'], 'min_bound=' + "''",
            'max_bound=' + "''", 'min_bound_hadoop=' + "''",
            'max_bound_hadoop=' + "''", 'hdfs_load_ts=' + hdfs_load_ts
        ])
        dynamic_properties = dynamic_properties + '\n ' + "where=1=1"
        if envvars.list['hive_query'].strip().lower(
        ) == 'hv_ins_stg_fnl_audit.hql':
            dynamic_properties = dynamic_properties + '\n ' + "where_hadoop=as_of_date=" + hdfs_load_ts
        else:
            dynamic_properties = dynamic_properties + '\n ' + "where_hadoop=1=1"
        abc_parameter = env + ',' + env_ver + ',' + app + ',' + sub_app + ',' + group + "," + table + ","

    #ABC logging parameter for oozie
    #print "env"+ env
    #abc_parameter = env+','+env_ver+','+app+','+sub_app+','+group+","+table+','+field+ lower_bound_hadoop +"to"+upper_bound_hadoop

    properties_file.write(dynamic_properties)
    properties_file.close()
    print("run_ingest.py           -> CommnPrpty : " + common_properties)
    print("run_ingest.py           -> TablePrpty : " + table_properties)
    print("run_ingest.py           -> DynmcPrpty : " +
          dynamic_properties.replace("\n", ", "))
    print("run_ingest.py           -> FinalPrpty : " + final_properties)
    sys.stdout.flush()
    # ABC Logging Started
    parameter_string = ""
    if lower_bound is not None and lower_bound != "":
        parameter_string = field + " " + lower_bound + " " + upper_bound
    comments = "Properties file name :" + final_properties
    abc_line = "|".join([
        group, "run_ingest.py", "python", "run_job.py",
        str(table), parameter_string, "RUNNING",
        getpass.getuser(), comments,
        str(datetime.today())
    ])
    print("**ABC_log**->" + abc_line)
    abc_parameter = env + ',' + env_ver + ',' + app + ',' + sub_app + ',' + group + ",run_ingest.py"
    sys.stdout.flush()
    rc = runoozieworkflow(final_properties, abc_parameter)
    print "Return-Code:" + str(rc)
    if rc > return_code:
        return_code = rc
    abc_line = "|".join([
        group, "run_ingest.py", "python", "run_job.py",
        str(table), parameter_string, "ENDED",
        getpass.getuser(), "return-code:" + str(return_code),
        str(datetime.today())
    ])
    print("**ABC_log**->" + abc_line)
    sys.stdout.flush()

    print("run_ingest.py           -> Ended      : " +
          datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    print start_line
    print "Return-Code:" + str(return_code)
    sys.exit(return_code)
Ejemplo n.º 10
0
def main():
    
    options = arg_handle()
    #BEELINE_URL='jdbc:hive2://lbdp164a.uat.pncint.net:10000/default;principal=hive/[email protected]'
    envvars.populate(options.env,options.env_ver,options.app,options.sub_app) 
    BEELINE_URL="beeline -u '"+envvars.list['hive2JDBC']+"principal="+envvars.list['hive2Principal']+"'"
    envvars.list['hdfs_load_ts'] = str(datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S.%f'))

    print "run_hive_create.py             -> Beeline connect command - "+ BEELINE_URL
    #Read the list of table names from the generated file. 
    try:
        with open(options.inputFile) as fin:
            for line in fin:
                print "run_hive_create.py             -> ***************table name = " +line.strip() +"*********************"
                sys.stdout.flush()
                tableName=line.strip()
                envvars.clearList()
                envvars.populate(options.env,options.env_ver,options.app,options.sub_app)
                envvars.load_file(options.table_properties_path+"/"+tableName+".properties") 
                #stageTargetDir=envvars.list['stg_target_dir']
                #targetDir=envvars.list['target_dir']
                
                
                print "run_hive_create.py             ->       creating stage table - " +envvars.list['hv_db_stage']+"."+envvars.list['stage_table'] 
                print "run_hive_create.py             ->       stage target_dir = " + envvars.list['stg_target_dir']
                #comment/uncomment the below print statement to know/hide the exact command issued for creating hive table
                beeline_cmd = " ".join([BEELINE_URL,
                                        "-hiveconf hv_db_stage="+envvars.list['hv_db_stage'],
                                        " -hiveconf hv_db="+envvars.list['hv_db'],
                                        " -hiveconf stage_table="+tableName.strip(),
                                        " -hiveconf stg_target_dir="+envvars.list['stg_target_dir'],
                                        "-f",
                                        options.table_create_path+"/"+tableName+"_create_stg.hql"])
                print "run_hive_create.py             ->       " + beeline_cmd 
                
                #beeline -u ${BEELINE_URL} -hiveconf hv_db_stage="${hv_db_stage}" -hiveconf hv_db="${hv_db}" -hiveconf stage_table="$tableName" -hiveconf stg_target_dir="${stageTargetDir}" --silent=true -f "${table_create_path}"/${tableName}_create_stg.hql
                #hive -hiveconf hv_db_stage="${hv_db_stage}" -hiveconf hv_db="${hv_db}" -hiveconf stage_table="$tableName" -hiveconf stg_target_dir="${stageTargetDir}" -f "${table_create_path}"/${tableName}_create_stg.hql
                sys.stdout.flush()
                rc = os.system(beeline_cmd)
                sys.stdout.flush()
                #1>/tmp/${USER}.log
                if ( rc != 0 ):
                    print "run_hive_create.py             -> Create stage table script failed. please validate, fix and continue"     
                    sys.exit(1)
                   #hdfs dfs -chmod -R 777 ${stageTargetDir} 2>&1
                
                print "run_hive_create.py             ->       creating final table - " + envvars.list['hv_db']+"."+envvars.list['hv_table'] 
                #print "run_hive_create.py             ->       final target_dir = " + envvars.list['target_dir']
                #Comment/Un-Comment the below print statement to know/hide the exact hive command used
                
                beeline_cmd = " ".join([BEELINE_URL,
                                        "-hiveconf hv_db_stage="+envvars.list['hv_db_stage'],
                                        " -hiveconf hv_db="+envvars.list['hv_db'],
                                        " -hiveconf table="+tableName.strip(),
                                        #" -hiveconf target_dir="+envvars.list['target_dir'],
                                        "-f",
                                        options.table_create_path+"/"+tableName+"_create_parquet.hql"])
                
                print "run_hive_create.py             ->      " + beeline_cmd
                
                #beeline -u ${BEELINE_URL} -hiveconf hv_db_stage="${hv_db_stage}" -hiveconf hv_db="${hv_db}" -hiveconf table="$table" -hiveconf target_dir="${targetDir}" --silent=true -f "${table_create_path}"/${tableName}_create_parquet.hql 
                #hive -hiveconf hv_db_stage="${hv_db_stage}" -hiveconf hv_db="${hv_db}" -hiveconf table="$table" -hiveconf target_dir="${targetDir}" -f "${table_create_path}"/${tableName}_create_parquet.hql 
                
                #1>/tmp/${USER}.log
                sys.stdout.flush()
                rc = os.system(beeline_cmd)
                sys.stdout.flush() 
                if ( rc != 0 ):
                    print "run_hive_create.py             -> Create table script failed. please validate, fix and continue"     
                    sys.exit(1)
                print "run_hive_create.py             ->       Invalidate metadata - " +envvars.list['hv_db']+"."+envvars.list['hv_table'] 
                
                
                impalaCmd=envvars.list['impalaConnect']+"' invalidate metadata "+envvars.list['hv_db']+"."+envvars.list['hv_table']+";invalidate metadata "+envvars.list['hv_db_stage']+"."+envvars.list['stage_table']+"; '"
                print "run_hive_create.py             ->      " + impalaCmd
                sys.stdout.flush()
                rc = os.system(impalaCmd)
                sys.stdout.flush()
                if (rc!= 0 ):
                    print "run_hive_create.py             -> Invalidate metadata failed"     
                    sys.exit(1)
       
    except IOError as e:
        if  e.errno != errno.ENOENT:
            raise IOError("exception file reading error")
        else:
            print("No Tablelist file found")
Ejemplo n.º 11
0
def main():
    global return_code
    return_code = 0
    start_line = "".join('*' for i in range(100))
    print(start_line)
    print("run_hive_query.py           -> Started    : " +
          datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    options = arg_handle()
    table = options.table.strip().lower()
    app = options.app.lower()
    env = options.env.lower()
    env_ver = options.env_ver.lower()
    sub_app = options.sub_app.lower()
    query = ""
    query_type = options.query_type.strip().lower()
    if query_type == "cstm":
        query = options.query.strip()
    group = options.group
    common_properties = '/cloudera_nfs1/config/oozie_global.properties'
    # Get envvars from oozie_common_properties file
    envvars.populate(env, env_ver, app, sub_app)
    hdfs_load_ts = "'" + str(options.common_date).replace("_", " ") + "'"
    common_date_tfmt = datetime.strptime(options.common_date,
                                         '%Y-%m-%d_%H:%M:%S.%f')
    log_time = common_date_tfmt.strftime('%Y-%m-%d_%H-%M-%S')
    log_date = common_date_tfmt.strftime('%Y-%m-%d')
    log_folder = envvars.list['lfs_app_logs'] + "/" + log_date
    log_file = log_folder + "/run_job-" + group + '_' + log_time + '.log'
    #Get Final Properties final name and path from variables
    final_properties = envvars.list[
        'lfs_app_wrk'] + '/' + env + '_hive_query_' + app.replace(
            "/", "_") + '_' + table + '.properties'

    # Remove if the file exists
    silentremove(final_properties)

    # open the final properties file in write append mode
    properties_file = open(final_properties, 'wb')

    # Build the table properties file name and path from variables run_ingest only calls wf_db_ingest workflow
    table_properties = envvars.list[
        'lfs_app_workflows'] + '/wf_db_ingest/' + table + '.properties'
    # get time stamp to load the table
    hdfs_load_ts = "'" + str(datetime.now()) + "'"
    envvars.list['hdfs_load_ts'] = hdfs_load_ts

    #load evironment variables for app specific
    if os.path.isfile(table_properties):
        envvars.load_file(table_properties)

    #  Concatenate global properties file and table properties file
    shutil.copyfileobj(open(common_properties, 'rb'), properties_file)
    if os.path.isfile(table_properties):
        shutil.copyfileobj(open(table_properties, 'rb'), properties_file)
        #Get Databese name from environment variables
        db = envvars.list['hv_db']
        table = envvars.list['hv_table']
    else:
        db = envvars.list['hv_db_' + app + '_' + sub_app]
        table = table_properties

    sys.stdout.flush()

    curr_date = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d")
    prev_load_date = (datetime.fromtimestamp(time.time()) +
                      timedelta(days=-1)).strftime("%Y-%m-%d")

    dynamic_properties = '\n'.join([
        '\nenv=' + env, 'app=' + app, 'sub_app=' + sub_app, 'group=' + group,
        'prev_load_date=' + prev_load_date, 'curr_date=' + curr_date,
        'log_file=' + log_file, 'happ=' + envvars.list['happ']
    ])
    if query_type == 'mgfl':
        print(
            "run_hive_query.py           -> HiveQueryTyp: Merge small files ")
        if not envvars.list['partition_column'] == '':
            if not (options.ldate is None or options.ldate.strip() == ""):
                partition_clause = "PARTITION (" + envvars.list[
                    'partition_column'] + "='" + options.ldate.strip() + "')"
                where_clause = " where " + envvars.list[
                    'partition_column'] + "='" + options.ldate.strip() + "'"
            else:
                partition_clause = "PARTITION (" + envvars.list[
                    'partition_column'] + "='" + prev_load_date + "')"
                where_clause = " where " + envvars.list[
                    'partition_column'] + "='" + prev_load_date + "'"
            dynamic_properties = '\n'.join([
                dynamic_properties, 'hive_query=merge_smfl_table.hql',
                'python_script=invalidate_metadata.py',
                'partition_clause=' + partition_clause,
                'where_clause=' + where_clause
            ])
            abc_parameter = env + ',' + env_ver + ',' + app + ',' + sub_app + ',' + group + "," + table + ","
        else:
            print(
                "run_hive_query.py           -> Partition column must be present to merge small files "
            )
            sys.exit()
    elif query_type == 'cstm':
        print("run_hive_query.py           -> HiveQueryTyp: custom query ")
        queries = query
        query = queries.split(",")[0]
        rm_ctlM = "sed -i -e 's/\r$//' " + envvars.list[
            'lfs_app_workflows'] + '/wf_hive_query/' + query
        rc, status = commands.getstatusoutput(rm_ctlM)
        print(
            "run_hive_query.py           -> removing ^M characters in file: " +
            rm_ctlM + " Status:" + str(rc))
        hdfs_put = "hdfs dfs -put -f " + envvars.list[
            'lfs_app_workflows'] + '/wf_hive_query/' + query + " " + envvars.list[
                'hdfs_app_workflows'] + '/wf_hive_query/'
        rc, status = commands.getstatusoutput(hdfs_put)
        print("run_hive_query.py           -> copying file: " + hdfs_put +
              " Status:" + str(rc))

        if len(queries.split(",")) == 2:
            py_query = queries.split(",")[1]
            hdfs_put = "hdfs dfs -put -f " + envvars.list[
                'lfs_app_workflows'] + '/wf_hive_query/' + py_query + " " + envvars.list[
                    'hdfs_app_workflows'] + '/wf_hive_query/'
            rc, status = commands.getstatusoutput(hdfs_put)
            print("run_hive_query.py           -> copying file: " + hdfs_put +
                  " Status:" + str(rc))

        else:
            py_query = 'invalidate_metadata.py'
        dynamic_properties = '\n'.join([
            dynamic_properties, 'hive_query=' + query,
            'python_script=' + py_query
        ])
        abc_parameter = env + ',' + env_ver + ',' + app + ',' + sub_app + ',' + group + "," + table + ","

    properties_file.write(dynamic_properties)
    properties_file.close()
    print("run_hive_query.py           -> CommnPrpty : " + common_properties)
    print("run_hive_query.py           -> TablePrpty : " + table_properties)
    print("run_hive_query.py           -> DynmcPrpty : " +
          dynamic_properties.replace("\n", ", "))
    print("run_hive_query.py           -> FinalPrpty : " + final_properties)
    sys.stdout.flush()
    # ABC Logging Started
    parameter_string = ""
    comments = "Properties file name :" + final_properties
    abc_line = "|".join([
        group, "run_hive_query.py", "python", "run_job.py",
        str(table), parameter_string, "RUNNING",
        getpass.getuser(), comments,
        str(datetime.today())
    ])
    print("**ABC_log**->" + abc_line)
    abc_parameter = env + ',' + env_ver + ',' + app + ',' + sub_app + ',' + group + ",run_hive_query.py"
    sys.stdout.flush()
    rc = runoozieworkflow(final_properties, abc_parameter)
    print "Return-Code:" + str(rc)
    if rc > return_code:
        return_code = rc
    abc_line = "|".join([
        group, "run_hive_query.py", "python", "run_job.py",
        str(table), parameter_string, "ENDED",
        getpass.getuser(), "return-code:" + str(return_code),
        str(datetime.today())
    ])
    print("**ABC_log**->" + abc_line)
    sys.stdout.flush()

    print("run_hive_query.py           -> Ended      : " +
          datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    print start_line
    print "Return-Code:" + str(return_code)
    sys.exit(return_code)
Ejemplo n.º 12
0
def main():
    global return_code, msck_count, msck_command
    return_code = 0
    msck_count = 0
    home = '/data/'
    path = os.path.dirname(os.path.realpath(__file__))
    root = path.split('src/scripts')[0]
    
    #env = path.split('/')[2].split('bdp')[1]
    #env_ver = path.split('/')[4]
    env = 'p'
    env_ver = '01'
    usage = "usage: run_job.py grp_name app sub_app jobnames.list"
    parser = OptionParser(usage)
    (options, args) = parser.parse_args()
    if len(args) < 3:
        parser.error("Arguments - group_job_name and app name are required.")
    global app, sub_app
    grp_name = args[0]
    app = args[1]
    sub_app = args[2]
    jobnames = "jobnames.list"
    
    common_date =""
    if len(args) == 4:
       common_date = args[3].strip()
    else:
       common_date = str(datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H:%M:%S.%f'))
    common_date_tfmt = datetime.strptime(common_date,'%Y-%m-%d_%H:%M:%S.%f')
    asofdate = common_date_tfmt.strftime('%Y-%m-%d')
    log_time = common_date_tfmt.strftime('%Y-%m-%d_%H-%M-%S') 
    rerunjobnames = "jobnames_"+asofdate+".list"
    rerun = "N"

#       rerunjobnames = jobnames
#       rerun = "Y"

    envvars.populate(env,env_ver,app,sub_app)
        

    log_date = common_date_tfmt.strftime('%Y-%m-%d')
    log_folder = envvars.list['lfs_app_logs'] + "/"+log_date
    if not os.path.exists(log_folder):
        os.makedirs(log_folder)
        chmod_log = "chmod 777 "+log_folder
        rc, status = commands.getstatusoutput(chmod_log)
    log_file = log_folder +"/run_job-" + grp_name + '_' + log_time  + '.log'
    global abc_log_file, stdout_file
    abc_log_file = envvars.list['lfs_app_logs'] + "/"+grp_name+".tmp"
    failed_group_name = "@@"+ grp_name + '_' + log_time 
    
    print("LogFile: " + log_file)
    print("To Kill: kill " + str(os.getpid())) 
    f = open(log_file, "a",0)
    f.close()
    stdout_file = open(log_file, "r+",0)
    sys.stdout = stdout_file
    
    global kerb, user_name 
    rc, user_name = commands.getstatusoutput("echo $USER") 
    
    service_user_name = envvars.list['srvc_acct_login_'+app+'_'+sub_app]
    if service_user_name is not None and service_user_name != "":
       user_name = service_user_name
    if not os.path.isfile(envvars.list['lfs_keystore']+user_name.lower()+".keytab "):
       kerb = "kinit -k -t "+envvars.list['lfs_keystore']+"/"+user_name.lower()+".keytab "+user_name.lower()+envvars.list['domainName']
       rc, out = commands.getstatusoutput(kerb)
       print("run_job.py              -> Authenticated    : "+kerb+" RC:"+str(rc))
    else: 
       print("run_job.py              -> Keytab file missing, not able to authenticate. Using user default authentication")
    
    
    
    
    start_line = "".join('*' for i in range(100))
    print start_line   
    print("run_job.py              -> Started    : " + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    
    global abc_hdfs_put
    #hdfs_abc_log_file = envvars.list['hdfs_meta_raw']+"/"+envvars.list['hv_db_meta_stage']+"/abc_hadoop/load_date="+str(asofdate)+"/00000.log";
    #abc_hdfs_put = " ".join(["hdfs","dfs","-appendToFile",abc_log_file,
    #     hdfs_abc_log_file]) 
    #hdfs_chmod = "hdfs dfs -chmod -R 777 " + hdfs_abc_log_file 
    #rc, out = commands.getstatusoutput(hdfs_chmod)
    #print("---Output of chmod command of abc_log_file-->"+hdfs_chmod)

    #print("run_job.py              -> Invoked    : " +hdfs_chmod)
    #print out 
    #msck_command =  "beeline -u '" + envvars.list['hive2JDBC'] + ";principal=" + envvars.list['hive2Principal']+"' -e "
    #msck_command = "hive -e "
    #msck_command = msck_command + "'use "+ envvars.list['hv_db_meta_stage']+"; msck repair table abc_hadoop;'"
    
    comments = ""
    # determine joblist file path 
    job_list_file = envvars.list['lfs_app_config'] + '/' + jobnames
    rerun_job_list_file = envvars.list['lfs_app_config'] + '/' + grp_name + "_rerun.list"
    print("run_job.py              -> JobList    : " + job_list_file) 
    
    if os.path.isfile(rerun_job_list_file):
       job_list_file = rerun_job_list_file
       print("run_job.py              -> JobList    : Rerun file found, updating joblist lookup file. Please re-run if original entries has to run.")
       print("run_job.py              -> JobList    : " + job_list_file)
       comments = comments + "Rerun file found "+job_list_file
    else:
       comments = comments + "joblist file " + job_list_file
     
    abc_line = "|".join([grp_name,"run_job.py","python","CA-7 Job","",str(args),"STARTED",
                         user_name,comments.replace(os.linesep,"---"),str(datetime.today())+"\n"]) 
    writeabc(abc_line)
    input_scripts_count = 0
    failed_scripts_count = 0
    failed_scripts = ""
    try:
        with open(job_list_file) as fin:
            for line in fin:
                args = line.split('|')
                if  args[0].strip().lower() == grp_name.lower() or grp_name.lower() == '*all':
                    options = ' --env ' + env + ' --app ' + app + ' --env_ver ' + env_ver + ' --group ' + grp_name
                    options = options + ' --subapp ' + sub_app + ' --cmmn_dt ' + common_date
                    if  len(args) < 3:
                        print("Error: Table name and script name not defined in config file")
                        return None, None, None, None, None, None, None
                    
                    if  len(args) >= 4:
                        job = args[2].strip()
                        if args[1].strip().lower() == 'g':
                            path = envvars.list['lfs_global_scripts']
                        else:
                            path = envvars.list['lfs_app_scripts'] 
                        options = options + ' --op0 ' + args[3].strip()
                    if  len(args) >= 5 and args[4].strip != "":
                        options = options + ' --op1 ' + args[4].strip() 
                    if  len(args) >= 6 and args[5].strip != "":
                        options = options + ' --op2 ' + args[5].strip()
                    if  len(args) >= 7 and args[6].strip != "":
                        options = options + ' --op3 ' + args[6].strip()
                    if  len(args) >= 8 and args[7].strip != "":
                        options = options + ' --op4 ' + args[7].strip() 
                    if  len(args) >= 9 and args[8].strip != "":
                        options = options + ' --op5 ' + args[8].strip()
                    if  len(args) >= 10 and args[9].strip != "":
                        options = options + ' --op6 ' + args[9].strip()
                    if  len(args) >= 11 and args[10].strip != "":
                        options = options + ' --op7 ' + args[10].strip() 
                    if  len(args) >= 12 and args[11].strip != "":
                        options = options + ' --op8 ' + args[11].strip()
                    if  len(args) >= 13 and args[12].strip != "":
                        options = options + ' --op9 ' + args[12].strip()
                    input_scripts_count = input_scripts_count + 1
                    rc = call_script(path, job, options)
                    if rc != 0:
                       failed_scripts_count = failed_scripts_count + 1
                       fs = line.split('|')
                       fs[0] = failed_group_name                       
                       failed_scripts = failed_scripts + line
                    if rc > return_code:
                       return_code = rc 


        
    except IOError as e:
        if  e.errno != errno.ENOENT:
            raise IOError("exception file reading error")
        else:
            print("No joblist file found")
    
    if return_code > 0:
       #if input_scripts_count != failed_scripts_count:
          with open(rerun_job_list_file, 'w') as myfile:
             myfile.write(failed_scripts)
          chmod_log = "chmod 777 "+rerun_job_list_file
          rc, status = commands.getstatusoutput(chmod_log)
          print "run_job.py              -> Failed Script: Some scripts failed.. Please use below command to rerun.."
          print "run_job.py              -> Re-run Cmd   : "+ " ".join(["python",path+"/run_job.py",grp_name,app,sub_app])
          abc_line = "|".join([grp_name,"run_job.py","python","CA-7 Job","",str(args),"FAILED",
                         user_name,"run_job failed, Some scripts failed.." + str(return_code),str(datetime.today())+"\n"]) 
          writeabc(abc_line)
       #else:
       #   print "run_job.py              -> Failed Script: All scripts failed.. Please use below command to rerun.."
       #   print "run_job.py              -> Re-run Cmd   : "+ " ".join(["python",path+"/run_job.py",grp_name,app,sub_app,jobnames])
       #   abc_line = "|".join([grp_name,"run_job.py","python","CA-7 Job","",str(args),"FAILED",
       #                  user_name,"run_job failed, all scripts failed.." + str(return_code),str(datetime.today())+"\n"]) 
       #   writeabc(abc_line)
    elif os.path.isfile(rerun_job_list_file):
       print "run_job.py              -> Deleting..." + str(rerun_job_list_file)
       os.remove(rerun_job_list_file)
       
    abc_line = "|".join([grp_name,"run_job.py","python","CA-7 Job","",str(args),"ENDED",
                         user_name,"run_job ended,Return-Code:" + str(return_code),str(datetime.today())+"\n"]) 
    writeabc(abc_line)
    print("run_job.py              -> Ended      : Return-Code:" + str(return_code)+" " + datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    print start_line
    silentremove(abc_log_file)
    sys.exit(return_code)