예제 #1
0
def druid_indexing(task_id, **kwargs):
    task = 'init_task_{}'.format(task_id)
    ti = kwargs['ti']
    xcom_values = pullXcom(ti, task, xcom_keys)  # pull data from xcom object
    index_template = xcom_values['index_template']

    # index_template = kwargs.get('templates_dict').get('index_template', None)
    # site = kwargs.get('templates_dict').get('site', None)
    tmp_index_template = index_template.replace("_template.json", "_template_{}.json".format(task_id))
    print(tmp_index_template)
    hook = DruidHook(
        druid_ingest_conn_id='druid_ingest_default',
        max_ingestion_time=None
    )

    with open(tmp_index_template) as data_file:
        index_spec = json.load(data_file)
    index_spec_str = json.dumps(
        index_spec,
        sort_keys=True,
        indent=4,
        separators=(',', ': ')
    )
    log.info("Sumitting %s", index_spec_str)
    hook.submit_indexing_job(index_spec_str)
예제 #2
0
def startPythonBatch(task_id, **kwargs):
    task = 'init_task_{}'.format(task_id)
    print(kwargs)
    ti = kwargs['ti']
    xcom_values = pullXcom(
        ti, task, "log_enrich_path_phoenix,table")  #pull data from xcom object
    table = xcom_values['table']  #pull data from xcom object
    path = xcom_values['log_enrich_path_phoenix']
    command = buildPhoenixSubmitCommand(table, path)
    log.info('Phoenix Batch Command : ' + str(command))
    os.putenv(
        "HADOOP_CLASSPATH",
        "/usr/hdp/current/hbase-client/lib/hbase-protocol.jar:/etc/hbase/conf")
    submitSP = subprocess.Popen(command,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                bufsize=-1,
                                universal_newlines=True)
    processPhoenixSubmitLog(iter(submitSP.stdout.readline, ''))
    returncode = submitSP.wait()
    log.info('Starting Python Batch process ' +
             str(datetime.now().strftime("%m%d%Y-%H%M")) + ' return code ' +
             str(returncode))

    return returncode
예제 #3
0
def DruidIndexTask(**kwargs):
    task = 'boot_task'
    ti = kwargs['ti']
    xcom_keys = "index_template,source,date,log_enrich_path"
    xcom_values = pullXcom(ti, task, xcom_keys)  # pull data from xcom object
    index_template = xcom_values['index_template']
    source = xcom_values['source']
    date = xcom_values['date']
    enrichDataPath = xcom_values['log_enrich_path']
    # read arguments
    # index_template = kwargs.get('templates_dict').get('index_template', None)
    # source = kwargs.get('templates_dict').get('source', None)

    # date = kwargs.get('templates_dict').get('date', None)
    # enrichDataPath = kwargs.get('templates_dict').get('enrichedDir', None)

    hdfs_path = buildDates(date, enrichDataPath)
    jReturn = json.loads(hdfs_path)

    interval = jReturn['interval']
    hdfs_path = jReturn['path'][0]

    hook = DruidHook(
        druid_ingest_conn_id='druid_ingest_default',
        max_ingestion_time=None
    )

    # Open the JSON file for reading
    jsonFile = open(index_template, "r")

    # Read the JSON into the buffer
    data = json.load(jsonFile)

    # Close the JSON file
    jsonFile.close()

    ## Working with buffered content
    jData = data['spec']['dataSchema']['dataSource']
    data['spec']['dataSchema']['dataSource'] = str(jData).replace(jData, source)
    jData = data['spec']['dataSchema']['granularitySpec']['intervals'][0]
    data['spec']['dataSchema']['granularitySpec']['intervals'] = [str(jData).replace(jData, interval)]
    jData = data['spec']['ioConfig']['inputSpec']['paths']
    data['spec']['ioConfig']['inputSpec']['paths'] = str(jData).replace(jData, hdfs_path)

    index_spec_str = json.dumps(
        data,
        sort_keys=True,
        indent=4,
        separators=(',', ': ')
    )
    log.info("Sumitting %s", index_spec_str)
    hook.submit_indexing_job(index_spec_str)

    ## Save our changes to JSON file
    jsonFile = open(index_template, "w+")
    jsonFile.write(json.dumps(data, indent=4, sort_keys=True))
    jsonFile.close()
    return index_template
예제 #4
0
def copyFilesToDeltaDir(task_id, **kwargs):

    task = 'init_task_{0}'.format(task_id)
    ti = kwargs['ti']
    xcom_keys = "index_template,source,date,log_enrich_path,enriched_dir_delta"
    xcom_values = pullXcom(ti, task,xcom_keys)  #pull data from xcom object
    datestr = xcom_values['date']
    enrichedDir = xcom_values['log_enrich_path']
    deltaDir = xcom_values['enriched_dir_delta']

    dlist =[]
    pathList=[]
    intervalPathList=[]

    d = datetime.strptime(datestr,"%Y%m%d")
    day = "0{}".format(d.day) if len(str(d.day)) == 1 else d.day
    month = "0{}".format(d.month) if len(str(d.month)) == 1 else d.month
    dlist.append("{}-{}-{}".format(d.year, month, day))

    print('deltadir:'+ deltaDir)


    fileIndexLen= getIndexFiles(enrichedDir)
    if len(fileIndexLen) >0:
        pathList = pathList + fileIndexLen

    itr = 0;
    if dlist:
        minDate = d - timedelta(druidLookbackDays)

    for path in pathList:

        try:
            cdate=  datetime.strptime(path.strip()[-10:], "%m-%d-%Y")
            if cdate < minDate :
                if itr == 0 :
                    x = subprocess.Popen("hdfs dfs -rm -r "+deltaDir, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                    y = subprocess.Popen("hdfs dfs -mkdir -p "+deltaDir, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                    itr = 1
                intervalPathList.append(path+'/*')
                y=subprocess.Popen("hdfs dfs -mv "+path +" "+ deltaDir, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                print(path)
        except:
            print('bad input: '+path)


    retJson = { 'path': [ ','.join(intervalPathList)]}