def druid_indexing(task_id, **kwargs): task = 'init_task_{}'.format(task_id) ti = kwargs['ti'] xcom_values = pullXcom(ti, task, xcom_keys) # pull data from xcom object index_template = xcom_values['index_template'] # index_template = kwargs.get('templates_dict').get('index_template', None) # site = kwargs.get('templates_dict').get('site', None) tmp_index_template = index_template.replace("_template.json", "_template_{}.json".format(task_id)) print(tmp_index_template) hook = DruidHook( druid_ingest_conn_id='druid_ingest_default', max_ingestion_time=None ) with open(tmp_index_template) as data_file: index_spec = json.load(data_file) index_spec_str = json.dumps( index_spec, sort_keys=True, indent=4, separators=(',', ': ') ) log.info("Sumitting %s", index_spec_str) hook.submit_indexing_job(index_spec_str)
def startPythonBatch(task_id, **kwargs): task = 'init_task_{}'.format(task_id) print(kwargs) ti = kwargs['ti'] xcom_values = pullXcom( ti, task, "log_enrich_path_phoenix,table") #pull data from xcom object table = xcom_values['table'] #pull data from xcom object path = xcom_values['log_enrich_path_phoenix'] command = buildPhoenixSubmitCommand(table, path) log.info('Phoenix Batch Command : ' + str(command)) os.putenv( "HADOOP_CLASSPATH", "/usr/hdp/current/hbase-client/lib/hbase-protocol.jar:/etc/hbase/conf") submitSP = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, bufsize=-1, universal_newlines=True) processPhoenixSubmitLog(iter(submitSP.stdout.readline, '')) returncode = submitSP.wait() log.info('Starting Python Batch process ' + str(datetime.now().strftime("%m%d%Y-%H%M")) + ' return code ' + str(returncode)) return returncode
def DruidIndexTask(**kwargs): task = 'boot_task' ti = kwargs['ti'] xcom_keys = "index_template,source,date,log_enrich_path" xcom_values = pullXcom(ti, task, xcom_keys) # pull data from xcom object index_template = xcom_values['index_template'] source = xcom_values['source'] date = xcom_values['date'] enrichDataPath = xcom_values['log_enrich_path'] # read arguments # index_template = kwargs.get('templates_dict').get('index_template', None) # source = kwargs.get('templates_dict').get('source', None) # date = kwargs.get('templates_dict').get('date', None) # enrichDataPath = kwargs.get('templates_dict').get('enrichedDir', None) hdfs_path = buildDates(date, enrichDataPath) jReturn = json.loads(hdfs_path) interval = jReturn['interval'] hdfs_path = jReturn['path'][0] hook = DruidHook( druid_ingest_conn_id='druid_ingest_default', max_ingestion_time=None ) # Open the JSON file for reading jsonFile = open(index_template, "r") # Read the JSON into the buffer data = json.load(jsonFile) # Close the JSON file jsonFile.close() ## Working with buffered content jData = data['spec']['dataSchema']['dataSource'] data['spec']['dataSchema']['dataSource'] = str(jData).replace(jData, source) jData = data['spec']['dataSchema']['granularitySpec']['intervals'][0] data['spec']['dataSchema']['granularitySpec']['intervals'] = [str(jData).replace(jData, interval)] jData = data['spec']['ioConfig']['inputSpec']['paths'] data['spec']['ioConfig']['inputSpec']['paths'] = str(jData).replace(jData, hdfs_path) index_spec_str = json.dumps( data, sort_keys=True, indent=4, separators=(',', ': ') ) log.info("Sumitting %s", index_spec_str) hook.submit_indexing_job(index_spec_str) ## Save our changes to JSON file jsonFile = open(index_template, "w+") jsonFile.write(json.dumps(data, indent=4, sort_keys=True)) jsonFile.close() return index_template
def copyFilesToDeltaDir(task_id, **kwargs): task = 'init_task_{0}'.format(task_id) ti = kwargs['ti'] xcom_keys = "index_template,source,date,log_enrich_path,enriched_dir_delta" xcom_values = pullXcom(ti, task,xcom_keys) #pull data from xcom object datestr = xcom_values['date'] enrichedDir = xcom_values['log_enrich_path'] deltaDir = xcom_values['enriched_dir_delta'] dlist =[] pathList=[] intervalPathList=[] d = datetime.strptime(datestr,"%Y%m%d") day = "0{}".format(d.day) if len(str(d.day)) == 1 else d.day month = "0{}".format(d.month) if len(str(d.month)) == 1 else d.month dlist.append("{}-{}-{}".format(d.year, month, day)) print('deltadir:'+ deltaDir) fileIndexLen= getIndexFiles(enrichedDir) if len(fileIndexLen) >0: pathList = pathList + fileIndexLen itr = 0; if dlist: minDate = d - timedelta(druidLookbackDays) for path in pathList: try: cdate= datetime.strptime(path.strip()[-10:], "%m-%d-%Y") if cdate < minDate : if itr == 0 : x = subprocess.Popen("hdfs dfs -rm -r "+deltaDir, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) y = subprocess.Popen("hdfs dfs -mkdir -p "+deltaDir, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) itr = 1 intervalPathList.append(path+'/*') y=subprocess.Popen("hdfs dfs -mv "+path +" "+ deltaDir, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print(path) except: print('bad input: '+path) retJson = { 'path': [ ','.join(intervalPathList)]}