def index_documents(self, name, documents, bulk=True): attempts = 0 destination_index = "" is_box = False if name.startswith("boxinfo"): destination_index = self.boxinfo_write is_box = True else: destination_index = self.runindex_write while True: attempts += 1 try: if bulk: self.es.bulk_index(destination_index, name, documents) else: self.es.index(destination_index, name, documents[0]) return True except ElasticHttpError as ex: if attempts <= 1: continue self.logger.error( 'elasticsearch HTTP error. skipping document ' + name) if is_box == True: break #self.logger.exception(ex) return False except (socket.gaierror, ConnectionError, Timeout) as ex: if attempts > 100 and self.runMode: raise (ex) self.logger.error('elasticsearch connection error. retry.') if self.stopping: return False time.sleep(0.1) ip_url = getURLwithIP(self.es_server_url, self.nsslock) self.es = ElasticSearch(ip_url, timeout=20, revival_delay=60) if is_box == True: break return False
def updateIndexMaybe(self, index_name, alias_write, alias_read, settings, mapping): connectionAttempts = 0 retry = False while True: if self.stopping: break connectionAttempts += 1 try: if retry or self.ip_url == None: self.ip_url = getURLwithIP(self.es_server_url, self.nsslock) self.es = ElasticSearch(self.ip_url, timeout=20, revival_delay=60) #check if runindex alias exists if requests.get(self.es_server_url + '/_alias/' + alias_write).status_code == 200: self.logger.info('writing to elastic index ' + alias_write + ' on ' + self.es_server_url + ' - ' + self.ip_url) self.createDocMappingsMaybe(alias_write, mapping) break else: time.sleep(.5) if (connectionAttempts % 10) == 0: self.logger.error( 'unable to access to elasticsearch alias ' + alias_write + ' on ' + self.es_server_url + ' / ' + self.ip_url) continue except ElasticHttpError as ex: #es error, retry self.logger.error(ex) if self.runMode and connectionAttempts > 100: self.logger.error( 'elastic (BU): exiting after 100 ElasticHttpError reports from ' + self.es_server_url) sys.exit(1) elif self.runMode == False and connectionAttempts > 10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry = True continue except (socket.gaierror, ConnectionError, Timeout) as ex: #try to reconnect with different IP from DNS load balancing if self.runMode and connectionAttempts > 100: self.logger.error( 'elastic (BU): exiting after 100 connection attempts to ' + self.es_server_url) sys.exit(1) elif self.runMode == False and connectionAttempts > 10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry = True continue
def __init__(self, index_name, settings_path, host="http://127.0.0.1:9200"): self.connection = ElasticSearch(host) self.index_name = index_name self.settings_path = settings_path self.create_index()
def setupES(es_server_url='http://localhost:9200',deleteOld=1,doPrint=False,overrideTests=False, forceReplicas=-1): #ip_url=getURLwithIP(es_server_url) es = ElasticSearch(es_server_url,timeout=5) #get_template #es.send_request('GET', ['_template', name],query_params=query_params) #list_template #res = es.cluster_state(metric='metadata') templateList = es.send_request('GET', ['_template']) #templateList = res['metadata']['templates'] TEMPLATES = ["runappliance"] for template_name in TEMPLATES: if template_name not in templateList: printout(template_name+"template not present. It will be created. ",doPrint,False) create_template(es,template_name) else: norm_name = convert(templateList[template_name]) if deleteOld==0: printout(template_name+" already exists. Add 'replace' parameter to update if different, or forceupdate to always update.",doPrint,False) else: printout(template_name+" already exists.",doPrint,False) loaddoc = load_template(es,template_name) if forceReplicas>=0: loaddoc['settings']['index']['number_of_replicas']=forceReplicas if loaddoc!=None: mappingSame = norm_name['mappings']==loaddoc['mappings'] #settingSame = norm_name['settings']==loaddoc['settings'] settingsSame=True if int(norm_name['settings']['index.number_of_replicas'])!=int(loaddoc['settings']['index']['number_of_replicas']): settingsSame=False if int(norm_name['settings']['index.number_of_shards'])!=int(loaddoc['settings']['index']['number_of_shards']): settingsSame=False #currently analyzer settings are ot checked #if norm_name['settings']['index']['analysis']!=loaddoc['settings']['analysis']: # settingsSame=False if not (mappingSame and settingsSame) or deleteOld>1: #test is override if overrideTests==False: try: if norm_name['settings']['index,test']==True: printout("Template test setting found, skipping update...",doPrint,True) return except:pass #delete_template(es,template_name) printout("Updating "+template_name+" ES template",doPrint,True) create_template(es,template_name) else: printout('runappliance ES template is up to date',doPrint,True)
def __init__(self, es_server_url): self.server = ElasticSearch(es_server_url) self.datadict = { 'prc-out': { "lookup": Query('prc-out', 'source'), "action": { 'definition': Aggregator('drop'), 'data': Aggregator({ 'in': Aggregator('add'), 'out': Aggregator('add'), 'file': Aggregator('cat') }), 'ls': Aggregator('check'), 'stream': Aggregator('check'), 'source': Aggregator('match') } }, 'prc-in': { "lookup": Query('prc-in', 'dest'), "action": { 'definition': Aggregator('drop'), 'data': Aggregator({ 'out': Aggregator('add'), }), 'ls': Aggregator('check'), 'index': Aggregator('cat'), 'source': Aggregator('check'), 'dest': Aggregator('check'), 'process': Aggregator('cat') } }, 'prc-s-state': { "lookup": Query('prc-s-state'), "action": { 'macro': Aggregator('histoadd'), 'mini': Aggregator('histoadd'), 'micro': Aggregator('histoadd'), 'tp': Aggregator('add'), 'lead': Aggregator('avg'), 'nfiles': Aggregator('add'), 'ls': Aggregator('check'), 'process': Aggregator('cat') } } }
def __init__(self,group_name, topic_name, timeout=60, filename='config.txt'): self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.INFO) handler = logging.FileHandler('../_logs/%s.log'%group_name) handler.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s %(message)s') handler.setFormatter(formatter) self.logger.addHandler(handler) try: f = open(filename, 'r') self.hbasehost = f.readline().split(' ')[0] self.eshost = f.readline().split(' ')[0] self.kafkahost = f.readline().split(' ')[0] self.hdfshost = f.readline().split(' ')[0] self.logger.info('All Hosts Loaded') except Exception as e: self.logger.warning('file load error, %s'%filename) self.logger.warning(str(e)) raise # sys.exit(0) self.group_name = group_name self.topic_name = topic_name self.timeout = timeout try: self.kafka = KafkaClient(self.kafkahost) self.pool = happybase.ConnectionPool(size=6, host=self.hbasehost) self.es = ElasticSearch(self.eshost) except Exception as e: self.logger.warning(str(e)) raise
def index_documents(self,name,documents,bulk=True): attempts=0 destination_index = "" is_box=False if name.startswith("boxinfo") or name=='resource_summary': destination_index = self.boxinfo_write is_box=True else: destination_index = self.runindex_write while True: attempts+=1 try: if bulk: self.es.bulk_index(destination_index,name,documents) else: self.es.index(destination_index,name,documents[0]) return True except ElasticHttpError as ex: if attempts<=1:continue self.logger.error('elasticsearch HTTP error'+str(ex)+'. skipping document '+name) if is_box==True:break #self.logger.exception(ex) return False except (socket.gaierror,ConnectionError,Timeout) as ex: if attempts>100 and self.runMode: raise(ex) self.logger.error('elasticsearch connection error' + str(ex)+'. retry.') if self.stopping:return False ip_url=getURLwithIP(self.es_server_url,self.nsslock) self.es = ElasticSearch(ip_url,timeout=20) time.sleep(0.1) if is_box==True:break return False
class IndexData: def __init__(self, index_name, settings_path, host="http://127.0.0.1:9200"): self.connection = ElasticSearch(host) self.index_name = index_name self.settings_path = settings_path self.create_index() def get_settings(self): config_file = file(self.settings_path) settings = json.load(config_file) return settings def create_index(self): settings = self.get_settings() try: self.connection.create_index(self.index_name, settings) except pyelasticsearch.exceptions.ElasticHttpError as e: self.connection.delete_index(self.index_name) self.connection.create_index(self.index_name, settings) def index_data(self, data_path, index_type): if index_type is None: raise "Please enter valid index type" objects = [] count = 0 with open(data_path) as f: for line in f: word_split = line.split("\t") cin = word_split[0] name = word_split[1].strip() doc = {'cin': cin, 'name': name} objects.append(doc) if len(objects) > 1000: response = self.connection.bulk_index(self.index_name, index_type, objects, id_field='cin') objects = [] self.connection.bulk_index(self.index_name, index_type, objects, id_field='cin')
def main(): if len(sys.argv) > 3: print "Invalid argument number" sys.exit(1) if len(sys.argv) < 2: print "Please provide an elasticsearch server url (e.g. http://localhost:9200)" sys.exit(1) deleteOld = False if len(sys.argv) > 2: if "replace" in sys.argv[2]: deleteOld = True es_server_url = sys.argv[1] ip_url = getURLwithIP(es_server_url) es = ElasticSearch(es_server_url) #get_template #es.send_request('GET', ['_template', name],query_params=query_params) #list_template res = es.cluster_state(filter_routing_table=True, filter_nodes=True, filter_blocks=True) templateList = res['metadata']['templates'] for template_name in TEMPLATES: if template_name not in templateList: print "{0} template not present. It will be created. ".format( template_name) create_template(es, template_name) else: if deleteOld == False: print "{0} already exists. Add 'replace' parameter to force update.".format( template_name) else: print "{0} already exists.".format(template_name) delete_template(es, template_name) print "Deleted old template and will recreate {0}".format( template_name) create_template(es, template_name)
class IndexData: def __init__(self, index_name, settings_path, host="http://127.0.0.1:9200"): self.connection = ElasticSearch(host) self.index_name = index_name self.settings_path = settings_path self.create_index() def get_settings(self): config_file = file(self.settings_path) settings = json.load(config_file) return settings def create_index(self): settings = self.get_settings() try: self.connection.create_index(self.index_name, settings) except pyelasticsearch.exceptions.ElasticHttpError as e: self.connection.delete_index(self.index_name) self.connection.create_index(self.index_name, settings) def index_data(self, data_path, index_type): if index_type is None: raise "Please enter valid index type" objects = [] count = 0 with open(data_path) as f: for line in f: word_split = line.split("\t") cin = word_split[0] name = word_split[1].strip() doc = {'cin':cin, 'name':name} objects.append(doc) if len(objects) > 1000: response = self.connection.bulk_index(self.index_name, index_type, objects, id_field='cin') objects = [] self.connection.bulk_index(self.index_name, index_type, objects, id_field='cin')
def main(): if len(sys.argv) > 3: print "Invalid argument number" sys.exit(1) if len(sys.argv) < 2: print "Please provide an elasticsearch server url (e.g. http://localhost:9200)" sys.exit(1) deleteOld=False if len(sys.argv)>2: if "replace" in sys.argv[2]: deleteOld=True es_server_url = sys.argv[1] ip_url=getURLwithIP(es_server_url) es = ElasticSearch(es_server_url) #get_template #es.send_request('GET', ['_template', name],query_params=query_params) #list_template res = es.cluster_state(metric='metadata') templateList = res['metadata']['templates'] for template_name in TEMPLATES: if template_name not in templateList: print "{0} template not present. It will be created. ".format(template_name) create_template(es,template_name) else: if deleteOld==False: print "{0} already exists. Add 'replace' parameter to force update.".format(template_name) else: print "{0} already exists.".format(template_name) delete_template(es,template_name) print "Deleted old template and will recreate {0}".format(template_name) create_template(es,template_name)
def updateIndexMaybe(self,index_name,alias_write,alias_read,settings,mapping): connectionAttempts=0 retry=False while True: if self.stopping:break connectionAttempts+=1 try: if retry or self.ip_url==None: self.ip_url=getURLwithIP(self.es_server_url,self.nsslock) self.es = ElasticSearch(self.ip_url,timeout=20) #check if runindex alias exists if requests.get(self.ip_url+'/_alias/'+alias_write).status_code == 200: self.logger.info('writing to elastic index '+alias_write + ' on '+self.es_server_url+' - '+self.ip_url ) self.createDocMappingsMaybe(alias_write,mapping) break else: time.sleep(.5) if (connectionAttempts%10)==0: self.logger.error('unable to access to elasticsearch alias ' + alias_write + ' on '+self.es_server_url+' / '+self.ip_url) continue except ElasticHttpError as ex: #es error, retry self.logger.error(ex) if self.runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 ElasticHttpError reports from '+ self.es_server_url) sys.exit(1) elif self.runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry=True continue except (socket.gaierror,ConnectionError,Timeout,RequestsConnectionError,RequestsTimeout) as ex: #try to reconnect with different IP from DNS load balancing if self.runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 connection attempts to '+ self.es_server_url) sys.exit(1) elif self.runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry=True continue
def __init__(self, es_server_url, runstring, indexSuffix, monBufferSize, fastUpdateModulo): self.logger = logging.getLogger(self.__class__.__name__) self.istateBuffer = [] self.prcinBuffer = {} self.prcoutBuffer = {} self.fuoutBuffer = {} self.es = ElasticSearch(es_server_url, timeout=20) self.hostname = os.uname()[1] self.hostip = socket.gethostbyname_ex(self.hostname)[2][0] #self.number_of_data_nodes = self.es.health()['number_of_data_nodes'] self.settings = {"index.routing.allocation.require._ip": self.hostip} self.indexCreated = False self.indexFailures = 0 self.monBufferSize = monBufferSize self.fastUpdateModulo = fastUpdateModulo aliasName = runstring + "_" + indexSuffix self.indexName = aliasName # + "_" + self.hostname
def __init__(self,es_server_url): self.server = ElasticSearch(es_server_url) self.datadict = { 'prc-out' : { "lookup" : Query('prc-out','source'), "action" : { 'definition' : Aggregator('drop'), 'data': Aggregator({'in': Aggregator('add'), 'out': Aggregator('add'), 'file':Aggregator('cat') }), 'ls' : Aggregator('check'), 'stream' : Aggregator('check'), 'source' : Aggregator('match') } }, 'prc-in' : { "lookup" : Query('prc-in','dest'), "action" : { 'definition' : Aggregator('drop'), 'data': Aggregator({ 'out' : Aggregator('add'), }), 'ls' : Aggregator('check'), 'index' : Aggregator('cat'), 'source' : Aggregator('check'), 'dest' : Aggregator('check'), 'process': Aggregator('cat') } }, 'prc-s-state' : { "lookup" : Query('prc-s-state'), "action" : { 'macro' : Aggregator('histoadd'), 'mini' : Aggregator('histoadd'), 'micro' : Aggregator('histoadd'), 'tp' : Aggregator('add'), 'lead' : Aggregator('avg'), 'nfiles' : Aggregator('add'), 'ls' : Aggregator('check'), 'process': Aggregator('cat') } } }
def index_documents(self,name,documents): attempts=0 while True: attempts+=1 try: self.es.bulk_index(self.index_name,name,documents) return True except ElasticHttpError as ex: if attempts<=1:continue self.logger.error('elasticsearch HTTP error. skipping document '+name) #self.logger.exception(ex) return False except (ConnectionError,Timeout) as ex: if attempts>100 and self.runMode: raise(ex) self.logger.error('elasticsearch connection error. retry.') if self.stopping:return False time.sleep(0.1) ip_url=getURLwithIP(self.es_server_url) self.es = ElasticSearch(ip_url) return False
def __init__(self, es_server_url): self.server = ElasticSearch(es_server_url) self.datadict = { "prc-out": { "lookup": Query("prc-out", "source"), "action": { "definition": Aggregator("drop"), "data": Aggregator({"in": Aggregator("add"), "out": Aggregator("add"), "file": Aggregator("cat")}), "ls": Aggregator("check"), "stream": Aggregator("check"), "source": Aggregator("match"), }, }, "prc-in": { "lookup": Query("prc-in", "dest"), "action": { "definition": Aggregator("drop"), "data": Aggregator({"out": Aggregator("add")}), "ls": Aggregator("check"), "index": Aggregator("cat"), "source": Aggregator("check"), "dest": Aggregator("check"), "process": Aggregator("cat"), }, }, "prc-s-state": { "lookup": Query("prc-s-state"), "action": { "macro": Aggregator("histoadd"), "mini": Aggregator("histoadd"), "micro": Aggregator("histoadd"), "tp": Aggregator("add"), "lead": Aggregator("avg"), "nfiles": Aggregator("add"), "ls": Aggregator("check"), "process": Aggregator("cat"), }, }, }
def write_es_geo(self, es_host='http://localhost:9200/', index_name="geos", doc_type='user_geos'): # try to connect with ES and delete the index es = ElasticSearch('http://localhost:9200/') ## uncomment the following code to prompt check # print "Will delete all the doc in the [index:type] from ElasticSearch:" # print index_name, ":", doc_type # confirm = raw_input("Sure?(y/n)") # if confirm!="y": # sys.exit(0) try: create_es() except Exception as e: print "Error", e else: print index_name, ":", doc_type, " deleted!" # initializing the documents documents = [] for record in self.userGeos: doc = { 'uid': int(record[0]), 'location': { 'lat': record[1], 'lon': record[2] } } documents.append(doc) print "Bulk indexing", len(documents), "documents.." es.bulk_index(index_name, doc_type, documents, id_field='uid') es.refresh(index_name) # test usage print "results from ES," query = {"from": 0, "size": 2000, 'query': {"match_all": {}}} res = es.search(query, index=index_name) print len(res['hits']['hits']), "documents found" print "sample result" print res['hits']['hits'][0]
def write_es_geo(self, es_host='http://localhost:9200/', index_name="geos", doc_type='user_geos'): # try to connect with ES and delete the index es = ElasticSearch('http://localhost:9200/') ## uncomment the following code to prompt check # print "Will delete all the doc in the [index:type] from ElasticSearch:" # print index_name, ":", doc_type # confirm = raw_input("Sure?(y/n)") # if confirm!="y": # sys.exit(0) try: create_es() except Exception as e: print "Error", e else: print index_name,":", doc_type," deleted!" # initializing the documents documents = [] for record in self.userGeos: doc = {'uid':int(record[0]), 'location':{'lat':record[1],'lon':record[2]}} documents.append(doc) print "Bulk indexing", len(documents),"documents.." es.bulk_index(index_name, doc_type, documents, id_field='uid') es.refresh(index_name) # test usage print "results from ES," query = { "from" : 0, "size" : 2000, 'query': { "match_all" : { } } } res = es.search(query, index=index_name) print len(res['hits']['hits']), "documents found" print "sample result" print res['hits']['hits'][0]
class elasticBandBU: def __init__(self,conf,runnumber,startTime,runMode=True,nsslock=None,box_version=None,update_run_mapping=True,update_box_mapping=True): self.logger = logging.getLogger(self.__class__.__name__) self.conf=conf self.es_server_url=conf.elastic_runindex_url self.runindex_write="runindex_"+conf.elastic_runindex_name+"_write" self.runindex_read="runindex_"+conf.elastic_runindex_name+"_read" self.runindex_name="runindex_"+conf.elastic_runindex_name self.boxinfo_write="boxinfo_"+conf.elastic_runindex_name+"_write" self.boxinfo_read="boxinfo_"+conf.elastic_runindex_name+"_read" self.boxinfo_name="boxinfo_"+conf.elastic_runindex_name self.boxdoc_version=box_version self.runnumber = str(runnumber) self.startTime = startTime self.host = os.uname()[1] self.stopping=False self.threadEvent = threading.Event() self.runMode=runMode self.boxinfoFUMap = {} self.ip_url=None self.nsslock=nsslock if update_run_mapping: self.updateIndexMaybe(self.runindex_name,self.runindex_write,self.runindex_read,mappings.central_es_settings_runindex,mappings.central_runindex_mapping) if update_box_mapping: self.updateIndexMaybe(self.boxinfo_name,self.boxinfo_write,self.boxinfo_read,mappings.central_es_settings_boxinfo,mappings.central_boxinfo_mapping) #silence eslib_logger = logging.getLogger('elasticsearch') eslib_logger.setLevel(logging.ERROR) self.black_list=None if self.conf.instance=='main': self.hostinst = self.host else: self.hostinst = self.host+'_'+self.conf.instance #this naturally fits with the 'run' document retries=10 if runMode == True: while retries: retries-=1 try: version = None arch = None hltmenuname = None with open(os.path.join(mainDir,'hlt',conf.paramfile_name),'r') as fp: fffparams = json.load(fp) version = fffparams['CMSSW_VERSION'] arch = fffparams['SCRAM_ARCH'] self.logger.info("OK") with open(os.path.join(mainDir,'hlt','HltConfig.py'),'r') as fp: firstline = fp.readline().strip().strip("\n") #first line if firstline.startswith("#"): hltmenuname = firstline.strip("#").strip() break except Exception as ex: self.logger.info("failed to parse run metadata file "+str(ex)+". retries left "+str(retries)) time.sleep(0.2) #write run number document if runMode == True and self.stopping==False: document = {} doc_id = self.runnumber document['runNumber'] = doc_id document['startTime'] = startTime document['activeBUs'] = 1 document['totalBUs'] = 1 document['rawDataSeenByHLT']=False if version: document['CMSSW_version']=version if arch: document['CMSSW_arch']=arch if hltmenuname and len(hltmenuname): document['HLT_menu']=hltmenuname documents = [document] ret = self.index_documents('run',documents,doc_id,bulk=False,overwrite=False) if isinstance(ret,tuple) and ret[1]==409: #run document was already created by another BU. In that case increase atomically active BU counter #self.index_documents('run',[{"inline":"ctx._source.activeBUs+=1;ctx._source.totalBUs+=1","lang":"painless"}],doc_id,bulk=False,update_only=True,script=True,retry_on_conflict=300) self.index_documents('run',[{"inline":"ctx._source.activeBUs+=1;ctx._source.totalBUs+=1"}],doc_id,bulk=False,update_only=True,script=True,retry_on_conflict=300) def updateIndexMaybe(self,index_name,alias_write,alias_read,settings,mapping): connectionAttempts=0 retry=False while True: if self.stopping:break connectionAttempts+=1 try: if retry or self.ip_url==None: self.ip_url=getURLwithIP(self.es_server_url,self.nsslock) self.es = ElasticSearch(self.ip_url,timeout=20) #check if index alias exists if requests.get(self.ip_url+'/_alias/'+alias_write).status_code == 200: self.logger.info('writing to elastic index '+alias_write + ' on '+self.es_server_url+' - '+self.ip_url ) self.createDocMappingsMaybe(alias_write,mapping) break else: time.sleep(.5) if (connectionAttempts%10)==0: self.logger.error('unable to access to elasticsearch alias ' + alias_write + ' on '+self.es_server_url+' / '+self.ip_url) continue except ElasticHttpError as ex: #es error, retry self.logger.error(ex) if self.runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 ElasticHttpError reports from '+ self.es_server_url) sys.exit(1) elif self.runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry=True continue except (socket.gaierror,ConnectionError,Timeout,RequestsConnectionError,RequestsTimeout) as ex: #try to reconnect with different IP from DNS load balancing if self.runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 connection attempts to '+ self.es_server_url) sys.exit(1) elif self.runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry=True continue def createDocMappingsMaybe(self,index_name,mapping): #update in case of new documents added to mapping definition for key in mapping: doc = {key:mapping[key]} res = requests.get(self.ip_url+'/'+index_name+'/'+key+'/_mapping') #only update if mapping is empty if res.status_code==200: if res.content.strip()=='{}': self.logger.info('inserting new mapping for '+str(key)) requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc)) else: #still check if number of properties is identical in each type inmapping = json.loads(res.content) for indexname in inmapping: properties = inmapping[indexname]['mappings'][key]['properties'] self.logger.info('checking mapping '+ indexname + '/' + key + ' which has ' + str(len(mapping[key]['properties'])) + '(index:' + str(len(properties)) + ') entries..') for pdoc in mapping[key]['properties']: if pdoc not in properties: self.logger.info('inserting mapping for ' + str(key) + ' which is missing mapping property ' + str(pdoc)) requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc)) if res.status_code!=200: self.logger.warning('insert mapping reply status code '+str(res.status_code)+': '+res.content) break else: self.logger.warning('requests error code '+res.status_code+' in mapping request') def read_line(self,fullpath): with open(fullpath,'r') as fp: return fp.readline() def elasticize_modulelegend(self,fullpath): self.logger.info(os.path.basename(fullpath)) document = {} #document['_parent']= self.runnumber doc_id="microstatelegend_"+self.runnumber if fullpath.endswith('.jsn'): try: with open(fullpath,'r') as fp: doc = json.load(fp) document['stateNames'] = doc['names'] try:document['reserved'] = doc['reserved'] except:document['reserved'] = 33 try:document['special'] = doc['special'] except:document['special'] = 7 nstring = "" cnt = 0 outputcnt = 0 #fill in also old format for now for sname in doc['names']: nstring+= str(cnt) + "=" + sname + " " cnt+=1 if sname.startswith('hltOutput'):outputcnt+=1 try:document['output'] = doc['output'] except:document['output']=outputcnt #document['names'] = nstring except Exception as ex: self.logger.warning("can not parse "+fullpath + ' ' + str(ex)) else: #old format stub = self.read_line(fullpath) docnames= self.read_line(fullpath) document['reserved'] = 33 document['special'] = 7 outputcnt=0 for sname in docnames.split(): if "=hltOutput" in sname: outputcnt+=1 document['output'] = outputcnt document['stateNames']=[] nameTokens = docnames.split() for nameToken in nameTokens: if '=' in nameToken: idx,sn = nameToken.split('=') document["stateNames"].append( sn ) documents = [document] doc_pars = {"parent":str(self.runnumber)} return self.index_documents('microstatelegend',documents,doc_id,doc_params=doc_pars,bulk=False) def elasticize_pathlegend(self,fullpath): self.logger.info(os.path.basename(fullpath)) document = {} #document['_parent']= self.runnumber doc_id="pathlegend_"+self.runnumber if fullpath.endswith('.jsn'): try: with open(fullpath,'r') as fp: doc = json.load(fp) document['stateNames'] = doc['names'] document['reserved'] = doc['reserved'] #put old name format value nstring="" cnt=0 for sname in doc['names']: nstring+= str(cnt) + "=" + sname + " " cnt+=1 document['names'] = nstring except Exception as ex: self.logger.warning("can not parse "+fullpath) else: stub = self.read_line(fullpath) document['names']= self.read_line(fullpath) documents = [document] doc_pars = {"parent":str(self.runnumber)} return self.index_documents('pathlegend',documents,doc_id,doc_params=doc_pars,bulk=False) def elasticize_inputlegend(self,fullpath): self.logger.info(os.path.basename(fullpath)) document = {} doc_id="inputstatelegend_"+self.runnumber try: with open(fullpath,'r') as fp: doc = json.load(fp) document['stateNames'] = doc['names'] except Exception as ex: self.logger.warning("can not parse "+fullpath) documents = [document] doc_pars = {"parent":str(self.runnumber)} return self.index_documents('inputstatelegend',documents,doc_id,doc_params=doc_pars,bulk=False) def elasticize_stream_label(self,infile): #elasticize stream name information self.logger.info(infile.filepath) document = {} #document['_parent']= self.runnumber document['stream']=infile.stream[6:] doc_id=infile.basename doc_pars = {"parent":str(self.runnumber)} return self.index_documents('stream_label',[document],doc_id,doc_params=doc_pars,bulk=False) def elasticize_runend_time(self,endtime): self.logger.info(str(endtime)+" going into buffer") doc_id = self.runnumber #first update: endtime field self.index_documents('run',[{"endTime":endtime}],doc_id,bulk=False,update_only=True) #second update:decrease atomically active BU counter #self.index_documents('run',[{"inline":"ctx._source.activeBUs-=1","lang":"painless"}],doc_id,bulk=False,update_only=True,script=True,retry_on_conflict=300) self.index_documents('run',[{"inline":"ctx._source.activeBUs-=1"}],doc_id,bulk=False,update_only=True,script=True,retry_on_conflict=300) def elasticize_resource_summary(self,jsondoc): self.logger.debug('injecting resource summary document') jsondoc['appliance']=self.host self.index_documents('resource_summary',[jsondoc],bulk=False) def elasticize_box(self,infile): basename = infile.basename self.logger.debug(basename) current_time = time.time() if infile.data=={}:return bu_doc=False if basename.startswith('bu') or basename.startswith('dvbu'): bu_doc=True #check box file against blacklist if bu_doc or self.black_list==None: self.black_list=[] try: with open(os.path.join(self.conf.watch_directory,'appliance','blacklist'),"r") as fi: try: self.black_list = json.load(fi) except ValueError: #file is being written or corrupted return except: #blacklist file is not present, do not filter pass if basename in self.black_list:return if bu_doc==False: try: if self.boxdoc_version!=infile.data['version']: self.logger.info('skipping '+basename+' box file version '+str(infile.data['version'])+' which is different from '+str(self.boxdoc_version)) return; except: self.logger.warning("didn't find version field in box file "+basename) return try: self.boxinfoFUMap[basename] = [infile.data,current_time] except Exception as ex: self.logger.warning('box info not injected: '+str(ex)) return try: document = infile.data #unique id for separate instances if bu_doc: doc_id=self.hostinst else: doc_id=basename document['id']=doc_id try: document['activeRunList'] = map(int,document['activeRuns']) except: pass try: document['activeRuns'] = map(str,document['activeRuns']) except: pass document['appliance']=self.host document['instance']=self.conf.instance if bu_doc==True: document['blacklist']=self.black_list #only here document['host']=basename try:document.pop('version') except:pass try:document.pop('ip') except:pass try:document.pop('boot_id') except:pass self.index_documents('boxinfo',[document],doc_id,bulk=False) except Exception as ex: self.logger.warning('box info not injected: '+str(ex)) return def elasticize_fubox(self,doc): try: doc_id = self.host doc['host']=doc_id self.index_documents('fu-box-status',[doc],doc_id,bulk=False) except Exception as ex: self.logger.warning('fu box status not injected: '+str(ex)) def elasticize_eols(self,infile): basename = infile.basename self.logger.info(basename) data = infile.data['data'] data.insert(0,infile.mtime) data.insert(0,infile.ls[2:]) values = [int(f) if f.isdigit() else str(f) for f in data] try: keys = ["ls","fm_date","NEvents","NFiles","TotalEvents","NLostEvents","NBytes"] document = dict(zip(keys, values)) except: #try without NBytes keys = ["ls","fm_date","NEvents","NFiles","TotalEvents","NLostEvents"] document = dict(zip(keys, values)) doc_id = infile.name+"_"+self.host document['id'] = doc_id #document['_parent']= self.runnumber document['appliance']=self.host documents = [document] doc_pars = {"parent":str(self.runnumber)} self.index_documents('eols',documents,doc_id,doc_params=doc_pars,bulk=False) def index_documents(self,name,documents,doc_id=None,doc_params=None,bulk=True,overwrite=True,update_only=False,retry_on_conflict=0,script=False): if name=='fu-box-status' or name.startswith("boxinfo") or name=='resource_summary': destination_index = self.boxinfo_write is_box=True else: destination_index = self.runindex_write is_box=False attempts=0 while True: attempts+=1 try: if bulk: self.es.bulk_index(destination_index,name,documents) else: if doc_id: if update_only: if script: self.es.update(index=destination_index,doc_type=name,id=doc_id,script=documents[0],upsert=False,retry_on_conflict=retry_on_conflict) else: self.es.update(index=destination_index,doc_type=name,id=doc_id,doc=documents[0],upsert=False,retry_on_conflict=retry_on_conflict) else: #overwrite existing can be used with id specified if doc_params: self.es.index(destination_index,name,documents[0],doc_id,parent=doc_params['parent'],overwrite_existing=overwrite) else: self.es.index(destination_index,name,documents[0],doc_id,overwrite_existing=overwrite) else: self.es.index(destination_index,name,documents[0]) return True except ElasticHttpError as ex: if name=='run' and ex[0]==409: #create failed because overwrite was forbidden return (False,ex[0]) if ex[0]==429: if attempts<10 and not is_box: self.logger.warning('elasticsearch HTTP error 429'+str(ex)+'. retrying..') time.sleep(.1) continue else: if attempts<=1 and not is_box:continue if is_box: self.logger.warning('elasticsearch HTTP error '+str(ex)+'. skipping document '+name) else: self.logger.error('elasticsearch HTTP error '+str(ex)+'. skipping document '+name) return False except (socket.gaierror,ConnectionError,Timeout) as ex: if attempts>100 and self.runMode: raise(ex) if is_box or attempts<=1: self.logger.warning('elasticsearch connection error' + str(ex)+'. retry.') elif (attempts-2)%10==0: self.logger.error('elasticsearch connection error' + str(ex)+'. retry.') if self.stopping:return False ip_url=getURLwithIP(self.es_server_url,self.nsslock) self.es = ElasticSearch(ip_url,timeout=20) time.sleep(0.1) if is_box==True:#give up on too many box retries as they are indexed again every 5 seconds break return False
plist = x[1][1] pdict = {} for i in range(len(plist)): pdict[i] = json.loads(plist[i][1]) with POOL.connection() as connection: tagview = connection.table('top_tags') rowkey = "%016i" % int(x[0]) + hashlib.md5(str(x[1][0])).digest() tagview.put(rowkey, { "p:tag": str(x[1][0]), "p:dump": json.dumps(pdict) }) #sample input # (u"102", ((5, 5), "{"photo": {"timeposted": 1422939564, "description": "pdes", "tags": "ptag1,ptag3", "URL": "purl", "title": "ptitle", "pid": "102", "location": {"latitude": "plat", "longitude": "plon"}}}")) ES = ElasticSearch("http://localhost:9200") def saveESDocuments(x): print "writing to hbase.., pid,", x[0] parsedrawdata = json.loads(x[1][1]) document = { "pid": int(x[0]), "likes": x[1][0][0], "views": x[1][0][1], "location": { "lat": parsedrawdata["photo"]["location"]["latitude"], "lon": parsedrawdata["photo"]["location"]["longitude"] } } ES.index('photo_geos', 'photos', document, id=document['pid'])
command = sys.argv[1] server_url = sys.argv[2] index_name = sys.argv[3] else: print "Parameters: command[create,alias,mapping] server url, index.alias name (target index)" print " COMMANDS:" print " create: create index" print " alias: create index *_read and *_write aliases (optional parameter: target index)" print " create missing document mappings for the index" sys.exit(1) if server_url.startswith('http://') == False: server_url = 'http://' + server_url #connection es = ElasticSearch(server_url) #pick mapping if index_name.startswith('runindex'): my_settings = mappings.central_es_settings my_mapping = mappings.central_runindex_mapping if index_name.startswith('boxinfo'): my_settings = mappings.central_es_settings, my_mapping = mappings.central_boxinfo_mapping if index_name.startswith('hltdlogs'): my_settings = mappings.central_es_settings_hltlogs my_mapping = mappings.central_hltdlogs_mapping #alias convention alias_write = index_name + "_write" alias_read = index_name + "_read"
class elasticBandBU: def __init__(self,conf,runnumber,startTime,runMode=True,nsslock=None): self.logger = logging.getLogger(self.__class__.__name__) self.conf=conf self.es_server_url=conf.elastic_runindex_url self.runindex_write="runindex_"+conf.elastic_runindex_name+"_write" self.runindex_read="runindex_"+conf.elastic_runindex_name+"_read" self.runindex_name="runindex_"+conf.elastic_runindex_name self.boxinfo_write="boxinfo_"+conf.elastic_runindex_name+"_write" self.boxinfo_read="boxinfo_"+conf.elastic_runindex_name+"_read" self.boxinfo_name="boxinfo_"+conf.elastic_runindex_name self.runnumber = str(runnumber) self.startTime = startTime self.host = os.uname()[1] self.stopping=False self.threadEvent = threading.Event() self.runMode=runMode self.boxinfoFUMap = {} self.ip_url=None self.nsslock=nsslock self.updateIndexMaybe(self.runindex_name,self.runindex_write,self.runindex_read,mappings.central_es_settings,mappings.central_runindex_mapping) self.updateIndexMaybe(self.boxinfo_name,self.boxinfo_write,self.boxinfo_read,mappings.central_es_settings,mappings.central_boxinfo_mapping) self.black_list=None if self.conf.instance=='main': self.hostinst = self.host else: self.hostinst = self.host+'_'+self.conf.instance #write run number document if runMode == True and self.stopping==False: document = {} document['runNumber'] = self.runnumber document['startTime'] = startTime documents = [document] self.index_documents('run',documents) #except ElasticHttpError as ex: # self.logger.info(ex) # pass def updateIndexMaybe(self,index_name,alias_write,alias_read,settings,mapping): connectionAttempts=0 retry=False while True: if self.stopping:break connectionAttempts+=1 try: if retry or self.ip_url==None: self.ip_url=getURLwithIP(self.es_server_url,self.nsslock) self.es = ElasticSearch(self.ip_url,timeout=20) #check if runindex alias exists if requests.get(self.ip_url+'/_alias/'+alias_write).status_code == 200: self.logger.info('writing to elastic index '+alias_write + ' on '+self.es_server_url+' - '+self.ip_url ) self.createDocMappingsMaybe(alias_write,mapping) break else: time.sleep(.5) if (connectionAttempts%10)==0: self.logger.error('unable to access to elasticsearch alias ' + alias_write + ' on '+self.es_server_url+' / '+self.ip_url) continue except ElasticHttpError as ex: #es error, retry self.logger.error(ex) if self.runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 ElasticHttpError reports from '+ self.es_server_url) sys.exit(1) elif self.runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry=True continue except (socket.gaierror,ConnectionError,Timeout,RequestsConnectionError,RequestsTimeout) as ex: #try to reconnect with different IP from DNS load balancing if self.runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 connection attempts to '+ self.es_server_url) sys.exit(1) elif self.runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry=True continue def createDocMappingsMaybe(self,index_name,mapping): #update in case of new documents added to mapping definition for key in mapping: doc = {key:mapping[key]} res = requests.get(self.ip_url+'/'+index_name+'/'+key+'/_mapping') #only update if mapping is empty if res.status_code==200: if res.content.strip()=='{}': requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc)) else: #still check if number of properties is identical in each type inmapping = json.loads(res.content) for indexname in inmapping: properties = inmapping[indexname]['mappings'][key]['properties'] self.logger.info('checking mapping '+ indexname + '/' + key + ' which has ' + str(len(mapping[key]['properties'])) + '(index:' + str(len(properties)) + ') entries..') #should be size 1 for pdoc in mapping[key]['properties']: if pdoc not in properties: requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc)) break else: self.logger.warning('requests error code '+res.status_code+' in mapping request') def read_line(self,fullpath): with open(fullpath,'r') as fp: return fp.readline() def elasticize_modulelegend(self,fullpath): self.logger.info(os.path.basename(fullpath)) stub = self.read_line(fullpath) document = {} document['_parent']= self.runnumber document['id']= "microstatelegend_"+self.runnumber document['names']= self.read_line(fullpath) documents = [document] return self.index_documents('microstatelegend',documents) def elasticize_pathlegend(self,fullpath): self.logger.info(os.path.basename(fullpath)) stub = self.read_line(fullpath) document = {} document['_parent']= self.runnumber document['id']= "pathlegend_"+self.runnumber document['names']= self.read_line(fullpath) documents = [document] return self.index_documents('pathlegend',documents) def elasticize_runend_time(self,endtime): self.logger.info(str(endtime)+" going into buffer") document = {} document['runNumber'] = self.runnumber document['startTime'] = self.startTime document['endTime'] = endtime documents = [document] self.index_documents('run',documents) def elasticize_box(self,infile): basename = infile.basename self.logger.debug(basename) current_time = time.time() if infile.data=={}:return bu_doc=False if basename.startswith('bu') or basename.startswith('dvbu'): bu_doc=True #check box file against blacklist if bu_doc or self.black_list==None: self.black_list=[] try: with open(os.path.join(self.conf.watch_directory,'appliance','blacklist'),"r") as fi: try: self.black_list = json.load(fi) except ValueError: #file is being written or corrupted return except: #blacklist file is not present, do not filter pass if basename in self.black_list:return if bu_doc==False: try: self.boxinfoFUMap[basename] = [infile.data,current_time] except Exception as ex: self.logger.warning('box info not injected: '+str(ex)) return try: document = infile.data #unique id for separate instances if bu_doc: document['id']=self.hostinst else: document['id']=basename #both here and in "boxinfo_appliance" document['appliance']=self.host document['instance']=self.conf.instance #only here document['host']=basename try: document['detectedStaleHandle']=bool(document['detectedStaleHandle']=='True') except: pass self.index_documents('boxinfo',[document]) except Exception as ex: self.logger.warning('box info not injected: '+str(ex)) return if bu_doc: try: document = infile.data try: document.pop('id') except:pass try: document.pop('host') except:pass #aggregation from FUs document['idles']=0 document['used']=0 document['broken']=0 document['quarantined']=0 document['cloud']=0 document['usedDataDir']=0 document['totalDataDir']=0 document['hosts']=[basename] document['blacklistedHosts']=[] for key in self.boxinfoFUMap: dpair = self.boxinfoFUMap[key] d = dpair[0] #check if entry is not older than 10 seconds if current_time - dpair[1] > 10:continue document['idles']+=int(d['idles']) document['used']+=int(d['used']) document['broken']+=int(d['broken']) document['quarantined']+=int(d['quarantined']) document['cloud']+=int(d['cloud']) document['usedDataDir']+=int(d['usedDataDir']) document['totalDataDir']+=int(d['totalDataDir']) document['hosts'].append(key) for blacklistedHost in self.black_list: document['blacklistedHosts'].append(blacklistedHost) self.index_documents('boxinfo_appliance',[document],bulk=False) except Exception as ex: #in case of malformed box info self.logger.warning('box info not injected: '+str(ex)) return def elasticize_eols(self,infile): basename = infile.basename self.logger.info(basename) data = infile.data['data'] data.insert(0,infile.mtime) data.insert(0,infile.ls[2:]) values = [int(f) if f.isdigit() else str(f) for f in data] keys = ["ls","fm_date","NEvents","NFiles","TotalEvents","NLostEvents"] document = dict(zip(keys, values)) document['id'] = infile.name+"_"+os.uname()[1] document['_parent']= self.runnumber documents = [document] self.index_documents('eols',documents) def index_documents(self,name,documents,bulk=True): attempts=0 destination_index = "" is_box=False if name.startswith("boxinfo"): destination_index = self.boxinfo_write is_box=True else: destination_index = self.runindex_write while True: attempts+=1 try: if bulk: self.es.bulk_index(destination_index,name,documents) else: self.es.index(destination_index,name,documents[0]) return True except ElasticHttpError as ex: if attempts<=1:continue self.logger.error('elasticsearch HTTP error. skipping document '+name) if is_box==True:break #self.logger.exception(ex) return False except (socket.gaierror,ConnectionError,Timeout) as ex: if attempts>100 and self.runMode: raise(ex) self.logger.error('elasticsearch connection error' + str(ex)+'. retry.') if self.stopping:return False ip_url=getURLwithIP(self.es_server_url,self.nsslock) self.es = ElasticSearch(ip_url,timeout=20) time.sleep(0.1) if is_box==True:break return False
class Collation: def __init__(self,es_server_url): self.server = ElasticSearch(es_server_url) self.datadict = { 'prc-out' : { "lookup" : Query('prc-out','source'), "action" : { 'definition' : Aggregator('drop'), 'data': Aggregator({'in': Aggregator('add'), 'out': Aggregator('add'), 'file':Aggregator('cat') }), 'ls' : Aggregator('check'), 'stream' : Aggregator('check'), 'source' : Aggregator('match') } }, 'prc-in' : { "lookup" : Query('prc-in','dest'), "action" : { 'definition' : Aggregator('drop'), 'data': Aggregator({ 'out' : Aggregator('add'), }), 'ls' : Aggregator('check'), 'index' : Aggregator('cat'), 'source' : Aggregator('check'), 'dest' : Aggregator('check'), 'process': Aggregator('cat') } }, 'prc-s-state' : { "lookup" : Query('prc-s-state'), "action" : { 'macro' : Aggregator('histoadd'), 'mini' : Aggregator('histoadd'), 'micro' : Aggregator('histoadd'), 'tp' : Aggregator('add'), 'lead' : Aggregator('avg'), 'nfiles' : Aggregator('add'), 'ls' : Aggregator('check'), 'process': Aggregator('cat') } } } def lookup(self,doctype): return self.datadict[doctype]['lookup'] def action(self,doctype): return self.datadict[doctype]['action'] #print datadict[type]['lookup'] def search(self,ind,doctype,ls,stream=None): if stream: result=self.server.search(self.lookup(doctype)(ls,stream), index=ind) else: result=self.server.search(self.lookup(doctype)(ls), index=ind) return result def collate(self,ind,doctype,ls,stream=None): result = self.search(ind,doctype,ls,stream) for element in result['hits']['hits']: for k,v in element['_source'].items(): self.action(doctype)[k](v) retval = dict((k,v.value()) for k,v in self.action(doctype).items()) for v in self.action(doctype).values(): v.reset() return retval def refresh(self,ind): self.server.refresh(ind) def stash(self,ind,doctype,doc): result=self.server.index(ind,doctype,doc) return result
def __init__(self,es_server_url,runnumber,startTime,runMode=True): self.logger = logging.getLogger(self.__class__.__name__) self.es_server_url=es_server_url self.index_name=conf.elastic_runindex_name self.runnumber = str(runnumber) self.startTime = startTime self.host = os.uname()[1] self.stopping=False self.threadEvent = threading.Event() self.runMode=runMode self.settings = { "analysis":{ "analyzer": { "prefix-test-analyzer": { "type": "custom", "tokenizer": "prefix-test-tokenizer" } }, "tokenizer": { "prefix-test-tokenizer": { "type": "path_hierarchy", "delimiter": " " } } }, "index":{ 'number_of_shards' : 10, 'number_of_replicas' : 3 }, } self.run_mapping = { 'run' : { # '_routing' :{ # 'required' : True, # 'path' : 'runNumber' # }, '_id' : { 'path' : 'runNumber' }, 'properties' : { 'runNumber':{ 'type':'integer' }, 'startTimeRC':{ 'type':'date' }, 'stopTimeRC':{ 'type':'date' }, 'startTime':{ 'type':'date' }, 'endTime':{ 'type':'date' }, 'completedTime' : { 'type':'date' } }, '_timestamp' : { 'enabled' : True, 'store' : 'yes' } }, 'microstatelegend' : { '_id' : { 'path' : 'id' }, '_parent':{'type':'run'}, 'properties' : { 'names':{ 'type':'string' }, 'id':{ 'type':'string' } } }, 'pathlegend' : { '_id' : { 'path' : 'id' }, '_parent':{'type':'run'}, 'properties' : { 'names':{ 'type':'string' }, 'id':{ 'type':'string' } } }, 'boxinfo' : { '_id' :{'path':'id'},#TODO:remove 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, 'broken' :{'type':'integer'}, 'used' :{'type':'integer'}, 'idles' :{'type':'integer'}, 'quarantined' :{'type':'integer'}, 'usedDataDir' :{'type':'integer'}, 'totalDataDir' :{'type':'integer'}, 'usedRamdisk' :{'type':'integer'}, 'totalRamdisk' :{'type':'integer'}, 'usedOutput' :{'type':'integer'}, 'totalOutput' :{'type':'integer'}, 'activeRuns' :{'type':'string'} }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" }, '_ttl' : { 'enabled' : True, 'default' : '30d' } }, 'boxinfo_last' : { '_id' :{'path':'id'}, 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, 'broken' :{'type':'integer'}, 'used' :{'type':'integer'}, 'idles' :{'type':'integer'}, 'quarantined' :{'type':'integer'}, 'usedDataDir' :{'type':'integer'}, 'totalDataDir' :{'type':'integer'}, 'usedRamdisk' :{'type':'integer'}, 'totalRamdisk' :{'type':'integer'}, 'usedOutput' :{'type':'integer'}, 'totalOutput' :{'type':'integer'}, 'activeRuns' :{'type':'string'} }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" } }, 'eols' : { '_id' :{'path':'id'}, '_parent' :{'type':'run'}, 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, 'ls' :{'type':'integer'}, 'NEvents' :{'type':'integer'}, 'NFiles' :{'type':'integer'}, 'TotalEvents' :{'type':'integer'} }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" }, }, 'minimerge' : { '_id' :{'path':'id'}, '_parent' :{'type':'run'}, 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, #run+appliance+stream+ls 'appliance' :{'type':'string'}, 'stream' :{'type':'string','index' : 'not_analyzed'}, 'ls' :{'type':'integer'}, 'processed' :{'type':'integer'}, 'accepted' :{'type':'integer'}, 'errorEvents' :{'type':'integer'}, 'size' :{'type':'integer'}, } } } connectionAttempts=0 while True: if self.stopping:break connectionAttempts+=1 try: self.logger.info('writing to elastic index '+self.index_name) ip_url=getURLwithIP(es_server_url) self.es = ElasticSearch(es_server_url) self.es.create_index(self.index_name, settings={ 'settings': self.settings, 'mappings': self.run_mapping }) break except ElasticHttpError as ex: #this is normally fine as the index gets created somewhere across the cluster if "IndexAlreadyExistsException" in str(ex): self.logger.info(ex) break else: self.logger.error(ex) if runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 ElasticHttpError reports from '+ es_server_url) sys.exit(1) elif runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) continue except (ConnectionError,Timeout) as ex: #try to reconnect with different IP from DNS load balancing if runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 connection attempts to '+ es_server_url) sys.exit(1) elif runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) continue #write run number document if runMode == True: document = {} document['runNumber'] = self.runnumber document['startTime'] = startTime documents = [document] self.index_documents('run',documents)
from pyelasticsearch.client import ElasticSearch import sys # by default we connect to localhost:9200 if __name__ == "__main__": if len(sys.argv) != 4: print "Usage: [*.py] [lat] [lon] [R]" sys.exit(0) es = ElasticSearch("http://localhost:9200/") lat = float(sys.argv[1]) lon = float(sys.argv[2]) r = float(sys.argv[3]) print lat, lon, r query = { "from": 0, "size": 10, "query": {"match_all": {}}, "filter": {"geo_distance": {"distance": str(r) + "km", "location": {"lat": lat, "lon": lon}}}, "sort": [{"_geo_distance": {"location": {"lat": lat, "lon": lon}, "order": "asc", "unit": "km"}}], } query1 = {"from": 0, "size": 10, "query": {"match_all": {}}, "sort": [{"likes": {"order": "desc"}}, "_score"]} query_count = {"facets": {"count_by_type": {"terms": {"field": "_type"}}}} # res = es.search(query, index='photo_geos',doc_type=['photos']) res = es.search(query_count, index="geos", doc_type=["user_geos"]) print res sys.exit(0) uids = [ (
class Collation: def __init__(self, es_server_url): self.server = ElasticSearch(es_server_url) self.datadict = { 'prc-out': { "lookup": Query('prc-out', 'source'), "action": { 'definition': Aggregator('drop'), 'data': Aggregator({ 'in': Aggregator('add'), 'out': Aggregator('add'), 'file': Aggregator('cat') }), 'ls': Aggregator('check'), 'stream': Aggregator('check'), 'source': Aggregator('match') } }, 'prc-in': { "lookup": Query('prc-in', 'dest'), "action": { 'definition': Aggregator('drop'), 'data': Aggregator({ 'out': Aggregator('add'), }), 'ls': Aggregator('check'), 'index': Aggregator('cat'), 'source': Aggregator('check'), 'dest': Aggregator('check'), 'process': Aggregator('cat') } }, 'prc-s-state': { "lookup": Query('prc-s-state'), "action": { 'macro': Aggregator('histoadd'), 'mini': Aggregator('histoadd'), 'micro': Aggregator('histoadd'), 'tp': Aggregator('add'), 'lead': Aggregator('avg'), 'nfiles': Aggregator('add'), 'ls': Aggregator('check'), 'process': Aggregator('cat') } } } def lookup(self, doctype): return self.datadict[doctype]['lookup'] def action(self, doctype): return self.datadict[doctype]['action'] #print datadict[type]['lookup'] def search(self, ind, doctype, ls, stream=None): if stream: result = self.server.search(self.lookup(doctype)(ls, stream), index=ind) else: result = self.server.search(self.lookup(doctype)(ls), index=ind) return result def collate(self, ind, doctype, ls, stream=None): result = self.search(ind, doctype, ls, stream) for element in result['hits']['hits']: for k, v in element['_source'].items(): self.action(doctype)[k](v) retval = dict((k, v.value()) for k, v in self.action(doctype).items()) for v in self.action(doctype).values(): v.reset() return retval def refresh(self, ind): self.server.refresh(ind) def stash(self, ind, doctype, doc): result = self.server.index(ind, doctype, doc) return result
from pyelasticsearch.client import ElasticSearch import pyelasticsearch import sys import os SCRIPT_DIR = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.join(SCRIPT_DIR + '/..')) from config import settings connection = ElasticSearch(settings.ES_HOST) class QueryBuilder: def __init__(self, query, size): self.size = size self.should_query = [] self.searchQuery = query self.edgeword, self.keyword = self.processText() def processText(self): text_split = self.searchQuery.split(" ") if len(text_split) == 1: return self.searchQuery, None edgeword = text_split.pop() keyword = " ".join(text_split) return edgeword, keyword
class Query(object): def __init__(self, url='http://localhost:9200/', index='events'): self._es = ElasticSearch(url) self._index = index def last_request_took(self): ''' A number of processing time in milliseconds reported by ElasticSearch without our client code overhead. ''' return self._last_request_took def _build_filter(self, event_type, start=None, end=None, source=None, **kwargs): '''Build an 'AND' filter that combines filters: 1. correct `event_type` 2. more or equal than start time (if provided) 3. less or equal than end time (if provided) 4. filter by values of terms in kwargs ''' filters = [] # 0. event type filters.append({ 'term': {'_type': event_type} }) if source is not None: filters.append({ 'term': {'source': source} }) for term_name, term_values in kwargs.iteritems(): if term_values: terms = { 'venue': term_values, 'execution': 'or' } filters.append({ # XXX see if this query speeds up things # UPD: not really # if mapping uses analyzed strings # 'query': { # 'query_string': { # 'query': ' OR '.join(term_values) # } # } 'terms': terms, }) timestamp_range = {} if start: timestamp_range['gte'] = start if end: timestamp_range['lte'] = end # 1. if timestamp range is provided - add it if timestamp_range: filters.append({ 'range': {'timestamp': timestamp_range} }) # BOOL filter is more performand than AND: # http://www.elasticsearch.org/blog/all-about-elasticsearch-filter-bitsets/ # return {'and': filters} return { 'bool': { 'must': filters } } def total(self, event_type, start=None, end=None, venues=[], posters=[]): '''Returns event's sum of deltas broken down per source: { 'IG': 25.0, 'FB': 3.0, ... } Can be filtered by start and end dates, venues or posters. ''' filters = self._build_filter(event_type, start=start, end=end, venues=venues, posters=posters) query = { 'facets': { 'events_deltas_totals': { 'terms_stats': { 'key_field': 'source', 'value_field': 'delta' }, 'facet_filter': filters } } } result = self._es._search_or_count( '_search', query, index=self._index, query_params={'search_type': 'count'} ) self._last_request_took = result['took'] facets = result['facets']['events_deltas_totals']['terms'] return { f['term']: f['total'] for f in facets } def top_terms(self, event_type, term, limit=10, start=None, end=None, venues=[], posters=[]): '''Returns `limit` top terms with their count. `term` can be one of: `poster`, `venue`, `source`. This is a more flexible version of top posters. The rest of arguments do the same as in `total` function. ''' assert term in ('poster', 'venue', 'source') filters = self._build_filter(event_type, start=start, end=end, venues=venues, posters=posters) query = { 'facets': { 'top': { 'terms': { 'field': term, 'size': limit }, 'facet_filter': filters } } } result = self._es._search_or_count( '_search', query, index=self._index, query_params={'search_type': 'count'} ) self._last_request_took = result['took'] facets = result['facets']['top']['terms'] return facets def _format_histogram_facet_values(self, values): return [ {'time': datetime.utcfromtimestamp(v['time']/1000), 'total': v['total']} for v in values['entries'] ] def histogram(self, event_type, interval, start=None, end=None, venues=[], posters=[], sources_facets=PREFIXES, include_total=False): '''Returns histogram of events deltas totals in buckets by `interval` apart. { 'total': [ {'time': <datetime-1>, 'total': 3.0}, {'time': <datetime-2>, 'total': 1.0}, ], 'FB': [ ... ] } Filter parameters are the same as in `total` method. Source facets are taken from `sources_facets` param. If you don't want them, just pass an empty list. Total facet is included by default. If you don't need it, set `include_total` to `False`. ''' filter_builder = partial(self._build_filter, event_type, start=start, end=end, venues=venues, posters=posters) date_histogram_value = { 'key_field': 'timestamp', 'value_field': 'delta', 'interval': interval, } facets = {} if include_total: filters = filter_builder(source=None) facets['total'] = { 'date_histogram': date_histogram_value, 'facet_filter': filters } for source in sources_facets: filters = filter_builder(source=source) payload = { 'date_histogram': date_histogram_value, 'facet_filter': filters } facets[source] = payload result = self._es._search_or_count( '_search', query={'facets': facets}, index=self._index, query_params={'search_type': 'count'} ) self._last_request_took = result['took'] return { facet: self._format_histogram_facet_values(values) for facet, values in result['facets'].iteritems() }
class Indexer(object): def __init__(self, url='http://localhost:9200/', index='events'): self._es = ElasticSearch(url) self._es.json_encoder = ESJSONEncoder self._index = index def cleanup(self): try: self._es.delete_index(self._index) except ElasticHttpNotFoundError: pass self._es.create_index(self._index, settings={ 'index': { 'mapper': { 'dynamic': False } } }) not_analyzed_mapping = { 'properties': { 'timestamp': {'type': 'date', 'format': 'dateOptionalTime'}, 'source': {'type': 'string', 'index': 'not_analyzed'}, 'venue': {'type': 'string', 'index': 'not_analyzed'}, 'poster': {'type': 'string', 'index': 'not_analyzed'}, 'delta': {'type': 'integer'} } } analyzed_mapping = { 'properties': { 'timestamp': {'type': 'date', 'format': 'dateOptionalTime'}, 'source': {'type': 'string', 'analyzer': 'keyword'}, 'venue': {'type': 'string', 'analyzer': 'keyword'}, 'poster': {'type': 'string', 'analyzer': 'keyword'}, 'delta': {'type': 'integer'} } } hybrid_mapping = { 'properties': { 'timestamp': {'type': 'date', 'format': 'dateOptionalTime'}, 'source': {'type': 'string', 'analyzer': 'keyword'}, 'venue': {'type': 'string', 'analyzer': 'whitespace'}, 'poster': {'type': 'string', 'analyzer': 'whitespace'}, 'delta': {'type': 'integer'} } } mapping = not_analyzed_mapping self._es.put_mapping(self._index, 'post', {'post': mapping}) def add(self, event): data = { 'timestamp': event['timestamp'], 'source': event['_id']['source'], 'venue': '{}-{}'.format(event['_id']['source'], event['venue']), 'poster': '{}-{}'.format(event['_id']['source'], event['poster']), 'delta': event.get('delta', 1) } self._es.index( self._index, event.get('type').lower(), data, id='{source}-{id}'.format(**event['_id']) )
class elasticBandBU: def __init__(self,conf,runnumber,startTime,runMode=True,nsslock=None,box_version=None): self.logger = logging.getLogger(self.__class__.__name__) self.conf=conf self.es_server_url=conf.elastic_runindex_url self.runindex_write="runindex_"+conf.elastic_runindex_name+"_write" self.runindex_read="runindex_"+conf.elastic_runindex_name+"_read" self.runindex_name="runindex_"+conf.elastic_runindex_name self.boxinfo_write="boxinfo_"+conf.elastic_runindex_name+"_write" self.boxinfo_read="boxinfo_"+conf.elastic_runindex_name+"_read" self.boxinfo_name="boxinfo_"+conf.elastic_runindex_name self.boxdoc_version=box_version self.runnumber = str(runnumber) self.startTime = startTime self.host = os.uname()[1] self.stopping=False self.threadEvent = threading.Event() self.runMode=runMode self.boxinfoFUMap = {} self.ip_url=None self.nsslock=nsslock self.updateIndexMaybe(self.runindex_name,self.runindex_write,self.runindex_read,mappings.central_es_settings,mappings.central_runindex_mapping) self.updateIndexMaybe(self.boxinfo_name,self.boxinfo_write,self.boxinfo_read,mappings.central_es_settings,mappings.central_boxinfo_mapping) self.black_list=None if self.conf.instance=='main': self.hostinst = self.host else: self.hostinst = self.host+'_'+self.conf.instance #write run number document if runMode == True and self.stopping==False: document = {} document['runNumber'] = self.runnumber document['startTime'] = startTime documents = [document] self.index_documents('run',documents) #except ElasticHttpError as ex: # self.logger.info(ex) # pass def updateIndexMaybe(self,index_name,alias_write,alias_read,settings,mapping): connectionAttempts=0 retry=False while True: if self.stopping:break connectionAttempts+=1 try: if retry or self.ip_url==None: self.ip_url=getURLwithIP(self.es_server_url,self.nsslock) self.es = ElasticSearch(self.ip_url,timeout=20) #check if runindex alias exists if requests.get(self.ip_url+'/_alias/'+alias_write).status_code == 200: self.logger.info('writing to elastic index '+alias_write + ' on '+self.es_server_url+' - '+self.ip_url ) self.createDocMappingsMaybe(alias_write,mapping) break else: time.sleep(.5) if (connectionAttempts%10)==0: self.logger.error('unable to access to elasticsearch alias ' + alias_write + ' on '+self.es_server_url+' / '+self.ip_url) continue except ElasticHttpError as ex: #es error, retry self.logger.error(ex) if self.runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 ElasticHttpError reports from '+ self.es_server_url) sys.exit(1) elif self.runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry=True continue except (socket.gaierror,ConnectionError,Timeout,RequestsConnectionError,RequestsTimeout) as ex: #try to reconnect with different IP from DNS load balancing if self.runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 connection attempts to '+ self.es_server_url) sys.exit(1) elif self.runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry=True continue def createDocMappingsMaybe(self,index_name,mapping): #update in case of new documents added to mapping definition for key in mapping: doc = {key:mapping[key]} res = requests.get(self.ip_url+'/'+index_name+'/'+key+'/_mapping') #only update if mapping is empty if res.status_code==200: if res.content.strip()=='{}': self.logger.info('inserting new mapping for '+str(key)) requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc)) else: #still check if number of properties is identical in each type inmapping = json.loads(res.content) for indexname in inmapping: properties = inmapping[indexname]['mappings'][key]['properties'] self.logger.info('checking mapping '+ indexname + '/' + key + ' which has ' + str(len(mapping[key]['properties'])) + '(index:' + str(len(properties)) + ') entries..') for pdoc in mapping[key]['properties']: if pdoc not in properties: self.logger.info('inserting mapping for ' + str(key) + ' which is missing mapping property ' + str(pdoc)) requests.post(self.ip_url+'/'+index_name+'/'+key+'/_mapping',json.dumps(doc)) break else: self.logger.warning('requests error code '+res.status_code+' in mapping request') def read_line(self,fullpath): with open(fullpath,'r') as fp: return fp.readline() def elasticize_modulelegend(self,fullpath): self.logger.info(os.path.basename(fullpath)) document = {} document['_parent']= self.runnumber document['id']= "microstatelegend_"+self.runnumber if fullpath.endswith('.jsn'): try: with open(fullpath,'r') as fp: doc = json.load(fp) document['stateNames'] = doc['names'] try:document['reserved'] = doc['reserved'] except:document['reserved'] = 33 try:document['special'] = doc['special'] except:document['special'] = 7 nstring = "" cnt = 0 outputcnt = 0 #fill in also old format for now for sname in doc['names']: nstring+= str(cnt) + "=" + sname + " " cnt+=1 if sname.startswith('hltOutput'):outputcnt+=1 try:document['output'] = doc['output'] except:document['output']=outputcnt document['names'] = nstring except Exception as ex: self.logger.warning("can not parse "+fullpath + ' ' + str(ex)) else: #old format stub = self.read_line(fullpath) document['names']= self.read_line(fullpath) document['reserved'] = 33 document['special'] = 7 outputcnt=0 for sname in document['names'].split(): if "=hltOutput" in sname: outputcnt+=1 document['output'] = outputcnt document['stateNames']=[] nameTokens = document['names'].split() for nameToken in nameTokens: if '=' in nameToken: idx,sn = nameToken.split('=') document["stateNames"].append( sn ) documents = [document] return self.index_documents('microstatelegend',documents) def elasticize_pathlegend(self,fullpath): self.logger.info(os.path.basename(fullpath)) document = {} document['_parent']= self.runnumber document['id']= "pathlegend_"+self.runnumber if fullpath.endswith('.jsn'): try: with open(fullpath,'r') as fp: doc = json.load(fp) document['stateNames'] = doc['names'] document['reserved'] = doc['reserved'] #put old name format value nstring="" cnt=0 for sname in doc['names']: nstring+= str(cnt) + "=" + sname + " " cnt+=1 document['names'] = nstring except Exception as ex: self.logger.warning("can not parse "+fullpath) else: stub = self.read_line(fullpath) document['names']= self.read_line(fullpath) documents = [document] return self.index_documents('pathlegend',documents) def elasticize_stream_label(self,infile): #elasticize stream name information self.logger.info(infile.filepath) document = {} document['_parent']= self.runnumber document['stream']=infile.stream[6:] document['id']=infile.basename return self.index_documents('stream_label',[document]) def elasticize_runend_time(self,endtime): self.logger.info(str(endtime)+" going into buffer") document = {} document['runNumber'] = self.runnumber document['startTime'] = self.startTime document['endTime'] = endtime documents = [document] self.index_documents('run',documents) def elasticize_resource_summary(self,jsondoc): self.logger.debug('injecting resource summary document') jsondoc['appliance']=self.host self.index_documents('resource_summary',[jsondoc],bulk=False) def elasticize_box(self,infile): basename = infile.basename self.logger.debug(basename) current_time = time.time() if infile.data=={}:return bu_doc=False if basename.startswith('bu') or basename.startswith('dvbu'): bu_doc=True #check box file against blacklist if bu_doc or self.black_list==None: self.black_list=[] try: with open(os.path.join(self.conf.watch_directory,'appliance','blacklist'),"r") as fi: try: self.black_list = json.load(fi) except ValueError: #file is being written or corrupted return except: #blacklist file is not present, do not filter pass if basename in self.black_list:return if bu_doc==False: try: if self.boxdoc_version<infile.data['version']: self.logger.info('skipping '+basename+' box file version '+str(infile.data['version'])+' which is newer than '+str(self.boxdoc_version)) return; except: self.logger.warning("didn't find version field in box file "+basename) return try: self.boxinfoFUMap[basename] = [infile.data,current_time] except Exception as ex: self.logger.warning('box info not injected: '+str(ex)) return try: document = infile.data #unique id for separate instances if bu_doc: document['id']=self.hostinst else: document['id']=basename document['activeRuns'] = str(document['activeRuns']).strip('[]') document['appliance']=self.host document['instance']=self.conf.instance if bu_doc==True: document['blacklist']=self.black_list #only here document['host']=basename try:document.pop('version') except:pass self.index_documents('boxinfo',[document]) except Exception as ex: self.logger.warning('box info not injected: '+str(ex)) return def elasticize_eols(self,infile): basename = infile.basename self.logger.info(basename) data = infile.data['data'] data.insert(0,infile.mtime) data.insert(0,infile.ls[2:]) values = [int(f) if f.isdigit() else str(f) for f in data] try: keys = ["ls","fm_date","NEvents","NFiles","TotalEvents","NLostEvents","NBytes"] document = dict(zip(keys, values)) except: #try without NBytes keys = ["ls","fm_date","NEvents","NFiles","TotalEvents","NLostEvents"] document = dict(zip(keys, values)) document['id'] = infile.name+"_"+self.host document['_parent']= self.runnumber document['appliance']=self.host documents = [document] self.index_documents('eols',documents) def index_documents(self,name,documents,bulk=True): attempts=0 destination_index = "" is_box=False if name.startswith("boxinfo") or name=='resource_summary': destination_index = self.boxinfo_write is_box=True else: destination_index = self.runindex_write while True: attempts+=1 try: if bulk: self.es.bulk_index(destination_index,name,documents) else: self.es.index(destination_index,name,documents[0]) return True except ElasticHttpError as ex: if attempts<=1:continue self.logger.error('elasticsearch HTTP error'+str(ex)+'. skipping document '+name) if is_box==True:break #self.logger.exception(ex) return False except (socket.gaierror,ConnectionError,Timeout) as ex: if attempts>100 and self.runMode: raise(ex) self.logger.error('elasticsearch connection error' + str(ex)+'. retry.') if self.stopping:return False ip_url=getURLwithIP(self.es_server_url,self.nsslock) self.es = ElasticSearch(ip_url,timeout=20) time.sleep(0.1) if is_box==True:break return False
from pyelasticsearch.client import ElasticSearch import sys # by default we connect to localhost:9200 if __name__ == "__main__": if len(sys.argv) != 4: print "Usage: [*.py] [lat] [lon] [R]" sys.exit(0) es = ElasticSearch('http://localhost:9200/') lat = float(sys.argv[1]) lon = float(sys.argv[2]) r = float(sys.argv[3]) print lat, lon, r query = { "from": 0, "size": 10, 'query': { "match_all": {} }, "filter": { "geo_distance": { "distance": str(r) + 'km', "location": { "lat": lat, "lon": lon } } }, "sort": [{
def index_documents(self,name,documents,doc_id=None,doc_params=None,bulk=True,overwrite=True,update_only=False,retry_on_conflict=0,script=False): if name=='fu-box-status' or name.startswith("boxinfo") or name=='resource_summary': destination_index = self.boxinfo_write is_box=True else: destination_index = self.runindex_write is_box=False attempts=0 while True: attempts+=1 try: if bulk: self.es.bulk_index(destination_index,name,documents) else: if doc_id: if update_only: if script: self.es.update(index=destination_index,doc_type=name,id=doc_id,script=documents[0],upsert=False,retry_on_conflict=retry_on_conflict) else: self.es.update(index=destination_index,doc_type=name,id=doc_id,doc=documents[0],upsert=False,retry_on_conflict=retry_on_conflict) else: #overwrite existing can be used with id specified if doc_params: self.es.index(destination_index,name,documents[0],doc_id,parent=doc_params['parent'],overwrite_existing=overwrite) else: self.es.index(destination_index,name,documents[0],doc_id,overwrite_existing=overwrite) else: self.es.index(destination_index,name,documents[0]) return True except ElasticHttpError as ex: if name=='run' and ex[0]==409: #create failed because overwrite was forbidden return (False,ex[0]) if ex[0]==429: if attempts<10 and not is_box: self.logger.warning('elasticsearch HTTP error 429'+str(ex)+'. retrying..') time.sleep(.1) continue else: if attempts<=1 and not is_box:continue if is_box: self.logger.warning('elasticsearch HTTP error '+str(ex)+'. skipping document '+name) else: self.logger.error('elasticsearch HTTP error '+str(ex)+'. skipping document '+name) return False except (socket.gaierror,ConnectionError,Timeout) as ex: if attempts>100 and self.runMode: raise(ex) if is_box or attempts<=1: self.logger.warning('elasticsearch connection error' + str(ex)+'. retry.') elif (attempts-2)%10==0: self.logger.error('elasticsearch connection error' + str(ex)+'. retry.') if self.stopping:return False ip_url=getURLwithIP(self.es_server_url,self.nsslock) self.es = ElasticSearch(ip_url,timeout=20) time.sleep(0.1) if is_box==True:#give up on too many box retries as they are indexed again every 5 seconds break return False
class elasticBandBU: def __init__(self, conf, runnumber, startTime, runMode=True, nsslock=None): self.logger = logging.getLogger(self.__class__.__name__) self.conf = conf self.es_server_url = conf.elastic_runindex_url self.runindex_write = "runindex_" + conf.elastic_runindex_name + "_write" self.runindex_read = "runindex_" + conf.elastic_runindex_name + "_read" self.runindex_name = "runindex_" + conf.elastic_runindex_name self.boxinfo_write = "boxinfo_" + conf.elastic_runindex_name + "_write" self.boxinfo_read = "boxinfo_" + conf.elastic_runindex_name + "_read" self.boxinfo_name = "boxinfo_" + conf.elastic_runindex_name self.runnumber = str(runnumber) self.startTime = startTime self.host = os.uname()[1] self.stopping = False self.threadEvent = threading.Event() self.runMode = runMode self.boxinfoFUMap = {} self.ip_url = None self.nsslock = nsslock self.updateIndexMaybe(self.runindex_name, self.runindex_write, self.runindex_read, mappings.central_es_settings, mappings.central_runindex_mapping) self.updateIndexMaybe(self.boxinfo_name, self.boxinfo_write, self.boxinfo_read, mappings.central_es_settings, mappings.central_boxinfo_mapping) self.black_list = None if self.conf.instance == 'main': self.hostinst = self.host else: self.hostinst = self.host + '_' + self.conf.instance #write run number document if runMode == True and self.stopping == False: document = {} document['runNumber'] = self.runnumber document['startTime'] = startTime documents = [document] self.index_documents('run', documents) #except ElasticHttpError as ex: # self.logger.info(ex) # pass def updateIndexMaybe(self, index_name, alias_write, alias_read, settings, mapping): connectionAttempts = 0 retry = False while True: if self.stopping: break connectionAttempts += 1 try: if retry or self.ip_url == None: self.ip_url = getURLwithIP(self.es_server_url, self.nsslock) self.es = ElasticSearch(self.ip_url, timeout=20, revival_delay=60) #check if runindex alias exists if requests.get(self.es_server_url + '/_alias/' + alias_write).status_code == 200: self.logger.info('writing to elastic index ' + alias_write + ' on ' + self.es_server_url + ' - ' + self.ip_url) self.createDocMappingsMaybe(alias_write, mapping) break else: time.sleep(.5) if (connectionAttempts % 10) == 0: self.logger.error( 'unable to access to elasticsearch alias ' + alias_write + ' on ' + self.es_server_url + ' / ' + self.ip_url) continue except ElasticHttpError as ex: #es error, retry self.logger.error(ex) if self.runMode and connectionAttempts > 100: self.logger.error( 'elastic (BU): exiting after 100 ElasticHttpError reports from ' + self.es_server_url) sys.exit(1) elif self.runMode == False and connectionAttempts > 10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry = True continue except (socket.gaierror, ConnectionError, Timeout) as ex: #try to reconnect with different IP from DNS load balancing if self.runMode and connectionAttempts > 100: self.logger.error( 'elastic (BU): exiting after 100 connection attempts to ' + self.es_server_url) sys.exit(1) elif self.runMode == False and connectionAttempts > 10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) retry = True continue def createDocMappingsMaybe(self, index_name, mapping): #update in case of new documents added to mapping definition for key in mapping: doc = {key: mapping[key]} res = requests.get(self.ip_url + '/' + index_name + '/' + key + '/_mapping') #only update if mapping is empty if res.status_code == 200: if res.content.strip() == '{}': requests.post( self.ip_url + '/' + index_name + '/' + key + '/_mapping', json.dumps(doc)) else: #still check if number of properties is identical in each type inmapping = json.loads(res.content) for indexname in inmapping: properties = inmapping[indexname]['mappings'][key][ 'properties'] #should be size 1 for pdoc in mapping[key]['properties']: if pdoc not in properties: requests.post( self.ip_url + '/' + index_name + '/' + key + '/_mapping', json.dumps(doc)) break def read_line(self, fullpath): with open(fullpath, 'r') as fp: return fp.readline() def elasticize_modulelegend(self, fullpath): self.logger.info(os.path.basename(fullpath)) stub = self.read_line(fullpath) document = {} document['_parent'] = self.runnumber document['id'] = "microstatelegend_" + self.runnumber document['names'] = self.read_line(fullpath) documents = [document] return self.index_documents('microstatelegend', documents) def elasticize_pathlegend(self, fullpath): self.logger.info(os.path.basename(fullpath)) stub = self.read_line(fullpath) document = {} document['_parent'] = self.runnumber document['id'] = "pathlegend_" + self.runnumber document['names'] = self.read_line(fullpath) documents = [document] return self.index_documents('pathlegend', documents) def elasticize_runend_time(self, endtime): self.logger.info(str(endtime) + " going into buffer") document = {} document['runNumber'] = self.runnumber document['startTime'] = self.startTime document['endTime'] = endtime documents = [document] self.index_documents('run', documents) def elasticize_box(self, infile): basename = infile.basename self.logger.debug(basename) current_time = time.time() if infile.data == {}: return bu_doc = False if basename.startswith('bu') or basename.startswith('dvbu'): bu_doc = True #check box file against blacklist if bu_doc or self.black_list == None: self.black_list = [] try: with open( os.path.join(self.conf.watch_directory, 'appliance', 'blacklist'), "r") as fi: try: self.black_list = json.load(fi) except ValueError: #file is being written or corrupted return except: #blacklist file is not present, do not filter pass if basename in self.black_list: return if bu_doc == False: try: self.boxinfoFUMap[basename] = [infile.data, current_time] except Exception as ex: self.logger.warning('box info not injected: ' + str(ex)) return try: document = infile.data #unique id for separate instances if bu_doc: document['id'] = self.hostinst else: document['id'] = basename #both here and in "boxinfo_appliance" document['appliance'] = self.host document['instance'] = self.conf.instance #only here document['host'] = basename self.index_documents('boxinfo', [document]) except Exception as ex: self.logger.warning('box info not injected: ' + str(ex)) return if bu_doc: try: document = infile.data try: document.pop('id') except: pass try: document.pop('host') except: pass #aggregation from FUs document['idles'] = 0 document['used'] = 0 document['broken'] = 0 document['quarantined'] = 0 document['cloud'] = 0 document['usedDataDir'] = 0 document['totalDataDir'] = 0 document['hosts'] = [basename] document['blacklistedHosts'] = [] for key in self.boxinfoFUMap: dpair = self.boxinfoFUMap[key] d = dpair[0] #check if entry is not older than 10 seconds if current_time - dpair[1] > 10: continue document['idles'] += int(d['idles']) document['used'] += int(d['used']) document['broken'] += int(d['broken']) document['quarantined'] += int(d['quarantined']) document['cloud'] += int(d['cloud']) document['usedDataDir'] += int(d['usedDataDir']) document['totalDataDir'] += int(d['totalDataDir']) document['hosts'].append(key) for blacklistedHost in self.black_list: document['blacklistedHosts'].append(blacklistedHost) self.index_documents('boxinfo_appliance', [document], bulk=False) except Exception as ex: #in case of malformed box info self.logger.warning('box info not injected: ' + str(ex)) return def elasticize_eols(self, infile): basename = infile.basename self.logger.info(basename) data = infile.data['data'] data.append(infile.mtime) data.append(infile.ls[2:]) values = [int(f) if f.isdigit() else str(f) for f in data] keys = ["NEvents", "NFiles", "TotalEvents", "fm_date", "ls"] document = dict(zip(keys, values)) document['id'] = infile.name + "_" + os.uname()[1] document['_parent'] = self.runnumber documents = [document] self.index_documents('eols', documents) def index_documents(self, name, documents, bulk=True): attempts = 0 destination_index = "" is_box = False if name.startswith("boxinfo"): destination_index = self.boxinfo_write is_box = True else: destination_index = self.runindex_write while True: attempts += 1 try: if bulk: self.es.bulk_index(destination_index, name, documents) else: self.es.index(destination_index, name, documents[0]) return True except ElasticHttpError as ex: if attempts <= 1: continue self.logger.error( 'elasticsearch HTTP error. skipping document ' + name) if is_box == True: break #self.logger.exception(ex) return False except (socket.gaierror, ConnectionError, Timeout) as ex: if attempts > 100 and self.runMode: raise (ex) self.logger.error('elasticsearch connection error. retry.') if self.stopping: return False time.sleep(0.1) ip_url = getURLwithIP(self.es_server_url, self.nsslock) self.es = ElasticSearch(ip_url, timeout=20, revival_delay=60) if is_box == True: break return False
def __init__(self,es_server_url,runstring,indexSuffix,monBufferSize,fastUpdateModulo): self.logger = logging.getLogger(self.__class__.__name__) self.istateBuffer = [] self.prcinBuffer = {} # {"lsX": doclist} self.prcoutBuffer = {} self.fuoutBuffer = {} self.es = ElasticSearch(es_server_url) self.settings = { "analysis":{ "analyzer": { "prefix-test-analyzer": { "type": "custom", "tokenizer": "prefix-test-tokenizer" } }, "tokenizer": { "prefix-test-tokenizer": { "type": "path_hierarchy", "delimiter": "_" } } }, "index":{ 'number_of_shards' : 16, 'number_of_replicas' : 1 } } self.run_mapping = { 'prc-i-state' : { 'properties' : { 'macro' : {'type' : 'integer'}, 'mini' : {'type' : 'integer'}, 'micro' : {'type' : 'integer'}, 'tp' : {'type' : 'double' }, 'lead' : {'type' : 'double' }, 'nfiles' : {'type' : 'integer'}, 'fm_date' : {'type' : 'date' } }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" }, '_ttl' : { 'enabled' : True, 'default' : '5m' } }, 'prc-s-state' : { 'properties' : { 'macro' : {'type' : 'integer'}, 'mini' : {'type' : 'integer'}, 'micro' : {'type' : 'integer'}, 'tp' : {'type' : 'double'}, 'lead' : {'type' : 'double'}, 'nfiles' : {'type' : 'integer'}, 'ls' : {'type' : 'integer'}, 'process': {'type' : 'string'} }, }, 'fu-s-state' : { 'properties' : { 'macro' : {'type' : 'integer'}, 'mini' : {'type' : 'integer'}, 'micro' : {'type' : 'integer'}, 'tp' : {'type' : 'double'}, 'lead' : {'type' : 'double'}, 'nfiles' : {'type' : 'integer'}, 'ls' : {'type' : 'integer'}, 'machine': {'type' : 'string'} } }, 'prc-out': { '_routing' :{ 'required' : True, 'path' : 'source' }, 'properties' : { #'definition': {'type': 'string'}, 'data' : { 'properties' : { 'in' : { 'type' : 'integer'}, 'out': { 'type' : 'integer'}, 'file': { 'type' : 'string','index' : 'not_analyzed'} } }, 'ls' : { 'type' : 'integer', 'store': "yes" }, 'stream' : {'type' : 'string','index' : 'not_analyzed'}, 'source' : { 'type' : 'string', 'index_analyzer': 'prefix-test-analyzer', 'search_analyzer': "keyword", 'store' : "yes", 'index' : "analyzed" } }, '_timestamp' : { 'enabled' : True, 'store' : "yes" } }, 'prc-in': { '_routing' :{ 'required' : True, 'path' : 'dest' }, 'properties' : { #'definition': {'type': 'string',"index" : "not_analyzed"}, 'data' : { 'properties' : { 'out' : { 'type' : 'integer'} } }, 'ls' : { 'type' : 'integer', 'store': 'yes' }, 'index' : { 'type' : 'integer' }, 'source' : { 'type' : 'string' }, 'dest' : { 'type' : 'string', 'index_analyzer': 'prefix-test-analyzer', 'search_analyzer': "keyword", 'store' : "yes", 'index' : "analyzed", }, 'process' : { 'type' : 'integer' } }, '_timestamp' : { 'enabled' : True, 'store' : "yes" } }, 'fu-out': { '_routing' :{ 'required' : True, 'path' : 'source' }, 'properties' : { #'definition': {'type': 'string',"index" : "not_analyzed"}, 'data' : { 'properties' : { 'in' : { 'type' : 'integer'}, 'out': { 'type' : 'integer'}, 'errorEvents' : {'type' : 'integer'}, 'returnCodeMask': {'type':'string',"index" : "not_analyzed"}, 'fileSize' : {'type':'long'}, 'files': { 'properties' : { 'name' : { 'type' : 'string',"index" : "not_analyzed"} } } } }, 'ls' : { 'type' : 'integer' }, 'stream' : {'type' : 'string','index' : 'not_analyzed'}, 'source' : { 'type' : 'string', 'index_analyzer': 'prefix-test-analyzer', 'search_analyzer': "keyword" } }, '_timestamp' : { 'enabled' : True, 'store' : "yes" } }, 'fu-complete' : { 'properties' : { 'host' : {'type' : 'string'}, 'fm_date' : {'type' : 'date' } }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" }, }, 'bu-out': { 'properties' : { #'definition': {'type': 'string',"index" : "not_analyzed"}, 'out': { 'type' : 'integer'}, 'ls' : { 'type' : 'integer' }, 'source' : {'type' : 'string'}#,"index" : "not_analyzed"} } }, 'cmsswlog' : { '_timestamp' : { 'enabled' : True, 'store' : "yes" }, '_ttl' : { 'enabled' : True, 'default' : '30d'} , 'properties' : { 'host' : {'type' : 'string'}, 'pid' : {'type' : 'integer'}, 'type' : {'type' : 'string',"index" : "not_analyzed"}, 'severity' : {'type' : 'string',"index" : "not_analyzed"}, 'severityVal' : {'type' : 'integer'}, 'category' : {'type' : 'string'}, 'fwkState' : {'type' : 'string',"index" : "not_analyzed"}, 'module' : {'type' : 'string',"index" : "not_analyzed"}, 'moduleInstance' : {'type' : 'string',"index" : "not_analyzed"}, 'moduleCall' : {'type' : 'string',"index" : "not_analyzed"}, 'lumi' : {'type' : 'integer'}, 'eventInPrc' : {'type' : 'long'}, 'message' : {'type' : 'string'},#,"index" : "not_analyzed"}, 'lexicalId' : {'type' : 'string',"index" : "not_analyzed"}, 'msgtime' : {'type' : 'date','format':'dd-MMM-YYYY HH:mm:ss'}, 'msgtimezone' : {'type' : 'string'} #'context' : {'type' : 'string'} } } } self.run = runstring self.monBufferSize = monBufferSize self.fastUpdateModulo = fastUpdateModulo self.indexName = runstring + "_"+indexSuffix try: self.es.create_index(self.indexName, settings={ 'settings': self.settings, 'mappings': self.run_mapping }) except ElasticHttpError as ex: # print "Index already existing - records will be overridden" #this is normally fine as the index gets created somewhere across the cluster pass
class Collation: def __init__(self, es_server_url): self.server = ElasticSearch(es_server_url) self.datadict = { "prc-out": { "lookup": Query("prc-out", "source"), "action": { "definition": Aggregator("drop"), "data": Aggregator({"in": Aggregator("add"), "out": Aggregator("add"), "file": Aggregator("cat")}), "ls": Aggregator("check"), "stream": Aggregator("check"), "source": Aggregator("match"), }, }, "prc-in": { "lookup": Query("prc-in", "dest"), "action": { "definition": Aggregator("drop"), "data": Aggregator({"out": Aggregator("add")}), "ls": Aggregator("check"), "index": Aggregator("cat"), "source": Aggregator("check"), "dest": Aggregator("check"), "process": Aggregator("cat"), }, }, "prc-s-state": { "lookup": Query("prc-s-state"), "action": { "macro": Aggregator("histoadd"), "mini": Aggregator("histoadd"), "micro": Aggregator("histoadd"), "tp": Aggregator("add"), "lead": Aggregator("avg"), "nfiles": Aggregator("add"), "ls": Aggregator("check"), "process": Aggregator("cat"), }, }, } def lookup(self, doctype): return self.datadict[doctype]["lookup"] def action(self, doctype): return self.datadict[doctype]["action"] # print datadict[type]['lookup'] def search(self, ind, doctype, ls, stream=None): if stream: result = self.server.search(self.lookup(doctype)(ls, stream), index=ind) else: result = self.server.search(self.lookup(doctype)(ls), index=ind) return result def collate(self, ind, doctype, ls, stream=None): result = self.search(ind, doctype, ls, stream) for element in result["hits"]["hits"]: for k, v in element["_source"].items(): self.action(doctype)[k](v) retval = dict((k, v.value()) for k, v in self.action(doctype).items()) for v in self.action(doctype).values(): v.reset() return retval def refresh(self, ind): self.server.refresh(ind) def stash(self, ind, doctype, doc): result = self.server.index(ind, doctype, doc) return result
def __init__(self, url='http://localhost:9200/', index='events'): self._es = ElasticSearch(url) self._es.json_encoder = ESJSONEncoder self._index = index
def setupES(es_server_url='http://localhost:9200',deleteOld=1,doPrint=False,overrideTests=False, forceReplicas=-1, forceShards=-1, create_index_name=None,subsystem="cdaq"): #ip_url=getURLwithIP(es_server_url) es = ElasticSearch(es_server_url,timeout=5) #list_template templateList = es.send_request('GET', ['_template']) TEMPLATES = ["runappliance_"+subsystem] loaddoc = None for template_name in TEMPLATES: template_label = template_name.split('_')[0] if template_name not in templateList: printout(template_name + " template not present. It will be created. ",doPrint,False) loaddoc = create_template(es,template_name,template_label,subsystem,forceReplicas,forceShards) else: loaddoc = create_template(es,None,template_label,subsystem,forceReplicas,forceShards,send=False) norm_name = convert(templateList[template_name]) if deleteOld==0: printout(template_name+" already exists. Add 'replace' parameter to update if different, or forceupdate to always update.",doPrint,False) else: printout(template_name+" already exists.",doPrint,False) if loaddoc!=None: mappingSame = norm_name['mappings']==loaddoc['mappings'] #settingSame = norm_name['settings']==loaddoc['settings'] settingsSame=True #convert to int before comparison if int(norm_name['settings']['index']['number_of_replicas'])!=int(loaddoc['settings']['index']['number_of_replicas']): settingsSame=False if int(norm_name['settings']['index']['number_of_shards'])!=int(loaddoc['settings']['index']['number_of_shards']): settingsSame=False #add more here if other settings need to be added if 'translog' not in norm_name['settings']['index'] or norm_name['settings']['index']['translog']!=loaddoc['settings']['index']['translog']: settingsSame=False #currently analyzer settings are not verified if not (mappingSame and settingsSame) or deleteOld>1: #test is override if overrideTests==False: try: if norm_name['settings']['test']==True: printout("Template test setting found, skipping update...",doPrint,True) break except:pass printout("Updating "+template_name+" ES template",doPrint,True) create_template(es,template_name,template_label,subsystem,forceReplicas,forceShards) else: printout('runappliance ES template is up to date',doPrint,True) #create index if create_index_name: if loaddoc: try: c_res = es.send_request('PUT', [create_index_name], body = loaddoc) if c_res!={'acknowledged':True}: printout("Result of index " + create_index_name + " create request: " + str(c_res),doPrint,True ) except ElasticHttpError, ex: if ex[1]['type']=='index_already_exists_exception': #this is for index pre-creator printout("Attempting to intialize already existing index "+create_index_name,doPrint,True) try: doc_resp = es.send_request('GET', ['_cat','indices',create_index_name],query_params={'h':'status'}) if doc_resp.strip('\n')=='close': printout("Index "+create_index_name+ " is already closed! Index will be reopened",doPrint,True) c_res = es.send_request('POST', [create_index_name,'_open']) except ElasticHttpError as ex: printout("setupES got ElasticHttpError getting index open/close state: "+str(ex),doPrint,True) except Exception as ex: printout("setupEs got Exception getting index open/closed state: "+str(ex),doPrint,True) except Exception as ex: #if type(ex)==RemoteTransportException: print "a",type(ex) printout("Index not created: "+str(ex),doPrint,True)
def __init__(self, url='http://localhost:9200/', index='events'): self._es = ElasticSearch(url) self._index = index
def resetURL(url): self.es = None self.es = ElasticSearch(url)
#!/usr/bin/python from flask import Flask, request, json import flask import happybase from pyelasticsearch.client import ElasticSearch import hashlib hbasehost = 'c0tl.com' from struct import * app = Flask(__name__) es = ElasticSearch('http://*****:*****@app.route('/') def home(): return """<html> <h2>Welcome to the colt API!</h2> </html>"""
class elasticBandBU: def __init__(self,es_server_url,runnumber,startTime,runMode=True): self.logger = logging.getLogger(self.__class__.__name__) self.es_server_url=es_server_url self.index_name=conf.elastic_runindex_name self.runnumber = str(runnumber) self.startTime = startTime self.host = os.uname()[1] self.stopping=False self.threadEvent = threading.Event() self.runMode=runMode self.settings = { "analysis":{ "analyzer": { "prefix-test-analyzer": { "type": "custom", "tokenizer": "prefix-test-tokenizer" } }, "tokenizer": { "prefix-test-tokenizer": { "type": "path_hierarchy", "delimiter": " " } } }, "index":{ 'number_of_shards' : 10, 'number_of_replicas' : 3 }, } self.run_mapping = { 'run' : { # '_routing' :{ # 'required' : True, # 'path' : 'runNumber' # }, '_id' : { 'path' : 'runNumber' }, 'properties' : { 'runNumber':{ 'type':'integer' }, 'startTimeRC':{ 'type':'date' }, 'stopTimeRC':{ 'type':'date' }, 'startTime':{ 'type':'date' }, 'endTime':{ 'type':'date' }, 'completedTime' : { 'type':'date' } }, '_timestamp' : { 'enabled' : True, 'store' : 'yes' } }, 'microstatelegend' : { '_id' : { 'path' : 'id' }, '_parent':{'type':'run'}, 'properties' : { 'names':{ 'type':'string' }, 'id':{ 'type':'string' } } }, 'pathlegend' : { '_id' : { 'path' : 'id' }, '_parent':{'type':'run'}, 'properties' : { 'names':{ 'type':'string' }, 'id':{ 'type':'string' } } }, 'boxinfo' : { '_id' :{'path':'id'},#TODO:remove 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, 'broken' :{'type':'integer'}, 'used' :{'type':'integer'}, 'idles' :{'type':'integer'}, 'quarantined' :{'type':'integer'}, 'usedDataDir' :{'type':'integer'}, 'totalDataDir' :{'type':'integer'}, 'usedRamdisk' :{'type':'integer'}, 'totalRamdisk' :{'type':'integer'}, 'usedOutput' :{'type':'integer'}, 'totalOutput' :{'type':'integer'}, 'activeRuns' :{'type':'string'} }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" }, '_ttl' : { 'enabled' : True, 'default' : '30d' } }, 'boxinfo_last' : { '_id' :{'path':'id'}, 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, 'broken' :{'type':'integer'}, 'used' :{'type':'integer'}, 'idles' :{'type':'integer'}, 'quarantined' :{'type':'integer'}, 'usedDataDir' :{'type':'integer'}, 'totalDataDir' :{'type':'integer'}, 'usedRamdisk' :{'type':'integer'}, 'totalRamdisk' :{'type':'integer'}, 'usedOutput' :{'type':'integer'}, 'totalOutput' :{'type':'integer'}, 'activeRuns' :{'type':'string'} }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" } }, 'eols' : { '_id' :{'path':'id'}, '_parent' :{'type':'run'}, 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, 'ls' :{'type':'integer'}, 'NEvents' :{'type':'integer'}, 'NFiles' :{'type':'integer'}, 'TotalEvents' :{'type':'integer'} }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" }, }, 'minimerge' : { '_id' :{'path':'id'}, '_parent' :{'type':'run'}, 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, #run+appliance+stream+ls 'appliance' :{'type':'string'}, 'stream' :{'type':'string','index' : 'not_analyzed'}, 'ls' :{'type':'integer'}, 'processed' :{'type':'integer'}, 'accepted' :{'type':'integer'}, 'errorEvents' :{'type':'integer'}, 'size' :{'type':'integer'}, } } } connectionAttempts=0 while True: if self.stopping:break connectionAttempts+=1 try: self.logger.info('writing to elastic index '+self.index_name) ip_url=getURLwithIP(es_server_url) self.es = ElasticSearch(es_server_url) self.es.create_index(self.index_name, settings={ 'settings': self.settings, 'mappings': self.run_mapping }) break except ElasticHttpError as ex: #this is normally fine as the index gets created somewhere across the cluster if "IndexAlreadyExistsException" in str(ex): self.logger.info(ex) break else: self.logger.error(ex) if runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 ElasticHttpError reports from '+ es_server_url) sys.exit(1) elif runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) continue except (ConnectionError,Timeout) as ex: #try to reconnect with different IP from DNS load balancing if runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 connection attempts to '+ es_server_url) sys.exit(1) elif runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) continue #write run number document if runMode == True: document = {} document['runNumber'] = self.runnumber document['startTime'] = startTime documents = [document] self.index_documents('run',documents) #except ElasticHttpError as ex: # self.logger.info(ex) # pass def resetURL(url): self.es = None self.es = ElasticSearch(url) def read_line(self,fullpath): with open(fullpath,'r') as fp: return fp.readline() def elasticize_modulelegend(self,fullpath): self.logger.info(os.path.basename(fullpath)) stub = self.read_line(fullpath) document = {} document['_parent']= self.runnumber document['id']= "microstatelegend_"+self.runnumber document['names']= self.read_line(fullpath) documents = [document] return self.index_documents('microstatelegend',documents) def elasticize_pathlegend(self,fullpath): self.logger.info(os.path.basename(fullpath)) stub = self.read_line(fullpath) document = {} document['_parent']= self.runnumber document['id']= "pathlegend_"+self.runnumber document['names']= self.read_line(fullpath) documents = [document] return self.index_documents('pathlegend',documents) def elasticize_runend_time(self,endtime): self.logger.info(str(endtime)+" going into buffer") document = {} document['runNumber'] = self.runnumber document['startTime'] = self.startTime document['endTime'] = endtime documents = [document] self.index_documents('run',documents) def elasticize_box(self,infile): basename = infile.basename self.logger.debug(basename) try: document = infile.data #TODO:let dynamic ID document['id']= basename + '_' + document['fm_date'].split('.')[0] #TODO:remove documents = [document] except: #in case of malformed box info return self.index_documents('boxinfo',documents) #self.logger.info(str(document))#check that ID is not present... #TODO:write unique boxinfo #documents[0]['id']=basename #self.index_documents('boxinfo_last',documents) def elasticize_eols(self,infile): basename = infile.basename self.logger.info(basename) data = infile.data['data'] data.append(infile.mtime) data.append(infile.ls[2:]) values = [int(f) if f.isdigit() else str(f) for f in data] keys = ["NEvents","NFiles","TotalEvents","fm_date","ls"] document = dict(zip(keys, values)) document['id'] = infile.name+"_"+os.uname()[1] document['_parent']= self.runnumber documents = [document] self.index_documents('eols',documents) def elasticize_minimerge(self,infile): basename = infile.basename self.logger.info(basename) data = infile.data['data'] data.append(infile.mtime) data.append(infile.ls[2:]) stream=infile.stream if stream.startswith("stream"): stream = stream[6:] data.append(stream) values = [int(f) if str(f).isdigit() else str(f) for f in data] keys = ["processed","accepted","errorEvents","fname","size","eolField1","eolField2","fm_date","ls","stream"] document = dict(zip(keys, values)) document['id'] = infile.name document['_parent']= self.runnumber documents = [document] self.index_documents('minimerge',documents) def index_documents(self,name,documents): attempts=0 while True: attempts+=1 try: self.es.bulk_index(self.index_name,name,documents) return True except ElasticHttpError as ex: if attempts<=1:continue self.logger.error('elasticsearch HTTP error. skipping document '+name) #self.logger.exception(ex) return False except (ConnectionError,Timeout) as ex: if attempts>100 and self.runMode: raise(ex) self.logger.error('elasticsearch connection error. retry.') if self.stopping:return False time.sleep(0.1) ip_url=getURLwithIP(self.es_server_url) self.es = ElasticSearch(ip_url) return False
#!/bin/env python import os, sys, time, datetime import threading from pyelasticsearch.client import ElasticSearch import json from ordereddict import OrderedDict #es_hosts=['http://fuval-c2a11-02:9200','http://fuval-c2a11-03:9200','http://fuval-c2a11-15:9200'] #es_tribe_hosts=['http://fuval-c2a11-28:9200'] es_hosts = ['http://dvcu-ccsl6-01:9200'] es_tribe_hosts = ['http://dvtu-ccsl6-01:9200'] main_es = ElasticSearch(es_hosts[0]) tribe_es = ElasticSearch(es_tribe_hosts[0]) main_index = 'runindex' setup = 'daq2val' class query_maker(threading.Thread): def __init__(self, run): threading.Thread.__init__(self) self.running = True self.hostname = os.uname()[1] self.ip = {} self.runno = run self.known_streams = {} app_query = { "query": { "top_children": { "score": "sum",
command = sys.argv[1] server_url=sys.argv[2] index_name=sys.argv[3] else: print "Parameters: command[create,alias,mapping] server url, index.alias name (target index)" print " COMMANDS:" print " create: create index" print " alias: create index *_read and *_write aliases (optional parameter: target index)" print " create missing document mappings for the index" sys.exit(1) if server_url.startswith('http://')==False: server_url='http://'+server_url #connection es = ElasticSearch(server_url) #pick mapping if index_name.startswith('runindex'): my_settings = mappings.central_es_settings_runindex my_mapping = mappings.central_runindex_mapping if index_name.startswith('boxinfo'): my_settings = mappings.central_es_settings_boxinfo, my_mapping = mappings.central_boxinfo_mapping if index_name.startswith('hltdlogs'): my_settings = mappings.central_es_settings_hltlogs my_mapping = mappings.central_hltdlogs_mapping #alias convention alias_write=index_name+"_write" alias_read=index_name+"_read"