class IndexData: def __init__(self, index_name, settings_path, host="http://127.0.0.1:9200"): self.connection = ElasticSearch(host) self.index_name = index_name self.settings_path = settings_path self.create_index() def get_settings(self): config_file = file(self.settings_path) settings = json.load(config_file) return settings def create_index(self): settings = self.get_settings() try: self.connection.create_index(self.index_name, settings) except pyelasticsearch.exceptions.ElasticHttpError as e: self.connection.delete_index(self.index_name) self.connection.create_index(self.index_name, settings) def index_data(self, data_path, index_type): if index_type is None: raise "Please enter valid index type" objects = [] count = 0 with open(data_path) as f: for line in f: word_split = line.split("\t") cin = word_split[0] name = word_split[1].strip() doc = {'cin': cin, 'name': name} objects.append(doc) if len(objects) > 1000: response = self.connection.bulk_index(self.index_name, index_type, objects, id_field='cin') objects = [] self.connection.bulk_index(self.index_name, index_type, objects, id_field='cin')
class IndexData: def __init__(self, index_name, settings_path, host="http://127.0.0.1:9200"): self.connection = ElasticSearch(host) self.index_name = index_name self.settings_path = settings_path self.create_index() def get_settings(self): config_file = file(self.settings_path) settings = json.load(config_file) return settings def create_index(self): settings = self.get_settings() try: self.connection.create_index(self.index_name, settings) except pyelasticsearch.exceptions.ElasticHttpError as e: self.connection.delete_index(self.index_name) self.connection.create_index(self.index_name, settings) def index_data(self, data_path, index_type): if index_type is None: raise "Please enter valid index type" objects = [] count = 0 with open(data_path) as f: for line in f: word_split = line.split("\t") cin = word_split[0] name = word_split[1].strip() doc = {'cin':cin, 'name':name} objects.append(doc) if len(objects) > 1000: response = self.connection.bulk_index(self.index_name, index_type, objects, id_field='cin') objects = [] self.connection.bulk_index(self.index_name, index_type, objects, id_field='cin')
class Indexer(object): def __init__(self, url='http://localhost:9200/', index='events'): self._es = ElasticSearch(url) self._es.json_encoder = ESJSONEncoder self._index = index def cleanup(self): try: self._es.delete_index(self._index) except ElasticHttpNotFoundError: pass self._es.create_index(self._index, settings={ 'index': { 'mapper': { 'dynamic': False } } }) not_analyzed_mapping = { 'properties': { 'timestamp': {'type': 'date', 'format': 'dateOptionalTime'}, 'source': {'type': 'string', 'index': 'not_analyzed'}, 'venue': {'type': 'string', 'index': 'not_analyzed'}, 'poster': {'type': 'string', 'index': 'not_analyzed'}, 'delta': {'type': 'integer'} } } analyzed_mapping = { 'properties': { 'timestamp': {'type': 'date', 'format': 'dateOptionalTime'}, 'source': {'type': 'string', 'analyzer': 'keyword'}, 'venue': {'type': 'string', 'analyzer': 'keyword'}, 'poster': {'type': 'string', 'analyzer': 'keyword'}, 'delta': {'type': 'integer'} } } hybrid_mapping = { 'properties': { 'timestamp': {'type': 'date', 'format': 'dateOptionalTime'}, 'source': {'type': 'string', 'analyzer': 'keyword'}, 'venue': {'type': 'string', 'analyzer': 'whitespace'}, 'poster': {'type': 'string', 'analyzer': 'whitespace'}, 'delta': {'type': 'integer'} } } mapping = not_analyzed_mapping self._es.put_mapping(self._index, 'post', {'post': mapping}) def add(self, event): data = { 'timestamp': event['timestamp'], 'source': event['_id']['source'], 'venue': '{}-{}'.format(event['_id']['source'], event['venue']), 'poster': '{}-{}'.format(event['_id']['source'], event['poster']), 'delta': event.get('delta', 1) } self._es.index( self._index, event.get('type').lower(), data, id='{source}-{id}'.format(**event['_id']) )
class elasticBand(): def __init__(self,es_server_url,runstring,indexSuffix,monBufferSize,fastUpdateModulo): self.logger = logging.getLogger(self.__class__.__name__) self.istateBuffer = [] self.prcinBuffer = {} # {"lsX": doclist} self.prcoutBuffer = {} self.fuoutBuffer = {} self.es = ElasticSearch(es_server_url) self.settings = { "analysis":{ "analyzer": { "prefix-test-analyzer": { "type": "custom", "tokenizer": "prefix-test-tokenizer" } }, "tokenizer": { "prefix-test-tokenizer": { "type": "path_hierarchy", "delimiter": "_" } } }, "index":{ 'number_of_shards' : 16, 'number_of_replicas' : 1 } } self.run_mapping = { 'prc-i-state' : { 'properties' : { 'macro' : {'type' : 'integer'}, 'mini' : {'type' : 'integer'}, 'micro' : {'type' : 'integer'}, 'tp' : {'type' : 'double' }, 'lead' : {'type' : 'double' }, 'nfiles' : {'type' : 'integer'}, 'fm_date' : {'type' : 'date' } }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" }, '_ttl' : { 'enabled' : True, 'default' : '5m' } }, 'prc-s-state' : { 'properties' : { 'macro' : {'type' : 'integer'}, 'mini' : {'type' : 'integer'}, 'micro' : {'type' : 'integer'}, 'tp' : {'type' : 'double'}, 'lead' : {'type' : 'double'}, 'nfiles' : {'type' : 'integer'}, 'ls' : {'type' : 'integer'}, 'process': {'type' : 'string'} }, }, 'fu-s-state' : { 'properties' : { 'macro' : {'type' : 'integer'}, 'mini' : {'type' : 'integer'}, 'micro' : {'type' : 'integer'}, 'tp' : {'type' : 'double'}, 'lead' : {'type' : 'double'}, 'nfiles' : {'type' : 'integer'}, 'ls' : {'type' : 'integer'}, 'machine': {'type' : 'string'} } }, 'prc-out': { '_routing' :{ 'required' : True, 'path' : 'source' }, 'properties' : { #'definition': {'type': 'string'}, 'data' : { 'properties' : { 'in' : { 'type' : 'integer'}, 'out': { 'type' : 'integer'}, 'file': { 'type' : 'string','index' : 'not_analyzed'} } }, 'ls' : { 'type' : 'integer', 'store': "yes" }, 'stream' : {'type' : 'string','index' : 'not_analyzed'}, 'source' : { 'type' : 'string', 'index_analyzer': 'prefix-test-analyzer', 'search_analyzer': "keyword", 'store' : "yes", 'index' : "analyzed" } }, '_timestamp' : { 'enabled' : True, 'store' : "yes" } }, 'prc-in': { '_routing' :{ 'required' : True, 'path' : 'dest' }, 'properties' : { #'definition': {'type': 'string',"index" : "not_analyzed"}, 'data' : { 'properties' : { 'out' : { 'type' : 'integer'} } }, 'ls' : { 'type' : 'integer', 'store': 'yes' }, 'index' : { 'type' : 'integer' }, 'source' : { 'type' : 'string' }, 'dest' : { 'type' : 'string', 'index_analyzer': 'prefix-test-analyzer', 'search_analyzer': "keyword", 'store' : "yes", 'index' : "analyzed", }, 'process' : { 'type' : 'integer' } }, '_timestamp' : { 'enabled' : True, 'store' : "yes" } }, 'fu-out': { '_routing' :{ 'required' : True, 'path' : 'source' }, 'properties' : { #'definition': {'type': 'string',"index" : "not_analyzed"}, 'data' : { 'properties' : { 'in' : { 'type' : 'integer'}, 'out': { 'type' : 'integer'}, 'errorEvents' : {'type' : 'integer'}, 'returnCodeMask': {'type':'string',"index" : "not_analyzed"}, 'fileSize' : {'type':'long'}, 'files': { 'properties' : { 'name' : { 'type' : 'string',"index" : "not_analyzed"} } } } }, 'ls' : { 'type' : 'integer' }, 'stream' : {'type' : 'string','index' : 'not_analyzed'}, 'source' : { 'type' : 'string', 'index_analyzer': 'prefix-test-analyzer', 'search_analyzer': "keyword" } }, '_timestamp' : { 'enabled' : True, 'store' : "yes" } }, 'fu-complete' : { 'properties' : { 'host' : {'type' : 'string'}, 'fm_date' : {'type' : 'date' } }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" }, }, 'bu-out': { 'properties' : { #'definition': {'type': 'string',"index" : "not_analyzed"}, 'out': { 'type' : 'integer'}, 'ls' : { 'type' : 'integer' }, 'source' : {'type' : 'string'}#,"index" : "not_analyzed"} } }, 'cmsswlog' : { '_timestamp' : { 'enabled' : True, 'store' : "yes" }, '_ttl' : { 'enabled' : True, 'default' : '30d'} , 'properties' : { 'host' : {'type' : 'string'}, 'pid' : {'type' : 'integer'}, 'type' : {'type' : 'string',"index" : "not_analyzed"}, 'severity' : {'type' : 'string',"index" : "not_analyzed"}, 'severityVal' : {'type' : 'integer'}, 'category' : {'type' : 'string'}, 'fwkState' : {'type' : 'string',"index" : "not_analyzed"}, 'module' : {'type' : 'string',"index" : "not_analyzed"}, 'moduleInstance' : {'type' : 'string',"index" : "not_analyzed"}, 'moduleCall' : {'type' : 'string',"index" : "not_analyzed"}, 'lumi' : {'type' : 'integer'}, 'eventInPrc' : {'type' : 'long'}, 'message' : {'type' : 'string'},#,"index" : "not_analyzed"}, 'lexicalId' : {'type' : 'string',"index" : "not_analyzed"}, 'msgtime' : {'type' : 'date','format':'dd-MMM-YYYY HH:mm:ss'}, 'msgtimezone' : {'type' : 'string'} #'context' : {'type' : 'string'} } } } self.run = runstring self.monBufferSize = monBufferSize self.fastUpdateModulo = fastUpdateModulo self.indexName = runstring + "_"+indexSuffix try: self.es.create_index(self.indexName, settings={ 'settings': self.settings, 'mappings': self.run_mapping }) except ElasticHttpError as ex: # print "Index already existing - records will be overridden" #this is normally fine as the index gets created somewhere across the cluster pass def imbue_jsn(self,infile): with open(infile.filepath,'r') as fp: try: document = json.load(fp) except json.scanner.JSONDecodeError,ex: logger.exception(ex) return None,-1 return document,0
class elasticBandBU: def __init__(self,es_server_url,runnumber,startTime,runMode=True): self.logger = logging.getLogger(self.__class__.__name__) self.es_server_url=es_server_url self.index_name=conf.elastic_runindex_name self.runnumber = str(runnumber) self.startTime = startTime self.host = os.uname()[1] self.stopping=False self.threadEvent = threading.Event() self.runMode=runMode self.settings = { "analysis":{ "analyzer": { "prefix-test-analyzer": { "type": "custom", "tokenizer": "prefix-test-tokenizer" } }, "tokenizer": { "prefix-test-tokenizer": { "type": "path_hierarchy", "delimiter": " " } } }, "index":{ 'number_of_shards' : 10, 'number_of_replicas' : 3 }, } self.run_mapping = { 'run' : { # '_routing' :{ # 'required' : True, # 'path' : 'runNumber' # }, '_id' : { 'path' : 'runNumber' }, 'properties' : { 'runNumber':{ 'type':'integer' }, 'startTimeRC':{ 'type':'date' }, 'stopTimeRC':{ 'type':'date' }, 'startTime':{ 'type':'date' }, 'endTime':{ 'type':'date' }, 'completedTime' : { 'type':'date' } }, '_timestamp' : { 'enabled' : True, 'store' : 'yes' } }, 'microstatelegend' : { '_id' : { 'path' : 'id' }, '_parent':{'type':'run'}, 'properties' : { 'names':{ 'type':'string' }, 'id':{ 'type':'string' } } }, 'pathlegend' : { '_id' : { 'path' : 'id' }, '_parent':{'type':'run'}, 'properties' : { 'names':{ 'type':'string' }, 'id':{ 'type':'string' } } }, 'boxinfo' : { '_id' :{'path':'id'},#TODO:remove 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, 'broken' :{'type':'integer'}, 'used' :{'type':'integer'}, 'idles' :{'type':'integer'}, 'quarantined' :{'type':'integer'}, 'usedDataDir' :{'type':'integer'}, 'totalDataDir' :{'type':'integer'}, 'usedRamdisk' :{'type':'integer'}, 'totalRamdisk' :{'type':'integer'}, 'usedOutput' :{'type':'integer'}, 'totalOutput' :{'type':'integer'}, 'activeRuns' :{'type':'string'} }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" }, '_ttl' : { 'enabled' : True, 'default' : '30d' } }, 'boxinfo_last' : { '_id' :{'path':'id'}, 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, 'broken' :{'type':'integer'}, 'used' :{'type':'integer'}, 'idles' :{'type':'integer'}, 'quarantined' :{'type':'integer'}, 'usedDataDir' :{'type':'integer'}, 'totalDataDir' :{'type':'integer'}, 'usedRamdisk' :{'type':'integer'}, 'totalRamdisk' :{'type':'integer'}, 'usedOutput' :{'type':'integer'}, 'totalOutput' :{'type':'integer'}, 'activeRuns' :{'type':'string'} }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" } }, 'eols' : { '_id' :{'path':'id'}, '_parent' :{'type':'run'}, 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, 'ls' :{'type':'integer'}, 'NEvents' :{'type':'integer'}, 'NFiles' :{'type':'integer'}, 'TotalEvents' :{'type':'integer'} }, '_timestamp' : { 'enabled' : True, 'store' : "yes", "path" : "fm_date" }, }, 'minimerge' : { '_id' :{'path':'id'}, '_parent' :{'type':'run'}, 'properties' : { 'fm_date' :{'type':'date'}, 'id' :{'type':'string'}, #run+appliance+stream+ls 'appliance' :{'type':'string'}, 'stream' :{'type':'string','index' : 'not_analyzed'}, 'ls' :{'type':'integer'}, 'processed' :{'type':'integer'}, 'accepted' :{'type':'integer'}, 'errorEvents' :{'type':'integer'}, 'size' :{'type':'integer'}, } } } connectionAttempts=0 while True: if self.stopping:break connectionAttempts+=1 try: self.logger.info('writing to elastic index '+self.index_name) ip_url=getURLwithIP(es_server_url) self.es = ElasticSearch(es_server_url) self.es.create_index(self.index_name, settings={ 'settings': self.settings, 'mappings': self.run_mapping }) break except ElasticHttpError as ex: #this is normally fine as the index gets created somewhere across the cluster if "IndexAlreadyExistsException" in str(ex): self.logger.info(ex) break else: self.logger.error(ex) if runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 ElasticHttpError reports from '+ es_server_url) sys.exit(1) elif runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) continue except (ConnectionError,Timeout) as ex: #try to reconnect with different IP from DNS load balancing if runMode and connectionAttempts>100: self.logger.error('elastic (BU): exiting after 100 connection attempts to '+ es_server_url) sys.exit(1) elif runMode==False and connectionAttempts>10: self.threadEvent.wait(60) else: self.threadEvent.wait(1) continue #write run number document if runMode == True: document = {} document['runNumber'] = self.runnumber document['startTime'] = startTime documents = [document] self.index_documents('run',documents) #except ElasticHttpError as ex: # self.logger.info(ex) # pass def resetURL(url): self.es = None self.es = ElasticSearch(url) def read_line(self,fullpath): with open(fullpath,'r') as fp: return fp.readline() def elasticize_modulelegend(self,fullpath): self.logger.info(os.path.basename(fullpath)) stub = self.read_line(fullpath) document = {} document['_parent']= self.runnumber document['id']= "microstatelegend_"+self.runnumber document['names']= self.read_line(fullpath) documents = [document] return self.index_documents('microstatelegend',documents) def elasticize_pathlegend(self,fullpath): self.logger.info(os.path.basename(fullpath)) stub = self.read_line(fullpath) document = {} document['_parent']= self.runnumber document['id']= "pathlegend_"+self.runnumber document['names']= self.read_line(fullpath) documents = [document] return self.index_documents('pathlegend',documents) def elasticize_runend_time(self,endtime): self.logger.info(str(endtime)+" going into buffer") document = {} document['runNumber'] = self.runnumber document['startTime'] = self.startTime document['endTime'] = endtime documents = [document] self.index_documents('run',documents) def elasticize_box(self,infile): basename = infile.basename self.logger.debug(basename) try: document = infile.data #TODO:let dynamic ID document['id']= basename + '_' + document['fm_date'].split('.')[0] #TODO:remove documents = [document] except: #in case of malformed box info return self.index_documents('boxinfo',documents) #self.logger.info(str(document))#check that ID is not present... #TODO:write unique boxinfo #documents[0]['id']=basename #self.index_documents('boxinfo_last',documents) def elasticize_eols(self,infile): basename = infile.basename self.logger.info(basename) data = infile.data['data'] data.append(infile.mtime) data.append(infile.ls[2:]) values = [int(f) if f.isdigit() else str(f) for f in data] keys = ["NEvents","NFiles","TotalEvents","fm_date","ls"] document = dict(zip(keys, values)) document['id'] = infile.name+"_"+os.uname()[1] document['_parent']= self.runnumber documents = [document] self.index_documents('eols',documents) def elasticize_minimerge(self,infile): basename = infile.basename self.logger.info(basename) data = infile.data['data'] data.append(infile.mtime) data.append(infile.ls[2:]) stream=infile.stream if stream.startswith("stream"): stream = stream[6:] data.append(stream) values = [int(f) if str(f).isdigit() else str(f) for f in data] keys = ["processed","accepted","errorEvents","fname","size","eolField1","eolField2","fm_date","ls","stream"] document = dict(zip(keys, values)) document['id'] = infile.name document['_parent']= self.runnumber documents = [document] self.index_documents('minimerge',documents) def index_documents(self,name,documents): attempts=0 while True: attempts+=1 try: self.es.bulk_index(self.index_name,name,documents) return True except ElasticHttpError as ex: if attempts<=1:continue self.logger.error('elasticsearch HTTP error. skipping document '+name) #self.logger.exception(ex) return False except (ConnectionError,Timeout) as ex: if attempts>100 and self.runMode: raise(ex) self.logger.error('elasticsearch connection error. retry.') if self.stopping:return False time.sleep(0.1) ip_url=getURLwithIP(self.es_server_url) self.es = ElasticSearch(ip_url) return False
my_mapping = mappings.central_runindex_mapping if index_name.startswith('boxinfo'): my_settings = mappings.central_es_settings, my_mapping = mappings.central_boxinfo_mapping if index_name.startswith('hltdlogs'): my_settings = mappings.central_es_settings_hltlogs my_mapping = mappings.central_hltdlogs_mapping #alias convention alias_write = index_name + "_write" alias_read = index_name + "_read" if command == 'create': es.create_index(index_name, settings={ 'settings': my_settings, 'mappings': my_mapping }) if command == 'alias': try: target_index = sys.argv[4] except: target_index = index_name #check if alias exists status1 = requests.get(server_url + '/_alias/' + alias_write).status_code status2 = requests.get(server_url + '/_alias/' + alias_read).status_code aliases_settings = {"actions": []} if status1 != 200:
if index_name.startswith('runindex'): my_settings = mappings.central_es_settings_runindex my_mapping = mappings.central_runindex_mapping if index_name.startswith('boxinfo'): my_settings = mappings.central_es_settings_boxinfo, my_mapping = mappings.central_boxinfo_mapping if index_name.startswith('hltdlogs'): my_settings = mappings.central_es_settings_hltlogs my_mapping = mappings.central_hltdlogs_mapping #alias convention alias_write=index_name+"_write" alias_read=index_name+"_read" if command=='create': es.create_index(index_name, settings={ 'settings': my_settings, 'mappings': my_mapping }) if command=='alias': try: target_index = sys.argv[4] except: target_index = index_name #check if alias exists alias_settings={} status1 = requests.get(server_url+'/_alias/'+alias_write).status_code status2 = requests.get(server_url+'/_alias/'+alias_read).status_code aliases_settings = { "actions": []} if status1!=200: alias_settings["actions"].append({"add": {"index": target_index, "alias": alias_write}})