class ElasticSearchUtil: def __init__(self, host): self.host = host self.conn = Elasticsearch([self.host]) def __del__(self): self.close() def check(self): ''' 输出当前系统的ES信息 :return: ''' return self.conn.info() def insertDocument(self, index, type, body, id=None): ''' 插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成 :param index: 待插入的index值 :param type: 待插入的type值 :param body: 待插入的数据 -> dict型 :param id: 自定义Id值 :return: ''' return self.conn.index(index=index, doc_type=type, body=body, id=id) def insertDataFrame(self, index, type, dataFrame): ''' 批量插入接口; bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}] 其中optionType可为index、delete、update Condition可设置每条数据所对应的index值和type值 data为具体要插入/更新的单条数据 :param index: 默认插入的index值 :param type: 默认插入的type值 :param dataFrame: 待插入数据集 :return: ''' dataList = dataFrame.to_dict(orient='records') insertHeadInfoList = [{"index": {}} for i in range(len(dataList))] temp = [dict] * (len(dataList) * 2) temp[::2] = insertHeadInfoList temp[1::2] = dataList try: return self.conn.bulk(index=index, doc_type=type, body=temp) except Exception as e: return str(e) def deleteDocById(self, index, type, id): ''' 删除指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.delete(index=index, doc_type=type, id=id) def deleteDocByQuery(self, index, query, type=None): ''' 删除idnex下符合条件query的所有数据 :param index: :param query: 满足DSL语法格式 :param type: :return: ''' return self.conn.delete_by_query(index=index, body=query, doc_type=type) def deleteAllDocByIndex(self, index, type=None): ''' 删除指定index下的所有数据 :param index: :return: ''' try: query = {'query': {'match_all': {}}} return self.conn.delete_by_query(index=index, body=query, doc_type=type) except Exception as e: return str(e) + ' -> ' + index def searchDoc(self, index=None, type=None, body=None): ''' 查找index下所有符合条件的数据 :param index: :param type: :param body: 筛选语句,符合DSL语法格式 :return: ''' return self.conn.search(index=index, doc_type=type, body=body) def getDocById(self, index, type, id): ''' 获取指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.get(index=index, doc_type=type, id=id) def updateDocById(self, index, type, id, body=None): ''' 更新指定index、type、id所对应的数据 :param index: :param type: :param id: :param body: 待更新的值 :return: ''' return self.conn.update(index=index, doc_type=type, id=id, body=body) def close(self): if self.conn is not None: try: self.conn.close() except Exception as e: pass finally: self.conn = None
class Connector: def __init__(self, esEndpoint, dmonPort=5001, esInstanceEndpoint=9200, index="logstash-*"): self.esInstance = Elasticsearch(esEndpoint) self.esEndpoint = esEndpoint self.dmonPort = dmonPort self.esInstanceEndpoint = esInstanceEndpoint self.myIndex = index def query(self, queryBody, allm=True, dMetrics=[], debug=False): res = self.esInstance.search(index=self.myIndex, body=queryBody, request_timeout=230) if debug == True: print "%---------------------------------------------------------%" print "Raw JSON Ouput" print res print("%d documents found" % res['hits']['total']) print "%---------------------------------------------------------%" termsList = [] termValues = [] ListMetrics = [] for doc in res['hits']['hits']: if allm == False: if not dMetrics: sys.exit("dMetrics argument not set. Please supply valid list of metrics!") for met in dMetrics: # prints the values of the metrics defined in the metrics list if debug == True: print "%---------------------------------------------------------%" print "Parsed Output -> ES doc id, metrics, metrics values." print("doc id %s) metric %s -> value %s" % (doc['_id'], met, doc['_source'][met])) print "%---------------------------------------------------------%" termsList.append(met) termValues.append(doc['_source'][met]) dictValues = dict(zip(termsList, termValues)) else: for terms in doc['_source']: # prints the values of the metrics defined in the metrics list if debug == True: print "%---------------------------------------------------------%" print "Parsed Output -> ES doc id, metrics, metrics values." print("doc id %s) metric %s -> value %s" % (doc['_id'], terms, doc['_source'][terms])) print "%---------------------------------------------------------%" termsList.append(terms) termValues.append(doc['_source'][terms]) dictValues = dict(zip(termsList, termValues)) ListMetrics.append(dictValues) return ListMetrics, res def info(self): try: res = self.esInstance.info() except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to ES dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) return "An exception has occured with type %s at arguments %s" %(type(inst), inst.args) sys.exit(2) return res def roles(self): nUrl = "http://%s:%s/dmon/v1/overlord/nodes/roles" % (self.esEndpoint, self.dmonPort) logger.info('[%s] : [INFO] dmon get roles url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rRoles = requests.get(nUrl) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort) sys.exit(2) rData = rRoles.json() return rData def createIndex(self, indexName): try: self.esInstance.create(index=indexName, ignore=400) logger.info('[%s] : [INFO] Created index %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) except Exception as inst: logger.error('[%s] : [ERROR] Failed to created index %s with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst), inst.args) def closeIndex(self, indexName): try: self.esInstance.close(index=indexName) logger.info('[%s] : [INFO] Closed index %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) except Exception as inst: logger.error('[%s] : [ERROR] Failed to close index %s with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst), inst.args) def deleteIndex(self, indexName): try: res = self.esInstance.indices.delete(index=indexName, ignore=[400, 404]) logger.info('[%s] : [INFO] Deleted index %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) except Exception as inst: logger.error('[%s] : [ERROR] Failed to delete index %s with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst), inst.args) return 0 return res def openIndex(self, indexName): res = self.esInstance.indices.open(index=indexName) logger.info('[%s] : [INFO] Open index %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName) return res def getIndex(self, indexName): res = self.esInstance.indices.get(index=indexName, human=True) return res def getIndexSettings(self, indexName): res = self.esInstance.indices.get_settings(index=indexName, human=True) return res def clusterHealth(self): res = self.esInstance.cluster.health(request_timeout=15) return res def clusterSettings(self): res = self.esInstance.cluster.get_settings(request_timeout=15) return res def clusterState(self): res = self.esInstance.cluster.stats(human=True, request_timeout=15) return res def nodeInfo(self): res = self.esInstance.nodes.info(request_timeout=15) return res def nodeState(self): res = self.esInstance.nodes.stats(request_timeout=15) return res def getStormTopology(self): nUrl = "http://%s:%s/dmon/v1/overlord/detect/storm" % (self.esEndpoint, self.dmonPort) logger.info('[%s] : [INFO] dmon get storm topology url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rStormTopology = requests.get(nUrl) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort) sys.exit(2) rData = rStormTopology.json() return rData def pushAnomaly(self, anomalyIndex, doc_type, body): try: res = self.esInstance.index(index=anomalyIndex, doc_type=doc_type, body=body) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while pushing anomaly with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't push anomaly to dmon!" sys.exit(2) return res def getModel(self): return "getModel" def pushModel(self): return "push model" def localData(self): return "use local data" def getInterval(self): nUrl = "http://%s:%s/dmon/v1/overlord/aux/interval" % (self.esEndpoint, self.dmonPort) logger.info('[%s] : [INFO] dmon get interval url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rInterval = requests.get(nUrl) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort) sys.exit(2) rData = rInterval.json() return rData def aggQuery(self, queryBody): adt_timeout = os.environ['ADP_TIMEOUT'] = os.getenv('ADP_TIMEOUT', str(60)) # Set timeout as env variable ADT_TIMEOUT, if not set use default 60 try: res = self.esInstance.search(index=self.myIndex, body=queryBody, request_timeout=float(adt_timeout)) except Exception as inst: logger.error('[%s] : [ERROR] Exception while executing ES query with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) sys.exit(2) return res def getNodeList(self): ''' :return: -> returns the list of registered nodes from dmon ''' nUrl = "http://%s:%s/dmon/v1/observer/nodes" % (self.esEndpoint, self.dmonPort) logger.info('[%s] : [INFO] dmon get node url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rdmonNode = requests.get(nUrl) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort) sys.exit(2) rdata = rdmonNode.json() nodes = [] for e in rdata['Nodes']: for k in e: nodes.append(k) return nodes def getDmonStatus(self): nUrl = "http://%s:%s/dmon/v1/overlord/core/status" % (self.esEndpoint, self.dmonPort) logger.info('[%s] : [INFO] dmon get core status url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl) try: rdmonStatus = requests.get(nUrl) except Exception as inst: logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args) print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort) sys.exit(2) return rdmonStatus.json()
class ElasticSearchUtil: def __init__(self, host): self.host = host self.conn = Elasticsearch([self.host]) def __del__(self): self.close() def check(self): ''' 输出当前系统的ES信息 :return: ''' return self.conn.info() def insertDocument(self, index, type, body, id=None): ''' 插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成 :param index: 待插入的index值 :param type: 待插入的type值 :param body: 待插入的数据 -> dict型 :param id: 自定义Id值 :return: ''' return self.conn.index(index=index, doc_type=type, body=body, id=id) def insertDataFrame(self, index, type, dataFrame): ''' 批量插入接口; bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}] 其中optionType可为index、delete、update Condition可设置每条数据所对应的index值和type值 data为具体要插入/更新的单条数据 :param index: 默认插入的index值 :param type: 默认插入的type值 :param dataFrame: 待插入数据集 :return: ''' dataList = dataFrame.to_dict(orient='records') insertHeadInfoList = [{"index": {}} for i in range(len(dataList))] temp = [dict] * (len(dataList) * 2) temp[::2] = insertHeadInfoList temp[1::2] = dataList try: return self.conn.bulk(index=index, doc_type=type, body=temp) except Exception: return str(Exception) def deleteDocById(self, index, type, id): ''' 删除指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.delete(index=index, doc_type=type, id=id) def deleteDocByQuery(self, index, query, type=None): ''' 删除idnex下符合条件query的所有数据 :param index: :param query: 满足DSL语法格式 :param type: :return: ''' return self.conn.delete_by_query(index=index, body=query, doc_type=type) def deleteAllDocByIndex(self, index, type=None): ''' 删除指定index下的所有数据 :param index: :return: ''' try: query = {'query': {'match_all': {}}} return self.conn.delete_by_query(index=index, body=query, doc_type=type) except Exception: return str(Exception) + ' -> ' + index def searchDoc(self, index=None, type=None, body=None): ''' 查找index下所有符合条件的数据 :param index: :param type: :param body: 筛选语句,符合DSL语法格式 :return: ''' return self.conn.search(index=index, doc_type=type, body=body) def getDocById(self, index, type, id): ''' 获取指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.get(index=index, doc_type=type, id=id) def updateDocById(self, index, type, id, body=None): ''' 更新指定index、type、id所对应的数据 :param index: :param type: :param id: :param body: 待更新的值 :return: ''' return self.conn.update(index=index, doc_type=type, id=id, body=body) def close(self): if self.conn is not None: try: self.conn.close() except Exception: pass finally: self.conn = None
class Rdfxml2Es: def __init__(self, file, frame, host, port, esindex, indctrl, bulksize, devmode, filemode, outsubDir): """ 1) Initializes some attributes 2) Checks if connection to ES node can be established 3) Checks if ES index does not already exist 4) If 2) und 3) are true, then create index and type mappings :param file: The RDF-XML file :param frame: File containing the JSON-LD framing :param host: Host of ES node :param port: Port of ES node :param esindex: Name of ES index :param indctrl: Settings and mapping for ES :param bulksize: Size of bulk uploads :param devmode: Number of samples for performing performance :param filemode: :param outsubDir: test on different bulk upload sizes :return: None """ self.file = file self.frame = frame self.host = host self.port = port self.index = esindex self.indctrl = indctrl self.bulksize = bulksize self.bulknum = 0 self.devmode = devmode self.filemode = filemode self.esdocs = list() self.outsubDir = outsubDir self.numberOfFilesInSubDir = 300 self.openedFilesInSubDir = 0 self.currentSubDir = 1 self.writtenDocuments = 0 if self.devmode > 0: self.doccounter = 0 if self.filemode: self._openFile() #self.of = open('output.json', 'w') else: try: h1 = client.HTTPConnection(self.host, self.port) h1.connect() h1.close() self.of = Elasticsearch([{'host': self.host, 'port': self.port}]) if not self.of.indices.exists(self.index) is True: if self.indctrl is not None: self.of.indices.create(index=self.index, body=self.loadjson(self.indctrl)) else: self.of.indices.create(index=self.index) except Exception as inst: exit("Error: " + inst.args[1]) @staticmethod def loadjson(ifile): """ Loads a file containing valid JSON-LD objects and removes comments :param ifile: :return: Object of type Dictionary """ with open(ifile, 'r') as f: raw = f.read() jsonstr = jsmin(raw) return loads(jsonstr) @staticmethod def stripchars(string): """ Removes tabs and newlines from string. :param string: :return: Cleaned string """ return ''.join(re.split('\t+|\n+', string)) def parsexml(self): """ Parses XML and kicks off the transformation and indexing of the individual documents. Must be implemented in child classes :return: None """ raise NotImplementedError def rdf2es(self, string, bibo): """ Does the really interesting stuff: Transformation of the triples by subject and indexing in ES :param string: The RDF triples as a concatenated string. :param bibo: Is subject a bibo:Document? :return: Body for ES indexing """ g = Graph().parse(data=string) jldstr = g.serialize(format='json-ld', indent=4) if bibo: esdoc = jsonld.compact(loads(jldstr.decode('utf-8')), self.loadjson(self.frame)) doctype = 'document' else: esdoc = loads(jldstr.decode('utf-8')) esdoc = jsonld.frame(esdoc, self.loadjson(self.frame))['@graph'][0] esdoc['@context'] = self.loadjson(self.frame)['@context'] doctype = 'bibliographicResource' docid = re.findall('\w{9}', esdoc['@id'])[0] if self.filemode: bulkfile = [{'index': {'_index': self.index, '_type': doctype, '_id': docid}}, esdoc] return bulkfile else: esdoc.update({'_index': self.index, '_type': doctype, '_id': docid}) return esdoc def bulkupload(self, string, bibo): """ Creates a list of single JSON-LD documents and indexes them as bulk upload :param string: The RDF triples as a concatenated string. :param bibo: Is subject a bibo:Document? :return: """ if not self.filemode: self.bulknum += 1 self.esdocs.append(self.rdf2es(string, bibo)) if self.filemode: # Output content to file #I think we shouldn't serialize the content in memory in the output-file mode for outer in self.esdocs: for inner in outer: #self.of.write(dumps(inner, separators='\n')) #we need this json dump method because the content is stored in a dictionary structure - as far as I understand it #so we can't just write a string dump(inner, self.of) #dump(bytes(inner,'UTF-8'), self.of) self.writtenDocuments += 1 self.of.write('\n') #perhaps flush it only in bigger chunks? - later #self.of.flush() del self.esdocs[:] if self.writtenDocuments >= self.bulksize: self._closeFile() self.writtenDocuments = 0 self._openFile() elif self.bulknum >= self.bulksize: # Perform bulk upload helpers.bulk(client=self.of, actions=self.esdocs, stats_only=True) # Reset counter and empty list self.bulknum = 0 del self.esdocs[:] def _openFile(self): subDir = self.outsubDir + os.sep + self.currentSubDir.__str__() if not os.path.isdir(subDir): os.mkdir(subDir) #every time the script is started, the number of current subdirs is again 1 so we neeed to check #hown much files are already stored in the current subdir elif self.openedFilesInSubDir >= self.numberOfFilesInSubDir or len([name for name in os.listdir(subDir)]) \ >= self.numberOfFilesInSubDir: self.currentSubDir += 1 subDir = self.outsubDir + os.sep + self.currentSubDir.__str__() if not os.path.isdir(subDir): os.mkdir(subDir) self.numberOfFilesInSubDir = 0 outfile = "es." + datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__() + ".json" #using compressed method we are getting difficulties with the gzip interface in combination with the dump method of json module #outfile = "es." + datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__() + ".json.gz" absoluteFileName = "".join([subDir,os.sep, outfile]) self.of = open(absoluteFileName,'w') #self.of = gzip.open(absoluteFileName,'wb') self.numberOfFilesInSubDir += 1 def _closeFile(self): if not self.of is None: #last return necessary for bulk API self.of.write("\n") self.of.flush() name = self.of.name self.of.close() os.system("gzip " + name) self.of = None
class ElasticSearchUtil: def __init__(self, host): self.host = host self.conn = Elasticsearch([self.host]) self.dbserver = '172.20.12.80' # dbport = settings['MONGODB_PORT'] dbname = 'nccloud-gateway-elasticsearch-' + "crawler" self.dbname = dbname collname = 'crawldetailapivo' client = MongoClient(self.dbserver) db = client.admin db.authenticate("ublinker", "nash2017") db = client[dbname] self.col = db[collname] def __del__(self): self.close() def check(self): ''' 输出当前系统的ES信息 :return: ''' return self.conn.info() def insertDocument(self, index, type, body, id=None): ''' 插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成 :param index: 待插入的index值 :param type: 待插入的type值 :param body: 待插入的数据 -> dict型 :param id: 自定义Id值 :return: ''' return self.conn.index(index=index, doc_type=type, body=body, id=id) def insertDataFrame(self, index, type, dataFrame): ''' 批量插入接口; bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}] 其中optionType可为index、delete、update Condition可设置每条数据所对应的index值和type值 data为具体要插入/更新的单条数据 :param index: 默认插入的index值 :param type: 默认插入的type值 :param dataFrame: 待插入数据集 :return: ''' dataList = dataFrame.to_dict(orient='records') insertHeadInfoList = [{"index": {}} for i in range(len(dataList))] temp = [dict] * (len(dataList) * 2) temp[::2] = insertHeadInfoList temp[1::2] = dataList try: return self.conn.bulk(index=index, doc_type=type, body=temp) except Exception as e: return str(e) def deleteDocById(self, index, type, id): ''' 删除指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.delete(index=index, doc_type=type, id=id) def deleteDocByQuery(self, index, query, type=None): ''' 删除idnex下符合条件query的所有数据 :param index: :param query: 满足DSL语法格式 :param type: :return: ''' return self.conn.delete_by_query(index=index, body=query, doc_type=type) def deleteAllDocByIndex(self, index, type=None): ''' 删除指定index下的所有数据 :param index: :return: ''' try: query = {'query': {'match_all': {}}} return self.conn.delete_by_query(index=index, body=query, doc_type=type) except Exception as e: return str(e) + ' -> ' + index def searchDoc(self, index=None, type=None, body=None): ''' 查找index下所有符合条件的数据 :param index: :param type: :param body: 筛选语句,符合DSL语法格式 :return: ''' # return self.conn.search(index = "dfndsfyfsr0835468931_201803", body = {"query": {"match_all": {}}}) return self.conn.search(index=index, doc_type=type, body=body) def getDocById(self, index, type, id): ''' 获取指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.get(index=index, doc_type=type, id=id) def updateDocById(self, index, type, id, body=None): ''' 更新指定index、type、id所对应的数据 :param index: :param type: :param id: :param body: 待更新的值 :return: ''' return self.conn.update(index=index, doc_type=type, id=id, body=body) def close(self): if self.conn is not None: try: self.conn.close() except Exception as e: pass finally: self.conn = None def collectdata(self): host = '10.3.5.61:9200' esAction = self query = {"from": 0, "size": 10, 'query': {'match_all': {}}} res = esAction.searchDoc('dfndsfyfsr0835468931_201803', 'clMnxIkYIB0838868687_busi', query) a = res['hits'] a = a['total'] n = int(a / 1000) query = {"from": 1, "size": 10000, 'query': {'match_all': {}}} # query = {"from": 10000, "size": 100, 'query': {'match_all': {}}} res = esAction.searchDoc('dfndsfyfsr0835468931_201803', 'clMnxIkYIB0838868687_busi', query) for num in range(999, n): page1 = str(2 * num - 1) query = { "from": num * 1000, "size": 1000, 'query': { 'match_all': {} } } # query = {"from": 10000, "size": 100, 'query': {'match_all': {}}} res = esAction.searchDoc('dfndsfyfsr0835468931_201803', 'clMnxIkYIB0838868687_busi', query) if (num > 999): break for hit in res['hits']['hits']: data = hit["_source"] # esAction.col.insert(data) print(hit["_source"])