Python Elasticsearch.close Beispiele

Programmiersprache: Python

Namespace / Paketname: elasticsearch

Klasse / Typ: Elasticsearch

Methode / Funktion: close

Beispiele auf hotexamples.com: 5

Python Elasticsearch.close - 5 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die elasticsearch.Elasticsearch.close, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

Elasticsearch(30)

delete(30)

create(30)

count(30)

exists(30)

info(30)

index(30)

bulk(30)

msearch(30)

mget(30)

get(30)

delete_by_query(30)

get_source(26)

clear_scroll(24)

mlt(5)

__init__(5)

explain(4)

field_stats(3)

flush(2)

field_caps(2)

__hash__(2)

dumping_all_data(2)

delete_script(2)

commit(2)

cursor(2)

indices(2)

close(2)

cleanup(2)

ingest(2)

is_listening(2)

index_bulk(1)

index_get(1)

get_script(1)

index_topics(1)

keys(1)

list_database_names(1)

msearch_template(1)

msgBody(1)

get_settings(1)

exists_source(1)

get_field_mapping(1)

authors_to_update(1)

__str__(1)

_connected(1)

_eland_es_version(1)

_index(1)

_settings(1)

add(1)

articles_to_update(1)

bulk_index(1)

Beispiel #1

Datei anzeigen

Datei: elasticutils.py Projekt: Rockyzsu/base_function

class ElasticSearchUtil:
    def __init__(self, host):
        self.host = host
        self.conn = Elasticsearch([self.host])

    def __del__(self):
        self.close()

    def check(self):
        '''
        输出当前系统的ES信息
        :return:
        '''
        return self.conn.info()

    def insertDocument(self, index, type, body, id=None):
        '''
        插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成
        :param index: 待插入的index值
        :param type: 待插入的type值
        :param body: 待插入的数据 -> dict型
        :param id: 自定义Id值
        :return:
        '''
        return self.conn.index(index=index, doc_type=type, body=body, id=id)

    def insertDataFrame(self, index, type, dataFrame):
        '''
        批量插入接口;
        bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}]
        其中optionType可为index、delete、update
        Condition可设置每条数据所对应的index值和type值
        data为具体要插入/更新的单条数据
        :param index: 默认插入的index值
        :param type: 默认插入的type值
        :param dataFrame: 待插入数据集
        :return:
        '''
        dataList = dataFrame.to_dict(orient='records')
        insertHeadInfoList = [{"index": {}} for i in range(len(dataList))]
        temp = [dict] * (len(dataList) * 2)
        temp[::2] = insertHeadInfoList
        temp[1::2] = dataList
        try:
            return self.conn.bulk(index=index, doc_type=type, body=temp)
        except Exception as e:
            return str(e)

    def deleteDocById(self, index, type, id):
        '''
        删除指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.delete(index=index, doc_type=type, id=id)

    def deleteDocByQuery(self, index, query, type=None):
        '''
        删除idnex下符合条件query的所有数据
        :param index:
        :param query: 满足DSL语法格式
        :param type:
        :return:
        '''
        return self.conn.delete_by_query(index=index, body=query, doc_type=type)

    def deleteAllDocByIndex(self, index, type=None):
        '''
        删除指定index下的所有数据
        :param index:
        :return:
        '''
        try:
            query = {'query': {'match_all': {}}}
            return self.conn.delete_by_query(index=index, body=query, doc_type=type)
        except Exception as e:
            return str(e) + ' -> ' + index

    def searchDoc(self, index=None, type=None, body=None):
        '''
        查找index下所有符合条件的数据
        :param index:
        :param type:
        :param body: 筛选语句,符合DSL语法格式
        :return:
        '''
        return self.conn.search(index=index, doc_type=type, body=body)

    def getDocById(self, index, type, id):
        '''
        获取指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.get(index=index, doc_type=type, id=id)

    def updateDocById(self, index, type, id, body=None):
        '''
        更新指定index、type、id所对应的数据
        :param index:
        :param type:
        :param id:
        :param body: 待更新的值
        :return:
        '''
        return self.conn.update(index=index, doc_type=type, id=id, body=body)


    def close(self):
     if self.conn is not None:
        try:
            self.conn.close()
        except Exception as e:
            pass
        finally:
            self.conn = None

Beispiel #2

Datei anzeigen

Datei: dmonconnector.py Projekt: igabriel85/dmon-adp

class Connector:
    def __init__(self, esEndpoint, dmonPort=5001, esInstanceEndpoint=9200, index="logstash-*"):
        self.esInstance = Elasticsearch(esEndpoint)
        self.esEndpoint = esEndpoint
        self.dmonPort = dmonPort
        self.esInstanceEndpoint = esInstanceEndpoint
        self.myIndex = index

    def query(self, queryBody, allm=True, dMetrics=[], debug=False):
        res = self.esInstance.search(index=self.myIndex, body=queryBody, request_timeout=230)
        if debug == True:
            print "%---------------------------------------------------------%"
            print "Raw JSON Ouput"
            print res
            print("%d documents found" % res['hits']['total'])
            print "%---------------------------------------------------------%"
        termsList = []
        termValues = []
        ListMetrics = []
        for doc in res['hits']['hits']:
            if allm == False:
                if not dMetrics:
                    sys.exit("dMetrics argument not set. Please supply valid list of metrics!")
                for met in dMetrics:
                    # prints the values of the metrics defined in the metrics list
                    if debug == True:
                        print "%---------------------------------------------------------%"
                        print "Parsed Output -> ES doc id, metrics, metrics values."
                        print("doc id %s) metric %s -> value %s" % (doc['_id'], met, doc['_source'][met]))
                        print "%---------------------------------------------------------%"
                    termsList.append(met)
                    termValues.append(doc['_source'][met])
                dictValues = dict(zip(termsList, termValues))
            else:
                for terms in doc['_source']:
                    # prints the values of the metrics defined in the metrics list
                    if debug == True:
                        print "%---------------------------------------------------------%"
                        print "Parsed Output -> ES doc id, metrics, metrics values."
                        print("doc id %s) metric %s -> value %s" % (doc['_id'], terms, doc['_source'][terms]))
                        print "%---------------------------------------------------------%"
                    termsList.append(terms)
                    termValues.append(doc['_source'][terms])
                    dictValues = dict(zip(termsList, termValues))
            ListMetrics.append(dictValues)
        return ListMetrics, res

    def info(self):
        try:
            res = self.esInstance.info()
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to ES dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            return "An exception has occured with type %s at arguments %s" %(type(inst), inst.args)
            sys.exit(2)
        return res

    def roles(self):
        nUrl = "http://%s:%s/dmon/v1/overlord/nodes/roles" % (self.esEndpoint, self.dmonPort)
        logger.info('[%s] : [INFO] dmon get roles url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl)
        try:
            rRoles = requests.get(nUrl)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)
            sys.exit(2)
        rData = rRoles.json()
        return rData

    def createIndex(self, indexName):
        try:
            self.esInstance.create(index=indexName, ignore=400)
            logger.info('[%s] : [INFO] Created index %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Failed to created index %s with %s and %s',
                        datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst), inst.args)

    def closeIndex(self, indexName):
        try:
            self.esInstance.close(index=indexName)
            logger.info('[%s] : [INFO] Closed index %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Failed to close index %s with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst),
                         inst.args)

    def deleteIndex(self, indexName):
        try:
            res = self.esInstance.indices.delete(index=indexName, ignore=[400, 404])
            logger.info('[%s] : [INFO] Deleted index %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Failed to delete index %s with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName, type(inst),
                         inst.args)
            return 0
        return res

    def openIndex(self, indexName):
        res = self.esInstance.indices.open(index=indexName)
        logger.info('[%s] : [INFO] Open index %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), indexName)
        return res

    def getIndex(self, indexName):
        res = self.esInstance.indices.get(index=indexName, human=True)
        return res

    def getIndexSettings(self, indexName):
        res = self.esInstance.indices.get_settings(index=indexName, human=True)
        return res

    def clusterHealth(self):
        res = self.esInstance.cluster.health(request_timeout=15)
        return res

    def clusterSettings(self):
        res = self.esInstance.cluster.get_settings(request_timeout=15)
        return res

    def clusterState(self):
        res = self.esInstance.cluster.stats(human=True, request_timeout=15)
        return res

    def nodeInfo(self):
        res = self.esInstance.nodes.info(request_timeout=15)
        return res

    def nodeState(self):
        res = self.esInstance.nodes.stats(request_timeout=15)
        return res

    def getStormTopology(self):
        nUrl = "http://%s:%s/dmon/v1/overlord/detect/storm" % (self.esEndpoint, self.dmonPort)
        logger.info('[%s] : [INFO] dmon get storm topology url -> %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl)
        try:
            rStormTopology = requests.get(nUrl)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)
            sys.exit(2)
        rData = rStormTopology.json()
        return rData

    def pushAnomaly(self, anomalyIndex, doc_type, body):
        try:
            res = self.esInstance.index(index=anomalyIndex, doc_type=doc_type, body=body)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while pushing anomaly with type %s at arguments %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't push anomaly to dmon!"
            sys.exit(2)
        return res

    def getModel(self):
        return "getModel"

    def pushModel(self):
        return "push model"

    def localData(self):
        return "use local data"

    def getInterval(self):
        nUrl = "http://%s:%s/dmon/v1/overlord/aux/interval" % (self.esEndpoint, self.dmonPort)
        logger.info('[%s] : [INFO] dmon get interval url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl)
        try:
            rInterval = requests.get(nUrl)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)
            sys.exit(2)
        rData = rInterval.json()
        return rData

    def aggQuery(self, queryBody):
        adt_timeout = os.environ['ADP_TIMEOUT'] = os.getenv('ADP_TIMEOUT', str(60)) # Set timeout as env variable ADT_TIMEOUT, if not set use default 60
        try:
            res = self.esInstance.search(index=self.myIndex, body=queryBody, request_timeout=float(adt_timeout))
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception while executing ES query with %s and %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            sys.exit(2)
        return res

    def getNodeList(self):
        '''
        :return: -> returns the list of registered nodes from dmon
        '''
        nUrl = "http://%s:%s/dmon/v1/observer/nodes" % (self.esEndpoint, self.dmonPort)
        logger.info('[%s] : [INFO] dmon get node url -> %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl)
        try:
            rdmonNode = requests.get(nUrl)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)
            sys.exit(2)
        rdata = rdmonNode.json()
        nodes = []
        for e in rdata['Nodes']:
            for k in e:
                nodes.append(k)
        return nodes

    def getDmonStatus(self):
        nUrl = "http://%s:%s/dmon/v1/overlord/core/status" % (self.esEndpoint, self.dmonPort)
        logger.info('[%s] : [INFO] dmon get core status url -> %s',
                    datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), nUrl)
        try:
            rdmonStatus = requests.get(nUrl)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Exception has occured while connecting to dmon with type %s at arguments %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Can't connect to dmon at %s port %s" % (self.esEndpoint, self.dmonPort)
            sys.exit(2)
        return rdmonStatus.json()

Beispiel #3

Datei anzeigen

class ElasticSearchUtil:
    def __init__(self, host):
        self.host = host
        self.conn = Elasticsearch([self.host])

    def __del__(self):
        self.close()

    def check(self):
        '''
        输出当前系统的ES信息
        :return:
        '''
        return self.conn.info()

    def insertDocument(self, index, type, body, id=None):
        '''
        插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成
        :param index: 待插入的index值
        :param type: 待插入的type值
        :param body: 待插入的数据 -> dict型
        :param id: 自定义Id值
        :return:
        '''
        return self.conn.index(index=index, doc_type=type, body=body, id=id)

    def insertDataFrame(self, index, type, dataFrame):
        '''
        批量插入接口;
        bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}]
        其中optionType可为index、delete、update
        Condition可设置每条数据所对应的index值和type值
        data为具体要插入/更新的单条数据
        :param index: 默认插入的index值
        :param type: 默认插入的type值
        :param dataFrame: 待插入数据集
        :return:
        '''
        dataList = dataFrame.to_dict(orient='records')
        insertHeadInfoList = [{"index": {}} for i in range(len(dataList))]
        temp = [dict] * (len(dataList) * 2)
        temp[::2] = insertHeadInfoList
        temp[1::2] = dataList
        try:
            return self.conn.bulk(index=index, doc_type=type, body=temp)
        except Exception:
            return str(Exception)

    def deleteDocById(self, index, type, id):
        '''
        删除指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.delete(index=index, doc_type=type, id=id)

    def deleteDocByQuery(self, index, query, type=None):
        '''
        删除idnex下符合条件query的所有数据
        :param index:
        :param query: 满足DSL语法格式
        :param type:
        :return:
        '''
        return self.conn.delete_by_query(index=index, body=query, doc_type=type)

    def deleteAllDocByIndex(self, index, type=None):
        '''
        删除指定index下的所有数据
        :param index:
        :return:
        '''
        try:
            query = {'query': {'match_all': {}}}
            return self.conn.delete_by_query(index=index, body=query, doc_type=type)
        except Exception:
            return str(Exception) + ' -> ' + index

    def searchDoc(self, index=None, type=None, body=None):
        '''
        查找index下所有符合条件的数据
        :param index:
        :param type:
        :param body: 筛选语句,符合DSL语法格式
        :return:
        '''
        return self.conn.search(index=index, doc_type=type, body=body)

    def getDocById(self, index, type, id):
        '''
        获取指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.get(index=index, doc_type=type, id=id)

    def updateDocById(self, index, type, id, body=None):
        '''
        更新指定index、type、id所对应的数据
        :param index:
        :param type:
        :param id:
        :param body: 待更新的值
        :return:
        '''
        return self.conn.update(index=index, doc_type=type, id=id, body=body)


    def close(self):
     if self.conn is not None:
        try:
            self.conn.close()
        except Exception:
            pass
        finally:
            self.conn = None

Beispiel #4

Datei anzeigen

Datei: rdfxml2es.py Projekt: linked-swissbib/utilities

class Rdfxml2Es:

    def __init__(self, file, frame, host, port, esindex, indctrl, bulksize, devmode, filemode, outsubDir):
        """
        1) Initializes some attributes
        2) Checks if connection to ES node can be established
        3) Checks if ES index does not already exist
        4) If 2) und 3) are true, then create index and type mappings

        :param file: The RDF-XML file
        :param frame: File containing the JSON-LD framing
        :param host: Host of ES node
        :param port: Port of ES node
        :param esindex: Name of ES index
        :param indctrl: Settings and mapping for ES
        :param bulksize: Size of bulk uploads
        :param devmode: Number of samples for performing performance
        :param filemode:
        :param outsubDir:
        test on different bulk upload sizes
        :return: None
        """
        self.file = file
        self.frame = frame
        self.host = host
        self.port = port
        self.index = esindex
        self.indctrl = indctrl
        self.bulksize = bulksize
        self.bulknum = 0
        self.devmode = devmode
        self.filemode = filemode
        self.esdocs = list()
        self.outsubDir = outsubDir
        self.numberOfFilesInSubDir = 300
        self.openedFilesInSubDir = 0
        self.currentSubDir = 1
        self.writtenDocuments = 0
        if self.devmode > 0:
            self.doccounter = 0
        if self.filemode:
            self._openFile()
            #self.of = open('output.json', 'w')
        else:
            try:
                h1 = client.HTTPConnection(self.host, self.port)
                h1.connect()
                h1.close()
                self.of = Elasticsearch([{'host': self.host, 'port': self.port}])
                if  not self.of.indices.exists(self.index) is True:
                    if self.indctrl is not None:
                        self.of.indices.create(index=self.index, body=self.loadjson(self.indctrl))
                    else:
                        self.of.indices.create(index=self.index)
            except Exception as inst:
                exit("Error: " + inst.args[1])

    @staticmethod
    def loadjson(ifile):
        """
        Loads a file containing valid JSON-LD objects and removes comments
        :param ifile:
        :return: Object of type Dictionary
        """
        with open(ifile, 'r') as f:
            raw = f.read()
        jsonstr = jsmin(raw)
        return loads(jsonstr)

    @staticmethod
    def stripchars(string):
        """
        Removes tabs and newlines from string.
        :param string:
        :return: Cleaned string
        """
        return ''.join(re.split('\t+|\n+', string))

    def parsexml(self):
        """
        Parses XML and kicks off the transformation and indexing of the individual documents.
        Must be implemented in child classes
        :return: None
        """
        raise NotImplementedError

    def rdf2es(self, string, bibo):
        """
        Does the really interesting stuff: Transformation of the
        triples by subject and indexing in ES
        :param string: The RDF triples as a concatenated string.
        :param bibo: Is subject a bibo:Document?
        :return: Body for ES indexing
        """
        g = Graph().parse(data=string)
        jldstr = g.serialize(format='json-ld',
                             indent=4)
        if bibo:
            esdoc = jsonld.compact(loads(jldstr.decode('utf-8')), self.loadjson(self.frame))
            doctype = 'document'
        else:
            esdoc = loads(jldstr.decode('utf-8'))
            esdoc = jsonld.frame(esdoc, self.loadjson(self.frame))['@graph'][0]
            esdoc['@context'] = self.loadjson(self.frame)['@context']
            doctype = 'bibliographicResource'
        docid = re.findall('\w{9}', esdoc['@id'])[0]
        if self.filemode:
            bulkfile = [{'index': {'_index': self.index, '_type': doctype, '_id': docid}}, esdoc]
            return bulkfile
        else:
            esdoc.update({'_index': self.index, '_type': doctype, '_id': docid})
            return esdoc

    def bulkupload(self, string, bibo):
        """
        Creates a list of single JSON-LD documents and indexes them as bulk upload
        :param string: The RDF triples as a concatenated string.
        :param bibo: Is subject a bibo:Document?
        :return:
        """
        if not self.filemode:
            self.bulknum += 1
        self.esdocs.append(self.rdf2es(string, bibo))

        if self.filemode:
            # Output content to file
            #I think we shouldn't serialize the content in memory in the output-file mode

            for outer in self.esdocs:
                for inner in outer:
                    #self.of.write(dumps(inner, separators='\n'))
                    #we need this json dump method because the content is stored in a dictionary structure - as far as I understand it
                    #so we can't just write a string
                    dump(inner, self.of)
                    #dump(bytes(inner,'UTF-8'), self.of)
                    self.writtenDocuments += 1

                    self.of.write('\n')
            #perhaps flush it only in bigger chunks? - later
            #self.of.flush()
            del self.esdocs[:]
            if self.writtenDocuments >= self.bulksize:
                self._closeFile()
                self.writtenDocuments = 0
                self._openFile()

        elif self.bulknum >= self.bulksize:
            # Perform bulk upload
            helpers.bulk(client=self.of, actions=self.esdocs, stats_only=True)
            # Reset counter and empty list
            self.bulknum = 0
            del self.esdocs[:]

    def _openFile(self):

        subDir = self.outsubDir + os.sep + self.currentSubDir.__str__()

        if not os.path.isdir(subDir):
            os.mkdir(subDir)
        #every time the script is started, the number of current subdirs is again 1 so we neeed to check
        #hown much files are already stored in the current subdir
        elif self.openedFilesInSubDir >= self.numberOfFilesInSubDir or len([name for name in os.listdir(subDir)]) \
                >= self.numberOfFilesInSubDir:
            self.currentSubDir += 1
            subDir = self.outsubDir + os.sep + self.currentSubDir.__str__()
            if not os.path.isdir(subDir):
                os.mkdir(subDir)
            self.numberOfFilesInSubDir = 0

        outfile = "es." + datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__() + ".json"

        #using compressed method we are getting difficulties with the gzip interface in combination with the dump method of json module
        #outfile = "es." + datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + datetime.now().microsecond.__str__() + ".json.gz"
        absoluteFileName = "".join([subDir,os.sep, outfile])
        self.of = open(absoluteFileName,'w')
        #self.of = gzip.open(absoluteFileName,'wb')

        self.numberOfFilesInSubDir += 1

    def _closeFile(self):
        if not self.of is None:
            #last return necessary for bulk API
            self.of.write("\n")
            self.of.flush()
            name = self.of.name
            self.of.close()
            os.system("gzip " + name)
        self.of = None

Beispiel #5

Datei anzeigen

class ElasticSearchUtil:
    def __init__(self, host):
        self.host = host
        self.conn = Elasticsearch([self.host])
        self.dbserver = '172.20.12.80'
        # dbport = settings['MONGODB_PORT']
        dbname = 'nccloud-gateway-elasticsearch-' + "crawler"
        self.dbname = dbname
        collname = 'crawldetailapivo'
        client = MongoClient(self.dbserver)
        db = client.admin
        db.authenticate("ublinker", "nash2017")
        db = client[dbname]
        self.col = db[collname]

    def __del__(self):
        self.close()

    def check(self):
        '''
        输出当前系统的ES信息
        :return:
        '''
        return self.conn.info()

    def insertDocument(self, index, type, body, id=None):
        '''
        插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成
        :param index: 待插入的index值
        :param type: 待插入的type值
        :param body: 待插入的数据 -> dict型
        :param id: 自定义Id值
        :return:
        '''
        return self.conn.index(index=index, doc_type=type, body=body, id=id)

    def insertDataFrame(self, index, type, dataFrame):
        '''
        批量插入接口;
        bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}]
        其中optionType可为index、delete、update
        Condition可设置每条数据所对应的index值和type值
        data为具体要插入/更新的单条数据
        :param index: 默认插入的index值
        :param type: 默认插入的type值
        :param dataFrame: 待插入数据集
        :return:
        '''
        dataList = dataFrame.to_dict(orient='records')
        insertHeadInfoList = [{"index": {}} for i in range(len(dataList))]
        temp = [dict] * (len(dataList) * 2)
        temp[::2] = insertHeadInfoList
        temp[1::2] = dataList
        try:
            return self.conn.bulk(index=index, doc_type=type, body=temp)
        except Exception as e:
            return str(e)

    def deleteDocById(self, index, type, id):
        '''
        删除指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.delete(index=index, doc_type=type, id=id)

    def deleteDocByQuery(self, index, query, type=None):
        '''
        删除idnex下符合条件query的所有数据
        :param index:
        :param query: 满足DSL语法格式
        :param type:
        :return:
        '''
        return self.conn.delete_by_query(index=index,
                                         body=query,
                                         doc_type=type)

    def deleteAllDocByIndex(self, index, type=None):
        '''
        删除指定index下的所有数据
        :param index:
        :return:
        '''
        try:
            query = {'query': {'match_all': {}}}
            return self.conn.delete_by_query(index=index,
                                             body=query,
                                             doc_type=type)
        except Exception as e:
            return str(e) + ' -> ' + index

    def searchDoc(self, index=None, type=None, body=None):
        '''
        查找index下所有符合条件的数据
        :param index:
        :param type:
        :param body: 筛选语句,符合DSL语法格式
        :return:
        '''
        # return self.conn.search(index = "dfndsfyfsr0835468931_201803", body = {"query": {"match_all": {}}})
        return self.conn.search(index=index, doc_type=type, body=body)

    def getDocById(self, index, type, id):
        '''
        获取指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.get(index=index, doc_type=type, id=id)

    def updateDocById(self, index, type, id, body=None):
        '''
        更新指定index、type、id所对应的数据
        :param index:
        :param type:
        :param id:
        :param body: 待更新的值
        :return:
        '''
        return self.conn.update(index=index, doc_type=type, id=id, body=body)

    def close(self):
        if self.conn is not None:
            try:
                self.conn.close()
            except Exception as e:
                pass
            finally:
                self.conn = None

    def collectdata(self):
        host = '10.3.5.61:9200'
        esAction = self
        query = {"from": 0, "size": 10, 'query': {'match_all': {}}}
        res = esAction.searchDoc('dfndsfyfsr0835468931_201803',
                                 'clMnxIkYIB0838868687_busi', query)
        a = res['hits']
        a = a['total']
        n = int(a / 1000)
        query = {"from": 1, "size": 10000, 'query': {'match_all': {}}}
        # query = {"from": 10000, "size": 100, 'query': {'match_all': {}}}
        res = esAction.searchDoc('dfndsfyfsr0835468931_201803',
                                 'clMnxIkYIB0838868687_busi', query)
        for num in range(999, n):
            page1 = str(2 * num - 1)
            query = {
                "from": num * 1000,
                "size": 1000,
                'query': {
                    'match_all': {}
                }
            }
            # query = {"from": 10000, "size": 100, 'query': {'match_all': {}}}
            res = esAction.searchDoc('dfndsfyfsr0835468931_201803',
                                     'clMnxIkYIB0838868687_busi', query)
            if (num > 999):
                break
            for hit in res['hits']['hits']:
                data = hit["_source"]
                #         esAction.col.insert(data)
                print(hit["_source"])