class ElasticHelper(object):
    def __init__(self):
        self.es = Elasticsearch(ElasticConfig.uri)

    def index(self, body, index, doc_type):
        self.es.index(body=body, index=index, doc_type=doc_type)

    def bulk(self, body, index, doc_type):
        self.es.bulk(body=body, index=index, doc_type=doc_type)

    def scan(self, body, index, doc_type):
        return helpers.scan(self.es,
                            query=body,
                            index=index,
                            doc_type=doc_type,
                            preserve_order=True)

    def search(self, body, index, doc_type):
        try:
            rsp = self.es.search(body=body,
                                 index=index,
                                 doc_type=doc_type,
                                 request_timeout=100)
            if rsp.get("errors"):
                print("es search error")
                return
            return rsp
        except Exception as e:
            print("es search error: " + str(e))

    def count(self, body, index, doc_type):
        return self.es.count(index=index,
                             doc_type=doc_type,
                             body=body,
                             request_timeout=100)

    def delete_index(self, index):
        return self.es.indices.delete(index=index)
Exemple #2
0
class EsClientConnection:
    host = ''
    errorMessage = ''

    def __init__(self, host, index=None, type=None, body=None):
        '''
        创建的时候需要两个都要存在
        :param host:
        :param index:
        :param type:
        :param body:
        '''
        self.host = host
        self.conn = Elasticsearch([self.host])
        # 初始化mapping设置,即创建index
        indexExists = self.conn.indices.exists(index=index)
        typeExists = self.conn.indices.exists_type(index=index, doc_type=type)
        if body is not None:
            if indexExists is not True:
                if typeExists is not True:
                    self.conn.indices.create(index=index, body=body)
                else:
                    self.errorMessage = 'index not exists and type exists. it is not possible!'
            else:
                if typeExists is not True:
                    self.errorMessage = 'index index exists and type not exists'
                else:
                    self.errorMessage = 'index exists and type exists. you not need create it'

    def __del__(self):
        self.close()

    def check(self):
        '''
        输出当前系统的ES信息
        :return:
        '''
        return self.conn.info()

    def insertDocument(self, index, type, body, id=None):
        '''
        插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成
        :param index: 待插入的index值
        :param type: 待插入的type值
        :param body: 待插入的数据 -> dict型
        :param id: 自定义Id值
        :return:
        '''
        return self.conn.index(index=index, doc_type=type, body=body, id=id)

    def insertDataFrame(self, index, type, dataFrame):
        '''
        批量插入接口;
        bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}]
        其中optionType可为index、delete、update
        Condition可设置每条数据所对应的index值和type值
        data为具体要插入/更新的单条数据
        :param index: 默认插入的index值
        :param type: 默认插入的type值
        :param dataFrame: 待插入数据集
        :return:
        '''
        dataList = dataFrame.to_dict(orient='records')
        insertHeadInfoList = [{"index": {}} for i in range(len(dataList))]
        temp = [dict] * (len(dataList) * 2)
        temp[::2] = insertHeadInfoList
        temp[1::2] = dataList
        try:
            return self.conn.bulk(index=index, doc_type=type, body=temp)
        except Exception as e:
            return str(e)

    def deleteDocById(self, index, type, id):
        '''
        删除指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.delete(index=index, doc_type=type, id=id)

    def deleteDocByQuery(self, index, query, type=None):
        '''
        删除idnex下符合条件query的所有数据
        :param index:
        :param query: 满足DSL语法格式
        :param type:
        :return:
        '''
        return self.conn.delete_by_query(index=index,
                                         body=query,
                                         doc_type=type)

    def deleteAllDocByIndex(self, index, type=None):
        '''
        删除指定index下的所有数据
        :param index:
        :return:
        '''
        try:
            query = {'query': {'match_all': {}}}
            return self.conn.delete_by_query(index=index,
                                             body=query,
                                             doc_type=type)
        except Exception as e:
            return str(e) + ' -> ' + index

    def searchDoc(self, index=None, type=None, body=None):
        '''
        查找index下所有符合条件的数据
        :param index:
        :param type:
        :param body: 筛选语句,符合DSL语法格式
        :return:
        '''
        return self.conn.search(index=index, doc_type=type, body=body)

    def getDocById(self, index, type, id):
        '''
        获取指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.get(index=index, doc_type=type, id=id)

    def updateDocById(self, index, type, id, body=None):
        '''
        更新指定index、type、id所对应的数据
        :param index:
        :param type:
        :param id:
        :param body: 待更新的值
        :return:
        '''
        return self.conn.update(index=index, doc_type=type, id=id, body=body)

    def close(self):
        if self.conn is not None:
            try:
                self.conn.close()
            except Exception as e:
                pass
            finally:
                self.conn = None

    def mysqlToEs(self, mysqlData):
        doc = []
        for value in mysqlData:
            doc.append({"index": {}})
            doc.append(value)
        self.conn.bulk(index='product', doc_type='tour_product', body=doc)
    ids = []
    temp = []
    for x in term_list:
        ids.append(x['_id'])
        if 'product' in x['term_vectors'].keys():
            temp.append([x['term_vectors']['product']['terms']])
        else:
            temp.append(None)

    temp = [sort_term_vectors(term_vector) for term_vector in temp]
    body = [
        gen_bulk_2(pid, term_vector) for pid, term_vector in zip(ids, temp)
    ]
    body = [x for x in chain(*body)]
    body = "\n".join(body)
    es.bulk(body)

#
#
#     # df = df.loc[df.pid.isin(ids), 'term_vectors'].assign(temp)
#     # df['term_vectors'] = df['term_vectors'].mask(df['pid'].isin(ids), temp)
#
# =======
#

print(''' get_sorted_term_vectors ''')

sorted_term_vectors_dict = dict()
#     # TODO ES_INDEX : conf.es_nouns_index or conf.es_adjv_index
sorted_term_vectors = es.search(index=conf.es_nouns_index,
                                size=10000,
Exemple #4
0
class ElasticHelper(object):
    def __init__(self):
        self.es = Elasticsearch(ElasticConfig.uri)
        self._multi_search_results = []
        self.bulk_task_queue = []
        self.bulk_last_time = datetime_now_obj()

    def delay_index(self, body, index, doc_type):
        self.bulk_task_queue.append(
            {"index": {
                "_index": index,
                "_type": doc_type
            }})
        self.bulk_task_queue.append(body)

        if self._can_do_bulk():
            self.bulk(body=self.bulk_task_queue,
                      index=index,
                      doc_type=doc_type)
            self.bulk_task_queue = []

        self.bulk_last_time = datetime_now_obj()

    def _can_do_bulk(self):
        # 任务队列超过100条数据
        if len(self.bulk_task_queue) > 100:
            return True
        # 时间间隔超过1分钟
        if get_n_min_ago(1) > self.bulk_last_time:
            return True
        return False

    def index(self, body, index, doc_type):
        self.es.index(body=body, index=index, doc_type=doc_type)

    def bulk(self, body, index, doc_type):
        self.es.bulk(body=body, index=index, doc_type=doc_type)

    def scan(self, body, index, doc_type):
        return helpers.scan(self.es,
                            query=body,
                            index=index,
                            doc_type=doc_type,
                            preserve_order=True)

    def search(self, body, index, doc_type):
        try:
            rsp = self.es.search(body=body,
                                 index=index,
                                 doc_type=doc_type,
                                 request_timeout=100)
            if rsp.get("error"):
                logger.error(rsp.get("error").get("reason"))
                return
            return rsp
        except Exception as e:
            print(body)
            logger.error("es search error: " + str(e) + index)

    def count(self, body, index, doc_type):
        return self.es.count(index=index,
                             doc_type=doc_type,
                             body=body,
                             request_timeout=100)

    def delete_index(self, index):
        return self.es.indices.delete(index=index)

    def put_template(self, name, body, **kwargs):
        return self.es.indices.put_template(name=name, body=body, **kwargs)

    def exists_template(self, name, **kwargs) -> bool:
        return self.es.indices.exists_template(name=name, **kwargs)

    def delete_template(self, name, **kwargs):
        return self.es.indices.delete_template(name=name, **kwargs)

    def get_template(self, name, **kwargs):
        return self.es.indices.get_template(name=name, **kwargs)

    def wait_log_in_database(self, computer_name, record_number):
        """
            因为消息队列和入库ES是分开进行的,所以可能会出现当消费到某条日志时,ES还没入库,所以需要检查同步
        """
        count = 0
        query = {
            "query":
            get_must_statement(
                get_term_statement("computer_name", computer_name),
                get_term_statement("record_number", record_number)),
            "_source":
            False,
            "size":
            1
        }
        while True:
            try:
                rsp = self.es.search(body=query,
                                     index=ElasticConfig.event_log_index,
                                     doc_type=ElasticConfig.event_log_doc_type,
                                     request_timeout=100)
                if rsp.get("error"):
                    logger.error(rsp.get("error").get("reason"))
                    break
                if len(rsp["hits"]["hits"]) > 0:
                    return rsp["hits"]["hits"][0]["_id"]
                time.sleep(2)
                # 最多等5次,即 2 * 5 = 10秒
                if count == 10:
                    break
                count += 1
            except Exception as e:
                logger.error("es wait_log_in_database search error: " + str(e))
                break

    def multi_search(self, body, index, doc_type):
        try:
            rsp = self.es.msearch(body=body,
                                  index=index,
                                  doc_type=doc_type,
                                  request_timeout=100)
            if rsp.get("error"):
                logger.error(rsp.get("error").get("reason"))
                return
            return rsp
        except Exception as e:
            logger.error("es msearch error: " + str(e))