class ElasticHelper(object): def __init__(self): self.es = Elasticsearch(ElasticConfig.uri) def index(self, body, index, doc_type): self.es.index(body=body, index=index, doc_type=doc_type) def bulk(self, body, index, doc_type): self.es.bulk(body=body, index=index, doc_type=doc_type) def scan(self, body, index, doc_type): return helpers.scan(self.es, query=body, index=index, doc_type=doc_type, preserve_order=True) def search(self, body, index, doc_type): try: rsp = self.es.search(body=body, index=index, doc_type=doc_type, request_timeout=100) if rsp.get("errors"): print("es search error") return return rsp except Exception as e: print("es search error: " + str(e)) def count(self, body, index, doc_type): return self.es.count(index=index, doc_type=doc_type, body=body, request_timeout=100) def delete_index(self, index): return self.es.indices.delete(index=index)
class EsClientConnection: host = '' errorMessage = '' def __init__(self, host, index=None, type=None, body=None): ''' 创建的时候需要两个都要存在 :param host: :param index: :param type: :param body: ''' self.host = host self.conn = Elasticsearch([self.host]) # 初始化mapping设置,即创建index indexExists = self.conn.indices.exists(index=index) typeExists = self.conn.indices.exists_type(index=index, doc_type=type) if body is not None: if indexExists is not True: if typeExists is not True: self.conn.indices.create(index=index, body=body) else: self.errorMessage = 'index not exists and type exists. it is not possible!' else: if typeExists is not True: self.errorMessage = 'index index exists and type not exists' else: self.errorMessage = 'index exists and type exists. you not need create it' def __del__(self): self.close() def check(self): ''' 输出当前系统的ES信息 :return: ''' return self.conn.info() def insertDocument(self, index, type, body, id=None): ''' 插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成 :param index: 待插入的index值 :param type: 待插入的type值 :param body: 待插入的数据 -> dict型 :param id: 自定义Id值 :return: ''' return self.conn.index(index=index, doc_type=type, body=body, id=id) def insertDataFrame(self, index, type, dataFrame): ''' 批量插入接口; bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}] 其中optionType可为index、delete、update Condition可设置每条数据所对应的index值和type值 data为具体要插入/更新的单条数据 :param index: 默认插入的index值 :param type: 默认插入的type值 :param dataFrame: 待插入数据集 :return: ''' dataList = dataFrame.to_dict(orient='records') insertHeadInfoList = [{"index": {}} for i in range(len(dataList))] temp = [dict] * (len(dataList) * 2) temp[::2] = insertHeadInfoList temp[1::2] = dataList try: return self.conn.bulk(index=index, doc_type=type, body=temp) except Exception as e: return str(e) def deleteDocById(self, index, type, id): ''' 删除指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.delete(index=index, doc_type=type, id=id) def deleteDocByQuery(self, index, query, type=None): ''' 删除idnex下符合条件query的所有数据 :param index: :param query: 满足DSL语法格式 :param type: :return: ''' return self.conn.delete_by_query(index=index, body=query, doc_type=type) def deleteAllDocByIndex(self, index, type=None): ''' 删除指定index下的所有数据 :param index: :return: ''' try: query = {'query': {'match_all': {}}} return self.conn.delete_by_query(index=index, body=query, doc_type=type) except Exception as e: return str(e) + ' -> ' + index def searchDoc(self, index=None, type=None, body=None): ''' 查找index下所有符合条件的数据 :param index: :param type: :param body: 筛选语句,符合DSL语法格式 :return: ''' return self.conn.search(index=index, doc_type=type, body=body) def getDocById(self, index, type, id): ''' 获取指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.get(index=index, doc_type=type, id=id) def updateDocById(self, index, type, id, body=None): ''' 更新指定index、type、id所对应的数据 :param index: :param type: :param id: :param body: 待更新的值 :return: ''' return self.conn.update(index=index, doc_type=type, id=id, body=body) def close(self): if self.conn is not None: try: self.conn.close() except Exception as e: pass finally: self.conn = None def mysqlToEs(self, mysqlData): doc = [] for value in mysqlData: doc.append({"index": {}}) doc.append(value) self.conn.bulk(index='product', doc_type='tour_product', body=doc)
ids = [] temp = [] for x in term_list: ids.append(x['_id']) if 'product' in x['term_vectors'].keys(): temp.append([x['term_vectors']['product']['terms']]) else: temp.append(None) temp = [sort_term_vectors(term_vector) for term_vector in temp] body = [ gen_bulk_2(pid, term_vector) for pid, term_vector in zip(ids, temp) ] body = [x for x in chain(*body)] body = "\n".join(body) es.bulk(body) # # # # df = df.loc[df.pid.isin(ids), 'term_vectors'].assign(temp) # # df['term_vectors'] = df['term_vectors'].mask(df['pid'].isin(ids), temp) # # ======= # print(''' get_sorted_term_vectors ''') sorted_term_vectors_dict = dict() # # TODO ES_INDEX : conf.es_nouns_index or conf.es_adjv_index sorted_term_vectors = es.search(index=conf.es_nouns_index, size=10000,
class ElasticHelper(object): def __init__(self): self.es = Elasticsearch(ElasticConfig.uri) self._multi_search_results = [] self.bulk_task_queue = [] self.bulk_last_time = datetime_now_obj() def delay_index(self, body, index, doc_type): self.bulk_task_queue.append( {"index": { "_index": index, "_type": doc_type }}) self.bulk_task_queue.append(body) if self._can_do_bulk(): self.bulk(body=self.bulk_task_queue, index=index, doc_type=doc_type) self.bulk_task_queue = [] self.bulk_last_time = datetime_now_obj() def _can_do_bulk(self): # 任务队列超过100条数据 if len(self.bulk_task_queue) > 100: return True # 时间间隔超过1分钟 if get_n_min_ago(1) > self.bulk_last_time: return True return False def index(self, body, index, doc_type): self.es.index(body=body, index=index, doc_type=doc_type) def bulk(self, body, index, doc_type): self.es.bulk(body=body, index=index, doc_type=doc_type) def scan(self, body, index, doc_type): return helpers.scan(self.es, query=body, index=index, doc_type=doc_type, preserve_order=True) def search(self, body, index, doc_type): try: rsp = self.es.search(body=body, index=index, doc_type=doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) return return rsp except Exception as e: print(body) logger.error("es search error: " + str(e) + index) def count(self, body, index, doc_type): return self.es.count(index=index, doc_type=doc_type, body=body, request_timeout=100) def delete_index(self, index): return self.es.indices.delete(index=index) def put_template(self, name, body, **kwargs): return self.es.indices.put_template(name=name, body=body, **kwargs) def exists_template(self, name, **kwargs) -> bool: return self.es.indices.exists_template(name=name, **kwargs) def delete_template(self, name, **kwargs): return self.es.indices.delete_template(name=name, **kwargs) def get_template(self, name, **kwargs): return self.es.indices.get_template(name=name, **kwargs) def wait_log_in_database(self, computer_name, record_number): """ 因为消息队列和入库ES是分开进行的,所以可能会出现当消费到某条日志时,ES还没入库,所以需要检查同步 """ count = 0 query = { "query": get_must_statement( get_term_statement("computer_name", computer_name), get_term_statement("record_number", record_number)), "_source": False, "size": 1 } while True: try: rsp = self.es.search(body=query, index=ElasticConfig.event_log_index, doc_type=ElasticConfig.event_log_doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) break if len(rsp["hits"]["hits"]) > 0: return rsp["hits"]["hits"][0]["_id"] time.sleep(2) # 最多等5次,即 2 * 5 = 10秒 if count == 10: break count += 1 except Exception as e: logger.error("es wait_log_in_database search error: " + str(e)) break def multi_search(self, body, index, doc_type): try: rsp = self.es.msearch(body=body, index=index, doc_type=doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) return return rsp except Exception as e: logger.error("es msearch error: " + str(e))