class ESChat: def __init__(self, index_name, kb=True): self.es = Elasticsearch(http_auth=('elastic', 'elastic123')) self.index = index_name def multi_search(self, topics, samples=10): # limit the querys length search_arr = [] for topic in topics: search_arr.append({'index': self.index}) search_arr.append({'query': {'bool': {'should': [{'match': {'utterance': {'query': topic}}}]}}, 'size': samples}) request = '' for each in search_arr: request += f'{json.dumps(each)} \n' rest = self.es.msearch(body=request) return rest def multi_search_edge(self, topics, samples=10): # limit the querys length search_arr = [] for topic1, topic2 in topics: search_arr.append({'index': self.index}) search_arr.append({'query': {'bool': {'must': [{'match': {'utterance': {'query': topic1}}}, {'match': {'utterance': {'query': topic2}}}]}}, 'size': samples}) request = '' for each in search_arr: request += f'{json.dumps(each)} \n' rest = self.es.msearch(body=request) return rest
def start_stop_chr(start, stop, chr): es = Elasticsearch() request = [] for i in range(len(chr)): req_head = {'index': 'annotations', 'type': 'annotations'} req_body = {"from": 0, "size": 100, "query": { "bool": { "must": [ {"match": {"CHROM": chr}} ], "filter": {"range": {"START": {"gte": start, "lte": stop}}} } } } request.extend([req_head, req_body]) resp = es.msearch(body=request) annotations = [] start = [] stop = [] chr = [] for i in resp["responses"][0]["hits"]["hits"]: annotations.append(i["_source"]) start.append(i["_source"]["START"]) stop.append(i["_source"]["STOP"]) chr.append(i["_source"]["CHROM"]) return binding_site(start, stop, chr, annotations)
def binding_site(start, stop, chr, annotations): es = Elasticsearch() request = [] for i in range(len(chr)): req_head = {'index': 'bs', 'type': 'bs'} req_body = {"from": 0, "size": 1, "query": { "bool": { "must": [{ "range": {"START": {"gte": start[i], "lte": stop[i]}}, "range": {"STOP": {"lte": stop[i], "gte": start[i]}} }], "filter": { "term": {"CHR": chr[i]} } } } } request.extend([req_head, req_body]) resp = es.msearch(body=request) bs = [] bs_start = [] bs_stop = [] bs_chr = [] for i in resp["responses"]: if i["hits"]["hits"] != []: bs.append(i["hits"]["hits"][0]["_source"]) bs_start.append(i["hits"]["hits"][0]["_source"]["START"]) bs_stop.append(i["hits"]["hits"][0]["_source"]["STOP"]) bs_chr.append(i["hits"]["hits"][0]["_source"]["CHR"]) return exon_sgrna_peek(bs_start, bs_stop, bs_chr, annotations, bs)
def msearch_protein(self, host, query): self.es_client = Elasticsearch(host) es = Elasticsearch(host) response = es.msearch(body=query, request_timeout=150) # filter_pathを設定する場合は以下を利用する #response = es.msearch(body=query_text, request_timeout=150, filter_path=['responses.aggregations.tags.buckets.key','responses.aggregations.tags.buckets.top_tag_hits.hits.hits._score','responses.aggregations.tags.buckets.top_tag_hits.hits.hits._source.name','responses.aggregations.tags.buckets.top_tag_hits.hits.hits._source.normalized_name']) return response
class ElasticsearchService(object): def __init__(self, host, port): self._es = Elasticsearch([{'host': host, 'port': port}]) def search(self, *args, **kwargs): return self._es.search(*args, **kwargs) def create(self, *args, **kwargs): return self._es.create(*args, **kwargs) def get(self, *args, **kwargs): return self._es.get(*args, **kwargs) def exists(self, *args, **kwargs): return self._es.exists(*args, **kwargs) def msearch(self, *args, **kwargs): return self._es.msearch(*args, **kwargs) def index(self, *args, **kwargs): return self._es.index(*args, **kwargs) def update(self, *args, **kwargs): return self._es.update(*args, **kwargs) def delete(self, *args, **kwargs): return self._es.delete(*args, **kwargs) def put_template(self, *args, **kwargs): return self._es.indices.put_template(*args, **kwargs)
class ESUtils: def __init__(self, index_name): self.es = Elasticsearch(http_auth=('elastic', 'elastic123'), timeout=30) self.index = index_name def multi_search(self, msgs, samples=10): search_arr = [] for msg in msgs: search_arr.append({'index': self.index}) # https://elasticsearch.cn/article/132 search_arr.append({ 'query': { 'bool': { 'should': [{'match': {'utterance': {'query': msg.replace('[SEP]', '')}}}], } }, "collapse": { "field": "keyword", }, 'size': samples, }) request = '' for each in search_arr: request += f'{json.dumps(each)} \n' rest = self.es.msearch(body=request) return rest
def detect_entities(entity_set): entities = set([]) # using a set so duplicates are not added es = Elasticsearch() index = "dbpedia_2015_10" # TODO include MLM, this will very likely improve the results search_array = [] for entity in entity_set: search_array.append({'index': index, "sort": ["_score"]}) # search_array.append({"query": {"match": {"names": entity}}}) search_array.append( {"query": { "multi_match": { "query": entity, "fields": ["names"] } }}) request = '' for each in search_array: request += '%s \n' % json.dumps(each) res = es.msearch(body=request, max_concurrent_searches=1000) # print(res) # # print("Got %d Hits" % res['hits']['total']) for response in res['responses']: for hit in response['hits']['hits']: # print(hit['_source'].keys()) entities |= set(hit['_source']['names']) # print("%d entities related to %s" % (len(entities), text)) # print(entities) return entities
def batch_insert(session, org_category, mm_result): es_config = ConfigManager().get_setting(key='elasticsearch') addresses = es_config['address'] if not isinstance(addresses, list): addresses = [addresses] es = Elasticsearch(addresses, port=es_config['port']) batch_insert = es.msearch(body=mm_result)['responses'] if batch_insert: try: for org_cat, batch_result in zip(org_category, batch_insert): if len(batch_result['hits']['hits']) == 0: message = 'No Category Mapping: %s' % ' '.join( str(item) for item in org_cat) logging.getLogger(__name__).error(message) else: item_cat = ItemCat(org_cat) intg_id = batch_result['hits']['hits'][0]['_source'][ 'intg_id'] cat_map = CatMap(INTG_ID=intg_id, UPDATE_TIME=datetime.datetime.now( timezone('Asia/Seoul'))) item_cat.cat_map.append(cat_map) session.add(item_cat) session.commit() except IntegrityError as e: logger.error('Insertion Error %s' % e) session.rollback() session.close()
def findLogs(self, queryCommand, querySize=1): """find方法返回字符串,字符串内容是查询的消息文本""" logger.debug("es查询采集部分,收到命令:" + queryCommand + ",采集数量:" + str(querySize)) es = Elasticsearch( [{ "host": settings.ES_ADDRESS["ip"], "port": 80, "url_prefix": "elasticsearch" }], headers={ "kbn-version": "4.5.4", "Host": settings.ES_ADDRESS["host"], "User-Agent": "Mozilla/5.0 Gecko/20100101 Firefox/68.0" }, timeout=10, http_compress=False) # 模板里面的${queryCommand}必须经过JSON处理,转义 query_data = ''' {"index":["project_app-${today}"],"ignore_unavailable":true} {"size":${querySize},"sort":[{"@timestamp":{"order":"desc","unmapped_type":"boolean"}}],"query":{"filtered":{"query":{"query_string":{"query":${queryCommand},"analyze_wildcard":true}},"filter":{"bool":{"must":[]}}}},"fields":["message"],"fielddata_fields":["@timestamp"]} ''' tpl = Template(query_data) return json.dumps( es.msearch( tpl.substitute(today=time.strftime("%Y.%m.%d"), querySize=querySize, queryCommand=json.dumps(queryCommand))))
def query_es_bulk( question, host, port, num_hits=25, query_field=("text", ), highligh_size=400, num_highlights=3, explain=False, ): elastic = Elasticsearch(host, port=port) def map_response(response): return [{ "score": x["_score"], "hit": { "title": x["_source"]["title"], "text": x["_source"]["text"], "url": x["_source"]["isbn"], }, } for x in response["hits"]["hits"]] question_data = question["question"] if not question_data["choices"]: return None index_name = index_mapping[question["info"]["language"].lower()] body = [] for choice in question_data["choices"]: body.append({"index": index_name}) query = " ".join((question_data["stem"], choice["text"])) body.append({ "explain": explain, "query": { "multi_match": { "query": query, "fields": query_field } }, "highlight": { "fragment_size": highligh_size, "type": "plain", "number_of_fragments": num_highlights, "fields": { "passage": {} }, }, "from": 0, "size": num_hits, }) responses = elastic.msearch(index=index_name, body=body, request_timeout=60) return [map_response(res) for res in responses["responses"]]
def es_search(index_name, field_name, search_date): es = Elasticsearch() request = [] print('Search Date : ', search_date.strftime("%Y-%m-%dT%H:%M:%S")) req_head = {'index': index_name} req_body = {'query':{'range':{field_name:{'gte': search_date.strftime("%Y-%m-%dT%H:%M:%S") }}}} request.extend([req_head, req_body]) resp = es.msearch(body = request) print(resp)
def multisearch(es_client: Elasticsearch, search_bodies, index, doc_type): search_header = {"index": index, "type": doc_type} request = "".join( [ "%s \n" % json.dumps(each) for search_body in search_bodies for each in [search_header, search_body] ] ) out = es_client.msearch(body=request, index=index) return out
def compute_rank(search_arr: List[str], purchase_arr: List[List[Dict[str, List[str]]]], rank_num: List[float], rank_den: List[float], es_client: Elasticsearch) -> None: """ Sends queries against Elasticsearch and compares results with what customers purchased. Computes the average rank position of where the purchased document falls within the retrieved items. Args ---- search_arr: List[str] Searches made by customers as observed in validation data. We send those against Elasticsearch and compare results with purchased data purchase_arr: List[List[Dict[str, List[str]]]] List of documents that were purchased by customers rank_num: List[float] Numerator value of the rank equation. Defined as list to emulate a pointer rank_den: List[float] es_client: Elasticsearch Python Elasticsearch client """ idx = 0 if not search_arr: return request = os.linesep.join(search_arr) response = es_client.msearch(body=request, request_timeout=60) for hit in response['responses']: docs = [doc['_id'] for doc in hit['hits'].get('hits', [])] if not docs or len(docs) < 2: continue purchased_docs = [ docs for purch in purchase_arr[idx] for docs in purch['purchased'] ] ranks = np.where(np.in1d(docs, purchased_docs))[0] idx += 1 if ranks.size == 0: continue rank_num[0] += ranks.sum() / (len(docs) - 1) rank_den[0] += ranks.size print('rank num: ', rank_num[0]) print('rank den: ', rank_den[0])
def add_snp(mod, species): if mod!=[]: if species == "human": index = "snp" position = "POS" else: index = "mouse_snp" position = "START" chr = [] start = [] request = [] for i in mod: if i!="!": start.append(i['Start']) chr.append(i['Chr']) for i in range(len(start)): req_head = {'index': index, 'type': index} req_body = {"size": 1, "query": { "bool": { "must": [ { "range": { position: { "gt": int(start[i])-1, "lt": int(start[i])+1 } } }, { "term": { "CHR": chr[i] } }] } } } request.extend([req_head, req_body]) es = Elasticsearch('https://SECRET/elasticsearch', verify_certs=False, timeout=50, max_retries=10, retry_on_timeout=True) resp = es.msearch(body=request) response = {} for i in range(len(resp['responses'])): if resp['responses'][i]['hits']['hits'] != []: mod[i]["SNP"] = Markup("<a href='https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=%s' target='_blank'>%s</a>"%(str(resp['responses'][i]['hits']['hits'][0]['_source']['ID']).lstrip("rs"),resp['responses'][i]['hits']['hits'][0]['_source']['ID'])) else: mod[i]["SNP"] = "-" return mod else: return mod
def multi_search(text): es = Elasticsearch() body = [] index = {'index': 'recipe_index', 'type': 'recipe_index'} search_ings = {'query': {'nested': {'path': 'ingredients', 'query': {'match': {'ingredients.name': {'fuzziness': 1, 'query': text}}}}}} search_recipe = {'query': {'multi_match': {'fields': ['title', 'description'], 'query': text, 'fuzziness': 1}}} body.extend([index, search_ings, index, search_recipe]) response = es.msearch(body=body) return response
class ElasticsearchService(object): def __init__(self, host, port): self._es = Elasticsearch([{'host': host, 'port': port}]) def search(self, *args, **kwargs): return self._es.search(*args, **kwargs) def create(self, *args, **kwargs): return self._es.create(*args, **kwargs) def get(self, *args, **kwargs): return self._es.get(*args, **kwargs) def exists(self, *args, **kwargs): return self._es.exists(*args, **kwargs) def msearch(self, *args, **kwargs): return self._es.msearch(*args, **kwargs)
def run(self, es_client: Elasticsearch, debug): if debug: logger.debug( 'QUERY (for %s):\n%s', self.types[0], json.dumps(self.q[self.types[0]], indent=2, ensure_ascii=False)) if len(self.types) > 1: logger.debug( 'QUERY (for %s):\n%s', self.types[-1], json.dumps(self.q[self.types[-1]], indent=2, ensure_ascii=False)) body = ''.join('{}\n{}\n'.format(json.dumps(dict( index=index)), json.dumps(self.q[t])) for t, index in zip(self.types, self.indexes) if t in self.filtered_type_names) return es_client.msearch(body)
def findErrorLogs(self, job): """find方法返回字符串,字符串内容是查询的消息文本""" logger.debug("es查询采集部分,收到命令:" + job.es_query + ",采集数量:" + str(job.es_query_num)) es = Elasticsearch( [{"host": settings.ES_ADDRESS["ip"], "port": 80, "url_prefix": "elasticsearch"}], headers={"kbn-version":"4.5.4","Host":settings.ES_ADDRESS["host"],"User-Agent":"Mozilla/5.0 Gecko/20100101 Firefox/68.0"}, timeout=10, http_compress=False ) # 模板里面的${queryCommand}必须经过JSON处理,转义 query_data=''' {"index":["project_err-*"],"ignore_unavailable":true} {"size":${querySize},"sort":[{"@timestamp":{"order":"desc","unmapped_type":"boolean"}}],"query":{"filtered":{"query":{"query_string":{"analyze_wildcard":true,"query":${queryCommand}}},"filter":{"bool":{"must":[{"range":{"@timestamp":{"gte":${timestampMsStart},"lte":${timestampMsEnd},"format":"epoch_millis"}}}],"must_not":[]}}}},"fields":["*","_source"],"script_fields":{},"fielddata_fields":["@timestamp"]} ''' tpl = Template(query_data) return json.dumps(es.msearch(tpl.substitute( querySize = job.es_query_num, timestampMsStart = round((time.time() - float(job.delay_sec) - 2 * 60) * 1000), timestampMsEnd = round(time.time() * 1000), queryCommand = json.dumps(job.es_query) )))
class Elasticsearch_service: def __init__(self, ip="localhost", timeout=1000, index_name='finger'): self.ip = ip self.es = Elasticsearch(hosts=ip) self.timeout = timeout self.index_name = index_name def create_index(self): request_body = request_create_index() try: ret = self.es.indices.create(index=self.index_name, body=request_body, request_timeout=self.timeout) except: return False return ret['acknowledged'] def indexing(self, list_json_text): pool_index = multiprocessing.Pool() request_body = pool_index.map(create_request_minutia, list_json_text) try: res = helpers.bulk(self.es, request_body, request_timeout=self.timeout) except: return False return True def delete_index(self): self.es.indices.delete(index=self.index_name) def search(self, list_text): request_body = request_msearch(list_text) try: text_return = self.es.msearch(body=request_body, request_timeout=self.timeout) return text_return except: return False
class Search(object): def __init__(self): self.es = Elasticsearch(hosts, http_compress=True) def multi_get(self): #health_status = es.cluster.health() #print health_status #res = es.mget(params) #body = {"query":{"term":{}}} #number = es.count(body=body) index = ["log-2018.03.21"] from_ = 0 body = """ {"index":%(index)s} {"query":{"match_all":{}},"from":%(from_)d, "size":%(limit)d} """ % dict(index=index, from_=from_, limit=LIMIT) res = self.es.msearch(body, doc_type='message') total = res['responses'][0]['hits']['total'] hits = res['responses'][0]['hits']['hits'] for i in xrange(total / LIMIT): body = """ {"index":["log-2018.03.21"]} {"query":{"match_all":{}},"from":LIMIT*(1+i), "size":LIMIT}""" res = es.msearch(body, doc_type='message') hits.append(hits) return hits def _count(self, index=None, item=None, value=None): body = { "query": { "term": { item: value, } } } res = self.es.count(index=index, body=body) return res['count']
def get(self, request, params, format=None): params = params.replace(' & ', ' %26 ') params = params.replace(';', '%3B') search_options = urllib.parse.parse_qs(params) es = Elasticsearch([ELASTICSEARCH_ADDRESS]) body = es_functions.create_base_query() filters = [] advanced_filters = [] body['query']['bool']['must'] = es_functions.create_query_string(search_options.get('q')) body['sort'] = es_functions.create_sort_query(search_options.get('sort')) if search_options.get('date_gte') or search_options.get('date_lte'): date_query = es_functions.create_date_query(search_options.get('date_gte'), search_options.get('date_lte')) body['query']['bool']['filter']['bool']['filter'].append(date_query) filters.append(date_query) for field in self.advanced_fields: if search_options.get(field): adv_filters = es_functions.create_advanced_filters(field, search_options.get(field)) for filter in adv_filters: body['query']['bool']['filter']['bool']['filter'].append(filter) advanced_filters.append(filter) if len(advanced_filters): filters.append(advanced_filters) for category in self.facet_categories: if search_options.get(category): facet_filters = es_functions.create_facet_filters(category, search_options.get(category)) body['query']['bool']['filter']['bool']['must'].append(facet_filters) for other_category in self.facet_categories: if other_category != category: body['aggregations'][other_category]['filter']['bool']['must'].append(facet_filters) query = es_functions.create_multisearch(body, search_options.get('from'), search_options.get('size'), filters) response = es.msearch(body=query) data = json.loads(json.dumps(response)) return Response(data['responses'], status=status.HTTP_200_OK)
def findErrorLogs(self, queryCommand, querySize = 1): """find方法返回字符串,字符串内容是查询的消息文本""" logger.debug("es查询采集部分,收到命令:" + queryCommand + ",采集数量:" + str(querySize)) es = Elasticsearch( [{"host": settings.MLES_ADDRESS["ip"], "port": 443, "url_prefix": "elasticsearch"}], headers={"kbn-version":"7.5.2","Host":settings.MLES_ADDRESS["host"],"User-Agent":"Mozilla/5.0 Gecko/20100101 Firefox/68.0","Referer":"https://"+settings.XES_ADDRESS["host"]+"/app/kibana"}, timeout=30, http_compress=False, use_ssl=True, verify_certs=False, http_auth=settings.MLES_ADDRESS["http_auth"] ) query_data=''' {"index":"err-prod-*","ignore_unavailable":true} {"size":${querySize},"query":{"bool":{"filter":[{"match_phrase":{"project":{"query":"${project}"}}},{"range":{"timestamp":{"format":"strict_date_optional_time","gte":"${timestampMsStart}","lte":"${timestampMsEnd}"}}}]}}} ''' tpl = Template(query_data) return json.dumps(es.msearch(tpl.substitute( querySize = querySize, timestampMsStart = (datetime.datetime.utcnow() - datetime.timedelta(seconds = int(queryCommand))).isoformat() + 'Z', timestampMsEnd = datetime.datetime.utcnow().isoformat() + 'Z', project = settings.MLES_ADDRESS["project"] )))
def findLogs(self, queryCommand, querySize=1): """find方法返回字符串,字符串内容是查询的消息文本""" logger.debug("es查询采集部分,收到命令:" + queryCommand + ",采集数量:" + str(querySize)) es = Elasticsearch( [{ "host": settings.XES_ADDRESS["ip"], "port": 443, "url_prefix": "elasticsearch" }], headers={ "kbn-version": "6.3.2", "Host": settings.XES_ADDRESS["host"], "User-Agent": "Mozilla/5.0 Gecko/20100101 Firefox/68.0", "Referer": "https://" + settings.XES_ADDRESS["host"] + "/app/kibana" }, timeout=30, http_compress=False, use_ssl=True, verify_certs=False, http_auth=settings.XES_ADDRESS["http_auth"]) # 模板里面的${queryCommand}必须经过JSON处理,转义 query_data = ''' {"index":"err-prod*","ignore_unavailable":true,"timeout":30000} {"size":${querySize},"sort":[{"@timestamp":{"order":"desc","unmapped_type":"boolean"}}],"query":{"bool":{"must":[{"match_phrase":{"project":{"query":"ms-order-crm"}}},{"match_phrase":{"env":{"query":"prod"}}},{"range":{"timestamp":{"gte":${timestampMsStart},"lte":${timestampMsEnd},"format":"epoch_millis"}}}]}}} ''' tpl = Template(query_data) return json.dumps( es.msearch( tpl.substitute(querySize=querySize, timestampMsStart=round( (time.time() - float(queryCommand)) * 1000), timestampMsEnd=round(time.time() * 1000))))
def transcript_expression(start, stop, chr, species): if species=="human": index = "human_transcript_exp" keys = ["Thyroid", "Testis", "Brain - Anterior cingulate cortex (BA24)", "Skin - Not Sun Exposed (Suprapubic)", "Esophagus - Mucosa", "Heart - Atrial Appendage", "Brain - Caudate (basal ganglia)", "Esophagus - Muscularis", "Brain - Putamen (basal ganglia)", "Small Intestine - Terminal Ileum", "Breast - Mammary Tissue", "Cervix - Ectocervix", "Cervix - Endocervix", "Fallopian Tube", "Brain - Cerebellum", "Bladder", "Brain - Cerebellar Hemisphere", "Brain - Spinal cord (cervical c_1)", "Artery - Coronary", "Liver", "Esophagus - Gastroesophageal Junction", "Brain - Hypothalamus", "Colon - Transverse", "Brain - Amygdala", "Pancreas", "Adipose - Subcutaneous", "Cells - Leukemia cell line (CML)", "Spleen", "Brain - Hippocampus", "Whole Blood", "Brain - Cortex", "Artery - Tibial", "Uterus", "Stomach", "Ovary", "Artery - Aorta", "Heart - Left Ventricle", "Kidney - Cortex", "Brain - Nucleus accumbens (basalganglia)", "Prostate", "Brain - Frontal Cortex (BA9)", "V****a", "Adipose - Visceral (Omentum)", "Adrenal Gland", "Lung", "Cells - Transformed fibroblasts", "Muscle - Skeletal", "Colon - Sigmoid", "Nerve - Tibial", "Brain - Substantia nigra", "Cells - EBV-transformed lymphocytes"] return_keys = ['Chr', 'Start', 'Stop', 'Transcript_stable_ID', 'Strand', 'Thyroid', 'Testis', 'Brain - Anterior cingulate cortex (BA24)', 'Skin - Not Sun Exposed (Suprapubic)', 'Esophagus - Mucosa', 'Heart - Atrial Appendage', 'Brain - Caudate (basal ganglia)', 'Esophagus - Muscularis', 'Brain - Putamen (basal ganglia)', 'Small Intestine - Terminal Ileum', 'Breast - Mammary Tissue', 'Cervix - Ectocervix', 'Cervix - Endocervix', 'Fallopian Tube', 'Brain - Cerebellum', 'Bladder', 'Brain - Cerebellar Hemisphere', 'Brain - Spinal cord (cervical c_1)', 'Artery - Coronary', 'Liver', 'Esophagus - Gastroesophageal Junction', 'Brain - Hypothalamus', 'Colon - Transverse', 'Brain - Amygdala', 'Strand', 'Pancreas', 'Adipose - Subcutaneous', 'Cells - Leukemia cell line (CML)', 'Spleen', 'Brain - Hippocampus', 'Whole Blood', 'Brain - Cortex', 'Artery - Tibial', 'Uterus', 'Stomach', 'Ovary', 'Artery - Aorta', 'Heart - Left Ventricle', 'Kidney - Cortex', 'Brain - Nucleus accumbens (basalganglia)', 'Prostate', 'Brain - Frontal Cortex (BA9)', 'V****a', 'Adipose - Visceral (Omentum)', 'Adrenal Gland', 'Lung', 'Cells - Transformed fibroblasts', 'Muscle - Skeletal', 'Colon - Sigmoid', 'Nerve - Tibial', 'Brain - Substantia nigra', 'Cells - EBV-transformed lymphocytes'] else: index = "mouse_transcript_exp" keys = ["embryo", "heart", "bone marrow macrophage", "fat pad", "neural tube", "embryonic fibroblast", "brain", "hindbrain", "limb", "stomach", "erythroblast", "midbrain", "kidney", "B cell", "MEL cell line", "testis", "vesicular gland", "G1E", "subcutaneous adipose tissue", "adrenal gland", "gonadal fat pad", "telencephalon", "brown adipose tissue", "placenta", "intestine", "forestomach", "CH12.LX", "ES-Bruce4", "activated regulatory T-cells", "cortical plate", "regulatory T cell", "skeletal muscle tissue", "urinary bladder", "cerebellum", "small intestine", "416B", "NIH3T3", "pancreas", "A20", "Patski", "G1E-ER4", "embryonic facial prominence", "bone marrow", "spleen", "thymus", "splenic B cell", "inflammation-experienced regulatory T-cells", "forebrain", "uterus", "lung", "ovary", "muscle", "olfactory bulb", "liver"] return_keys = ['Chr','Start','Stop','Transcript_ID', 'Strand', 'embryo', 'heart', 'neural tube', 'bone marrow macrophage', 'CH12.LX', 'fat pad', 'embryonic fibroblast', 'brain', 'hindbrain', 'limb', 'stomach', 'erythroblast', 'kidney', 'B cell', 'MEL cell line', 'testis', 'vesicular gland', 'ES-Bruce4', 'G1E', 'subcutaneous adipose tissue', 'adrenal gland', 'gonadal fat pad', 'forestomach', 'brown adipose tissue', 'placenta', 'uterus', 'activated regulatory T-cells', 'intestine', 'cortical plate', 'regulatory T cell', 'skeletal muscle tissue', 'urinary bladder', 'embryonic facial prominence', 'small intestine', '416B', 'NIH3T3', 'midbrain', 'pancreas', 'cerebellum', 'Patski', 'G1E-ER4', 'bone marrow', 'spleen', 'thymus', 'A20', 'splenic B cell', 'telencephalon', 'forebrain', 'inflammation-experienced regulatory T-cells', 'lung', 'ovary', 'muscle', 'olfactory bulb', 'liver'] request = [] req_head = {'index': index, 'type': index} req_body = { "size": 9000, "query": { "bool": { "must": { "range": {"Start": {"gte": start, "lte": stop}} }, "filter": { "term": {"Chr": chr} } } } } request.extend([req_head, req_body]) es = Elasticsearch('https://SECRET/elasticsearch', verify_certs=False, timeout=30, max_retries=10, retry_on_timeout=True) resp = es.msearch(body=request) response = [] expression_list = [] try: for i in range(len(resp['responses'][0]['hits']['hits'])): response.append(resp['responses'][0]['hits']['hits'][i]["_source"]) expression_list.append([]) temp = resp['responses'][0]['hits']['hits'][i]["_source"] for j in keys: try: expression_list[i].append(float(temp[j])) except: expression_list[i].append(None) except: response = [] expression_list = [] normalized_list = [] for i in range(len(expression_list)): normalized_list.append([]) for j in expression_list[i]: if None not in expression_list[i] and max(expression_list[i])!=0: normalized_list[i].append(j/max(expression_list[i])) elif None in expression_list[i]: maximum = 0 for k in expression_list[i]: if k != None and k > maximum: maximum = k if maximum!=0 and j!=None: normalized_list[i].append(j/maximum) else: normalized_list[i].append(j) else: normalized_list[i].append(j) return response, json.dumps(expression_list), json.dumps(normalized_list), return_keys
def mod_function_mouse(start, stop, chr, mod_indices, gene): request = [] if gene!=0: for i in mod_indices: req_head = {'index': i, 'type': i} req_body = { "size": 9000, "query": { "term" : { "Gene" : gene } } } request.extend([req_head, req_body]) else: for i in mod_indices: req_head = {'index': i, 'type': i} req_body = {"size": 9000, "query": { "bool": { "must": [{ "range": {"Start": {"gte": start, "lte": stop}}, "range": {"Stop": {"lte": stop, "gte": start}} }], "filter": { "term": {"Chr": chr} } } } } request.extend([req_head, req_body]) es = Elasticsearch('https://SECRET/elasticsearch', verify_certs=False, timeout=30, max_retries=10, retry_on_timeout=True) resp = es.msearch(body=request) response = {} for i in range(len(resp["responses"])): response[mod_indices[i]] = [] for j in range(resp["responses"][i]["hits"]["total"]): try: resp["responses"][i]["hits"]["hits"][j]["_source"]["ENSG_ID"] = Markup("<a href='https://asia.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=%s;r=' target='_blank'>%s</a>"%(resp['responses'][i]['hits']['hits'][j]['_source']['ENSG_ID'],resp['responses'][i]['hits']['hits'][j]['_source']['ENSG_ID'])) resp["responses"][i]["hits"]["hits"][j]["_source"]["ENST_ID"] = Markup("<a href='https://asia.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=%s;r=' target='_blank'>%s</a>"%(resp['responses'][i]['hits']['hits'][j]['_source']['ENST_ID'],resp['responses'][i]['hits']['hits'][j]['_source']['ENST_ID'])) if "280771691" in resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"]: resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"] = "28077169" if "/" in resp["responses"][i]["hits"]["hits"][j]["_source"]["Tissue"]: resp["responses"][i]["hits"]["hits"][j]["_source"]["Tissue"] = resp["responses"][i]["hits"]["hits"][j]["_source"]["Tissue"].replace("/", ", ") resp["responses"][i]["hits"]["hits"][j]["_source"]["Tissue"] = resp["responses"][i]["hits"]["hits"][j]["_source"]["Tissue"].capitalize() if "|" in resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"]: ids = resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"] ids = ids.split("|") ids_final = [] for id in ids: ids_final.append("<a href='https://www.ncbi.nlm.nih.gov/pubmed/%s' target='_blank'>%s</a>" % (id, id)) resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"] = Markup(", ".join(ids_final)) else: resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"] = Markup("<a href='https://www.ncbi.nlm.nih.gov/pubmed/%s' target='_blank'>%s</a>"%(resp['responses'][i]['hits']['hits'][j]['_source']['Pubmed_ID'],resp['responses'][i]['hits']['hits'][j]['_source']['Pubmed_ID'])) response[mod_indices[i]].append(resp["responses"][i]["hits"]["hits"][j]["_source"]) except IndexError: response[mod_indices[i]].append("!") pool_query = ThreadPool(processes=6) response['a_to_i_mouse'] = pool_query.apply_async(add_snp, (response['a_to_i_mouse'], 'mouse')) response['m1a_mouse'] = pool_query.apply_async(add_snp, (response['m1a_mouse'], 'mouse')) response['m5c_mouse'] = pool_query.apply_async(add_snp, (response['m5c_mouse'], 'mouse')) response['m6a_mouse'] = pool_query.apply_async(add_snp, (response['m6a_mouse'], 'mouse')) response['nm_mouse'] = pool_query.apply_async(add_snp, (response['nm_mouse'], 'mouse')) response['pseudou_mouse'] = pool_query.apply_async(add_snp, (response['pseudou_mouse'], 'mouse')) #response['c_to_u_mouse'] = pool_query.apply_async(add_snp, (response['c_to_u_mouse'], 'mouse')) response['dihydrouridine_mouse'] = pool_query.apply_async(add_snp, (response['dihydrouridine_mouse'], 'mouse')) response['m1g_mouse'] = pool_query.apply_async(add_snp, (response['m1g_mouse'], 'mouse')) response['m2g_mouse'] = pool_query.apply_async(add_snp, (response['m2g_mouse'], 'mouse')) response['m7g_mouse'] = pool_query.apply_async(add_snp, (response['m7g_mouse'], 'mouse')) response['other_mouse'] = pool_query.apply_async(add_snp, (response['other_mouse'], 'mouse')) response['t6a_mouse'] = pool_query.apply_async(add_snp, (response['t6a_mouse'], 'mouse')) response['a_to_i_mouse'] = response['a_to_i_mouse'].get() response['m1a_mouse'] = response['m1a_mouse'].get() response['m5c_mouse'] = response['m5c_mouse'].get() response['m6a_mouse'] = response['m6a_mouse'].get() response['nm_mouse'] = response['nm_mouse'].get() response['pseudou_mouse'] = response['pseudou_mouse'].get() #response['c_to_u_mouse'] = response['c_to_u_mouse'].get() response['dihydrouridine_mouse'] = response['dihydrouridine_mouse'].get() response['m1g_mouse'] = response['m1g_mouse'].get() response['m2g_mouse'] = response['m2g_mouse'].get() response['m7g_mouse'] = response['m7g_mouse'].get() response['other_mouse'] = response['other_mouse'].get() response['t6a_mouse'] = response['t6a_mouse'].get() pool_query.terminate() pool_query.close() return response['a_to_i_mouse'],response['m1a_mouse'],response['m5c_mouse'],response['m6a_mouse'],response['nm_mouse'],response['pseudou_mouse'], response['dihydrouridine_mouse'], response['m1g_mouse'],response['m2g_mouse'], response['m7g_mouse'], response['other_mouse'], response['t6a_mouse']
class ES_DB: es_db = None # ================================ initializer def __init__(self, es_ip, es_port): self.es_ip = es_ip self.es_port = es_port self.es_db = Elasticsearch('http://' + self.es_ip + ':' + self.es_port, timeout=120) #print inspect.getargspec(self.es_db.indices.put_settings()) # setting # ================================ Create Index # create new index (case) def create_index(self, index_name): try: self.es_db.indices.create( index=index_name, body={ "mappings": { "dynamic_templates": [{ "strings": { "match_mapping_type": "string", "mapping": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 256 } }, "copy_to": "catch_all" } } }] }, "settings": { "analysis": { "analyzer": { "default": { "tokenizer": "keyword", "filter": ["lowercase"] }, "default_search": { "tokenizer": "keyword", "filter": ["lowercase"] } } } } }) return [True, index_name] except Exception as e: return [False, "Error: " + str(e)] # ================================ Delete Index # delete index (case) def delete_index(self, index_name): try: self.es_db.indices.delete(index=index_name) return [True, index_name] except Exception as e: return [False, "Error: " + str(e)] # ================================ get max results window # get the setting for maximum number of records to be retrived from elasticsearch def get_max_result_window(self, indx): settings = self.es_db.indices.get_settings(index=indx) settings = settings[indx]['settings']['index'] if "max_result_window" in settings.keys(): return settings['max_result_window'] else: return 10000 # default value # ================================ get max results window # get the setting for maximum number of records to be retrived from elasticsearch def get_max_fields_num(self, indx): settings = self.es_db.indices.get_settings(index=indx) settings = settings[indx]['settings']['index'] if "query" in settings.keys(): if "default_field" in settings['query']: return settings['query']['default_field'] else: return 1024 # default value # ================================ bulk query # this except index and bodies (list of single body requests) def multiqueries(self, index, bodies): request_header = json.dumps({'index': index}) requests = [] for b in bodies: b["track_total_hits"] = True requests.extend([request_header, b]) resp = self.es_db.msearch(body=requests) # check if there are failed queries for result in resp["responses"]: if "error" in result.keys(): return [False, result["error"]["root_cause"][0]["reason"]] return [True, resp['responses']] # ================================ query # query the elasticsearch db, index is the index name of the case, and body is the query body # count: number of times the function recursive def query(self, indexname, body, count=3): count -= 1 indexname = indexname.lower() body["track_total_hits"] = True logger.logger(level=logger.DEBUG, type="elasticsearch", message="Query to index [" + indexname + "]", reason=json.dumps(body)) filter_path = [ 'hits.hits._source.Data', 'hits.total.value', 'aggregations.*.buckets' ] try: #search_res = self.es_db.search(index=indexname,body=body , filter_path=filter_path) search_res = self.es_db.search(index=indexname, body=body) return [True, search_res] except elasticsearch.RequestError as e: reason = e.info['error']['reason'] logger.logger(level=logger.WARNING, type="elasticsearch", message="Query [" + indexname + "] failed [RequestError]", reason=reason) # if the problem in shards if reason == "all shards failed": for shard in e.info['error']['failed_shards']: if 'caused_by' in shard['reason'].keys(): shard_reason = shard['reason']['caused_by']['reason'] else: shard_reason = shard['reason']['reason'] # if the reason is that the field used for key is text and is not sortable, then try it sub-field ".keyword" if shard_reason.startswith( "Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default" ): if "sort" in body.keys(): field = body['sort'].keys()[0] order = body['sort'][field]['order'] body['sort'] = { field + ".keyword": { 'order': order } } logger.logger( level=logger.INFO, type="elasticsearch", message="Query [" + indexname + "], the sort is not a sortable field, try using sub-field .keyword" ) return self.query(indexname, body, count) # if the reason is the result has too many fields match = re.match( 'field expansion (for \[.*\] )?matches too many fields, limit: ([0-9]+), got: ([0-9]+)', shard_reason) if match is not None: # if the problem is the number of fields more than the default max number of fields in query max_field_num = int(match.groups()[1]) + 100 inc = self.es_db.indices.put_settings( index=indexname, body='{ "index" : { "query": { "default_field" : ' + str(max_field_num) + '} } }') if inc["acknowledged"]: logger.logger( level=logger.INFO, type="elasticsearch", message="Query [" + indexname + "] max query fields number increased " + str(max_field_num)) if count != 0: return self.query(indexname, body, count) else: return [ False, "exceeded the number of tries to fix the issue, field expansion matches too many fields" ] else: logger.logger( level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] Failed increasing the result window") continue # if the result window is too large, increase the window match = re.match( 'Result window is too large, from \+ size must be less than or equal to: \[([0-9]+)\] but was \[([0-9]+)\].*', shard_reason) if match is not None: max_result_window = int(match.groups()[1]) + 1000 inc = self.es_db.indices.put_settings( index=indexname, body='{ "index" : { "max_result_window" : ' + str(max_result_window) + ' } }') if inc["acknowledged"]: logger.logger( level=logger.INFO, type="elasticsearch", message="Query [" + indexname + "] result window increased to " + str(self.get_max_result_window(indexname))) if count != 0: return self.query(indexname, body, count) else: return [ False, "exceeded the number of tries to fix the issue, Result window is too large" ] else: logger.logger( level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] Failed increasing the result window") continue else: logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [RequestError]", reason=shard_reason) else: logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [RequestError]", reason=json.dumps(e.info)) res = [False, reason] except elasticsearch.ConnectionError as e: logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [ConnectionError]", reason=e.info) res = [False, 'Failed to connect to elasticsearch'] except elasticsearch.TransportError as e: reason = str(e) logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [TransportError]", reason=reason) logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [TransportError]", reason=json.dumps(e.info)) res = [False, reason] except elasticsearch.ElasticsearchException as e: reason = str(e) logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [ElasticsearchException]", reason=reason) logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [ElasticsearchException]", reason=json.dumps(e.info)) res = [False, reason] except Exception as e: print str(e) res = [False, str(e)] logger.logger(level=logger.ERROR, type="elasticsearch", message="Query [" + indexname + "] failed [Exception]", reason=str(e)) return res # ================================ get max fields limit # get the total_fields.limit from settings def get_total_fields_limit(self, indx): settings = self.es_db.indices.get_settings(index=indx) if 'mapping' in settings[settings.keys()[0]]['settings']['index']: if 'total_fields' in settings[settings.keys() [0]]['settings']['index']['mapping']: if 'limit' in settings[settings.keys( )[0]]['settings']['index']['mapping']['total_fields']: return settings[settings.keys()[0]]['settings']['index'][ 'mapping']['total_fields']['limit'] return 1000 # default fields limit # ================================ push records to elasticsearch # data: is a list of json data def bulk_queue_push(self, data, case_id, source=None, machine=None, data_type=None, data_path=None, chunk_size=500): case_id = case_id.lower() bulk_queue = [] for d in data: di = { "_index": case_id, "_source": { "Data": d }, '_id': str(uuid.uuid4()) } if source is not None: di['_source']['data_source'] = source if machine is not None: di['_source']['machine'] = machine if data_type is not None: di['_source']['data_type'] = data_type if data_path is not None: di['_source']['data_path'] = data_path bulk_queue.append(di) logger.logger(level=logger.DEBUG, type="elasticsearch", message="Index [" + case_id + "]: Pushing [" + str(len(bulk_queue)) + "] records") push_es = self.bulk_to_elasticsearch(bulk_queue, case_id, chunk_size) if push_es[0]: logger.logger(level=logger.INFO, type="elasticsearch", message="Index [" + case_id + "]: Pushed [" + str(len(bulk_queue) - len(push_es[2])) + "] records successfully") return [ True, "Pushed [" + str(len(bulk_queue)) + "] records", push_es[2], push_es[3] ] else: logger.logger(level=logger.ERROR, type="elasticsearch", message="Index [" + case_id + "]: Failed pusheing [" + str(len(bulk_queue)) + "] records", reason=push_es[1]) return [ False, 'Failed to bulk data to Elasticsearch: ' + str(push_es[1]), bulk_queue, push_es[3] ] # ================================ push records to elasticsearch # return list of records ids successed or failed def bulk_to_elasticsearch(self, bulk_queue, indx, chunk_size): try: errors = { } # contain dictionary of failed data (origin data and error info) failed = [] # contain the IDs of the failed records successed = [] # contain the IDs of successed records logger.logger(level=logger.DEBUG, type="elasticsearch", message="Index [" + indx + "]: bulk push to ES, default chunk[" + str(chunk_size) + "]: ", reason="number of records: " + str(len(bulk_queue))) # use helpers to push the data to elasticsearch for ok, item in helpers.parallel_bulk(self.es_db, bulk_queue, chunk_size=chunk_size, raise_on_error=False, raise_on_exception=False): if not ok: errors[item['index']['_id']] = item logger.logger(level=logger.WARNING, type="elasticsearch", message="Index [" + indx + "]: Failed pushing record: ", reason=str(item)) failed.append(item['index']['_id']) else: successed.append(item['index']['_id']) if len(failed): logger.logger(level=logger.WARNING, type="elasticsearch", message="Index [" + indx + "]: Failed pushing [" + str(len(failed)) + "] records, try to fix the issue") # get origin data from ID for data in bulk_queue: try: errors[data['_id']]['index']['data'] = data['_source'] logger.logger(level=logger.DEBUG, type="elasticsearch", message="Index [" + indx + "]: get data for failed record [" + data['_id'] + "]", reason=str(errors[data['_id']])) except: # if record not in the errors list, continue continue logger.logger(level=logger.WARNING, type="elasticsearch", message="Index [" + indx + "]: Failed pushing record: ", reason=str(data['_id'])) fixed_errors, nonfixed_errors = self.bulk_to_elasticsearch_fix_errors( indx, errors) failed = nonfixed_errors if len(fixed_errors): logger.logger( level=logger.DEBUG, type="elasticsearch", message="Index [" + indx + "]: fixed issue of [" + str(len(fixed_errors)) + "] records, retry to push it") repush_failed_errors = self.bulk_to_elasticsearch( fixed_errors, indx, chunk_size) if repush_failed_errors[0]: successed += repush_failed_errors[3] failed += repush_failed_errors[2] return [ True, "Pushed [" + str(len(successed)) + "] records to [" + indx + "] index", failed, successed ] # if connection timeout to elasticsearch occurred except elasticsearch.exceptions.ConnectionTimeout as e: logger.logger(level=logger.WARNING, type="elasticsearch", message="Index [" + indx + "]: Failed to push the records, retry again", reason="Connection to Elasticsearch timeout") return self.bulk_to_elasticsearch(bulk_queue, indx, chunk_size) except Exception as e: logger.logger( level=logger.ERROR, type="elasticsearch", message="Failed pushing the records, unexpected error", reason=str(e)) return [ False, "Failed pushing [" + str(len(bulk_queue)) + "] records to [" + indx + "] index", bulk_queue, [] ] # ================================ fix the errors faced during build_to_elasticsearch # this will recevie the failed data from bulk queue and fix it # it will return the list of fixed records and nonfixed records def bulk_to_elasticsearch_fix_errors(self, indx, errors): logger.logger(level=logger.WARNING, type="elasticsearch", message="Index [" + indx + "]: Failed pushing [" + str(len(errors)) + "] records [BulkIndexError], retry to fix the issue") # check the returned error for each document and try to solve it fixed_data = [] nonfixed_data = [] limit_fields_increased = False for _id, doc in errors.iteritems(): record_msg_info = "Indx[" + indx + "]" if 'machine' in doc['index']['data'].keys(): record_msg_info += ", machine [" + doc['index']['data'][ 'machine'] + "]" if 'data_type' in doc['index']['data'].keys(): record_msg_info += ", data_type[" + doc['index']['data'][ 'data_type'] + "]" if '_id' in doc['index'].keys(): record_msg_info += ", rec_id[" + doc['index']['_id'] + "]" try: doc_reason = doc['index']['error']['reason'] logger.logger(level=logger.WARNING, type="elasticsearch", message=record_msg_info + ": record failed", reason=doc_reason) # === if the error is the limitation on the fields number, get the add 1000 to the limitation and try again if "Limit of total fields" in doc_reason and limit_fields_increased == False: new_limit = int(self.get_total_fields_limit(indx)) new_limit = new_limit + 1000 inc = self.es_db.indices.put_settings( index=indx, body='{"index.mapping.total_fields.limit": ' + str(new_limit) + '}') if inc["acknowledged"]: logger.logger( level=logger.INFO, type="elasticsearch", message=record_msg_info + " : The total_fields.limit has been increased to " + str(new_limit)) limit_fields_increased = True else: logger.logger( level=logger.ERROR, type="elasticsearch", message=record_msg_info + " : failed to increase total_fields.limit") # === if already fixed the limit of total fields issue, then add it to the list if "Limit of total fields" in doc_reason and limit_fields_increased: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue # if there is error where the text field exceeded the maximum number of charactors (by default 32766) match = re.match( 'Document contains at least one immense term in field="(.+)" \(whose UTF8 encoding is longer than the max length ([0-9]+)\), all of which were skipped.* original message: bytes can be at most ([0-9]+) in length; got ([0-9]+)', doc_reason) if match is not None: field = match.groups()[0] current_max = int(match.groups()[1]) data_length = int(match.groups()[3]) logger.logger(level=logger.ERROR, type="elasticsearch", message=record_msg_info + " : field data more than the specified", reason="field " + field + ", defined max length [" + str(current_max) + "], field data [" + str(data_length) + "]") # ==== check if reason that an object received but the field data type is not correct match = re.match( "object mapping for \[(.*)\] tried to parse field \[(.*)\] as (.*), but found a concrete value", doc_reason) if match is not None: match = match.groups() failed_field = match[0] # if datatype is object but found concrete value if match[2] == 'object': d = json_get_val_by_path(doc['index']['data'], failed_field) if d[0]: # if type of field is object but found "None" as string if d[1] == 'None': if json_update_val_by_path( doc['index']['data'], failed_field, None)[0]: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue # if type of field is object but found string if isinstance(d[1], str): if json_update_val_by_path( doc['index']['data'], failed_field, {'value': d[1]})[0]: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue # ==== failed to parse field as date match = re.match( "failed to parse field \[(.*)\] of type \[(.*)\] in document with id .*", doc_reason) if match is not None: match = match.groups() failed_field = match[0] failed_field_type = match[1] # if the field mapped as date if failed_field_type == 'date': if json_update_val_by_path(doc['index']['data'], failed_field, '1700-01-01T00:00:00')[0]: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue # if the field mapped as text if failed_field_type == 'text': d = json_get_val_by_path(doc['index']['data'], failed_field) if d[0]: d = d[1] try: if isinstance(d, list): res = [0 for x in range(len(d))] for i in d.keys(): res[int(i)] = d[i] res_str = '\n'.join(res) if json_update_val_by_path( doc['index']['data'], failed_field, res_str)[0]: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue elif isinstance(d, dict): res_str = "\n".join([ str(k) + "=" + str(d[k]) for k in d.keys() ]) if json_update_val_by_path( doc['index']['data'], failed_field, res_str)[0]: fixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) continue except Exception as e: pass logger.logger(level=logger.ERROR, type="elasticsearch", message=record_msg_info + " : No fix found for failed record [" + doc['index']['_id'] + "] data", reason=doc['index']['data']) nonfixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) except Exception as e: logger.logger(level=logger.ERROR, type="elasticsearch", message=record_msg_info + " : unsuspected error in fixing record issue", reason=str(e)) nonfixed_data.append({ "_index": doc['index']['_index'], "_type": doc['index']['_type'], "_id": doc['index']['_id'], "_source": doc['index']['data'] }) return fixed_data, nonfixed_data # ================================ push records to elasticsearch # update specific record in elasticsearch def update_field(self, data, doc_id, indx): try: indx = indx.lower() up = self.es_db.update(index=indx, doc_type="_doc", id=doc_id, body=data) if up['result'] == 'updated': return [True, 'updated'] else: return [ False, "Index[" + indx + "]: Failed to update the record [" + str(doc_id) + "] : " + str(json.dumps(data)) ] except Exception as e: return [False, str(e)] # ================================ add tag def es_add_tag(self, data, case_id): try: case_id = case_id.lower() ins = self.es_db.index(index=case_id, body=data) return [True, ins] except Exception as e: return [False, str(e)] # ================================ get record # get specific record by its id def get_record_by_id(self, case_id, record_id): case_id = case_id.lower() try: res = self.es_db.get(index=case_id, doc_type="_doc", id=record_id) return [True, res] except Exception as e: return [False, str(e)] # ================================ Delete record # delete records by id def del_record_by_id(self, case_id, record_id): case_id = case_id.lower() try: res = self.es_db.delete(index=case_id, doc_type="_doc", id=record_id) if res['result'] == 'deleted': return [True, 'deleted'] else: return [ False, "Index[" + case_id + "]: Failed to delete the record [" + str(record_id) + "]" ] except elasticsearch.NotFoundError as e: return [ False, "NotFound: [" + case_id + "] _id[" + record_id + "]" ] except Exception as e: return [False, str(e)] # ================================ Delete record # delete records by query def del_record_by_query(self, case_id, query): case_id = case_id.lower() try: res = self.es_db.delete_by_query(index=case_id, body=query) return [ True, "Indx[" + case_id + "]: Deleted " + json.dumps(query) ] except Exception as e: return [False, str(e)] # ================================ Get fields mapping # return the fields mapping (all fields and its properties) def get_mapping_fields(self, case_id): try: mapping = self.es_db.indices.get_mapping(index=case_id) if 'properties' in mapping[case_id]['mappings'].keys(): fields = mapping[case_id]['mappings']['properties'] fields_rec = self.get_mapping_fields_rec(fields) if fields_rec[0] == False: return fields_rec else: fields_list = fields_rec[1] else: fields_list = [] return [True, fields_list] except Exception as e: return [False, str(e)] # recursive function for get_mapping_fields def get_mapping_fields_rec(self, fields, current_path=[]): fields_list = [] try: for k in fields.keys(): if 'properties' in fields[k].keys(): fields_rec = self.get_mapping_fields_rec( fields[k]['properties'], current_path + [k]) if fields_rec[0] == False: return fields_rec else: fields_list += fields_rec[1] else: current_path_tmp = '.'.join(current_path) if len(current_path) > 0: current_path_tmp += "." r = { 'type': fields[k]['type'], 'field_path': current_path_tmp + k, 'fields': fields[k]['fields'].keys()[0] if 'fields' in fields[k].keys() else '' } fields_list.append(r) return [True, fields_list] except Exception as e: return [False, str(e)] # ============================== get System health information # return the nodes information def get_nodes_info(self): try: return [True, self.es_db.nodes.info()] except Exception as e: return [False, str(e)] def get_indices_settings(self): try: return [True, self.es_db.indices.get_settings('*')] except Exception as e: return [False, str(e)] def get_indices_stats(self): try: return [True, self.es_db.indices.stats('')] except Exception as e: return [False, str(e)] def get_index_count(self, index): #print json_beautifier( self.es_db.indices.stats(index) ) try: return [ True, self.es_db.cat.count(index, params={"format": "json"}) ] except Exception as e: return [False, str(e)]
class ES: def __init__(self, model='EC', similarity='BM25'): self.model = model self.similarity = similarity self._settings = self.get_model_settings() self._index_name = f'{model}_{similarity}'.lower() self._settings['settings'] = getattr( self, f'get_{similarity.lower()}_settings')() self.es = Elasticsearch(timeout=120) #print(self.es.info()) def get_index(self): return self._index_name def get_lm_settings(self): return {"similarity": {"default": {"type": "LMDirichlet"}}} def get_bm25_settings(self): return {"similarity": {"default": {"type": "BM25"}}} def get_custom_settings(self): return { "similarity": { "default": { "type": "LMDirichlet" }, "custom_LMDirichlet": { "type": "scripted", "script": { "source": """ double freq = doc.freq; double PtC = (term.totalTermFreq+1.0)/(field.sumTotalTermFreq+1.0); double tw = Math.log(1.0 + (freq/(2000.0*PtC))); double norm = Math.log(2000.0 / (doc.length + 2000.0)); return Math.max((tw + norm), 0.0); """ } } } } def get_model_settings(self): properties = { 'body': { 'type': 'text', 'term_vector': 'yes', 'analyzer': 'english', 'similarity': 'default' #'custom_lm' if self.similarity == 'Custom' else 'default' } } if self.similarity == 'Custom': properties['weight'] = { 'type': 'float', 'index': False, "store": True } return {'mappings': {'properties': properties}} def reset_index(self): if self.es.indices.exists(self._index_name): self.es.indices.delete(self._index_name) self.es.indices.create(self._index_name, self._settings) def data_from_generator(self, doc): num_docs = len(doc) // 10 for i, (doc_id, body) in enumerate(doc.items()): yield {'_index': self._index_name, '_id': doc_id, '_source': body} if i % num_docs == 0: print('{}% done'.format((i // num_docs) * 10)) def _index_EC(self, documents): for success, info in parallel_bulk(self.es, self.data_from_generator(documents), thread_count=12, chunk_size=5000, max_chunk_bytes=104857600, queue_size=6): if not success: print('A document failed:', info) def _index_TC(self, documents): num_docs = len(documents) // 10 for i, (did, body) in enumerate(documents.items()): self.es.index(self._index_name, body=body, id=did) if i % num_docs == 0: print('{}% done'.format((i // num_docs) * 10)) def reindex(self, doc_body='short', ancestors=False): print('Indexing model {} - {}'.format(self.model, self.similarity)) self.reset_index() if self.model == 'EC': documents = get_EC_documents(doc_body) self._index_EC(documents) else: documents = get_TC_documents(doc_body, ancestors) if self.similarity == 'Custom': weights = get_type_weights() for t in documents: documents[t]['weight'] = weights.get(t, 1) self._index_TC(documents) # self._index_TC({k: v for k, v in list(documents.items())[:20]}) def analyze_query(self, query, field='body'): """Analyzes a query with respect to the relevant index. Arguments: query: String of query terms. field: The field with respect to which the query is analyzed. Returns: A list of query terms that exist in the specified field among the documents in the index. """ tokens = self.es.indices.analyze(index=self._index_name, body={'text': query})['tokens'] query_terms = [] for t in sorted(tokens, key=lambda x: x['position']): ## Use a boolean query to find at least one document that contains the term. hits = self.es.search(index=self._index_name, body={ 'query': { 'match': { field: t['token'] } } }, _source=False, size=1).get('hits', {}).get('hits', {}) doc_id = hits[0]['_id'] if len(hits) > 0 else None if doc_id is None: continue query_terms.append(t['token']) return query_terms def baseline_EC_retrieval(self, queries, k=100): """Performs baseline retrival on index. """ ids, body = [], [] for query in queries: if query['category'] != 'resource': continue q = self.analyze_query(query['question']) if not q: continue ids.append(query['id']) body.append({}) body.append({ 'query': { 'match': { 'body': ' '.join(q) } }, '_source': False, 'size': k }) res = self.es.msearch(index=self._index_name, body=body)['responses'] return { qid: [(doc['_id'], doc['_score']) for doc in hits['hits']['hits']] for qid, hits in zip(ids, res) } def baseline_TC_retrieval(self, queries, k=100): """Performs baseline retrival on index. """ results = {} for query in queries: if query['category'] != 'resource': continue q = self.analyze_query(query['question']) if not q: continue body = [] for term in q: body.append({}) body.append({ 'query': { 'match': { 'body': term } }, '_source': False }) res = self.es.msearch(index=self._index_name, body=body)['responses'] scores = defaultdict(int) for hits in res: for doc in hits['hits']['hits']: scores[doc['_id']] += doc['_score'] results[query['id']] = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k] return results def load_baseline_results(self, dataset='train', force=False): fname = f'top100_{self.model}_{self.similarity}_{dataset}' if not force: results = load_dict_from_json(fname) if results: return results print('Retrieving from index.') queries = load_dict_from_json(f'{dataset}_set_fixed.json') if not queries: print('Cannot find the dataset.') return None res = getattr(self, f'baseline_{self.model}_retrieval')(queries) save_dict_to_json(res, fname) return res def get_baseline_EC_scores(self, results, k=100): """Aggregates scores from EC index and return ranked types Args: results (dict): baseline entity retrieval k (int, optional): Number of documents to aggregate over. Defaults to 100. Returns: dict: Type scores """ type_weights = get_type_weights() entity_types = get_all_instance_types(True) system_output = {} for qid, res in results.items(): scores = defaultdict(int) for entity, score in res[:k]: for t in entity_types[entity]: scores[t] += score / type_weights[t] system_output[qid] = sorted(scores.items(), key=lambda x: x[1], reverse=True) return system_output def get_baseline_TC_scores(self, results, k=None): return results def generate_baseline_scores(self, dataset='train', k=100, force=False): raw_results = self.load_baseline_results(dataset, force) return getattr(self, f'get_baseline_{self.model}_scores')(raw_results, k)
class DB: IND = "appsearch" def __init__(self, mdb_conn, es_conn, force_delete=False): self.db = MongoClient(mdb_conn).main self.es = Elasticsearch([{'host': es_conn, 'port': 9200}]) if self.es.ping(): print('Elasticsearch connected') else: print('Elasticsearch not connected') self.create_index(force_delete) def create_index(self, force): if force: try: self.es.indices.delete(self.IND) except: pass if not self.es.indices.exists(self.IND): self.es.indices.create(index=self.IND) print('Index created') self.es.indices.close(self.IND) self.es.indices.put_settings(index=self.IND, body=json.load(open('data/es_settings.json'))) self.es.indices.put_mapping(index=self.IND, body=json.load(open('data/es_mapping.json'))) print("Index configured") self.es.indices.open(self.IND) self.es.indices.refresh(index=self.IND) print("Index opened") self.update_es() def insert(self, data): return self.es.index(index=self.IND, body=data) def update_es(self): res = self.db.apps.find({}) for i in tqdm(res, total=res.count()): i["extid"] = str(i["_id"]) del i["_id"] self.insert(i) print("Finishing copying records") def execute_search(self, **params): def proc(x): x["_source"]["_score"] = x["_score"] return x["_source"] try: return list(map(proc, self.es.search(timeout="2s", **params)['hits']['hits'])) except: return [] def execute_msearch(self, allow_remove_source=True, **params): def proc(x): if allow_remove_source and len(x["_source"]) == 1: return x["_source"].popitem()[1] x["_source"]["_score"] = x["_score"] return x["_source"] try: data = self.es.msearch(body=reduce( lambda x, y: x + [{"index": params["index"]}, y], params["body"], []) )['responses'] return reduce(lambda x, y: x + list(map(proc, y['hits']['hits'])), data, []) except: return [] def get_pending_review(self): return self.db.reviews.find_one() def get_pending_app(self): return list(self.db.apps.aggregate([{"$sample": {"size": 1}}]))[0] def save_review(self, aid, text, type): self.db.reviews.insert_one({ "aid": aid, "text": text, "type": type, "checked": 0 }) def approve_review(self, id): rev = self.db.reviews.find_one({ "_id": ObjectId(id) }) if rev['type']: self.db.apps.update_one({ "_id": ObjectId(rev['aid']) }, { '$push': { 'feedbacks': rev['text'] } }) else: self.db.apps.update_one({ "_id": ObjectId(rev['aid']) }, { '$push': { 'tags': rev['text'] } }) self.update_es() self.db.reviews.delete_one({ "_id": ObjectId(id) }) def category_search(self, text): return self.execute_search( index=self.IND, body={ "sort": [ {"pos_feedbacks": {"order": "desc"}}, "_score" ], "query": { "match": { "category": text } } } ) def search(self, txt): return self.execute_msearch(index=self.IND, body=[ { "query": { "match": { "title": txt } }, }, { "query": { "match": { "description": txt } }, } ]) def get_ids_for_query(self, txt): res = self.execute_msearch(index=self.IND, body=[ { "query": { "match": { "title": txt } }, "_source": ["extid"], }, { "query": { "match": { "description": txt } }, "_source": ["extid"], } ]) return list(map(lambda x: ObjectId(x), res)) def combine_tags(self, txt): res = self.db.apps.aggregate([ { "$match": { "_id": { "$in": self.get_ids_for_query(txt), } } }, { "$unwind": "$tags" }, { "$group": { "_id": "null", "tags": { "$addToSet": "$tags" } } } ]) res = list(res) if len(res) > 0: res = res[0]['tags'] if '' in res: res.remove('') return res return [] def query_by_tags(self, txt, tags): return list(self.db.apps.aggregate([ { "$match": { "$and": [{ "_id": { "$in": self.get_ids_for_query(txt), } }] + ([{ "tags": { "$elemMatch": { "$in": tags } } }] if len(tags) > 0 else []) } }, { "$sort": { "pos_feedbacks": -1 } } ] )) def get_app_by_id(self, x): return self.db.apps.find_one({"_id": ObjectId(x) if type(x) == str else x})
class MUSEUM: def __init__(self, host, port, use_caching=False): self.es = Elasticsearch(hosts=host, port=port, timeout=600) self.use_caching = use_caching def create_index(self, index, module, num_hash=128, use_smallest=False, use_mod=False, use_minmax=False, shards=5, replicas=1, interval=10): if index == '': raise NotDefinedError("Index parameter is not passed") if self.es.indices.exists(index): raise AlreadyExistError("\"{}\" already exist index".format(index)) res = self.es.indices.create( index=index, body=get_index_template(module, num_hash, use_smallest, use_mod, use_minmax, shards, replicas, interval)) return res def get_index_info(self, index_name): if not self.es.indices.exists(index_name): raise NotExistError("Index does not exist") index_info = self.es.indices.get_mapping( index=index_name)[index_name]['mappings']['_meta'] index_info['module'] = module_loader(index_info['module_info']) return index_info def bulk(self, index_name, target, process_count=8, batch_size=10000, disable_tqdm=False, pass_indexed_files=False): index_info = self.get_index_info(index_name) if type(target) is list or type(target) is set: file_list = target elif type(target) is str and os.path.isdir(target): file_list = walk_directory(target) else: raise NotADirectoryError("{} is not a directory".format(target)) pbar = tqdm(total=len(file_list), desc="Bulk index", disable=disable_tqdm) for batch_file_list in batch_generator(file_list, batch_size): if not pass_indexed_files: remain_file_list = batch_file_list else: remain_file_list = [] exist_md5_set = self.__check_exists(index_name, batch_file_list) for file_path in batch_file_list: if not os.path.splitext( os.path.split(file_path)[1])[0] in exist_md5_set: remain_file_list.append(file_path) else: pbar.update(1) bulk_body_list = [] for file_md5, sampled_data, feature_size, file_name in mp_helper( preprocess.action, remain_file_list, process_count, index_info=index_info, use_caching=self.use_caching): if sampled_data: bulk_body_list.append( get_bulk_request(file_md5, sampled_data, feature_size, file_name, index_name)) pbar.update(1) if bulk_body_list: self.es.bulk(body=bulk_body_list) pbar.close() print("Waiting {} sec for index refresh".format( index_info["refresh_interval"])) time.sleep(int(index_info["refresh_interval"])) def search(self, index_name, file_path, limit=1, index_info=None): if not index_info: index_info = self.get_index_info(index_name) _, query_samples, query_feature_size, file_name = preprocess.action( file_path, index_info, self.use_caching) report = {'query': file_name, 'hits': []} if query_samples: try: response = self.es.search(index=index_name, body=get_search_body( query_samples, limit), search_type='dfs_query_then_fetch') except ConnectionTimeout: print('Search error detected') return report report['hits'] = make_report_hits(response, query_samples, query_feature_size, index_info) return report def multi_search(self, index_name, target, limit=1, process_count=1, batch_size=100, disable_tqdm=False): if type(target) is list or type(target) is set: file_list = target elif type(target) is str and os.path.isdir(target): file_list = walk_directory(target) else: raise NotADirectoryError("{} is not a directory".format(target)) index_info = self.get_index_info(index_name) pbar = tqdm(total=len(file_list), disable=disable_tqdm, desc="Multiple search") for jobs in batch_generator(file_list, batch_size): search_data_list = [] query_samples_list = [] query_feature_size_list = [] file_name_list = [] for _, query_samples, query_feature_size, file_name in mp_helper( preprocess.action, jobs, process_count, index_info=index_info, use_caching=self.use_caching): if query_samples: search_data_list.append( get_msearch_request(index_name, query_samples, limit)) query_samples_list.append(query_samples) query_feature_size_list.append(query_feature_size) file_name_list.append(file_name) report_list = [] if search_data_list: try: resp = self.es.msearch(body="\n".join(search_data_list)) except ConnectionTimeout: print('Search error detected') continue for i, response in enumerate(resp['responses']): report = { 'query': file_name_list[i], 'hits': make_report_hits(response, query_samples_list[i], query_feature_size_list[i], index_info) } report_list.append(report) pbar.update(len(report_list)) yield report_list pbar.close() def __check_exists(self, index_name, batch_file_list): md5_list = [ os.path.splitext(os.path.split(file_path)[1])[0] for file_path in batch_file_list ] exist_query_list = [] for md5 in md5_list: exist_query_list.append(get_exists_request(index_name, md5)) responses = self.es.msearch( body="\n".join(exist_query_list))['responses'] exist_md5_set = set() for response in responses: hits = response['hits']['hits'] if hits: exist_md5_set.add(hits[0]['_id']) return exist_md5_set
class RaintankFinder(object): __fetch_multi__ = "raintank" def __init__(self, config): cfg = config.get('raintank', {}) es = cfg.get('es', {}) rt = cfg.get('tank', {}) self.config = { "tank": { "url": rt.get('url', 'http://localhost:6060') }, "es": { "url": es.get('url', 'http://localhost:9200') } } logger.info("initialize RaintankFinder", config=self.config) self.es = Elasticsearch([self.config['es']['url']]) def find_nodes(self, query): seen_branches = set() #query Elasticsearch for paths matches = self.search_series(query) for name, metrics in matches['leafs'].iteritems(): yield RaintankLeafNode(name, RaintankReader(self.config, metrics)) for branchName in matches['branches']: yield BranchNode(branchName) def search_series(self, query): parts = query.pattern.split(".") part_len = len(parts) es_query = { "bool": { "must": [ ] } } pos = 0 for p in parts: node = "nodes.n%d" % pos value = p q_type = "term" if is_pattern(p): q_type = "regexp" value = p.replace('*', '.*').replace('{', '(').replace(',', '|').replace('}', ')') es_query['bool']['must'].append({q_type: {node: value}}) pos += 1 leaf_search_body = { "query": { "filtered": { "filter": { "bool": { "must": [ { "term" : { "node_count": part_len } } ], "should": [ { "term": { "org_id": g.org } }, { "term": { "org_id": -1 } } ] } }, "query": es_query } } } leaf_query = json.dumps(leaf_search_body) branch_search_body = leaf_search_body branch_search_body["query"]["filtered"]["filter"]["bool"]["must"][0] = {"range": {"node_count": {"gt": part_len}}} branch_search_body["aggs"] = { "branches" : { "terms": { "field": "nodes.n%d" % (part_len - 1), "size": 500 } } } branch_query = json.dumps(branch_search_body) search_body = '{"index": "metric", "type": "metric_index", "size": 500}' + "\n" + leaf_query +"\n" search_body += '{"index": "metric", "type": "metric_index", "search_type": "count"}' + "\n" + branch_query + "\n" branches = [] leafs = {} with statsd.timer("graphite-api.search_series.es_search.query_duration"): ret = self.es.msearch(index="metric", doc_type="metric_index", body=search_body) if len(ret['responses'][0]["hits"]["hits"]) > 0: for hit in ret['responses'][0]["hits"]["hits"]: leaf = True source = hit['_source'] if source['name'] not in leafs: leafs[source['name']] = [] leafs[source['name']].append(RaintankMetric(source, leaf)) if len(ret['responses'][1]['aggregations']['branches']['buckets']) > 0: for agg in ret['responses'][1]['aggregations']['branches']['buckets']: branches.append("%s.%s" % (".".join(parts[:-2]), agg['key'])) return dict(leafs=leafs, branches=branches) def fetch_multi(self, nodes, start_time, end_time): step = None node_ids = {} for node in nodes: for metric in node.reader.metrics: if step is None or metric.interval < step: step = metric.interval with statsd.timer("graphite-api.fetch.raintank_query.query_duration"): data = self.fetch_from_tank(nodes, start_time, end_time) series = {} delta = None with statsd.timer("graphite-api.fetch.unmarshal_raintank_resp.duration"): for path, points in data.iteritems(): datapoints = [] next_time = start_time; max_pos = len(points) if max_pos == 0: for i in range(int((end_time - start_time) / step)): datapoints.append(None) series[path] = datapoints continue pos = 0 if delta is None: delta = (points[0][1] % start_time) % step # ts[0] is always greater then start_time. if delta == 0: delta = step while next_time <= end_time: # check if there are missing values from the end of the time window if pos >= max_pos: datapoints.append(None) next_time += step continue ts = points[pos][1] # read in the metric value. v = points[pos][0] # pad missing points with null. while ts > (next_time + step): datapoints.append(None) next_time += step datapoints.append(v) next_time += step pos += 1 if (ts + step) > end_time: break series[path] = datapoints if delta is None: delta = 1 time_info = (start_time + delta, end_time, step) return time_info, series def fetch_from_tank(self, nodes, start_time, end_time): params = {"render": [], "from": start_time, "to": end_time} pathMap = {} for node in nodes: for metric in node.reader.metrics: params['render'].append(metric.id) pathMap[metric.id] = metric.name url = "%sget" % self.config['tank']['url'] resp = requests.get(url, params=params) logger.debug('fetch_from_tank', url=url, status_code=resp.status_code, body=resp.text) dataMap = {} for result in resp.json(): path = pathMap[result['Target']] if path in dataMap: #we need to merge the datapoints. dataMap[path].extend(result['Datapoints']) # sort by timestamp dataMap[path].sort(key=lambda x: x[1]) else: dataMap[path] = result['Datapoints'] return dataMap
def search_similarity_master_data(similar_row_hash, index_name, doc_type='details'): # 建立连接 es = Elasticsearch([{'host': ES_HOST, 'port': ES_PORT }]) # 构造查询语句 msearch_dsl = [] key_index_list = [] for row_key in similar_row_hash: master = similar_row_hash[row_key] # 配置索引信息 dsl_index = {"index": index_name, "doc_type": doc_type} # 配置必要条件 must_list = [] for key in master: query_name = master[key] if query_name == '': continue must_list.append({ "match": { key: {"query":query_name} } } ) # 构造查询语句 dsl_query = { "query" : { "bool":{ "must": must_list } }, "size" : 1 } msearch_dsl.append(dsl_index) msearch_dsl.append(dsl_query) key_index_list.append(row_key) # 批量查询,2000查询1次 similarity_hash = {} queue_size = 2000 max_queue_count = len(msearch_dsl)/queue_size+1 for i in range(0,max_queue_count): from_index = i*queue_size to_index = (i+1)*queue_size queue_dsl = msearch_dsl[from_index:to_index] if len(queue_dsl) == 0: continue # 查询 responses = es.msearch(body=queue_dsl) response_index = 0 for response in responses['responses'] : # 解析每条查询的结果 masters = response['hits']['hits'] if len(masters) > 0: es_master = masters[0]['_source'] # 所以的sha1 row_sha1 = key_index_list[i*queue_size/2+response_index] similar_row_hash[row_sha1] = es_master response_index = response_index + 1 return similar_row_hash
class QueryLogRhythm: elastic_misp_mapping = { 'authentihash': ['hash', 'object'], 'cdhash': ['hash', 'object'], 'domain': ['domain', 'domainOrigin', 'impactedHostName', 'originHostName'], 'email-dst': ['recipient'], 'email-reply-to': ['recipient'], 'email-src': ['sender'], 'email-subject': ['subject'], 'filename': ['object', 'objectName'], 'impfuzzy': ['hash', 'object'], 'imphash': ['hash', 'object'], 'md5': ['hash', 'object'], 'pehash': ['hash', 'object'], 'sha1': ['hash', 'object'], 'sha224': ['hash', 'object'], 'sha256': ['hash', 'object'], 'sha384': ['hash', 'object'], 'sha512': ['hash', 'object'], 'sha512/224': ['hash', 'object'], 'sha512/256': ['hash', 'object'], 'ssdeep': ['hash', 'object'], 'tlsh': ['hash', 'object'], 'hassh-md5': ['hash', 'object'], 'hasshserver-md5': ['hash', 'object'], 'ja3-fingerprint-md5': ['hash', 'object'], 'hostname': ['domain', 'domainOrigin', 'impactedHostName', 'originHostName'], 'http-method': ['action', 'command'], 'port': ['originPort', 'impactedPort'], 'o-port': ['originPort'], 'i-port': ['impactedPort'], 'ip-dst': ['impactedIp', 'impactedIpV6'], 'ip-src': ['originIp', 'originIpV6'], 'link': ['url'], 'mac-address': ['impactedMac', 'originMac'], 'mime-type': ['object', 'objectName', 'objectType'], 'mutex': ['object', 'parentProcessName', 'parentProcessPath', 'process'], 'named pipe': ['object', 'parentProcessName', 'parentProcessPath', 'process'], 'regkey': ['object', 'objectName'], 'target-email': ['recipient'], 'target-machine': ['domain', 'domainOrigin', 'impactedHostName', 'originHostName'], 'target-user': ['login', 'account'], 'uri': ['object', 'url', 'objectName'], 'url': ['url'], 'user-agent': ['userAgent'], 'vulnerability': ['CVE', 'object'], 'windows-scheduled-task': ['parentProcessName', 'parentProcessPath', 'process'], 'windows-service-name': ['parentProcessName', 'parentProcessPath', 'process', 'serviceName'], 'test': 'test', 'test2': 9200, 'windows-service-displayname': ['parentProcessName', 'parentProcessPath', 'process', 'serviceName'] } def __init__(self, elastic_host='localhost', elastic_port=9200): self.elastic_host = elastic_host self.elastic_port = elastic_port self.elastic_client = Elasticsearch([{ 'host': elastic_host, 'port': elastic_port }]) def build_query(self, parameters): if parameters is None: return None lst_and_qry = list() for parameter in parameters: data = parameters[parameter] terms = self.elastic_misp_mapping[parameter] lst_or_qry = list() for term in terms: if isinstance(data, list): for value in data: qry = term + ':' + str(value) lst_or_qry.append(qry) else: qry = term + ': ' + str(data) lst_or_qry.append(qry) or_qyr = ' OR '.join(lst_or_qry) lst_and_qry.append('(' + or_qyr + ')') and_qry = ' AND '.join(lst_and_qry) return and_qry def query_ec(self, str_query, q_fields, start_date=0, end_date=0, index='logs-*', doc_type='logs', hours=24, debug=False): if start_date > end_date: raise Exception( 'The start_date can\'t be greater than the end_date') if start_date == 0 or end_date == 0: dt_end_date = datetime.now().timestamp() dt_start_date = (datetime.now() - timedelta(hours=hours)).timestamp() start_date = int(dt_start_date) * 1000 end_date = int(dt_end_date) * 1000 # print(str(start_date) + ' -- ' + str(end_date)) elastic_qry = ElasticQuery(es=self.elastic_client, index=index, doc_type=doc_type) elastic_qry.query( Query.bool(must=[ Query.query_string(str_query), Query.range('normalDate', gte=start_date, lte=end_date) ])) elastic_qry.aggregate( Aggregate.date_histogram('2', 'normalDate', '12h')) my_qry = elastic_qry.dict() my_qry['stored_fields'] = q_fields search_arr = list() header_qry = {"index": ["logs-*"], "ignore_unavailable": True} search_arr.append(header_qry) search_arr.append(my_qry) print('Elastic Query: ' + str(search_arr)) print( '------------------------------------------------------------------------------------' ) print('Lucene Query: ' + str_query) request = '' for each in search_arr: request += '%s \n' % json.dumps(each) # print(request) resp = self.elastic_client.msearch(body=request) if resp is None and len(resp['responses']) <= 0: return None else: response = resp['responses'][0] hits_data = list() if response['hits']['total'] > 0: for hit in response['hits']['hits']: hits_data.append(hit) # print(str(hits_data)) return search_arr, hits_data