def find_es(idx_name, domain, status): body = { "query": { "bool": { "filter": { "range": { "@timestamp": { "gt": 'now-2m' } } }, "must": [ # {"match":{"http_host" : "flight.01zhuanche.com"}}, # {"match":{"status":"404"}} { "match": { "http_host": domain } }, { "match": { "status": status } } ] } } } es = Elasticsearch(hosts='http://10.66.5.28:9200', timeout=300) Es_Data = es.search(index='%s' % idx_name, body=body) print Es_Data print Es_Data["hits"]["hits"][0]['_source']['remote_addr']
def lambda_handler(event, context): # elasticsearch host = 'vpc-photos-djta6afabridi6a46k6zz6mq44.us-east-1.es.amazonaws.com' # For example, my-test-domain.us-east-1.es.amazonaws.com service = 'es' credentials = boto3.Session().get_credentials() awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, 'us-east-1', service, session_token=credentials.token) es = Elasticsearch(hosts=[{ 'host': host, 'port': 443 }], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) res = es.search(index="photos", doc_type="photos", body={"query": { "match_all": {} }}) print(json.dumps(res["hits"]["hits"], indent=2)) return {'statusCode': 200, 'body': json.dumps('Hello from Lambda!')}
def Get_Error(stime, dtime, idx_name, domain, status): # body = { # "size": 10000, # "sort": { # "@timestamp": { # "order": "desc", # "unmapped_type": "boolean" # } # }, # "_source": { # "excludes": [] # }, # "stored_fields": ["*"], # "docvalue_fields": ["@timestamp"], # "query" : { # "constant_score" : { # "filter" : { # "bool": { # # "must": {"exists": {"field": "%s" % field_name}}, # # "must_not": {"term": {"%s.keyword" % field_name: ""}}, # {"match": {"http_host": domain}}, # {"match": {"status": status}}, # "must": {"range":{"@timestamp":{"gte":stime,"lte":dtime}}} # } # } # } # } # } body = { "size": 10000, "query": { "bool": { "filter": { "range": { "@timestamp": { "gte": stime, "lte": dtime } } }, "must": [ # {"match":{"http_host" : "flight.01zhuanche.com"}}, # {"match":{"status":"404"}} { "match_phrase": { "http_host": domain } }, { "match": { "status": status } } ] } } } es = Elasticsearch(hosts='http://10.66.5.28:9200', timeout=300) Es_Data = es.search(index='%s' % idx_name, body=body) print Es_Data
def find_es(stime, dtime, idx_name, domain, status): body = { "size": 10000, "query": { "bool": { "filter": { "range": { "@timestamp": { "gte": stime, "lte": dtime } } }, "must": [{ "match_phrase": { "http_host": domain } }, { "match": { "status": status } }] } } } es = Elasticsearch(hosts='http://10.66.5.28:9200', timeout=300) Es_Data = es.search(index='%s' % idx_name, body=body) print Es_Data print 'Type Es_Data: %s' % type(Es_Data) print Es_Data["hits"]["hits"][0]['_source']['remote_addr'] return Es_Data
class ESClient(DBClient): @property def parameter_definitions(self): return query_parameter_mappers def setup_connection(self): self.store = Elasticsearch(hosts=[self.endpoint], http_auth=(self.user, self.passwd)) def get_content(self, mapped_params): query_body = get_content.build_query_body(**mapped_params) es_response = self.query(query_body, index='pips', doc_type='clip', scroll='1m') clips = map_hits_to_api_spec(es_response) return {'results': clips} def get_item(self, mapped_params): query_body = get_item.build_query_body(**mapped_params) es_response = self.query(query_body, index='pips', doc_type='clip', scroll='1m') clips = map_hits_to_api_spec(es_response) if len(clips) == 0: raise NoResultsFoundError(f'No results for URI: {mapped_params}') return clips[0] def get_similar(self, mapped_params): query_body = get_similar.build_query_body(**mapped_params) es_response = self.query(query_body, index='pips', doc_type='clip', scroll='1m') clips = map_hits_to_api_spec(es_response) return {'results': clips} def query(self, query, **params): return self.store.search(body=query, **params) def close_connection(self): # handled by garbage collection pass
class ElasticHelper(object): def __init__(self): self.es = Elasticsearch(ElasticConfig.uri) def index(self, body, index, doc_type): self.es.index(body=body, index=index, doc_type=doc_type) def bulk(self, body, index, doc_type): self.es.bulk(body=body, index=index, doc_type=doc_type) def scan(self, body, index, doc_type): return helpers.scan(self.es, query=body, index=index, doc_type=doc_type, preserve_order=True) def search(self, body, index, doc_type): try: rsp = self.es.search(body=body, index=index, doc_type=doc_type, request_timeout=100) if rsp.get("errors"): print("es search error") return return rsp except Exception as e: print("es search error: " + str(e)) def count(self, body, index, doc_type): return self.es.count(index=index, doc_type=doc_type, body=body, request_timeout=100) def delete_index(self, index): return self.es.indices.delete(index=index)
"excludes": [] }, "aggs": { "aggdata": { "terms": { "field": "myxxid.keyword", "size": 100, "order": { "_count": "desc" } } } } } resp = es.search(index='logstash-*', body=query) aggs['views'] = resp['aggregations']['aggdata']['buckets'] # get shares query = { "query": { "bool": { "must": [{ "query_string": { "query": "share", "analyze_wildcard": True } }, { "range": { "@timestamp": { "gte": epoch(date),
class es02: def __init__(self): self.es = Elasticsearch([ 'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443' ]) self.set_service() pass def load(self, fname="es01.pkl"): self.data = pickle.load(open(fname, "rb")) def load_datas(self, start_date=date(2017, 12, 1), end_date=date(2018, 1, 9)): d = start_date delta = datetime.timedelta(days=1) while d <= end_date: es_date = d.strftime("%Y.%m.%d") print(es_date) try: self.load_data(es_date) self.parse() #print(es.dset) except Exception: print(traceback.format_exc()) d += delta def load_data(self, dt="2018.01.08"): es_index = 'slowquery-' + dt page = self.es.search(index=es_index, doc_type='elltdev', body={'query': { 'match_all': {} }}) self.data = page # print("test") def load_datas2(self, start_date=date(2017, 12, 1), end_date=date(2018, 1, 9)): d = start_date delta = datetime.timedelta(days=1) while d <= end_date: es_date = d.strftime("%Y.%m.%d") print(es_date) try: self.load_data2(es_date) self.parse() #print(es.dset) except: print("can't not find data") d += delta def load_data2(self, dt): url = 'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443/slowquery-' + dt + '/elltdev/_search' resp = requests.get(url=url) self.data = json.loads(resp.text) #data = {'took': 1, '_shards': {'total': 5, 'successful': 5, 'failed': 0}, 'timed_out': False, 'hits': {'max_score': 1.0, 'total': 1550, 'hits': [{'_source': {'host': 'omuser[omuser] @ [10.125.224.9] Id: 1005635', 'Rows_examined': 514, 'query': '''SELECT \t/*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\titemT.GOODS_NO\t\t , GROUP_CONCAT(DISTINCT itemT.ITEM_NO separator ',') AS ITEM_NO\t\t , itemT.OPT_NM\t\t , itemT.OPT_VAL\t\t\t\t, optT.OPT_SEQ\t\t\t\t \t\t FROM (\t\t\t\tSELECT /*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\t\t\tgd_item_opt.ITEM_NO\t\t\t , GOODS_NO\t\t\t\t\t\t, OPT_NM\t\t\t\t\t\t, OPT_VAL\t\t\t\t FROM gd_item , gd_item_opt\t\t\t\t WHERE gd_item_opt.ITEM_NO = gd_item.ITEM_NO\t\t\t\t ) itemT\t\t INNER JOIN gd_goods_opt optT\t ON itemT.GOODS_NO = optT.GOODS_NO\t\t AND itemT.OPT_NM = optT.OPT_NM\t\t \t\t AND optT.GOODS_NO = '1000000644'\t\t \t \t\t \t\t AND optT.OPT_SEQ = '1'\t\t GROUP BY itemT.GOODS_NO, itemT.OPT_NM, itemT.OPT_VAL, optT.OPT_SEQ;'''}}] }} # 데이터를 저장하기 위한 영역 dset = [] dtmp = {} def set_service(self, svc="goods"): self.svc = svc def get_dbio(self, sql): pat = re.compile("\[\w+\-api][\w|.]+") m = pat.findall(sql) if len(m) > 0: return (m[0]).strip() else: return None def get_tables(self, sql): pat = re.compile( "(?<=\W)(?:GD|AT|CC|CH|DP|ET|MB|OM|PR|ST)\_[\_\w\.]+(?=\W)", re.I) tables = pat.findall(sql) if len(sql) > 0: return [x.upper() for x in tables if x.find(".") == -1] else: return None def print_kv(self, k, v): if (k in [ 'host', 'Rows_examined', 'Query_time', '@timestamp', 'service', 'Lock_time' ]): #print(k,":",v) self.dtmp[k] = v elif (k in ['query']): #print("dbio :", get_dbio(v)) self.dtmp['dbio'] = self.get_dbio(v) #print("tables :",get_tables(v)) self.dtmp['tables'] = self.get_tables(v) elif (k in ['_source']): #print("="*80) self.print_data(v) if self.dtmp['dbio'] != None and len(self.dtmp['tables']) > 0: #self.dset[self.dtmp['dbio']] = self.dtmp['tables'] if (self.dtmp['service'] == self.svc): self.dset.append(self.dtmp['tables']) self.dtmp = {} else: #print(k,":") self.print_data(v) def print_data(self, d): if (type(d) == dict): for k, v in d.items(): self.print_kv(k, v) elif (type(d) == list): for item in d: self.print_data(item) elif (type(d) in [str, int, bool, float]): pass else: print("=" * 80) print(type(d)) # print_data argement없이 호출하는 함수. def parse(self): self.print_data(self.data)
'date': None, 'item': None, 'default_brand': None, 'scraped_brand': None, 'ingprod': None, 'recipeimpressions': 0, 'cartimpressions': 0, 'favs': 0, 'addToCarts': 0, 'prints': 0, 'shares': 0, 'ingsToCarts': 0 } for metric in metrics: resp = es.search(index='logstash-*', body=elastic_query(metric)) aggs[metric] = resp['aggregations']['aggdata']['buckets'] if metric == "ingsToCarts": reportData['ingsToCarts'] = resp['hits']['total'] else: ids = [i['key'] for i in aggs[metric]] mongo_query = { "myxxid": { "$in": ids }, "ingredients_edited.mappedingredient": item } docs = list(coll.find(mongo_query)) df1 = pd.DataFrame(docs) df2 = pd.DataFrame(aggs[metric])
class es02 : def __init__(self): self.es = Elasticsearch(['https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443']) self.set_service() pass def load(self,fname="es01.pkl"): self.data = pickle.load( open( fname, "rb" )) def load_datas(self,start_date=date(2017, 12, 1),end_date=date(2018,1,9)): d = start_date delta = datetime.timedelta(days=1) while d <= end_date: es_date = d.strftime("%Y.%m.%d") print(es_date) try: self.load_data(es_date) self.parse() #print(es.dset) except Exception: print(traceback.format_exc()) d += delta def load_data(self,dt="2018.01.08"): es_index = 'slowquery-'+dt page = self.es.search( index = es_index, doc_type = 'elltdev', body = { 'query' : { 'match_all' : {}} } ) self.data = page # print("test") def load_datas2(self,start_date=date(2017, 12, 1),end_date=date(2018,1,9)): d = start_date delta = datetime.timedelta(days=1) while d <= end_date: es_date = d.strftime("%Y.%m.%d") print(es_date) try: self.load_data2(es_date) self.parse() #print(es.dset) except: print("can't not find data") d += delta def load_data2(self,dt): url = 'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443/slowquery-'+dt+'/elltdev/_search' resp = requests.get(url=url) self.data = json.loads(resp.text) #data = {'took': 1, '_shards': {'total': 5, 'successful': 5, 'failed': 0}, 'timed_out': False, 'hits': {'max_score': 1.0, 'total': 1550, 'hits': [{'_source': {'host': 'omuser[omuser] @ [10.125.224.9] Id: 1005635', 'Rows_examined': 514, 'query': '''SELECT \t/*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\titemT.GOODS_NO\t\t , GROUP_CONCAT(DISTINCT itemT.ITEM_NO separator ',') AS ITEM_NO\t\t , itemT.OPT_NM\t\t , itemT.OPT_VAL\t\t\t\t, optT.OPT_SEQ\t\t\t\t \t\t FROM (\t\t\t\tSELECT /*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\t\t\tgd_item_opt.ITEM_NO\t\t\t , GOODS_NO\t\t\t\t\t\t, OPT_NM\t\t\t\t\t\t, OPT_VAL\t\t\t\t FROM gd_item , gd_item_opt\t\t\t\t WHERE gd_item_opt.ITEM_NO = gd_item.ITEM_NO\t\t\t\t ) itemT\t\t INNER JOIN gd_goods_opt optT\t ON itemT.GOODS_NO = optT.GOODS_NO\t\t AND itemT.OPT_NM = optT.OPT_NM\t\t \t\t AND optT.GOODS_NO = '1000000644'\t\t \t \t\t \t\t AND optT.OPT_SEQ = '1'\t\t GROUP BY itemT.GOODS_NO, itemT.OPT_NM, itemT.OPT_VAL, optT.OPT_SEQ;'''}}] }} # 데이터를 저장하기 위한 영역 dset = [] dtmp = {} def set_service(self,svc="goods"): self.svc = svc def get_dbio(self,sql): pat = re.compile("\[\w+\-api][\w|.]+") m = pat.findall(sql) if len(m) > 0 : return (m[0]).strip() else: return None def get_tables(self,sql): pat = re.compile("(?<=\W)(?:GD|AT|CC|CH|DP|ET|MB|OM|PR|ST)\_[\_\w\.]+(?=\W)",re.I) tables = pat.findall(sql) if len(sql) > 0 : return [x.upper() for x in tables if x.find(".") == -1 ] else: return None def print_kv(self,k,v): if ( k in ['host','Rows_examined','Query_time','@timestamp','service','Lock_time']): #print(k,":",v) self.dtmp[k] = v elif ( k in ['query']): #print("dbio :", get_dbio(v)) self.dtmp['dbio'] = self.get_dbio(v) #print("tables :",get_tables(v)) self.dtmp['tables'] = self.get_tables(v) elif ( k in ['_source']): #print("="*80) self.print_data(v) if self.dtmp['dbio'] != None and len(self.dtmp['tables']) > 0 : #self.dset[self.dtmp['dbio']] = self.dtmp['tables'] if(self.dtmp['service'] == self.svc) : self.dset.append(self.dtmp['tables']) self.dtmp = {} else: #print(k,":") self.print_data(v) def print_data(self,d): if ( type(d) == dict ): for k,v in d.items(): self.print_kv(k,v) elif( type(d) == list ): for item in d: self.print_data(item) elif( type(d) in [str,int,bool,float] ) : pass else: print("="*80) print(type(d)) # print_data argement없이 호출하는 함수. def parse(self): self.print_data(self.data)
#The rest of the lines have no special characters (we cut 4 slots for the tag) if counter == 0: tag = line[:5] tag = tag[1:4] line = line[5:] counter = 1 else: tag = line[:4] tag = tag[:3] line = line[4:] #We need k+1 results for each test result20 = es.search(index='test', doc_type='project', body={ 'query': { 'match': { 'text': line } }, 'size': 21 }) result30 = es.search(index='test', doc_type='project', body={ 'query': { 'match': { 'text': line } }, 'size': 31 }) print(tag + " - 20:")
esPort = os.environ['esport'] esPass = os.environ['espass'] esUser = os.environ['esuser'] rootOrg = os.environ['rootOrg'] org = os.environ['org'] esObj = Elasticsearch([{ "host": esUrl, "port": esPort }], http_auth=(esUser, esPass)) response = esObj.search("mlsearch_*", "searchresources", '''{ "size":1000, "_source":["locale","keywords","catalogPaths","name","sourceName","sourceShortName"] }''', scroll="5s") result_pending = [response] cnt = 1 indexDocs = [] stData = {} while result_pending: print("OBJECT NUMBER " + str(cnt)) cnt += 1 curr_obj = result_pending.pop() scroll_id = curr_obj.get("_scroll_id") for hit in curr_obj["hits"]["hits"]: hitSource = hit["_source"] for key, val in hitSource.items():
return None term_dict = {} for term, val in term_vectors[0].items(): for pos_info in val['tokens']: term_dict[pos_info['position']] = term sorted_terms = sorted(term_dict.items()) sorted_terms = [tup[1] for tup in sorted_terms] return sorted_terms if __name__ == '__main__': count_list = [x for x in range(0, count, 10000)] count_list.append(count) results = list() results.append(es.search(index='prd_review', size=10000, scroll='1m')) scroll_id = results[0]['_scroll_id'] results = results[0]['hits']['hits'] for _ in range(count // 10000): results.extend(es.scroll(scroll_id=scroll_id, scroll='1m')['hits']['hits']) results = [result['_source'] for result in results] data = [] for result in results: data.append({}) data[-1]['m_id'] = result['message_id'] data[-1]['score'] = result['prd_satisfact'] data[-1]['cus_grade'] = result['cus_grade'] data[-1]['best_flag'] = result['best_flag']
from elasticsearch5 import Elasticsearch useIndex = 'tw_user_database_*' TWEETSINDEX = "tweets_database*" # host = "192.168.209.113" # port = "9200" host = "192.168。8.200" port = "9201" es_client = Elasticsearch([{"host": host, "port": port}]) info = es_client.info() userid = "25073877" body = {"query": {"match": {"user.id": userid}}} rs = es_client.search(index=TWEETSINDEX, body=body) print rs print type(rs)
class ESStorage(Storage): """Elasticsearch storage backend.""" NAME = "es" _MESSAGE_FIELD_NAME = "_source.message" def __init__(self, configuration): """Initialize Elasticsearch storage backend.""" self.config = configuration self._connect() def _connect(self): urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) if len(self.config.ES_CERT_DIR) and os.path.isdir( self.config.ES_CERT_DIR): _LOGGER.warn( "Using cert and key in %s for connection to %s (verify_certs=%s)." % ( self.config.ES_CERT_DIR, self.config.ES_ENDPOINT, self.config.ES_VERIFY_CERTS, )) self.es = Elasticsearch( self.config.ES_ENDPOINT, use_ssl=self.config.ES_USE_SSL, verify_certs=self.config.ES_VERIFY_CERTS, client_cert=os.path.join(self.config.ES_CERT_DIR, "es.crt"), client_key=os.path.join(self.config.ES_CERT_DIR, "es.key"), timeout=60, max_retries=2, ) else: _LOGGER.warn("Conecting to ElasticSearch without authentication.") print(self.config.ES_USE_SSL) self.es = Elasticsearch( self.config.ES_ENDPOINT, use_ssl=self.config.ES_USE_SSL, verify_certs=self.config.ES_VERIFY_CERTS, timeout=60, max_retries=2, ) def _prep_index_name(self, prefix): # appends the correct date to the index prefix now = datetime.datetime.now() date = now.strftime("%Y.%m.%d") index = prefix + date return index def retrieve(self, time_range: int, number_of_entries: int, false_data=None): """Retrieve data from ES.""" index_in = self._prep_index_name(self.config.ES_INPUT_INDEX) query = { "sort": { "@timestamp": { "order": "desc" } }, "query": { "bool": { "must": [ { "query_string": { "analyze_wildcard": True, "query": "" } }, { "range": { "@timestamp": { "gte": "now-900s", "lte": "now" } } }, ], "must_not": [], } }, } _LOGGER.info( "Reading in max %d log entries in last %d seconds from %s", number_of_entries, time_range, self.config.ES_ENDPOINT, ) query["size"] = number_of_entries query["query"]["bool"]["must"][1]["range"]["@timestamp"][ "gte"] = "now-%ds" % time_range query["query"]["bool"]["must"][0]["query_string"][ "query"] = self.config.ES_QUERY es_data = self.es.search(index_in, body=json.dumps(query)) if es_data["hits"]["total"] == 0: return pandas.DataFrame(), es_data # only use _source sub-dict es_data = [x["_source"] for x in es_data["hits"]["hits"]] es_data_normalized = pandas.DataFrame( json_normalize(es_data)["message"]) _LOGGER.info("%d logs loaded in from last %d seconds", len(es_data_normalized), time_range) self._preprocess(es_data_normalized) return es_data_normalized, es_data # bad solution, this is how Entry objects could come in. def store_results(self, data): """Store results back to ES.""" index_out = self._prep_index_name(self.config.ES_TARGET_INDEX) actions = [{ "_index": index_out, "_type": "log", "_source": data[i] } for i in range(len(data))] helpers.bulk(self.es, actions, chunk_size=int(len(data) / 4) + 1)
class Searcher(): """Searches papers from elasticsearch database Longer class information.... Longer class information.... """ def __init__(self, index_name='paperdb', doc_type='papers', host='10.1.114.114', port=9200): """Initialize a search engine Args: host: A host name of elasticsearch port: A port number of elasticsearch index_name: name of the index you want to search for doc_type: name of the doc_type under certain index """ self.es = Elasticsearch([{'host': host, 'port': port}]) self.index = index_name self.doc_type = doc_type def generate_dsl(self, search_info): """Generate DSL given query and search settings Args: search_info: a dict including a query and other settings Attention that 'query_type' must be consistent with 'match' ! Example: { 'query_type': 'integrated_search', 'query': 'attention network', 'match': { 'title': True, 'abstract': True, 'paperContent': True, 'videoContent': True, }, 'filter': { 'yearfrom': 1000, 'yearbefore': 3000, }, 'sort': 'year', 'is_filter': True, 'is_rescore': True, 'is_cited': False } or { 'query_type': 'advanced_search', 'match': { 'title': 'attention', 'abstract': 'attention', 'paperContent': 'attention', 'videoContent': None, }, 'filter': { 'yearfrom': 1000, 'yearbefore': 3000, }, 'sort': 'relevance', 'is_filter': False, 'is_rescore': True, 'is_cited': False } Return: dsl: a dsl translated from search info """ # check search_info if 'integrated' in search_info['query_type']: assert 'query' in search_info, "Integrated search must have query !" assert isinstance(search_info['match']['title'], bool), "Here needs bool type !" else: assert isinstance(search_info['match']['title'], (str, None)), "Here needs a string or None !" if search_info['is_cited'] is False: dsl = Vividict() dsl['query']['bool']['must'] = [] dsl['query']['bool']['should'] = [] dsl['rescore'] = [] if 'integrated' in search_info['query_type']: match = self.get_integrated_match(search_info['query'], search_info['match']) dsl['query']['bool']['should'] = match if search_info['is_filter'] is True: filter = self.get_filter_query(search_info['query']) dsl['query']['bool']['must'].append(filter) if search_info['is_rescore'] is True: rescore = self.get_rescore_query(match) dsl['rescore'] = rescore else: # 'advanced_search' match = self.get_advanced_match(search_info['match']) dsl['query']['bool']['must'] = match if search_info['is_rescore'] is True: rescore = self.get_rescore_query(match) dsl['rescore'] = rescore year_range = Vividict() year_range['range']['year']['gte'] = search_info['filter'].get('yearfrom', 1000) year_range['range']['year']['lte'] = search_info['filter'].get('yearbefore', 3000) dsl['query']['bool']['must'].append(year_range) else: # cited-function_score dsl = Vividict() dsl['query']['function_score']['query']['bool']['must'] = [] dsl['query']['function_score']['query']['bool']['should'] = [] dsl['query']['function_score']['field_value_factor'] = [] dsl['rescore'] = [] if 'integrated' in search_info['query_type']: match = self.get_integrated_match(search_info['query'], search_info['match']) dsl['query']['function_score']['query']['bool']['should'] = match cited = self.get_function_factor() dsl['query']['function_score']['field_value_factor'] = cited if search_info['is_filter'] is True: filter = self.get_filter_query(search_info['query']) dsl['query']['function_score']['query']['bool']['must'].append(filter) if search_info['is_rescore'] is True: rescore = self.get_rescore_query(match) dsl['rescore'] = rescore else: # 'advanced_search' match = self.get_advanced_match(search_info['match']) dsl['query']['bool']['must'] = match if search_info['is_rescore'] is True: rescore = self.get_rescore_query(match) dsl['rescore'] = rescore year_range = Vividict() year_range['range']['year']['gte'] = search_info['filter'].get('yearfrom', 1000) year_range['range']['year']['lte'] = search_info['filter'].get('yearbefore', 3000) dsl['query']['function_score']['query']['bool']['must'].append(year_range) if search_info['sort'] == 'year': dsl['sort']['year'] = 'desc' elif search_info['sort'] == 'cited': dsl['sort']['cited'] = 'asc' return dsl def get_integrated_match(self, query, match): """get match of intergrated search Args: query: query string from user match: A dict contained title, abstract... Return: res: A list of match """ res = [] if match['title'] or match['abstract']: tmp = Vividict() tmp['multi_match']['query'] = query fields = [] if match['title']: fields.append('title^3') if match['abstract']: fields.append('abstract^2') tmp['multi_match']['fields'] = fields res.append(tmp) if match['paperContent']: nest = self.get_nested_query_paperContent(query) res.append(nest) if match['videoContent']: nest = self.get_nested_query_videoContent(query) res.append(nest) if match['authors']: nest = self.get_nested_query_authors(query) res.append(nest) return res def get_advanced_match(self, match): """get match of advanced search Args: match: A dict contained title, abstract, paper_content... Return: res: A list of match """ res = [] if match['title']: _match = {'match': {'title': match['title']}} res.append(_match) if match['abstract']: _match = {'match': {'abstract': match['abstract']}} res.append(_match) if match['paperContent']: nest = self.get_nested_query_paperContent(match['paperContent']) res.append(nest) if match['videoContent']: nest = self.get_nested_query_videoContent(match['videoContent']) res.append(nest) if match['authors']: nest = self.get_nested_query_authors(match['authors']) res.append(nest) return res def get_nested_query_authors(self, query): nest = Vividict() nest['nested']['path'] = 'authors' nest['nested']['score_mode'] = 'max' tmp = Vividict() fields = ['authors.firstName', 'authors.lastName'] tmp['multi_match']['fields'] = fields tmp['multi_match']['query'] = query nest['nested']['query']['bool']['must'] = tmp return nest def get_nested_query_paperContent(self, query): nest = Vividict() nest['nested']['path'] = 'paperContent' nest['nested']['score_mode'] = 'max' tmp = Vividict() fields = ['paperContent.text', 'paperContent.subtitles^2', 'paperContent.subtexts'] tmp['multi_match']['fields'] = fields tmp['multi_match']['query'] = query nest['nested']['query']['bool']['must'] = tmp return nest def get_nested_query_videoContent(self, query): nest = Vividict() nest['nested']['path'] = 'videoContent' nest['nested']['score_mode'] = 'max' tmp = Vividict() tmp['match']['videoContent.textEnglish'] = query nest['nested']['query']['bool']['must'] = tmp return nest def get_function_factor(self): cited = Vividict() cited['field'] = 'cited' cited['modifier'] = 'log1p' cited['factor'] = 0.5 cited['missing'] = 0 return cited def get_filter_query(self, query): filter = Vividict() tag_list = [] word_list = query.split() for word in word_list: tag_list.append(word.capitalize()) tag_list.append(word.lower()) filter['terms']['abstract'] = tag_list return filter def get_rescore_query(self, match): rescore = Vividict() rescore['window_size'] = 100 rescore['query']['rescore_query'] = match[0] rescore['query']['query_weight'] = 1.5 rescore['query']['rescore_query_weight'] = 0.5 return rescore def search_paper_by_name(self, search_info, only_top_k=True, size=100): """Search paper by name Args: query: query string from user Return: paper_list: A list of paper information paper_id : A list of paper id paper_num : The number of returned paper """ dsl = self.generate_dsl(search_info) result = self.es.search(index=self.index, doc_type=self.doc_type, body=dsl, size=size) return self.get_paper_info(result) def get_video_pos_by_paper_id(self, search_info, paper_id, threshold=0.8): """ Args: search_info: the same as that in self.generate_dsl() paper_id: A string, given by es Return: a sorted video captions' list according to similarity between captions and query """ assert isinstance(paper_id, str), "paper_id must be a string, here need only one id !" paper = self.es.get_source(index=self.index, doc_type=self.doc_type, id=paper_id) return self.get_video_pos_by_paper(search_info=search_info, paper=paper, threshold=threshold) def get_video_pos_by_paper(self, search_info, paper, threshold=0.8): """ Args: paper: A dict contained title, abstract ... Return: a sorted video captions' list according to similarity between captions and query """ assert isinstance(paper, dict), "paper must be a dict, here need only one paper !" if 'integrated' in search_info['query_type']: query = search_info['query'] else: query = search_info['match']['videoContent'] assert (query is not None) if 'videoContent' not in paper: return [None] pos = self.get_video_pos(query=query, videoContent=paper['videoContent'], threshold=threshold) return pos @staticmethod def get_paper_info(res): """Return raw paper info given es search result Args: res: A dict of result from es.search Return: paper_list: A list of dicts, each dict stores information of a paper num: length of paper_list """ paper_list = [] paper_id = [] hits = res['hits']['hits'] num = res['hits']['total'] # import pdb; pdb.set_trace(); for hit in hits: paper_list.append(hit['_source']) paper_id.append(hit['_id']) return paper_list, paper_id, num @staticmethod def remove_text_embedding(papers): """Remove textEmbedding in videoContent Args: papers: A list of paper """ for paper in papers: if 'videoContent' in paper: for v in paper['videoContent']: if 'textEmbedding' in v: v.pop('textEmbedding') @staticmethod def get_video_pos(query, videoContent, threshold=0.8): """Return a list of video captions related to user's query Args: query: english query text videoContent: a list of video caption information threshold: captions whose similarity score is > threshold are returned Return: res_list: a sorted video captions' list according to similarity between captions and query """ emd_list = [v.pop('textEmbedding') for v in videoContent] sim_list = test_similarity(query, emd_list) if sim_list == '__ERROR__': return sim_list res_list = [] for s, v in zip(sim_list, videoContent): v['score'] = s if v['score'] > threshold: res_list.append(v) elif query in v['textEnglish']: res_list.append(v) # print('query:' + query) # pprint(res_list) return res_list
#res = es.get(index="test-index", id=1) #print(res['_source']) #es.indices.refresh(index="test-index") indexes = es.indices.get('*') #print(indexes) for j in range(0, 10): print("value of j is: ", j) for i in indexes: print(i) print(" ") res = es.search(index=i, body={ "query": { "match_all": {} }, "size": 1000 }) #res = es.search(index="fx-testsuite-responses", body={"query": {"match_all": {}}, "size": 1000}) #print((res)) #print("Got %d Hits:" % res['hits']['total']['value']) for hit in res['hits']['hits']: #print("Hello") #print("%(timestamp)s %(author)s: %(text)s" % hit["_source"]) #es.index(index="fx-testsuite-responses",body={hit}) a = hit["_source"] e = es.index(index=i, doc_type="test", body=a) #e = es.index(index="fx-testsuite-responses",doc_type="test" ,body=a)
fd2 = open("es_results_20_b.txt", 'w', encoding='utf-8') line = fd.readline() # For each line-query it sends a search request to elasticsearch while line: # Removes the tag (Q#) from the line-query # We cut 4 slots for the tag tag = line[:4] tag = tag[:3] line = line[4:] # We need 21 results if part == "a": result = es.search(index='test', doc_type='project', body={ 'query': { 'match': { 'text': line } }, 'size': 21 }) elif part == "b": result = es.search(index='test2', doc_type='project', body={ 'query': { 'match': { 'text': line } }, 'size': 21 })
sorted_terms = sorted(term_dict.items()) sorted_terms = [tup[1] for tup in sorted_terms] return sorted_terms df = pd.read_pickle(dataset_dir + 'df_product_dataset.pkl') df.pid = df.pid.str.strip() df['term_vectors'] = None # df = dd.from_pandas(df, npartitions=50) count_list = list(range(0, len(df), 10000)) + [len(df)] sorted_term_vectors = list() sorted_term_vectors.append( es.search( index='nori_with_adjv', size=10000, scroll='1m', filter_path=['hits.hits._source.sorted_term', 'hits.hits._source.pid'])) scroll_id = sorted_term_vectors[0]['_scroll_id'] # def gen_bulk_2(pid, sorted_term): # _head = {"update": {"_id": pid, "_type": "_doc", "_index": conf.es_adjv_index, "retry_on_conflict": 3}} # ======= # # print(''' get_parsed_token and upload sorted term vectors ''') # def get_mtermvectors(ids): # body = dict() # body['ids'] = ids # body['parameters'] = {"fields": ["product"]} # # TODO ES_INDEX : conf.es_nouns_index or conf.es_adjv_index # res = es.mtermvectors(index=conf.es_nouns_index, doc_type='_doc', body=body)['docs']
class EsClientConnection: host = '' errorMessage = '' def __init__(self, host, index=None, type=None, body=None): ''' 创建的时候需要两个都要存在 :param host: :param index: :param type: :param body: ''' self.host = host self.conn = Elasticsearch([self.host]) # 初始化mapping设置,即创建index indexExists = self.conn.indices.exists(index=index) typeExists = self.conn.indices.exists_type(index=index, doc_type=type) if body is not None: if indexExists is not True: if typeExists is not True: self.conn.indices.create(index=index, body=body) else: self.errorMessage = 'index not exists and type exists. it is not possible!' else: if typeExists is not True: self.errorMessage = 'index index exists and type not exists' else: self.errorMessage = 'index exists and type exists. you not need create it' def __del__(self): self.close() def check(self): ''' 输出当前系统的ES信息 :return: ''' return self.conn.info() def insertDocument(self, index, type, body, id=None): ''' 插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成 :param index: 待插入的index值 :param type: 待插入的type值 :param body: 待插入的数据 -> dict型 :param id: 自定义Id值 :return: ''' return self.conn.index(index=index, doc_type=type, body=body, id=id) def insertDataFrame(self, index, type, dataFrame): ''' 批量插入接口; bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}] 其中optionType可为index、delete、update Condition可设置每条数据所对应的index值和type值 data为具体要插入/更新的单条数据 :param index: 默认插入的index值 :param type: 默认插入的type值 :param dataFrame: 待插入数据集 :return: ''' dataList = dataFrame.to_dict(orient='records') insertHeadInfoList = [{"index": {}} for i in range(len(dataList))] temp = [dict] * (len(dataList) * 2) temp[::2] = insertHeadInfoList temp[1::2] = dataList try: return self.conn.bulk(index=index, doc_type=type, body=temp) except Exception as e: return str(e) def deleteDocById(self, index, type, id): ''' 删除指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.delete(index=index, doc_type=type, id=id) def deleteDocByQuery(self, index, query, type=None): ''' 删除idnex下符合条件query的所有数据 :param index: :param query: 满足DSL语法格式 :param type: :return: ''' return self.conn.delete_by_query(index=index, body=query, doc_type=type) def deleteAllDocByIndex(self, index, type=None): ''' 删除指定index下的所有数据 :param index: :return: ''' try: query = {'query': {'match_all': {}}} return self.conn.delete_by_query(index=index, body=query, doc_type=type) except Exception as e: return str(e) + ' -> ' + index def searchDoc(self, index=None, type=None, body=None): ''' 查找index下所有符合条件的数据 :param index: :param type: :param body: 筛选语句,符合DSL语法格式 :return: ''' return self.conn.search(index=index, doc_type=type, body=body) def getDocById(self, index, type, id): ''' 获取指定index、type、id对应的数据 :param index: :param type: :param id: :return: ''' return self.conn.get(index=index, doc_type=type, id=id) def updateDocById(self, index, type, id, body=None): ''' 更新指定index、type、id所对应的数据 :param index: :param type: :param id: :param body: 待更新的值 :return: ''' return self.conn.update(index=index, doc_type=type, id=id, body=body) def close(self): if self.conn is not None: try: self.conn.close() except Exception as e: pass finally: self.conn = None def mysqlToEs(self, mysqlData): doc = [] for value in mysqlData: doc.append({"index": {}}) doc.append(value) self.conn.bulk(index='product', doc_type='tour_product', body=doc)
def db_es(): es = Elasticsearch( ['113.107.166.14'], # http_auth=('elastic', 'passwd'), port=19200) #platID=1&roomID=2384875205&startTime=1561482671&endTime=1561523909 #query ={'query': {'match_all': {}}} #query = { # "size": 10000, # "query":{ # "match":{ # "platform_id":"2" ## "timestamp":"1561735836" # } # } #} #query = { # "size": 10000, # "query":{ # "bool": { # "must":[ # {"match":{ # "platform_id":"20" ## "timestamp":"1561735836" # }}, # {"match":{ # "room_id":"281147838" ## "timestamp":"1561735836" # }}, # {"match":{ # "gift_type":"0" ## "timestamp":"1561735836" # }}, # {"range":{ # "timestamp":{"lte":"1562638920",} #gte,lte # }} ## {"range":{ ## "timestamp":{"lte":"1562638920",} #gte,lte ## }} ## {"sort":{ ## "from_id":{"order by":"desc",} #gte,lte ## }} # ] # } # } #} # query = { "size": 10, "query": { "bool": { "must": [ { "match": { "platform_id": "1" } }, # {"match":{ # "from_id":"cc_1333_38802060_1477136798" # }} { "match": { "gift_name": "" } }, { "match": { "gift_type": "1" } } # {"range":{ # "count":{"gte":2} #gte,lte # }} # {"range":{ # "timestamp":{"gte":"1563798600"} #gte,lte ,"lte":"1563728700" "gte":'1562567226' # }} ] } } } #query = { # "query":{ # "terms":{ # "room_id":[ # "432863","432863" # ] # } # } #} #res = es.get(index="liveshow-2018-07-20",doc_type='gift',id='1') # 获取所有数据 #res = es.search(index='liveshow-online-page-2019-06-29', doc_type='page',body=query) res = es.search(index='xiaohulu-liveshow-2019-08-26', doc_type='gift', body=query) return res
from elasticsearch5 import Elasticsearch import chardet import pickle es = Elasticsearch([ 'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443' ]) print(es.info()) page = es.search(index='slowquery-2018.01.09', doc_type='elltdev', body={'query': { 'match_all': {} }}) pickle.dump(page, open('es01.pkl', 'wb')) #f = open("es01.json",'w') #data = str(page) # print(chardet.detect(page)) # f.write(data) # f.close()
class ElasticHelper(object): def __init__(self): self.es = Elasticsearch(ElasticConfig.uri) self._multi_search_results = [] self.bulk_task_queue = [] self.bulk_last_time = datetime_now_obj() def delay_index(self, body, index, doc_type): self.bulk_task_queue.append( {"index": { "_index": index, "_type": doc_type }}) self.bulk_task_queue.append(body) if self._can_do_bulk(): self.bulk(body=self.bulk_task_queue, index=index, doc_type=doc_type) self.bulk_task_queue = [] self.bulk_last_time = datetime_now_obj() def _can_do_bulk(self): # 任务队列超过100条数据 if len(self.bulk_task_queue) > 100: return True # 时间间隔超过1分钟 if get_n_min_ago(1) > self.bulk_last_time: return True return False def index(self, body, index, doc_type): self.es.index(body=body, index=index, doc_type=doc_type) def bulk(self, body, index, doc_type): self.es.bulk(body=body, index=index, doc_type=doc_type) def scan(self, body, index, doc_type): return helpers.scan(self.es, query=body, index=index, doc_type=doc_type, preserve_order=True) def search(self, body, index, doc_type): try: rsp = self.es.search(body=body, index=index, doc_type=doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) return return rsp except Exception as e: print(body) logger.error("es search error: " + str(e) + index) def count(self, body, index, doc_type): return self.es.count(index=index, doc_type=doc_type, body=body, request_timeout=100) def delete_index(self, index): return self.es.indices.delete(index=index) def put_template(self, name, body, **kwargs): return self.es.indices.put_template(name=name, body=body, **kwargs) def exists_template(self, name, **kwargs) -> bool: return self.es.indices.exists_template(name=name, **kwargs) def delete_template(self, name, **kwargs): return self.es.indices.delete_template(name=name, **kwargs) def get_template(self, name, **kwargs): return self.es.indices.get_template(name=name, **kwargs) def wait_log_in_database(self, computer_name, record_number): """ 因为消息队列和入库ES是分开进行的,所以可能会出现当消费到某条日志时,ES还没入库,所以需要检查同步 """ count = 0 query = { "query": get_must_statement( get_term_statement("computer_name", computer_name), get_term_statement("record_number", record_number)), "_source": False, "size": 1 } while True: try: rsp = self.es.search(body=query, index=ElasticConfig.event_log_index, doc_type=ElasticConfig.event_log_doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) break if len(rsp["hits"]["hits"]) > 0: return rsp["hits"]["hits"][0]["_id"] time.sleep(2) # 最多等5次,即 2 * 5 = 10秒 if count == 10: break count += 1 except Exception as e: logger.error("es wait_log_in_database search error: " + str(e)) break def multi_search(self, body, index, doc_type): try: rsp = self.es.msearch(body=body, index=index, doc_type=doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) return return rsp except Exception as e: logger.error("es msearch error: " + str(e))