class es02 : def __init__(self): self.es = Elasticsearch(['https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443']) self.set_service() pass def load(self,fname="es01.pkl"): self.data = pickle.load( open( fname, "rb" )) def load_datas(self,start_date=date(2017, 12, 1),end_date=date(2018,1,9)): d = start_date delta = datetime.timedelta(days=1) while d <= end_date: es_date = d.strftime("%Y.%m.%d") print(es_date) try: self.load_data(es_date) self.parse() #print(es.dset) except Exception: print(traceback.format_exc()) d += delta def load_data(self,dt="2018.01.08"): es_index = 'slowquery-'+dt page = self.es.search( index = es_index, doc_type = 'elltdev', body = { 'query' : { 'match_all' : {}} } ) self.data = page # print("test") def load_datas2(self,start_date=date(2017, 12, 1),end_date=date(2018,1,9)): d = start_date delta = datetime.timedelta(days=1) while d <= end_date: es_date = d.strftime("%Y.%m.%d") print(es_date) try: self.load_data2(es_date) self.parse() #print(es.dset) except: print("can't not find data") d += delta def load_data2(self,dt): url = 'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443/slowquery-'+dt+'/elltdev/_search' resp = requests.get(url=url) self.data = json.loads(resp.text) #data = {'took': 1, '_shards': {'total': 5, 'successful': 5, 'failed': 0}, 'timed_out': False, 'hits': {'max_score': 1.0, 'total': 1550, 'hits': [{'_source': {'host': 'omuser[omuser] @ [10.125.224.9] Id: 1005635', 'Rows_examined': 514, 'query': '''SELECT \t/*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\titemT.GOODS_NO\t\t , GROUP_CONCAT(DISTINCT itemT.ITEM_NO separator ',') AS ITEM_NO\t\t , itemT.OPT_NM\t\t , itemT.OPT_VAL\t\t\t\t, optT.OPT_SEQ\t\t\t\t \t\t FROM (\t\t\t\tSELECT /*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\t\t\tgd_item_opt.ITEM_NO\t\t\t , GOODS_NO\t\t\t\t\t\t, OPT_NM\t\t\t\t\t\t, OPT_VAL\t\t\t\t FROM gd_item , gd_item_opt\t\t\t\t WHERE gd_item_opt.ITEM_NO = gd_item.ITEM_NO\t\t\t\t ) itemT\t\t INNER JOIN gd_goods_opt optT\t ON itemT.GOODS_NO = optT.GOODS_NO\t\t AND itemT.OPT_NM = optT.OPT_NM\t\t \t\t AND optT.GOODS_NO = '1000000644'\t\t \t \t\t \t\t AND optT.OPT_SEQ = '1'\t\t GROUP BY itemT.GOODS_NO, itemT.OPT_NM, itemT.OPT_VAL, optT.OPT_SEQ;'''}}] }} # 데이터를 저장하기 위한 영역 dset = [] dtmp = {} def set_service(self,svc="goods"): self.svc = svc def get_dbio(self,sql): pat = re.compile("\[\w+\-api][\w|.]+") m = pat.findall(sql) if len(m) > 0 : return (m[0]).strip() else: return None def get_tables(self,sql): pat = re.compile("(?<=\W)(?:GD|AT|CC|CH|DP|ET|MB|OM|PR|ST)\_[\_\w\.]+(?=\W)",re.I) tables = pat.findall(sql) if len(sql) > 0 : return [x.upper() for x in tables if x.find(".") == -1 ] else: return None def print_kv(self,k,v): if ( k in ['host','Rows_examined','Query_time','@timestamp','service','Lock_time']): #print(k,":",v) self.dtmp[k] = v elif ( k in ['query']): #print("dbio :", get_dbio(v)) self.dtmp['dbio'] = self.get_dbio(v) #print("tables :",get_tables(v)) self.dtmp['tables'] = self.get_tables(v) elif ( k in ['_source']): #print("="*80) self.print_data(v) if self.dtmp['dbio'] != None and len(self.dtmp['tables']) > 0 : #self.dset[self.dtmp['dbio']] = self.dtmp['tables'] if(self.dtmp['service'] == self.svc) : self.dset.append(self.dtmp['tables']) self.dtmp = {} else: #print(k,":") self.print_data(v) def print_data(self,d): if ( type(d) == dict ): for k,v in d.items(): self.print_kv(k,v) elif( type(d) == list ): for item in d: self.print_data(item) elif( type(d) in [str,int,bool,float] ) : pass else: print("="*80) print(type(d)) # print_data argement없이 호출하는 함수. def parse(self): self.print_data(self.data)
########### ES ############### index = 'trips' type = 'tripcontent' host = "search-travelgramsearch-te6bi4dkwyzudmadanw26s4yni.us-east-1.es.amazonaws.com" region = 'us-east-1' service = 'es' credentials = boto3.Session().get_credentials() awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token) es = Elasticsearch(hosts=[{ 'host': host, 'port': 443 }], http_auth=awsauth, use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection) def modifyFavorite(tripid): response = favorite_table.scan(AttributesToGet=['UserID'], ScanFilter={ "TripID": { "AttributeValueList": [tripid], "ComparisonOperator": "CONTAINS" } }) for item in response['Items']: user_id = item['UserID']
def create_app(env_type, enable_config_file=False): """ 创建flask应用 并 初始化各组件 :param env_type: 环境类型 :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息 :return: flask应用 """ app = create_flask_app(env_type, enable_config_file) # 添加自定义正则转换器 from utils.converters import register_converters register_converters(app) # 创建redis哨兵 from redis.sentinel import Sentinel _sentinel = Sentinel(app.config['REDIS_SENTINELS']) # 获取redis主从连接对象 app.redis_master = _sentinel.master_for(app.config['REDIS_SENTINEL_SERVICE_NAME']) app.redis_slave = _sentinel.slave_for(app.config['REDIS_SENTINEL_SERVICE_NAME']) # 创建redis集群 from rediscluster import StrictRedisCluster app.redis_cluster = StrictRedisCluster(startup_nodes=app.config['REDIS_CLUSTER']) # 配置myql数据库 from models import db db.init_app(app) # 配置日志 from utils.logging import create_logger create_logger(app) # 限流器 from utils.limiter import limiter as lmt lmt.init_app(app) # 创建Snowflake ID worker from utils.snowflake.id_worker import IdWorker app.id_worker = IdWorker(app.config['DATACENTER_ID'], app.config['WORKER_ID'], app.config['SEQUENCE']) # 创建执行器 from apscheduler.executors.pool import ThreadPoolExecutor executor = ThreadPoolExecutor() # 创建定时任务调度器 from apscheduler.schedulers.background import BackgroundScheduler app.scheduler = BackgroundScheduler(executors={'default': executor}) from scheduler.cache_schedule import fix_statistic # 添加定时任务 每天3天同步数据 # app.scheduler.add_job(fix_statistic, 'cron', hour=3) app.scheduler.add_job(fix_statistic, 'date', args=[app]) # 启动调度器 app.scheduler.start() # 建立grpc的连接 app.channel = grpc.insecure_channel(app.config['RPC'].RECOMMEND) # 创建socketio的消息队列管理器(要求flask应用处于生产模式) 将消息保存到消息队列中 import socketio app.siomgr = socketio.KombuManager(app.config['RABBIT_MQ']) # 创建es客户端 from elasticsearch5 import Elasticsearch app.es = Elasticsearch( app.config['ES_HOST'], # 启动前嗅探es集群服务器 sniff_on_start=True, # es集群服务器结点连接异常时是否刷新es节点信息 sniff_on_connection_fail=True, # 每60秒刷新节点信息 sniffer_timeout=60 ) # 添加请求钩子 from utils.middlewares import jwt_authentication app.before_request(jwt_authentication) # 注册用户模块蓝图 from .resources.user import user_bp app.register_blueprint(user_bp) # 注册新闻模块蓝图 from .resources.news import news_bp app.register_blueprint(news_bp) # 注册搜索模块蓝图 from .resources.search import search_bp app.register_blueprint(search_bp) return app
from elasticsearch5 import Elasticsearch import datetime es = Elasticsearch('http://fx-elasticsearch:9200') #doc = { # 'author': 'kimchy', # 'text': 'Elasticsearch: cool. bonsai cool.', # 'timestamp': datetime.datetime.now() #} #res = es.index(index="test-index",doc_type="test", id=1, body=doc) #print(res['result']) #res = es.get(index="test-index", id=1) #print(res['_source']) #es.indices.refresh(index="test-index") indexes = es.indices.get('*') #print(indexes) for j in range(0, 10): print("value of j is: ", j) for i in indexes: print(i) print(" ") res = es.search(index=i, body={ "query": { "match_all": {} }, "size": 1000 })
class Searcher(): """Searches papers from elasticsearch database Longer class information.... Longer class information.... """ def __init__(self, index_name, doc_type, host='10.1.114.114', port=9200): """Initialize a search engine Args: host: A host name of elasticsearch port: A port number of elasticsearch index_name: name of the index you want to search for doc_type: name of the doc_type under certain index """ self.es = Elasticsearch([{'host': host, 'port': port}]) self.index = index_name self.doc_type = doc_type def generate_dsl(self, search_info): """Generate DSL given query and search settings Args: search_info: a dict including a query and other settings Attention that 'query_type' must be consistent with 'match' ! Example: { 'query_type': 'integrated_search', 'query': 'attention network', 'match': { 'title': True, 'abstract': True, 'paperContent': True, 'videoContent': True, }, 'filter': { 'yearfrom': 1000, 'yearbefore': 3000, }, 'sort': 'year', 'is_filter': True, 'is_rescore': True, 'is_cited': False } or { 'query_type': 'advanced_search', 'match': { 'title': 'attention', 'abstract': 'attention', 'paperContent': 'attention', 'videoContent': None, }, 'filter': { 'yearfrom': 1000, 'yearbefore': 3000, }, 'sort': 'relevance', 'is_filter': False, 'is_rescore': True, 'is_cited': False } Return: dsl: a dsl translated from search info """ # check search_info if 'integrated' in search_info['query_type']: assert 'query' in search_info, "Integrated search must have query !" assert isinstance(search_info['match']['title'], bool), "Here needs bool type !" else: assert isinstance(search_info['match']['title'], (str, None)), \ "Here needs a string or None !" if search_info['is_cited'] is False: dsl = Vividict() dsl['query']['bool']['must'] = [] dsl['query']['bool']['should'] = [] dsl['rescore'] = [] if 'integrated' in search_info['query_type']: match = self.get_integrated_match(search_info['query'], search_info['match']) dsl['query']['bool']['should'] = match if search_info['is_filter'] is True: filter = self.get_filter_query(search_info['query']) dsl['query']['bool']['must'].append(filter) if search_info['is_rescore'] is True: rescore = self.get_rescore_query(match) dsl['rescore'] = rescore else: # 'advanced_search' match = self.get_advanced_match(search_info['match']) dsl['query']['bool']['must'] = match if search_info['is_rescore'] is True: rescore = self.get_rescore_query(match) dsl['rescore'] = rescore year_range = Vividict() year_range['range']['year']['gte'] = search_info['filter'].get('yearfrom', 1000) year_range['range']['year']['lte'] = search_info['filter'].get('yearbefore', 3000) dsl['query']['bool']['must'].append(year_range) else: # cited-function_score dsl = Vividict() dsl['query']['function_score']['query']['bool']['must'] = [] dsl['query']['function_score']['query']['bool']['should'] = [] dsl['query']['function_score']['field_value_factor'] = [] dsl['rescore'] = [] if 'integrated' in search_info['query_type']: match = self.get_integrated_match(search_info['query'], search_info['match']) dsl['query']['function_score']['query']['bool']['should'] = match cited = self.get_function_factor() dsl['query']['function_score']['field_value_factor'] = cited if search_info['is_filter'] is True: filter = self.get_filter_query(search_info['query']) dsl['query']['function_score']['query']['bool']['must'].append(filter) if search_info['is_rescore'] is True: rescore = self.get_rescore_query(match) dsl['rescore'] = rescore else: # 'advanced_search' match = self.get_advanced_match(search_info['match']) dsl['query']['bool']['must'] = match if search_info['is_rescore'] is True: rescore = self.get_rescore_query(match) dsl['rescore'] = rescore year_range = Vividict() year_range['range']['year']['gte'] = search_info['filter'].get('yearfrom', 1000) year_range['range']['year']['lte'] = search_info['filter'].get('yearbefore', 3000) dsl['query']['function_score']['query']['bool']['must'].append(year_range) if search_info['sort'] == 'year': dsl['sort']['year'] = 'desc' elif search_info['sort'] == 'cited': dsl['sort']['cited'] = 'asc' return dsl def get_integrated_match(self, query, match): """get match of intergrated search Args: query: query string from user match: A dict contained title, abstract... Return: res: A list of match """ res = [] if match['title'] or match['abstract']: tmp = Vividict() tmp['multi_match']['query'] = query fields = [] if match['title']: fields.append('title^3') if match['abstract']: fields.append('abstract^2') tmp['multi_match']['fields'] = fields res.append(tmp) if match['paperContent']: nest = self.get_nested_query_paperContent(query) res.append(nest) if match['videoContent']: nest = self.get_nested_query_videoContent(query) res.append(nest) return res def get_advanced_match(self, match): """get match of advanced search Args: match: A dict contained title, abstract, paper_content... Return: res: A list of match """ res = [] if match['title']: _match = {'match': {'title': match['title']}} res.append(_match) if match['abstract']: _match = {'match': {'abstract': match['abstract']}} res.append(_match) if match['paperContent']: nest = self.get_nested_query_paperContent(match['paperContent']) res.append(nest) if match['videoContent']: nest = self.get_nested_query_videoContent(match['videoContent']) res.append(nest) return res def get_nested_query_paperContent(self, query): nest = Vividict() nest['nested']['path'] = 'paperContent' nest['nested']['score_mode'] = 'max' tmp = Vividict() fields = ['paperContent.text', 'paperContent.subtitles^2', 'paperContent.subtexts'] tmp['multi_match']['fields'] = fields tmp['multi_match']['query'] = query nest['nested']['query']['bool']['must'] = tmp return nest def get_nested_query_videoContent(self, query): nest = Vividict() nest['nested']['path'] = 'videoContent' nest['nested']['score_mode'] = 'max' tmp = Vividict() tmp['match']['videoContent.textEnglish'] = query nest['nested']['query']['bool']['must'] = tmp return nest def get_function_factor(self): cited = Vividict() cited['field'] = 'cited' cited['modifier'] = 'log1p' cited['factor'] = 0.5 cited['missing'] = 0 return cited def get_filter_query(self, query): filter = Vividict() tag_list = query.split() filter['terms']['abstract'] = tag_list return filter def get_rescore_query(self, match): rescore = Vividict() rescore['window_size'] = 100 rescore['query']['rescore_query'] = match[0] rescore['query']['query_weight'] = 1.5 rescore['query']['rescore_query_weight'] = 0.5 return rescore def search_paper_by_name(self, search_info, only_top_k=True): """Search paper by name Args: query: query string from user Return: res_list: A list of paper information num: The number of returned paper """ dsl = self.generate_dsl(search_info) result = self.es.search(index=self.index, doc_type=self.doc_type, body=dsl, scroll="5m", size=100) # import pdb; pdb.set_trace(); sid = result['_scroll_id'] scroll_size = result['hits']['total'] res_list, paper_id, num = [], [], scroll_size while scroll_size > 0: result = self.es.scroll(scroll_id=sid, scroll="5m") sid = result['_scroll_id'] scroll_size = len(result["hits"]["hits"]) paper, p_id, _ = self.get_paper_info(result) res_list += paper paper_id += p_id if only_top_k: break return res_list, paper_id, num def get_video_pos_by_paper_id(self, search_info, paper_id, threshold=0.6): """ Args: search_info: the same as that in self.generate_dsl() paper_id: A string, given by es Return: a sorted video captions' list according to similarity between captions and query """ assert isinstance(paper_id, str), "paper_id must be a string, here need only one id !" paper = self.es.get_source(index=self.index, doc_type=self.doc_type, id=paper_id) return self.get_video_pos_by_paper(search_info=search_info, paper=paper, threshold=threshold) def get_video_pos_by_paper(self, search_info, paper, threshold=0.6): """ Args: paper: A dict contained title, abstract ... Return: a sorted video captions' list according to similarity between captions and query """ assert isinstance(paper, dict), "paper must be a dict, here need only one paper !" if 'integrated' in search_info['query_type']: query = search_info['query'] else: query = search_info['match']['videoContent'] if 'videoContent' not in paper: return [None] pos = get_video_pos(query=query, videoContent=paper['videoContent'], threshold=threshold) return pos @staticmethod def get_paper_info(res): """Return raw paper info given es search result Args: res: A dict of result from es.search Return: paper_list: A list of dicts, each dict stores information of a paper num: length of paper_list """ paper_list = [] paper_id = [] hits = res['hits']['hits'] num = res['hits']['total'] # import pdb; pdb.set_trace(); for hit in hits: paper_list.append(hit['_source']) paper_id.append(hit['_id']) return paper_list, paper_id, num @staticmethod def remove_text_embedding(papers): """Remove textEmbedding in videoContent Args: papers: A list of paper """ for paper in papers: if 'videoContent' in paper: for v in paper['videoContent']: if 'textEmbedding' in v: v.pop('textEmbedding')
class ESStorage(Storage): """Elasticsearch storage backend.""" NAME = "es" _MESSAGE_FIELD_NAME = "_source.message" def __init__(self, configuration): """Initialize Elasticsearch storage backend.""" self.config = configuration self._connect() def _connect(self): urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) if len(self.config.ES_CERT_DIR) and os.path.isdir(self.config.ES_CERT_DIR): _LOGGER.warn( "Using cert and key in %s for connection to %s (verify_certs=%s)." % ( self.config.ES_CERT_DIR, self.config.ES_ENDPOINT, self.config.ES_VERIFY_CERTS, ) ) self.es = Elasticsearch( self.config.ES_ENDPOINT, use_ssl=self.config.ES_USE_SSL, verify_certs=self.config.ES_VERIFY_CERTS, client_cert=os.path.join(self.config.ES_CERT_DIR, "es.crt"), client_key=os.path.join(self.config.ES_CERT_DIR, "es.key"), timeout=60, max_retries=2, ) else: _LOGGER.warn("Conecting to ElasticSearch without authentication.") print(self.config.ES_USE_SSL) self.es = Elasticsearch( self.config.ES_ENDPOINT, use_ssl=self.config.ES_USE_SSL, verify_certs=self.config.ES_VERIFY_CERTS, timeout=60, max_retries=2, ) def _prep_index_name(self, prefix): # appends the correct date to the index prefix now = datetime.datetime.now() date = now.strftime("%Y.%m.%d") index = prefix + date return index def retrieve(self, storage_attribute: ESStorageAttribute): """Retrieve data from ES.""" index_in = self._prep_index_name(self.config.ES_INPUT_INDEX) query = { "sort": {"@timestamp": {"order": "desc"}}, "query": { "bool": { "must": [ {"query_string": {"analyze_wildcard": True, "query": ""}}, {"range": {"@timestamp": {"gte": "now-900s", "lte": "now"}}}, ], "must_not": [], } }, } _LOGGER.info( "Reading in max %d log entries in last %d seconds from %s", storage_attribute.number_of_entries, storage_attribute.time_range, self.config.ES_ENDPOINT, ) query["size"] = storage_attribute.number_of_entries query["query"]["bool"]["must"][1]["range"]["@timestamp"]["gte"] = "now-%ds" % storage_attribute.time_range query["query"]["bool"]["must"][0]["query_string"]["query"] = self.config.ES_QUERY es_data = self.es.search(index_in, body=json.dumps(query)) if es_data["hits"]["total"] == 0: return pandas.DataFrame(), es_data # only use _source sub-dict es_data = [x["_source"] for x in es_data["hits"]["hits"]] es_data_normalized = pandas.DataFrame(json_normalize(es_data)["message"]) _LOGGER.info("%d logs loaded in from last %d seconds", len(es_data_normalized), storage_attribute.time_range) self._preprocess(es_data_normalized) return es_data_normalized, es_data # bad solution, this is how Entry objects could come in. def store_results(self, data): """Store results back to ES.""" index_out = self._prep_index_name(self.config.ES_TARGET_INDEX) actions = [{"_index": index_out, "_type": "log", "_source": data[i]} for i in range(len(data))] helpers.bulk(self.es, actions, chunk_size=int(len(data) / 4) + 1)
args = vars(get_args()) if not args['comparecsvs']: if not args['index'] or not args['rootdir']: print( '--eshost1, --index and --rootdir cli args required (unless using --comparecsvs), use -h for help' ) sys.exit(1) # set up elasticsearch connections es = Elasticsearch(hosts=args['eshost1'], port=args['esport1'], http_auth=(args['esuser1'], args['espass1']), connection_class=Urllib3HttpConnection, timeout=config['es_timeout'], maxsize=config['es_maxsize'], max_retries=config['es_max_retries'], retry_on_timeout=True) if args['eshost2']: es2 = Elasticsearch(hosts=args['eshost2'], port=args['esport2'], http_auth=(args['esuser2'], args['espass2']), connection_class=Urllib3HttpConnection, timeout=config['es_timeout'], maxsize=config['es_maxsize'], max_retries=config['es_max_retries'], retry_on_timeout=True) else: es2 = es
#!/usr/bin/env python3 import os import sys from elasticsearch5 import Elasticsearch es = Elasticsearch() es_options = { "index": "article_test", "doc_type": "article" } query = {"query": {"match_all": {}}} es.delete_by_query(**es_options, body=query)
class es02: def __init__(self): self.es = Elasticsearch([ 'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443' ]) self.set_service() pass def load(self, fname="es01.pkl"): self.data = pickle.load(open(fname, "rb")) def load_datas(self, start_date=date(2017, 12, 1), end_date=date(2018, 1, 9)): d = start_date delta = datetime.timedelta(days=1) while d <= end_date: es_date = d.strftime("%Y.%m.%d") print(es_date) try: self.load_data(es_date) self.parse() #print(es.dset) except Exception: print(traceback.format_exc()) d += delta def load_data(self, dt="2018.01.08"): es_index = 'slowquery-' + dt page = self.es.search(index=es_index, doc_type='elltdev', body={'query': { 'match_all': {} }}) self.data = page # print("test") def load_datas2(self, start_date=date(2017, 12, 1), end_date=date(2018, 1, 9)): d = start_date delta = datetime.timedelta(days=1) while d <= end_date: es_date = d.strftime("%Y.%m.%d") print(es_date) try: self.load_data2(es_date) self.parse() #print(es.dset) except: print("can't not find data") d += delta def load_data2(self, dt): url = 'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443/slowquery-' + dt + '/elltdev/_search' resp = requests.get(url=url) self.data = json.loads(resp.text) #data = {'took': 1, '_shards': {'total': 5, 'successful': 5, 'failed': 0}, 'timed_out': False, 'hits': {'max_score': 1.0, 'total': 1550, 'hits': [{'_source': {'host': 'omuser[omuser] @ [10.125.224.9] Id: 1005635', 'Rows_examined': 514, 'query': '''SELECT \t/*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\titemT.GOODS_NO\t\t , GROUP_CONCAT(DISTINCT itemT.ITEM_NO separator ',') AS ITEM_NO\t\t , itemT.OPT_NM\t\t , itemT.OPT_VAL\t\t\t\t, optT.OPT_SEQ\t\t\t\t \t\t FROM (\t\t\t\tSELECT /*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\t\t\tgd_item_opt.ITEM_NO\t\t\t , GOODS_NO\t\t\t\t\t\t, OPT_NM\t\t\t\t\t\t, OPT_VAL\t\t\t\t FROM gd_item , gd_item_opt\t\t\t\t WHERE gd_item_opt.ITEM_NO = gd_item.ITEM_NO\t\t\t\t ) itemT\t\t INNER JOIN gd_goods_opt optT\t ON itemT.GOODS_NO = optT.GOODS_NO\t\t AND itemT.OPT_NM = optT.OPT_NM\t\t \t\t AND optT.GOODS_NO = '1000000644'\t\t \t \t\t \t\t AND optT.OPT_SEQ = '1'\t\t GROUP BY itemT.GOODS_NO, itemT.OPT_NM, itemT.OPT_VAL, optT.OPT_SEQ;'''}}] }} # 데이터를 저장하기 위한 영역 dset = [] dtmp = {} def set_service(self, svc="goods"): self.svc = svc def get_dbio(self, sql): pat = re.compile("\[\w+\-api][\w|.]+") m = pat.findall(sql) if len(m) > 0: return (m[0]).strip() else: return None def get_tables(self, sql): pat = re.compile( "(?<=\W)(?:GD|AT|CC|CH|DP|ET|MB|OM|PR|ST)\_[\_\w\.]+(?=\W)", re.I) tables = pat.findall(sql) if len(sql) > 0: return [x.upper() for x in tables if x.find(".") == -1] else: return None def print_kv(self, k, v): if (k in [ 'host', 'Rows_examined', 'Query_time', '@timestamp', 'service', 'Lock_time' ]): #print(k,":",v) self.dtmp[k] = v elif (k in ['query']): #print("dbio :", get_dbio(v)) self.dtmp['dbio'] = self.get_dbio(v) #print("tables :",get_tables(v)) self.dtmp['tables'] = self.get_tables(v) elif (k in ['_source']): #print("="*80) self.print_data(v) if self.dtmp['dbio'] != None and len(self.dtmp['tables']) > 0: #self.dset[self.dtmp['dbio']] = self.dtmp['tables'] if (self.dtmp['service'] == self.svc): self.dset.append(self.dtmp['tables']) self.dtmp = {} else: #print(k,":") self.print_data(v) def print_data(self, d): if (type(d) == dict): for k, v in d.items(): self.print_kv(k, v) elif (type(d) == list): for item in d: self.print_data(item) elif (type(d) in [str, int, bool, float]): pass else: print("=" * 80) print(type(d)) # print_data argement없이 호출하는 함수. def parse(self): self.print_data(self.data)
def __init__(self): self.es = Elasticsearch([ 'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443' ]) self.set_service() pass
class ElasticSearchSearvice: # 默认加载本地 if ENV != 'HP': _es = Elasticsearch(hosts='127.0.0.1:9200', sniffer_timeout=60) else: _es = Elasticsearch(hosts='192.168.31.250:9200', sniffer_timeout=60) # _es = Elasticsearch(hosts='192.168.31.16:9200') @classmethod def is_available(cls): return cls._es.ping() # create dsl # 单子段匹配并且高亮(短语匹配) @classmethod def __create_dsl_for_single_field(cls, field, text, size=None, highlight={}): dsl = { 'query': { 'match': { field: text.lower() } }, "highlight": { "order": "score", "require_field_match": False, "fields": { field: highlight } } } if size: dsl['query']['match']['size'] = size return dsl # 多字段全局检索(短语匹配, 不会有字符程度) @classmethod def __create_dsl_for_multi_fields(cls, text, fields, operator='or', size=None): dsl = { "query": { "multi_match": { "query": text, "type": "most_fields", "operator": operator, "fields": fields } } } if size: dsl['query']['multi_match']['size'] = size return dsl # 单字段检索, 正则表达式 @classmethod def __create_dsl_for_single_field_with_regex(cls, field, texts, start=0, size=3, highlight={}): dsl = { "query": { "constant_score": { "filter": { 'bool': { 'should': [] } } } }, "highlight": { "order": "score", "require_field_match": False, "fields": { field: highlight } } } if size: dsl['size'] = size if start: dsl['from'] = start for text in texts: dsl['query']['constant_score']['filter']['bool']['should'].append( {"regexp": { field: { "value": '.*' + text.lower() + '.*' } }}) return dsl # 多字段多值匹配并且高亮(正则表达式模式, 会匹配单字符包含关系) @classmethod def __create_dsl_for_multi_field_with_regex(cls, fields, texts, start=0, size=3, highlight={}): dsl = { "query": { "constant_score": { "filter": { 'bool': { 'should': [] } } } }, "highlight": { "order": "score", "require_field_match": False, "fields": {} } } if size: dsl['size'] = size if start: dsl['from'] = start for field in fields: # 高亮 dsl['highlight']['fields'][field] = highlight for text in texts: dsl['query']['constant_score']['filter']['bool'][ 'should'].append({ "regexp": { field: { "value": '.*' + text.lower() + '.*' } } }) return dsl # 精确查找 @classmethod # 精确查找-tags def __create_dsl_for_tags(cls, tags, type, start=0, size=3): dsl = { 'from': start, 'size': size, 'query': { "constant_score": { "filter": { 'bool': { 'must': [{ 'term': { 'type': type } }, { 'terms': { 'tags': tags } }] } } } } } # 返回Tags return dsl # 多字段检索并且高亮 @classmethod def match_fields_with_highlight(cls, fields, text, operator='or', size=None, highlight={}): dsl = { "query": { "multi_match": { "query": text, "type": "most_fields", "operator": operator, # "fields": fields } }, "highlight": { "order": "score", "require_field_match": False, "fields": {} } } weights_fields = [] for field in fields: # 高亮 dsl['highlight']['fields'][field] = highlight # 加权 if field in ALLTITLES: field = field + '^3' elif field == 'tags': field = field + '^2' else: field = field + '^1' weights_fields.append(field) dsl['query']['multi_match']['fields'] = weights_fields return dsl @classmethod def __execute_dsl(cls, dsl, index): if index == 'apps': print(dsl) dsl['query']['constant_score']['filter']['bool']['must'] = { "term": { "type": 'app' } } return cls._es.search(index='projects', doc_type='project', body=dsl) if index == 'modules': dsl['query']['constant_score']['filter']['bool']['must'] = { "term": { "type": 'module' } } return cls._es.search(index='projects', doc_type='project', body=dsl) if index == 'datasets': dsl['query']['constant_score']['filter']['bool']['must'] = { "term": { "type": 'dataset' } } return cls._es.search(index='projects', doc_type='project', body=dsl) if index == 'users': return cls._es.search(index='users', doc_type='user', body=dsl) if index == 'requests': return cls._es.search(index='requests', doc_type='request', body=dsl) if index == 'projects': return cls._es.search(index='projects', doc_type='project', body=dsl) if index == 'code_snippets': return cls._es.search(index='code_snippets', doc_type='code_snippet', body=dsl) @classmethod def search_title(cls, search_value=None, index=None): if not isinstance(search_value, list): search_value = [search_value] if index: # des index if not search_value: return {'msg': 'failed'} fields = TABLEFORFIELDS[index] ''' cls.__create_dsl_for_single_field_with_regex(field=title, texts=search_value, size=20, highlight=HIGHTLIGHT) ''' dsl = cls.__create_dsl_for_multi_field_with_regex( fields=fields, texts=search_value, start=0, size=3, highlight=HIGHTLIGHT) results = cls.__execute_dsl(dsl, index=index) else: # all # indexes = list(TITLESFORTABLES.values()) results = [] for idx in range(len(INDEXES)): ''' cls.__create_dsl_for_single_field_with_regex(field=titles[title_idx], texts=search_value, size=3, highlight=HIGHTLIGHT) ''' fields = TABLEFORFIELDS[INDEXES[idx]] dsl = cls.__create_dsl_for_multi_field_with_regex( fields=fields, texts=search_value, start=0, size=3, highlight=HIGHTLIGHT) # for index in indexes[title_idx]: search_result = cls.__execute_dsl(dsl, INDEXES[idx]) results.append({INDEXES[idx]: search_result}) # 获取数值接口 count_info = cls.count(search_value) return results, count_info @classmethod def __remove_for_count_api(cls, dsl): del dsl['highlight'] del dsl['size'] return dsl @classmethod def count(cls, search_text): # 分别求所有 # request 的数量 request_dsl = cls.__create_dsl_for_multi_field_with_regex( fields=['request_title', 'description'], texts=search_text) request_dsl = cls.__remove_for_count_api(request_dsl) # request_count = cls._es.count(index='requests', doc_type='request', body=request_dsl) request_count = {} total_request_count = 0 # projects project_dsl = cls.__create_dsl_for_multi_field_with_regex( fields=['display_name', 'description'], texts=search_text) project_dsl = cls.__remove_for_count_api(project_dsl) total_project_count = 0 # project_count = cls._es.count(index='projects', doc_type='project', body=project_dsl) # 创建 project_count = {} for project_type in ['app', 'module', 'dataset']: project_type_dsl = copy.deepcopy(project_dsl) request_type_dsl = copy.deepcopy(request_dsl) project_type_dsl['query']['constant_score']['filter']['bool'][ 'must'] = { "term": { "type": project_type } } request_type_dsl['query']['constant_score']['filter']['bool'][ 'must'] = { "term": { "type": project_type } } project_type_count = cls._es.count( index='projects', doc_type='project', body=project_type_dsl).get('count') request_type_count = cls._es.count( index='requests', doc_type='request', body=request_type_dsl).get('count') request_count[project_type] = request_type_count project_count[project_type] = project_type_count total_request_count += request_type_count total_project_count += project_type_count # users user_dsl = cls.__create_dsl_for_multi_field_with_regex( fields=['username', 'bio'], texts=search_text) user_dsl = cls.__remove_for_count_api(user_dsl) user_count = cls._es.count(index='users', doc_type='user', body=user_dsl) return { 'project_nums': total_project_count, 'request_nums': total_request_count, 'user_nums': user_count.get('count'), 'projects': project_count, 'requests': request_count } @classmethod def search_code_snippet(cls, fields=[], search_values=[], index=None, start=0, size=3): # 多字段检索 if not isinstance(fields, list): return {'message': "Please input the valid data"} # 是否多值检索 if not isinstance(search_values, list): search_values = [search_values] # 创建 DSL 语句 dsl = cls.__create_dsl_for_multi_field_with_regex(fields=fields, texts=search_values, start=start, size=size, highlight=HIGHTLIGHT) results = cls.__execute_dsl(dsl, index=index) return results @classmethod def search_fields(cls, fields=[], search_values=[], index=None, start=0, size=3, request_type=None): # 多字段检索 if not isinstance(fields, list): return {'message': "Please input the valid data"} # 是否多值检索 if not isinstance(search_values, list): search_values = [search_values] # 创建 DSL 语句 dsl = cls.__create_dsl_for_multi_field_with_regex(fields=fields, texts=search_values, start=start, size=size, highlight=HIGHTLIGHT) # 统计其他的种类的数量 count_info = cls.count(search_values) # 执行出结果 # if 'request' in index and request_type: # print(dsl) # dsl['query']['constant_score']['filter']['bool']['must'] = { "term": {"type": request_type}} results = cls.__execute_dsl(dsl, index=index) # 或许会要执行筛选 return results, count_info @classmethod def search_tags(cls, tags, start=0, size=10, index='all', project_type='app'): dsl = cls.__create_dsl_for_tags(tags, project_type, start=start, size=size) if index == 'project': results = cls._es.search(index='projects', doc_type='project', body=dsl) elif index == 'request': results = cls._es.search(index='requests', doc_type='request', body=dsl) else: # 基本不存在所有 project_results = cls._es.search(index='projects', doc_type='project', body=dsl) request_results = cls._es.search(index='requests', doc_type='request', body=dsl) return project_results, request_results return results @classmethod def operation_with_index(cls, body, index, doc_type, index_id): # if not cls._es.indices.exists(index): # body = { # "mappings": { # index[:-1]: { # "properties": { # "description": { # "type": "text", # "analyzer": "ik_max_word", # "search_analyzer": "ik_max_word" # } # } # } # } # } # if index == 'requests': # body['mappings'][index[:-1]]['properties']['request_title'] = { # "type": "text", # "analyzer": "ik_max_word", # "search_analyzer": "ik_max_word" # } # cls._es.indices.create(index=index, body=body, ignore=400) # else: # if index == 'projects': # body['mappings'][index[:-1]]['properties']['display_name'] = { # "type": "text", # "analyzer": "standard" # } # else: # body['mappings'][index[:-1]]['properties']['username'] = { # "type": "text", # "analyzer": "standard" # } # cls._es.indices.create(index=index, body=body, ignore=400) cls.create_index(index=index) print( cls._es.index(index=index, doc_type=doc_type, body=body, id=index_id, refresh='true', request_timeout=60)) @classmethod def create_index(cls, index): if not cls._es.indices.exists(index): body = { "mappings": { index[:-1]: { "properties": { "description": { "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_max_word" } } } } } if index == 'requests': body['mappings'][index[:-1]]['properties']['request_title'] = { "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_max_word" } cls._es.indices.create(index=index, body=body, ignore=400) else: if index == 'projects': body['mappings'][ index[:-1]]['properties']['display_name'] = { "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_max_word" } cls._es.indices.create(index=index, body=body, ignore=400) elif index == 'code_snippets': body['mappings'][index[:-1]]['properties']['code_name'] = { "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_max_word" } body['mappings'][ index[:-1]]['properties']['code_source'] = { "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_max_word" } body['mappings'][index[:-1]]['properties']['code_tags'] = { "type": "text", "analyzer": "ik_max_word", "search_analyzer": "ik_max_word" } cls._es.indices.create(index=index, body=body, ignore=400) else: body['mappings'][index[:-1]]['properties']['username'] = { "type": "text", "analyzer": "standard" } cls._es.indices.create(index=index, body=body, ignore=400) @classmethod def delete_all(cls): if cls._es.indices.exists('apps'): cls._es.indices.delete(index='apps') if cls._es.indices.exists('modules'): cls._es.indices.delete(index='modules') if cls._es.indices.exists('datasets'): cls._es.indices.delete(index='datasets') if cls._es.indices.exists('projects'): print('刪除projects') cls._es.indices.delete(index='projects') if cls._es.indices.exists('requests'): print('刪除requests') cls._es.indices.delete(index='requests') if cls._es.indices.exists('users'): print('刪除User') cls._es.indices.delete(index='users') if cls._es.indices.exists('code_snippets'): print('刪除code_snippets') cls._es.indices.delete(index='code_snippets') @classmethod def refresh_index(cls): cls._es.indices.refresh() @classmethod def add_user(cls, user_ID, username, bio, avatarV, avatar_url): body = { 'username': username, 'bio': bio, 'avatarV': avatarV, 'avatar_url': avatar_url } cls.operation_with_index(index='users', doc_type='user', body=body, index_id=user_ID) @classmethod def add_project(cls, project_id, display_name, description, tags, project_type, img_v, photo_url, username): body = { 'display_name': display_name, 'description': description, 'tags': [tag for tag in tags], 'type': project_type, 'img_v': img_v, 'photo_url': photo_url, 'username': username } cls.operation_with_index(index='projects', doc_type='project', body=body, index_id=project_id) @classmethod def add_request(cls, request_title, description, request_type, username, request_id): body = { 'request_title': request_title, 'description': description, 'type': request_type, 'username': username } cls.operation_with_index(index='requests', doc_type='request', body=body, index_id=request_id) @classmethod def add_code_snippet(cls, code_snippet_id, code_name, code_des, code_tags, code_source, detail_url, insert_num): body = { 'code_name': code_name, 'code_des': code_des, 'code_tags': [tag for tag in code_tags], 'code_source': code_source, 'detail_url': detail_url, 'insert_num': insert_num # 'key_words': [key_word for key_word in key_words], } cls.operation_with_index(index='code_snippets', doc_type='code_snippet', body=body, index_id=code_snippet_id) # 添加Projects @classmethod def add_projects(cls): # 添加 Project 的数据 from server3.business.project_business import ProjectBusiness # privacy='public' projects = ProjectBusiness.repo.objects() projects = projects(privacy='public') if len(projects) <= 0: cls.create_index(index='projects') for project in projects: try: body = { 'display_name': project.display_name, 'description': project.description, 'tags': [tag.id for tag in project.tags], 'type': project.type, 'img_v': project.img_v if project.img_v else '', 'photo_url': project.photo_url if project.photo_url else '', 'username': project.user.username, 'create_time': project.create_time } if project.privacy == 'private': continue cls.operation_with_index(index='projects', doc_type='project', body=body, index_id=project.id) except Exception as e: # project.delete() print('项目被删除掉, 无法添加elasticsearch') continue # 添加Requests @classmethod def add_requests(cls): from server3.business.user_request_business import UserRequestBusiness requests = UserRequestBusiness.repo.objects() if len(requests) <= 0: cls.create_index(index='requests') for request in requests: try: body = { 'request_title': request.title, 'description': request.description, 'type': request.type, 'username': request.user.username, 'tags': [tag.id for tag in request.tags], 'create_time': request.create_time } cls.operation_with_index(index='requests', doc_type='request', body=body, index_id=request.id) except Exception as e: print('需求被删除掉, 无法添加到elasticsearch') continue @classmethod def add_code_snippets(cls): from server3.business.code_snippet_business import CodeSnippetBusiness code_snippets = CodeSnippetBusiness.repo.objects() if len(code_snippets) <= 0: cls.create_index(index='code_snippets') for code_snippet in code_snippets: try: body = { 'code_name': code_snippet.code_name, 'code_des': code_snippet.code_des, 'code_tags': [tag for tag in code_snippet.code_tags], 'code_source': code_snippet.code_source, 'detail_url': code_snippet.detail_url, 'insert_num': code_snippet.insert_num, # 'key_words': [key_word for key_word in code_snippet.key_words], } cls.operation_with_index(index='code_snippets', doc_type='code_snippet', body=body, index_id=code_snippet.id) except Exception as e: print('代码块不存在') continue @classmethod # 添加 users def add_users(cls): from server3.business.user_business import UserBusiness users = UserBusiness.repo.objects() if len(users) <= 0: cls.create_index(index='users') for user in users: try: body = { 'username': user.username, 'bio': user.bio, 'avatarV': user.avatarV, 'avatar_url': user.avatar_url if user.avatar_url else '' } cls.operation_with_index(index='users', doc_type='user', body=body, index_id=user.user_ID) except Exception as e: print('用户不存在, 添加不了elasticsearch') continue @classmethod def delete_project(cls, project_id): try: print( cls._es.delete(index='projects', doc_type='project', id=project_id, refresh='true', ignore=['400', '404'])) except NotFoundError as e: print('Elastic not found this project with id: ' + project_id) @classmethod def delete_user(cls, user_ID): try: print( cls._es.delete(index='users', doc_type='user', id=user_ID, refresh='true', ignore=['400', '404'])) except NotFoundError as e: print('Elastic not found this user with id: ' + user_ID) @classmethod def delete_request(cls, request_id): try: print( cls._es.delete(index='requests', doc_type='request', id=request_id, refresh='true', ignore=['400', '404'])) except NotFoundError as e: print('Elastic not found this request with id: ' + request_id) @classmethod # 添加所有数据 def add_all(cls): if not cls._es.indices.exists('requests'): cls.create_index('requests') cls.add_requests() if not cls._es.indices.exists('projects'): cls.create_index('projects') cls.add_projects() if not cls._es.indices.exists('users'): cls.create_index('users') cls.add_users() if not cls._es.indices.exists('code_snippets'): cls.create_index('code_snippets') cls.add_code_snippets() @classmethod def clear_indices(cls): cls._es.indices.clear_cache()
def __init__(self): self.es = Elasticsearch(ElasticConfig.uri) self._multi_search_results = [] self.bulk_task_queue = [] self.bulk_last_time = datetime_now_obj()
class ElasticHelper(object): def __init__(self): self.es = Elasticsearch(ElasticConfig.uri) self._multi_search_results = [] self.bulk_task_queue = [] self.bulk_last_time = datetime_now_obj() def delay_index(self, body, index, doc_type): self.bulk_task_queue.append( {"index": { "_index": index, "_type": doc_type }}) self.bulk_task_queue.append(body) if self._can_do_bulk(): self.bulk(body=self.bulk_task_queue, index=index, doc_type=doc_type) self.bulk_task_queue = [] self.bulk_last_time = datetime_now_obj() def _can_do_bulk(self): # 任务队列超过100条数据 if len(self.bulk_task_queue) > 100: return True # 时间间隔超过1分钟 if get_n_min_ago(1) > self.bulk_last_time: return True return False def index(self, body, index, doc_type): self.es.index(body=body, index=index, doc_type=doc_type) def bulk(self, body, index, doc_type): self.es.bulk(body=body, index=index, doc_type=doc_type) def scan(self, body, index, doc_type): return helpers.scan(self.es, query=body, index=index, doc_type=doc_type, preserve_order=True) def search(self, body, index, doc_type): try: rsp = self.es.search(body=body, index=index, doc_type=doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) return return rsp except Exception as e: print(body) logger.error("es search error: " + str(e) + index) def count(self, body, index, doc_type): return self.es.count(index=index, doc_type=doc_type, body=body, request_timeout=100) def delete_index(self, index): return self.es.indices.delete(index=index) def put_template(self, name, body, **kwargs): return self.es.indices.put_template(name=name, body=body, **kwargs) def exists_template(self, name, **kwargs) -> bool: return self.es.indices.exists_template(name=name, **kwargs) def delete_template(self, name, **kwargs): return self.es.indices.delete_template(name=name, **kwargs) def get_template(self, name, **kwargs): return self.es.indices.get_template(name=name, **kwargs) def wait_log_in_database(self, computer_name, record_number): """ 因为消息队列和入库ES是分开进行的,所以可能会出现当消费到某条日志时,ES还没入库,所以需要检查同步 """ count = 0 query = { "query": get_must_statement( get_term_statement("computer_name", computer_name), get_term_statement("record_number", record_number)), "_source": False, "size": 1 } while True: try: rsp = self.es.search(body=query, index=ElasticConfig.event_log_index, doc_type=ElasticConfig.event_log_doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) break if len(rsp["hits"]["hits"]) > 0: return rsp["hits"]["hits"][0]["_id"] time.sleep(2) # 最多等5次,即 2 * 5 = 10秒 if count == 10: break count += 1 except Exception as e: logger.error("es wait_log_in_database search error: " + str(e)) break def multi_search(self, body, index, doc_type): try: rsp = self.es.msearch(body=body, index=index, doc_type=doc_type, request_timeout=100) if rsp.get("error"): logger.error(rsp.get("error").get("reason")) return return rsp except Exception as e: logger.error("es msearch error: " + str(e))
def create_app(config, enable_config_file=False): """ 创建应用 :param config: 配置信息对象 :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息 :return: 应用 """ app = create_flask_app(config, enable_config_file) # 创建Snowflake ID worker from utils.snowflake.id_worker import IdWorker app.id_worker = IdWorker(app.config['DATACENTER_ID'], app.config['WORKER_ID'], app.config['SEQUENCE']) # 限流器 from utils.limiter import limiter as lmt lmt.init_app(app) # 配置日志 from utils.logging import create_logger create_logger(app) # 注册url转换器 from utils.converters import register_converters register_converters(app) from redis.sentinel import Sentinel _sentinel = Sentinel(app.config['REDIS_SENTINELS']) app.redis_master = _sentinel.master_for( app.config['REDIS_SENTINEL_SERVICE_NAME']) app.redis_slave = _sentinel.slave_for( app.config['REDIS_SENTINEL_SERVICE_NAME']) from rediscluster import StrictRedisCluster app.redis_cluster = StrictRedisCluster( startup_nodes=app.config['REDIS_CLUSTER']) # rpc app.rpc_reco_channel = grpc.insecure_channel(app.config['RPC'].RECOMMEND) app.rpc_reco = app.rpc_reco_channel # Elasticsearch app.es = Elasticsearch( app.config['ES'], # sniff before doing anything sniff_on_start=True, # refresh nodes after a node fails to respond sniff_on_connection_fail=True, # and also every 60 seconds sniffer_timeout=60) # socket.io # 通过sio mgr对象 可以发布要进行即使消息推送的任务,由socketio服务器从rabbitmq中取出任务,推送消息 app.sio_mgr = socketio.KombuManager(app.config['RABBITMQ'], write_only=True) # MySQL数据库连接初始化 from models import db db.init_app(app) # 创建APScheduler定时任务调度器对象 executors = {'default': ThreadPoolExecutor(10)} app.scheduler = BackgroundScheduler(executors=executors) # 添加"静态的"定时任务 from .schedule.statistic import fix_statistics # app.scheduler.add_job(fix_statistics, 'date', args=[app]) app.scheduler.add_job(fix_statistics, 'cron', hour=3, args=[app]) # 启动定时任务调度器 app.scheduler.start() # 废弃 添加异常处理 对于flask-restful无效 # from utils.error_handlers import handle_redis_error, handler_mysql_error # app.register_error_handler(RedisError, handle_redis_error) # app.register_error_handler(SQLAlchemyError, handler_mysql_error) # 添加请求钩子 from utils.middlewares import jwt_authentication app.before_request(jwt_authentication) # 注册用户模块蓝图 from .resources.user import user_bp app.register_blueprint(user_bp) # 注册新闻模块蓝图 from .resources.news import news_bp app.register_blueprint(news_bp) # 注册通知模块 from .resources.notice import notice_bp app.register_blueprint(notice_bp) # 搜索 from .resources.search import search_bp app.register_blueprint(search_bp) return app
from elasticsearch5 import Elasticsearch useIndex = 'tw_user_database_*' TWEETSINDEX = "tweets_database*" # host = "192.168.209.113" # port = "9200" host = "192.168。8.200" port = "9201" es_client = Elasticsearch([{"host": host, "port": port}]) info = es_client.info() userid = "25073877" body = {"query": {"match": {"user.id": userid}}} rs = es_client.search(index=TWEETSINDEX, body=body) print rs print type(rs)
from elasticsearch5 import Elasticsearch from .constants import Constants from .text import get_file_list import json es = Elasticsearch() def create_index(): """Create new index """ es.indices.create(Constants.INDEX_NAME, body=get_es_script('index_create')) def delete_index(): """Delete existing index """ es.indices.delete(index=Constants.INDEX_NAME, ignore=[400, 404]) def get_es_script(script_name): """Read es json file return dictionary of the body """ with open(Constants.ES_SCRIPTS_PATH + script_name + '.json') as s: body = json.load(s) return body def get_doc_count(): """Retrieve document count from ES
def create_app(config, enable_config_file=False): """ 创建应用 :param config: 配置信息对象 :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息 :return: 应用 """ app = create_flask_app(config, enable_config_file) # 创建Snowflake ID worker from utils.snowflake.id_worker import IdWorker app.id_worker = IdWorker(app.config['DATACENTER_ID'], app.config['WORKER_ID'], app.config['SEQUENCE']) # 如果在视图中需要生成分布式ID # id = current_app.id_worker.get_id() # 限流器 from utils.limiter import limiter as lmt lmt.init_app(app) # 配置日志 from utils.logging import create_logger create_logger(app) # 注册url转换器 from utils.converters import register_converters register_converters(app) from redis.sentinel import Sentinel _sentinel = Sentinel(app.config['REDIS_SENTINELS']) app.redis_master = _sentinel.master_for( app.config['REDIS_SENTINEL_SERVICE_NAME']) app.redis_slave = _sentinel.slave_for( app.config['REDIS_SENTINEL_SERVICE_NAME']) from rediscluster import StrictRedisCluster app.redis_cluster = StrictRedisCluster( startup_nodes=app.config['REDIS_CLUSTER']) # 视图 # current_app.redis_master.set() # current_app.redis_cluster.get() # rpc app.rpc_reco = grpc.insecure_channel(app.config['RPC'].RECOMMEND) # Elasticsearch app.es = Elasticsearch( app.config['ES'], # sniff before doing anything sniff_on_start=True, # refresh nodes after a node fails to respond sniff_on_connection_fail=True, # and also every 60 seconds sniffer_timeout=60) # socket.io # app.sio = socketio.KombuManager(app.config['RABBITMQ'], write_only=True) # MySQL数据库连接初始化 from models import db db.init_app(app) # db = SQLAlchmey(app) # db = SQLAlchemy() # db.init_app(app_ # 创建定时任务工具对象 # 将scheduler对象保存到flask app对象中的目的,是方便视图执行的时候随时产生新的定时任务需求,可以借助current_app.scheduler.add_job()来 # 动态添加新的定时任务 executors = { 'default': ThreadPoolExecutor(10), } app.scheduler = BackgroundScheduler(executors=executors) # 此处可以添加定时任务,这些定时任务与视图程序的执行无关,是在程序启动一开始就确定好的 from .schedule import statistic # 每天凌晨3点执行 app.scheduler.add_job(statistic.fix_statistics, 'cron', hour=3, args=[app]) # 为了测试方便,立即执行 # app.scheduler.add_job(statistic.fix_statistics, 'date', args=[app]) # app.scheduler.add_job() # app.scheduler.add_job() # app.scheduler.add_job() app.scheduler.start() # 添加请求钩子 from utils.middlewares import jwt_authentication app.before_request(jwt_authentication) # 注册用户模块蓝图 from .resources.user import user_bp app.register_blueprint(user_bp) # 注册新闻模块蓝图 from .resources.news import news_bp app.register_blueprint(news_bp) # 注册通知模块 from .resources.notice import notice_bp app.register_blueprint(notice_bp) # 搜索 from .resources.search import search_bp app.register_blueprint(search_bp) return app
import os import time import xmltodict from elasticsearch5 import Elasticsearch es = Elasticsearch() #Set the path for the XML files path = "Parsed files/" #Set the path for the queries txt Qfile = "testingQueries.txt" counter = 0 mode = "other" #Set mode to 'all' if you want to read the XML files, turn them into dictionaries and upload them to elasticsearch #Set the mode to 'other' if you have already uploaded the data and you want to do the queries part only if mode == "all": #Creates the index form #Sets the english analyzer to elasticsearch before inserting the data es.indices.create(index='test', ignore=400, body={ 'mappings': { 'project': { 'properties': { 'rcn': { 'type': 'integer' }, 'acronym': { 'type': 'string' }, 'text': { 'type': 'string',
from acmappings import mappings from elasticsearch5 import Elasticsearch from elasticsearch5 import helpers import os esUrl = os.environ['esurl'] esPort = os.environ['esport'] esPass = os.environ['espass'] esUser = os.environ['esuser'] aliases = os.environ['aliases'] indices = os.environ['indices'] esObj = Elasticsearch([{"host":esUrl,"port":esPort}],http_auth=(esUser,esPass)) version = "" oldversion = "" with open("sacindexcreator_count","r") as file: version,oldversion = file.readline().split(",") version = str(int(version) + 1) oldversion = str(int(oldversion) + 1) locales = {"da":"danish_rebuilt","hr":"standard","pl":"standard","sl":"standard","el":"greek_rebuilt","ja":"cjk_rebuilt","ko":"cjk_rebuilt","ar": "arabic_rebuilt","de": "german_rebuilt","zh":"cjk_rebuilt","id": "indonesian_rebuilt","th": "thai_rebuilt","sv": "swedish_rebuilt","tr": "turkish_rebuilt","ru": "russian_rebuilt","pt": "portuguese_rebuilt","br": "brazilian_rebuilt","it": "italian_rebuilt","hu": "hungarian_rebuilt","nl": "dutch_rebuilt","no": "norwegian_rebuilt","es": "spanish_rebuilt","fr": "french_rebuilt","fr_ca": "french_rebuilt","cz": "czech_rebuilt","en": "english_rebuilt"} if indices == "yes": for locale in locales: tempmapping = mappings for item in tempmapping["mappings"]["autocomplete"]["properties"]: if tempmapping["mappings"]["autocomplete"]["properties"][item].get("type",False): if tempmapping["mappings"]["autocomplete"]["properties"][item]["type"] == "text":
from elasticsearch5 import Elasticsearch # 使用对应版本的模块中的类 # elasticsearch 集群服务器的地址 ES = ['127.0.0.1:9200'] # 创建elasticsearch客户端 es = Elasticsearch( ES, # 启动前嗅探es集群服务器 sniff_on_start=True, # es集群服务器结点连接异常时是否刷新es节点信息 sniff_on_connection_fail=True, # 每60秒刷新节点信息 sniffer_timeout=60) def search(): search_name = input("请输入要查询的关键词:") query = { "from": 0, "size": 10000, # from + size must be less than or equal to: [10000] "query": { "bool": { # 匹配标题或作者 "should": [{ "match_phrase": { "title": search_name } }, { "match_phrase": {
import pandas as pd from elasticsearch5 import Elasticsearch #pid = 2337 es = Elasticsearch(hosts=ES_HOST) count = es.count(index="prd_review")['count'] def get_mtermvectors(ids): body = dict() body["ids"] = ids body["parameters"] = {"fields": ["title"]} res = es.mtermvectors(index='prd_review', doc_type='_doc', body=body)['docs'] return res def get_termvectors(id): res = es.termvectors(index='prd_review', doc_type='_doc', id=id)['term_vectors'] if 'title' in res.keys(): return res else: return None def sort_terms_vector(term_vectors): if not term_vectors: return None term_dict = {}
def create_app(config, enable_config_file=False): """ 创建应用 :param config: 配置信息对象 :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息 :return: 应用 """ app = create_flask_app(config, enable_config_file) # 创建Snowflake ID worker from utils.snowflake.id_worker import IdWorker app.id_worker = IdWorker(app.config['DATACENTER_ID'], app.config['WORKER_ID'], app.config['SEQUENCE']) # 限流器 from utils.limiter import limiter as lmt lmt.init_app(app) # 配置日志 from utils.logging import create_logger create_logger(app) # 注册url转换器 from utils.converters import register_converters register_converters(app) from redis.sentinel import Sentinel _sentinel = Sentinel(app.config['REDIS_SENTINELS']) app.redis_master = _sentinel.master_for( app.config['REDIS_SENTINEL_SERVICE_NAME']) app.redis_slave = _sentinel.slave_for( app.config['REDIS_SENTINEL_SERVICE_NAME']) from rediscluster import StrictRedisCluster app.redis_cluster = StrictRedisCluster( startup_nodes=app.config['REDIS_CLUSTER']) # rpc app.rpc_reco = grpc.insecure_channel(app.config['RPC'].RECOMMEND) # Elasticsearch app.es = Elasticsearch( app.config['ES'], # sniff before doing anything sniff_on_start=True, # refresh nodes after a node fails to respond sniff_on_connection_fail=True, # and also every 60 seconds sniffer_timeout=60) # socket.io # app.sio = socketio.KombuManager(app.config['RABBITMQ'], write_only=True) # MySQL数据库连接初始化 from models import db db.init_app(app) # 添加请求钩子 from utils.middlewares import jwt_authentication app.before_request(jwt_authentication) # 注册用户模块蓝图 from .resources.user import user_bp app.register_blueprint(user_bp) # 注册新闻模块蓝图 from .resources.news import news_bp app.register_blueprint(news_bp) # 注册通知模块 from .resources.notice import notice_bp app.register_blueprint(notice_bp) # 搜索 from .resources.search import search_bp app.register_blueprint(search_bp) # 定义apscheduler的调度器对象 # 保存到flask的app对象,方便在视图中使用调度器添加新的定时任务 executors = { 'default': ThreadPoolExecutor(20), } app.scheduler = BackgroundScheduler(executors=executors) # 由scheduler管理的定时任务 两种: # 一种是一开始就明确确定的 ,比如 修正redis的统计数据 # 在此处定义 add_job # app.scheduler.add_job() # 添加定时修正统计数据的定时任务 from .schedulers.statistic import fix_statistics # 每天的凌晨3点执行 # 通过args 可以在调度器执行定时任务方法的时候,传递给定时任务方法参数 # app.scheduler.add_job(fix_statistics, 'cron', hour=3, args=[app]) # 为了测试方便,需要立即执行 app.scheduler.add_job(fix_statistics, 'date', args=[app]) # 另一种 是在flask运行期间,由视图函数产生的,动态添加的新定时任务 # 在视图函数中 调用 current_app.scheduler.add_job来添加 app.scheduler.start() # 非阻塞,不会阻塞住flask程序的执行,会在后台单独创建进程或线程进行计时 return app
from elasticsearch5 import Elasticsearch from elasticsearch5 import helpers import os import uuid import datetime esUrl = os.environ['esurl'] esPort = os.environ['esport'] esPass = os.environ['espass'] esUser = os.environ['esuser'] rootOrg = os.environ['rootOrg'] org = os.environ['org'] esObj = Elasticsearch([{ "host": esUrl, "port": esPort }], http_auth=(esUser, esPass)) response = esObj.search("mlsearch_*", "searchresources", '''{ "size":1000, "_source":["locale","keywords","catalogPaths","name","sourceName","sourceShortName"] }''', scroll="5s") result_pending = [response] cnt = 1 indexDocs = [] stData = {} while result_pending:
def create_app(config, enable_config_file=False): """ 创建应用 :param config: 配置信息对象 :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息 :return: 应用 """ app = create_flask_app(config, enable_config_file) # 创建Snowflake ID worker from utils.snowflake.id_worker import IdWorker app.id_worker = IdWorker(app.config['DATACENTER_ID'], app.config['WORKER_ID'], app.config['SEQUENCE']) # 限流器 from utils.limiter import limiter as lmt lmt.init_app(app) # 配置日志 from utils.logging import create_logger create_logger(app) # 注册url转换器 from utils.converters import register_converters register_converters(app) from redis.sentinel import Sentinel _sentinel = Sentinel(app.config['REDIS_SENTINELS']) app.redis_master = _sentinel.master_for( app.config['REDIS_SENTINEL_SERVICE_NAME']) app.redis_slave = _sentinel.slave_for( app.config['REDIS_SENTINEL_SERVICE_NAME']) from rediscluster import StrictRedisCluster app.redis_cluster = StrictRedisCluster( startup_nodes=app.config['REDIS_CLUSTER']) # rpc # app.rpc_reco = grpc.insecure_channel(app.config['RPC'].RECOMMEND) # Elasticsearch app.es = Elasticsearch( app.config['ES'], # sniff before doing anything sniff_on_start=True, # refresh nodes after a node fails to respond sniff_on_connection_fail=True, # and also every 60 seconds sniffer_timeout=60) # socket.io # 创建一个socketio提供的写入rabbitmq任务信息的工具对象 app.sio_mgr = socketio.KombuManager(app.config['RABBITMQ'], write_only=True) # MySQL数据库连接初始化 from models import db db.init_app(app) # 添加请求钩子 from utils.middlewares import jwt_authentication app.before_request(jwt_authentication) # 注册用户模块蓝图 from .resources.user import user_bp app.register_blueprint(user_bp) # 注册新闻模块蓝图 from .resources.news import news_bp app.register_blueprint(news_bp) # 注册通知模块 from .resources.notice import notice_bp app.register_blueprint(notice_bp) # 搜索 from .resources.search import search_bp app.register_blueprint(search_bp) return app
def create_app(config, enable_config_file=False): """ 创建应用 :param config: 配置信息对象 :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息 :return: 应用 """ app = create_flask_app(config, enable_config_file) # 创建Snowflake ID worker from utils.snowflake.id_worker import IdWorker app.id_worker = IdWorker(app.config['DATACENTER_ID'], app.config['WORKER_ID'], app.config['SEQUENCE']) # 限流器 from utils.limiter import limiter as lmt lmt.init_app(app) # 配置日志 from utils.logging import create_logger create_logger(app) # 注册url转换器 from utils.converters import register_converters register_converters(app) from redis.sentinel import Sentinel _sentinel = Sentinel(app.config['REDIS_SENTINELS']) app.redis_master = _sentinel.master_for( app.config['REDIS_SENTINEL_SERVICE_NAME']) app.redis_slave = _sentinel.slave_for( app.config['REDIS_SENTINEL_SERVICE_NAME']) from rediscluster import StrictRedisCluster app.redis_cluster = StrictRedisCluster( startup_nodes=app.config['REDIS_CLUSTER']) # rpc app.rpc_reco_channel = grpc.insecure_channel(app.config['RPC'].RECOMMEND) # app.rpc_reco = grpc.insecure_channel(app.config['RPC'].RECOMMEND) # Elasticsearch app.es = Elasticsearch( app.config['ES'], # sniff before doing anything sniff_on_start=True, # refresh nodes after a node fails to respond sniff_on_connection_fail=True, # and also every 60 seconds sniffer_timeout=60) # socket.io # app.sio = socketio.KombuManager(app.config['RABBITMQ'], write_only=True) # MySQL数据库连接初始化 from models import db db.init_app(app) # # 添加请求钩子 from utils.middleware import jwt_authorization app.before_request(jwt_authorization) # 添加定时任务APScheduler from apscheduler.schedulers.background import BackgroundScheduler from apscheduler.executors.pool import ThreadPoolExecutor # 触发器 from apscheduler.triggers import date, interval, cron from toutiao.schedule.statistics import fix_statistics # 1.创建执行器对象executors executors = { # 默认会将定时任务使用线程执行,并且添加到线程池,最大并发10个线程 "default": ThreadPoolExecutor(max_workers=10) } # 2.创建调度器对象-使用executors进行配置 scheduler = BackgroundScheduler(executors=executors) # 2.1 将scheduler对象保存到app中,其他地方如果需要添加`动态任务` :current_app.scheduler.add_job(动态任务) app.scheduler = scheduler # 3.添加任务--修正统计数据--`静态任务` # app.scheduler.add_job(func="定时任务函数引用", trigger="触发器", args=["参数"]) # app.scheduler.add_job(func=fix_statistics, trigger=cron.CronTrigger(hour=4), args=["参数"]) # 触发器凌晨4点执行任务 # app.scheduler.add_job(func=fix_statistics, trigger="cron", hour=4, args=[app]) app.scheduler.add_job(func=fix_statistics, trigger="date", args=[app]) # 4.开启定时任务 app.scheduler.start() # 注册用户模块蓝图 from .resources.user import user_bp app.register_blueprint(user_bp) # 注册新闻模块蓝图 from .resources.news import news_bp app.register_blueprint(news_bp) # 注册通知模块 from .resources.notice import notice_bp app.register_blueprint(notice_bp) # 搜索 from .resources.search import search_bp app.register_blueprint(search_bp) return app
from elasticsearch5 import Elasticsearch from tools.config import Config import json config = Config("./config.yml") index_name = config.get("index_name") es_dir = config.get("es_dir") es = Elasticsearch(timeout=2000) scroll_id = "" def get_es_script(script_name): """Read es json file return dictionary of the body """ with open(es_dir + script_name + '.json') as s: body = json.load(s) return body def search(keywords, body={}): global scroll_id """ES Built-in search command parameter string keyword to search return list of matched documents """ print("Passed: " + keywords) if len(body) == 0 and keywords != "": body = get_es_script('search') body['query']['match']['text'] = keywords
| $$$$$$ | $$ | $$ \ $$| $$ | $$$$$$/ | $$$$$$ | $$| $$ \ $$| $$ \ $$ | $$ \____ $$ | $$ /$$| $$ | $$| $$ | $$_ $$ \____ $$| $$| $$ | $$| $$ | $$ | $$ /$$ /$$$$$$$/ | $$$$/| $$$$$$/| $$$$$$$| $$ \ $$ /$$$$$$$/| $$| $$$$$$$| $$ | $$ | $$$$/ |_______/ \___/ \______/ \_______/|__/ \__/|_______/ |__/ \____ $$|__/ |__/ \___/ /$$ \ $$ :) = +$ :( = -$ | $$$$$$/ \______/ v%s Join the StockSight website https://stocksight.diskoverspace.com \033[0m""" % (color, STOCKSIGHT_VERSION) print(banner + '\n') if not args.noelasticsearch: # create instance of elasticsearch es = Elasticsearch(hosts=[{ 'host': elasticsearch_host, 'port': elasticsearch_port }], http_auth=(elasticsearch_user, elasticsearch_password)) # set up elasticsearch mappings and create index mappings = { "mappings": { "tweet": { "properties": { "author": { "type": "string", "fields": { "keyword": { "type": "keyword" } }
def __init__(self): self.es = Elasticsearch(['https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443']) self.set_service() pass
import os import json from pathlib import Path import click from elasticsearch5 import Elasticsearch, helpers from utils import convert_message from config import INDEX_NAME, TYPE_NAME ELASTICSEARCH_URL = os.environ['ELASTICSEARCH_URL'] es = Elasticsearch([ELASTICSEARCH_URL]) @click.group() def cmd(): pass def parse_file(fname, channel): results = [] for data in json.load(fname.open()): data = convert_message(data) if data is None: continue data['channel'] = channel results.append(data) return results