Exemple #1
0
class es02 :
    def __init__(self):
        self.es = Elasticsearch(['https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443'])
        self.set_service()
        pass
    
    def load(self,fname="es01.pkl"):
        self.data = pickle.load( open( fname, "rb" ))

    def load_datas(self,start_date=date(2017, 12, 1),end_date=date(2018,1,9)):
        d = start_date
        delta = datetime.timedelta(days=1)
        while d <= end_date:
            es_date = d.strftime("%Y.%m.%d")
            print(es_date)
            try:
                self.load_data(es_date)
                self.parse()
                #print(es.dset)            
            except Exception:
                print(traceback.format_exc())
            d += delta

    def load_data(self,dt="2018.01.08"):
        es_index = 'slowquery-'+dt
        page = self.es.search(
            index = es_index,
            doc_type = 'elltdev',
            body = { 
                'query' : { 'match_all' : {}}
            }
        )
        self.data = page
    # print("test")


    def load_datas2(self,start_date=date(2017, 12, 1),end_date=date(2018,1,9)):
        d = start_date
        delta = datetime.timedelta(days=1)
        while d <= end_date:
            es_date = d.strftime("%Y.%m.%d")
            print(es_date)
            try:
                self.load_data2(es_date)
                self.parse()
                #print(es.dset)            
            except:
                print("can't not find data")
            d += delta    
    def load_data2(self,dt):
        url = 'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443/slowquery-'+dt+'/elltdev/_search'
        resp = requests.get(url=url)
        self.data = json.loads(resp.text)

    #data = {'took': 1, '_shards': {'total': 5, 'successful': 5, 'failed': 0}, 'timed_out': False, 'hits': {'max_score': 1.0, 'total': 1550, 'hits': [{'_source': {'host': 'omuser[omuser] @  [10.125.224.9]  Id: 1005635', 'Rows_examined': 514, 'query': '''SELECT \t/*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\titemT.GOODS_NO\t\t        , GROUP_CONCAT(DISTINCT itemT.ITEM_NO separator ',') AS ITEM_NO\t\t        , itemT.OPT_NM\t\t        , itemT.OPT_VAL\t\t\t\t, optT.OPT_SEQ\t\t\t\t \t\t  FROM (\t\t\t\tSELECT /*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\t\t\tgd_item_opt.ITEM_NO\t\t\t            , GOODS_NO\t\t\t\t\t\t, OPT_NM\t\t\t\t\t\t, OPT_VAL\t\t\t\t  FROM gd_item , gd_item_opt\t\t\t\t WHERE gd_item_opt.ITEM_NO = gd_item.ITEM_NO\t\t\t\t ) itemT\t\t INNER JOIN gd_goods_opt optT\t        ON itemT.GOODS_NO = optT.GOODS_NO\t\t   AND itemT.OPT_NM = optT.OPT_NM\t\t \t\t   AND optT.GOODS_NO = '1000000644'\t\t   \t \t\t    \t\t   AND optT.OPT_SEQ = '1'\t\t GROUP BY itemT.GOODS_NO, itemT.OPT_NM, itemT.OPT_VAL, optT.OPT_SEQ;'''}}] }}

    # 데이터를 저장하기 위한 영역 
    dset = []
    dtmp = {}

    def set_service(self,svc="goods"):
        self.svc = svc

    def get_dbio(self,sql):
        pat = re.compile("\[\w+\-api][\w|.]+")
        m = pat.findall(sql)
        if  len(m) > 0 :
            return (m[0]).strip()
        else:
            return None

    def get_tables(self,sql):
        pat = re.compile("(?<=\W)(?:GD|AT|CC|CH|DP|ET|MB|OM|PR|ST)\_[\_\w\.]+(?=\W)",re.I)
        tables = pat.findall(sql)
        if len(sql) > 0 :
            return [x.upper()   for x in tables if x.find(".") == -1 ]
        else:
            return None

    def print_kv(self,k,v):
        if ( k in ['host','Rows_examined','Query_time','@timestamp','service','Lock_time']):
            #print(k,":",v)
            self.dtmp[k] = v
        elif ( k in ['query']):
            #print("dbio :", get_dbio(v))
            self.dtmp['dbio'] = self.get_dbio(v)
            #print("tables :",get_tables(v))
            self.dtmp['tables'] = self.get_tables(v)
        elif ( k in ['_source']):
            #print("="*80)
            self.print_data(v)
            if self.dtmp['dbio'] != None  and len(self.dtmp['tables']) > 0 :
                #self.dset[self.dtmp['dbio']] = self.dtmp['tables']  
                if(self.dtmp['service'] == self.svc) :              
                    self.dset.append(self.dtmp['tables'])
            self.dtmp = {}
        else:
            #print(k,":")
            self.print_data(v)


    def print_data(self,d):
        if ( type(d) == dict ):
            for k,v in d.items():            
                self.print_kv(k,v)
        elif( type(d) == list ):
            for item in d:
                self.print_data(item)
        elif( type(d) in [str,int,bool,float] ) :
            pass
        else:
            print("="*80)
            print(type(d))            
    # print_data argement없이 호출하는 함수.            
    def parse(self):
        self.print_data(self.data)
Exemple #2
0
########### ES ###############
index = 'trips'
type = 'tripcontent'
host = "search-travelgramsearch-te6bi4dkwyzudmadanw26s4yni.us-east-1.es.amazonaws.com"
region = 'us-east-1'
service = 'es'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key,
                   credentials.secret_key,
                   region,
                   service,
                   session_token=credentials.token)
es = Elasticsearch(hosts=[{
    'host': host,
    'port': 443
}],
                   http_auth=awsauth,
                   use_ssl=True,
                   verify_certs=True,
                   connection_class=RequestsHttpConnection)


def modifyFavorite(tripid):
    response = favorite_table.scan(AttributesToGet=['UserID'],
                                   ScanFilter={
                                       "TripID": {
                                           "AttributeValueList": [tripid],
                                           "ComparisonOperator": "CONTAINS"
                                       }
                                   })
    for item in response['Items']:
        user_id = item['UserID']
Exemple #3
0
def create_app(env_type, enable_config_file=False):
    """
    创建flask应用 并 初始化各组件

    :param env_type: 环境类型
    :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息
    :return: flask应用
    """
    app = create_flask_app(env_type, enable_config_file)

    # 添加自定义正则转换器
    from utils.converters import register_converters
    register_converters(app)

    # 创建redis哨兵
    from redis.sentinel import Sentinel
    _sentinel = Sentinel(app.config['REDIS_SENTINELS'])
    # 获取redis主从连接对象
    app.redis_master = _sentinel.master_for(app.config['REDIS_SENTINEL_SERVICE_NAME'])
    app.redis_slave = _sentinel.slave_for(app.config['REDIS_SENTINEL_SERVICE_NAME'])

    # 创建redis集群
    from rediscluster import StrictRedisCluster
    app.redis_cluster = StrictRedisCluster(startup_nodes=app.config['REDIS_CLUSTER'])

    # 配置myql数据库
    from models import db
    db.init_app(app)

    # 配置日志
    from utils.logging import create_logger
    create_logger(app)

    # 限流器
    from utils.limiter import limiter as lmt
    lmt.init_app(app)

    # 创建Snowflake ID worker
    from utils.snowflake.id_worker import IdWorker
    app.id_worker = IdWorker(app.config['DATACENTER_ID'],
                             app.config['WORKER_ID'],
                             app.config['SEQUENCE'])

    # 创建执行器
    from apscheduler.executors.pool import ThreadPoolExecutor
    executor = ThreadPoolExecutor()
    # 创建定时任务调度器
    from apscheduler.schedulers.background import BackgroundScheduler
    app.scheduler = BackgroundScheduler(executors={'default': executor})
    from scheduler.cache_schedule import fix_statistic
    # 添加定时任务  每天3天同步数据
    # app.scheduler.add_job(fix_statistic, 'cron', hour=3)
    app.scheduler.add_job(fix_statistic, 'date', args=[app])
    # 启动调度器
    app.scheduler.start()
    
    # 建立grpc的连接
    app.channel = grpc.insecure_channel(app.config['RPC'].RECOMMEND)

    # 创建socketio的消息队列管理器(要求flask应用处于生产模式)  将消息保存到消息队列中
    import socketio
    app.siomgr = socketio.KombuManager(app.config['RABBIT_MQ'])

    # 创建es客户端
    from elasticsearch5 import Elasticsearch
    app.es = Elasticsearch(
        app.config['ES_HOST'],
        # 启动前嗅探es集群服务器
        sniff_on_start=True,
        # es集群服务器结点连接异常时是否刷新es节点信息
        sniff_on_connection_fail=True,
        # 每60秒刷新节点信息
        sniffer_timeout=60
    )

    # 添加请求钩子
    from utils.middlewares import jwt_authentication
    app.before_request(jwt_authentication)

    # 注册用户模块蓝图
    from .resources.user import user_bp
    app.register_blueprint(user_bp)

    # 注册新闻模块蓝图
    from .resources.news import news_bp
    app.register_blueprint(news_bp)

    # 注册搜索模块蓝图
    from .resources.search import search_bp
    app.register_blueprint(search_bp)

    return app
from elasticsearch5 import Elasticsearch
import datetime
es = Elasticsearch('http://fx-elasticsearch:9200')

#doc = {
#    'author': 'kimchy',
#    'text': 'Elasticsearch: cool. bonsai cool.',
#    'timestamp': datetime.datetime.now()
#}
#res = es.index(index="test-index",doc_type="test", id=1, body=doc)
#print(res['result'])

#res = es.get(index="test-index", id=1)
#print(res['_source'])

#es.indices.refresh(index="test-index")
indexes = es.indices.get('*')
#print(indexes)
for j in range(0, 10):
    print("value of j is: ", j)
    for i in indexes:
        print(i)
        print(" ")

        res = es.search(index=i,
                        body={
                            "query": {
                                "match_all": {}
                            },
                            "size": 1000
                        })
Exemple #5
0
class Searcher():
    """Searches papers from elasticsearch database

    Longer class information....
    Longer class information....

    """
    def __init__(self, index_name, doc_type, host='10.1.114.114', port=9200):
        """Initialize a search engine

        Args:
            host: A host name of elasticsearch
            port: A port number of elasticsearch
            index_name: name of the index you want to search for
            doc_type: name of the doc_type under certain index

        """
        self.es = Elasticsearch([{'host': host, 'port': port}])
        self.index = index_name
        self.doc_type = doc_type

    def generate_dsl(self, search_info):
        """Generate DSL given query and search settings

        Args:
            search_info: a dict including a query and other settings
            Attention that 'query_type' must be consistent with 'match' !
        Example:
            {
                'query_type': 'integrated_search',
                'query': 'attention network',
                'match': {
                    'title': True,
                    'abstract': True,
                    'paperContent': True,
                    'videoContent': True,
                },
                'filter': {
                    'yearfrom': 1000,
                    'yearbefore': 3000,
                },
                'sort': 'year',
                'is_filter': True,
                'is_rescore': True,
                'is_cited': False
            }
            or
            {
                'query_type': 'advanced_search',
                'match': {
                    'title': 'attention',
                    'abstract': 'attention',
                    'paperContent': 'attention',
                    'videoContent': None,
                },
                'filter': {
                    'yearfrom': 1000,
                    'yearbefore': 3000,
                },
                'sort': 'relevance',
                'is_filter': False,
                'is_rescore': True,
                'is_cited': False
            }
        Return:
            dsl: a dsl translated from search info
        """

        # check search_info
        if 'integrated' in search_info['query_type']:
            assert 'query' in search_info, "Integrated search must have query !"
            assert isinstance(search_info['match']['title'], bool), "Here needs bool type !"
        else:
            assert isinstance(search_info['match']['title'], (str, None)), \
            "Here needs a string or None !"

        if search_info['is_cited'] is False:
            dsl = Vividict()
            dsl['query']['bool']['must'] = []
            dsl['query']['bool']['should'] = []
            dsl['rescore'] = []

            if 'integrated' in search_info['query_type']:
                match = self.get_integrated_match(search_info['query'], search_info['match'])
                dsl['query']['bool']['should'] = match
                if search_info['is_filter'] is True:
                    filter = self.get_filter_query(search_info['query'])
                    dsl['query']['bool']['must'].append(filter)
                if search_info['is_rescore'] is True:
                    rescore = self.get_rescore_query(match)
                    dsl['rescore'] = rescore

            else:  # 'advanced_search'
                match = self.get_advanced_match(search_info['match'])
                dsl['query']['bool']['must'] = match
                if search_info['is_rescore'] is True:
                    rescore = self.get_rescore_query(match)
                    dsl['rescore'] = rescore

            year_range = Vividict()
            year_range['range']['year']['gte'] = search_info['filter'].get('yearfrom', 1000)
            year_range['range']['year']['lte'] = search_info['filter'].get('yearbefore', 3000)
            dsl['query']['bool']['must'].append(year_range)

        else:  # cited-function_score
            dsl = Vividict()
            dsl['query']['function_score']['query']['bool']['must'] = []
            dsl['query']['function_score']['query']['bool']['should'] = []
            dsl['query']['function_score']['field_value_factor'] = []
            dsl['rescore'] = []

            if 'integrated' in search_info['query_type']:
                match = self.get_integrated_match(search_info['query'], search_info['match'])
                dsl['query']['function_score']['query']['bool']['should'] = match
                cited = self.get_function_factor()
                dsl['query']['function_score']['field_value_factor'] = cited
                if search_info['is_filter'] is True:
                    filter = self.get_filter_query(search_info['query'])
                    dsl['query']['function_score']['query']['bool']['must'].append(filter)
                if search_info['is_rescore'] is True:
                    rescore = self.get_rescore_query(match)
                    dsl['rescore'] = rescore

            else:  # 'advanced_search'
                match = self.get_advanced_match(search_info['match'])
                dsl['query']['bool']['must'] = match
                if search_info['is_rescore'] is True:
                    rescore = self.get_rescore_query(match)
                    dsl['rescore'] = rescore

            year_range = Vividict()
            year_range['range']['year']['gte'] = search_info['filter'].get('yearfrom', 1000)
            year_range['range']['year']['lte'] = search_info['filter'].get('yearbefore', 3000)
            dsl['query']['function_score']['query']['bool']['must'].append(year_range)

        if search_info['sort'] == 'year':
            dsl['sort']['year'] = 'desc'
        elif search_info['sort'] == 'cited':
            dsl['sort']['cited'] = 'asc'

        return dsl

    def get_integrated_match(self, query, match):
        """get match of intergrated search

        Args:
            query: query string from user
            match: A dict contained title, abstract...

        Return:
            res: A list of match
        """
        res = []

        if match['title'] or match['abstract']:
            tmp = Vividict()
            tmp['multi_match']['query'] = query

            fields = []
            if match['title']:
                fields.append('title^3')

            if match['abstract']:
                fields.append('abstract^2')

            tmp['multi_match']['fields'] = fields
            res.append(tmp)

        if match['paperContent']:
            nest = self.get_nested_query_paperContent(query)
            res.append(nest)

        if match['videoContent']:
            nest = self.get_nested_query_videoContent(query)
            res.append(nest)

        return res

    def get_advanced_match(self, match):
        """get match of advanced search

        Args:
            match: A dict contained title, abstract, paper_content...

        Return:
            res: A list of match
        """
        res = []
        if match['title']:
            _match = {'match': {'title': match['title']}}
            res.append(_match)

        if match['abstract']:
            _match = {'match': {'abstract': match['abstract']}}
            res.append(_match)

        if match['paperContent']:
            nest = self.get_nested_query_paperContent(match['paperContent'])
            res.append(nest)

        if match['videoContent']:
            nest = self.get_nested_query_videoContent(match['videoContent'])
            res.append(nest)

        return res

    def get_nested_query_paperContent(self, query):

        nest = Vividict()
        nest['nested']['path'] = 'paperContent'
        nest['nested']['score_mode'] = 'max'

        tmp = Vividict()
        fields = ['paperContent.text', 'paperContent.subtitles^2', 'paperContent.subtexts']
        tmp['multi_match']['fields'] = fields
        tmp['multi_match']['query'] = query
        nest['nested']['query']['bool']['must'] = tmp

        return nest

    def get_nested_query_videoContent(self, query):

        nest = Vividict()
        nest['nested']['path'] = 'videoContent'
        nest['nested']['score_mode'] = 'max'

        tmp = Vividict()
        tmp['match']['videoContent.textEnglish'] = query
        nest['nested']['query']['bool']['must'] = tmp

        return nest

    def get_function_factor(self):
        cited = Vividict()
        cited['field'] = 'cited'
        cited['modifier'] = 'log1p'
        cited['factor'] = 0.5
        cited['missing'] = 0

        return cited

    def get_filter_query(self, query):
        filter = Vividict()
        tag_list = query.split()
        filter['terms']['abstract'] = tag_list

        return filter

    def get_rescore_query(self, match):
        rescore = Vividict()
        rescore['window_size'] = 100
        rescore['query']['rescore_query'] = match[0]
        rescore['query']['query_weight'] = 1.5
        rescore['query']['rescore_query_weight'] = 0.5

        return rescore

    def search_paper_by_name(self, search_info, only_top_k=True):
        """Search paper by name
        Args:
            query: query string from user

        Return:
            res_list: A list of paper information
            num: The number of returned paper
        """
        dsl = self.generate_dsl(search_info)
        result = self.es.search(index=self.index, doc_type=self.doc_type, body=dsl, scroll="5m", size=100)
        # import pdb; pdb.set_trace();
        sid = result['_scroll_id']
        scroll_size = result['hits']['total']
        res_list, paper_id, num = [], [], scroll_size
        while scroll_size > 0:
            result = self.es.scroll(scroll_id=sid, scroll="5m")
            sid = result['_scroll_id']
            scroll_size = len(result["hits"]["hits"])
            paper, p_id, _ = self.get_paper_info(result)
            res_list += paper
            paper_id += p_id

            if only_top_k:
                break

        return res_list, paper_id, num

    def get_video_pos_by_paper_id(self, search_info, paper_id, threshold=0.6):
        """
        Args:
            search_info: the same as that in self.generate_dsl()
            paper_id: A string, given by es

        Return:
            a sorted video captions' list according to similarity between
            captions and query
        """
        
        assert isinstance(paper_id, str), "paper_id must be a string, here need only one id !"

        paper = self.es.get_source(index=self.index, doc_type=self.doc_type, id=paper_id)

        return self.get_video_pos_by_paper(search_info=search_info,
                                           paper=paper,
                                           threshold=threshold)

    def get_video_pos_by_paper(self, search_info, paper, threshold=0.6):
        """
        Args:
            paper: A dict contained title, abstract ...

        Return:
            a sorted video captions' list according to similarity between
            captions and query
        """

        assert isinstance(paper, dict), "paper must be a dict, here need only one paper !"

        if 'integrated' in search_info['query_type']:
            query = search_info['query']
        else:
            query = search_info['match']['videoContent']

        if 'videoContent' not in paper:
            return [None]

        pos = get_video_pos(query=query,
                            videoContent=paper['videoContent'],
                            threshold=threshold)
        return pos

    @staticmethod
    def get_paper_info(res):
        """Return raw paper info given es search result
        Args:
            res: A dict of result from es.search

        Return:
            paper_list: A list of dicts, each dict stores information of a paper
            num: length of paper_list
        """
        paper_list = []
        paper_id = []
        hits = res['hits']['hits']
        num = res['hits']['total']
        # import pdb; pdb.set_trace();
        for hit in hits:
            paper_list.append(hit['_source'])
            paper_id.append(hit['_id'])
        return paper_list, paper_id, num

    @staticmethod
    def remove_text_embedding(papers):
        """Remove textEmbedding in videoContent
        Args:
            papers: A list of paper
        """
        for paper in papers:
            if 'videoContent' in paper:
                for v in paper['videoContent']:
                    if 'textEmbedding' in v:
                        v.pop('textEmbedding')
class ESStorage(Storage):
    """Elasticsearch storage backend."""

    NAME = "es"
    _MESSAGE_FIELD_NAME = "_source.message"

    def __init__(self, configuration):
        """Initialize Elasticsearch storage backend."""
        self.config = configuration
        self._connect()

    def _connect(self):
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        if len(self.config.ES_CERT_DIR) and os.path.isdir(self.config.ES_CERT_DIR):
            _LOGGER.warn(
                "Using cert and key in %s for connection to %s (verify_certs=%s)."
                % (
                    self.config.ES_CERT_DIR,
                    self.config.ES_ENDPOINT,
                    self.config.ES_VERIFY_CERTS,
                )
            )
            self.es = Elasticsearch(
                self.config.ES_ENDPOINT,
                use_ssl=self.config.ES_USE_SSL,
                verify_certs=self.config.ES_VERIFY_CERTS,
                client_cert=os.path.join(self.config.ES_CERT_DIR, "es.crt"),
                client_key=os.path.join(self.config.ES_CERT_DIR, "es.key"),
                timeout=60,
                max_retries=2,
            )
        else:
            _LOGGER.warn("Conecting to ElasticSearch without authentication.")
            print(self.config.ES_USE_SSL)
            self.es = Elasticsearch(
                self.config.ES_ENDPOINT,
                use_ssl=self.config.ES_USE_SSL,
                verify_certs=self.config.ES_VERIFY_CERTS,
                timeout=60,
                max_retries=2,
            )

    def _prep_index_name(self, prefix):
        # appends the correct date to the index prefix
        now = datetime.datetime.now()
        date = now.strftime("%Y.%m.%d")
        index = prefix + date
        return index

    def retrieve(self, storage_attribute: ESStorageAttribute):
        """Retrieve data from ES."""
        index_in = self._prep_index_name(self.config.ES_INPUT_INDEX)

        query = {
            "sort": {"@timestamp": {"order": "desc"}},
            "query": {
                "bool": {
                    "must": [
                        {"query_string": {"analyze_wildcard": True, "query": ""}},
                        {"range": {"@timestamp": {"gte": "now-900s", "lte": "now"}}},
                    ],
                    "must_not": [],
                }
            },
        }
        _LOGGER.info(
            "Reading in max %d log entries in last %d seconds from %s",
            storage_attribute.number_of_entries,
            storage_attribute.time_range,
            self.config.ES_ENDPOINT,
        )

        query["size"] = storage_attribute.number_of_entries
        query["query"]["bool"]["must"][1]["range"]["@timestamp"]["gte"] = "now-%ds" % storage_attribute.time_range
        query["query"]["bool"]["must"][0]["query_string"]["query"] = self.config.ES_QUERY

        es_data = self.es.search(index_in, body=json.dumps(query))
        if es_data["hits"]["total"] == 0:
            return pandas.DataFrame(), es_data
        # only use _source sub-dict
        es_data = [x["_source"] for x in es_data["hits"]["hits"]]
        es_data_normalized = pandas.DataFrame(json_normalize(es_data)["message"])

        _LOGGER.info("%d logs loaded in from last %d seconds", len(es_data_normalized), storage_attribute.time_range)

        self._preprocess(es_data_normalized)

        return es_data_normalized, es_data  # bad solution, this is how Entry objects could come in.

    def store_results(self, data):
        """Store results back to ES."""
        index_out = self._prep_index_name(self.config.ES_TARGET_INDEX)

        actions = [{"_index": index_out, "_type": "log", "_source": data[i]} for i in range(len(data))]

        helpers.bulk(self.es, actions, chunk_size=int(len(data) / 4) + 1)
Exemple #7
0

args = vars(get_args())

if not args['comparecsvs']:
    if not args['index'] or not args['rootdir']:
        print(
            '--eshost1, --index and --rootdir cli args required (unless using --comparecsvs), use -h for help'
        )
        sys.exit(1)

    # set up elasticsearch connections
    es = Elasticsearch(hosts=args['eshost1'],
                       port=args['esport1'],
                       http_auth=(args['esuser1'], args['espass1']),
                       connection_class=Urllib3HttpConnection,
                       timeout=config['es_timeout'],
                       maxsize=config['es_maxsize'],
                       max_retries=config['es_max_retries'],
                       retry_on_timeout=True)

    if args['eshost2']:
        es2 = Elasticsearch(hosts=args['eshost2'],
                            port=args['esport2'],
                            http_auth=(args['esuser2'], args['espass2']),
                            connection_class=Urllib3HttpConnection,
                            timeout=config['es_timeout'],
                            maxsize=config['es_maxsize'],
                            max_retries=config['es_max_retries'],
                            retry_on_timeout=True)
    else:
        es2 = es
#!/usr/bin/env python3
import os
import sys

from elasticsearch5 import Elasticsearch

es = Elasticsearch()
es_options = {
        "index": "article_test",
        "doc_type": "article" }

query = {"query": {"match_all": {}}}

es.delete_by_query(**es_options, body=query)
Exemple #9
0
class es02:
    def __init__(self):
        self.es = Elasticsearch([
            'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443'
        ])
        self.set_service()
        pass

    def load(self, fname="es01.pkl"):
        self.data = pickle.load(open(fname, "rb"))

    def load_datas(self,
                   start_date=date(2017, 12, 1),
                   end_date=date(2018, 1, 9)):
        d = start_date
        delta = datetime.timedelta(days=1)
        while d <= end_date:
            es_date = d.strftime("%Y.%m.%d")
            print(es_date)
            try:
                self.load_data(es_date)
                self.parse()
                #print(es.dset)
            except Exception:
                print(traceback.format_exc())
            d += delta

    def load_data(self, dt="2018.01.08"):
        es_index = 'slowquery-' + dt
        page = self.es.search(index=es_index,
                              doc_type='elltdev',
                              body={'query': {
                                  'match_all': {}
                              }})
        self.data = page

    # print("test")

    def load_datas2(self,
                    start_date=date(2017, 12, 1),
                    end_date=date(2018, 1, 9)):
        d = start_date
        delta = datetime.timedelta(days=1)
        while d <= end_date:
            es_date = d.strftime("%Y.%m.%d")
            print(es_date)
            try:
                self.load_data2(es_date)
                self.parse()
                #print(es.dset)
            except:
                print("can't not find data")
            d += delta

    def load_data2(self, dt):
        url = 'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443/slowquery-' + dt + '/elltdev/_search'
        resp = requests.get(url=url)
        self.data = json.loads(resp.text)

    #data = {'took': 1, '_shards': {'total': 5, 'successful': 5, 'failed': 0}, 'timed_out': False, 'hits': {'max_score': 1.0, 'total': 1550, 'hits': [{'_source': {'host': 'omuser[omuser] @  [10.125.224.9]  Id: 1005635', 'Rows_examined': 514, 'query': '''SELECT \t/*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\titemT.GOODS_NO\t\t        , GROUP_CONCAT(DISTINCT itemT.ITEM_NO separator ',') AS ITEM_NO\t\t        , itemT.OPT_NM\t\t        , itemT.OPT_VAL\t\t\t\t, optT.OPT_SEQ\t\t\t\t \t\t  FROM (\t\t\t\tSELECT /*+ [goods-api].GoodsDetailDAO.getGdItemInfo */\t\t\t\t\t\tgd_item_opt.ITEM_NO\t\t\t            , GOODS_NO\t\t\t\t\t\t, OPT_NM\t\t\t\t\t\t, OPT_VAL\t\t\t\t  FROM gd_item , gd_item_opt\t\t\t\t WHERE gd_item_opt.ITEM_NO = gd_item.ITEM_NO\t\t\t\t ) itemT\t\t INNER JOIN gd_goods_opt optT\t        ON itemT.GOODS_NO = optT.GOODS_NO\t\t   AND itemT.OPT_NM = optT.OPT_NM\t\t \t\t   AND optT.GOODS_NO = '1000000644'\t\t   \t \t\t    \t\t   AND optT.OPT_SEQ = '1'\t\t GROUP BY itemT.GOODS_NO, itemT.OPT_NM, itemT.OPT_VAL, optT.OPT_SEQ;'''}}] }}

    # 데이터를 저장하기 위한 영역
    dset = []
    dtmp = {}

    def set_service(self, svc="goods"):
        self.svc = svc

    def get_dbio(self, sql):
        pat = re.compile("\[\w+\-api][\w|.]+")
        m = pat.findall(sql)
        if len(m) > 0:
            return (m[0]).strip()
        else:
            return None

    def get_tables(self, sql):
        pat = re.compile(
            "(?<=\W)(?:GD|AT|CC|CH|DP|ET|MB|OM|PR|ST)\_[\_\w\.]+(?=\W)", re.I)
        tables = pat.findall(sql)
        if len(sql) > 0:
            return [x.upper() for x in tables if x.find(".") == -1]
        else:
            return None

    def print_kv(self, k, v):
        if (k in [
                'host', 'Rows_examined', 'Query_time', '@timestamp', 'service',
                'Lock_time'
        ]):
            #print(k,":",v)
            self.dtmp[k] = v
        elif (k in ['query']):
            #print("dbio :", get_dbio(v))
            self.dtmp['dbio'] = self.get_dbio(v)
            #print("tables :",get_tables(v))
            self.dtmp['tables'] = self.get_tables(v)
        elif (k in ['_source']):
            #print("="*80)
            self.print_data(v)
            if self.dtmp['dbio'] != None and len(self.dtmp['tables']) > 0:
                #self.dset[self.dtmp['dbio']] = self.dtmp['tables']
                if (self.dtmp['service'] == self.svc):
                    self.dset.append(self.dtmp['tables'])
            self.dtmp = {}
        else:
            #print(k,":")
            self.print_data(v)

    def print_data(self, d):
        if (type(d) == dict):
            for k, v in d.items():
                self.print_kv(k, v)
        elif (type(d) == list):
            for item in d:
                self.print_data(item)
        elif (type(d) in [str, int, bool, float]):
            pass
        else:
            print("=" * 80)
            print(type(d))

    # print_data argement없이 호출하는 함수.
    def parse(self):
        self.print_data(self.data)
Exemple #10
0
 def __init__(self):
     self.es = Elasticsearch([
         'https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443'
     ])
     self.set_service()
     pass
class ElasticSearchSearvice:
    # 默认加载本地
    if ENV != 'HP':
        _es = Elasticsearch(hosts='127.0.0.1:9200', sniffer_timeout=60)
    else:
        _es = Elasticsearch(hosts='192.168.31.250:9200', sniffer_timeout=60)
    # _es = Elasticsearch(hosts='192.168.31.16:9200')

    @classmethod
    def is_available(cls):
        return cls._es.ping()

    # create dsl
    # 单子段匹配并且高亮(短语匹配)
    @classmethod
    def __create_dsl_for_single_field(cls,
                                      field,
                                      text,
                                      size=None,
                                      highlight={}):
        dsl = {
            'query': {
                'match': {
                    field: text.lower()
                }
            },
            "highlight": {
                "order": "score",
                "require_field_match": False,
                "fields": {
                    field: highlight
                }
            }
        }

        if size:
            dsl['query']['match']['size'] = size

        return dsl

    # 多字段全局检索(短语匹配, 不会有字符程度)
    @classmethod
    def __create_dsl_for_multi_fields(cls,
                                      text,
                                      fields,
                                      operator='or',
                                      size=None):
        dsl = {
            "query": {
                "multi_match": {
                    "query": text,
                    "type": "most_fields",
                    "operator": operator,
                    "fields": fields
                }
            }
        }
        if size:
            dsl['query']['multi_match']['size'] = size

        return dsl

    # 单字段检索, 正则表达式
    @classmethod
    def __create_dsl_for_single_field_with_regex(cls,
                                                 field,
                                                 texts,
                                                 start=0,
                                                 size=3,
                                                 highlight={}):
        dsl = {
            "query": {
                "constant_score": {
                    "filter": {
                        'bool': {
                            'should': []
                        }
                    }
                }
            },
            "highlight": {
                "order": "score",
                "require_field_match": False,
                "fields": {
                    field: highlight
                }
            }
        }
        if size:
            dsl['size'] = size
        if start:
            dsl['from'] = start
        for text in texts:
            dsl['query']['constant_score']['filter']['bool']['should'].append(
                {"regexp": {
                    field: {
                        "value": '.*' + text.lower() + '.*'
                    }
                }})
        return dsl

    # 多字段多值匹配并且高亮(正则表达式模式, 会匹配单字符包含关系)
    @classmethod
    def __create_dsl_for_multi_field_with_regex(cls,
                                                fields,
                                                texts,
                                                start=0,
                                                size=3,
                                                highlight={}):
        dsl = {
            "query": {
                "constant_score": {
                    "filter": {
                        'bool': {
                            'should': []
                        }
                    }
                }
            },
            "highlight": {
                "order": "score",
                "require_field_match": False,
                "fields": {}
            }
        }
        if size:
            dsl['size'] = size
        if start:
            dsl['from'] = start
        for field in fields:
            # 高亮
            dsl['highlight']['fields'][field] = highlight
            for text in texts:
                dsl['query']['constant_score']['filter']['bool'][
                    'should'].append({
                        "regexp": {
                            field: {
                                "value": '.*' + text.lower() + '.*'
                            }
                        }
                    })
        return dsl

    # 精确查找
    @classmethod
    # 精确查找-tags
    def __create_dsl_for_tags(cls, tags, type, start=0, size=3):
        dsl = {
            'from': start,
            'size': size,
            'query': {
                "constant_score": {
                    "filter": {
                        'bool': {
                            'must': [{
                                'term': {
                                    'type': type
                                }
                            }, {
                                'terms': {
                                    'tags': tags
                                }
                            }]
                        }
                    }
                }
            }
        }
        # 返回Tags
        return dsl

    # 多字段检索并且高亮
    @classmethod
    def match_fields_with_highlight(cls,
                                    fields,
                                    text,
                                    operator='or',
                                    size=None,
                                    highlight={}):
        dsl = {
            "query": {
                "multi_match": {
                    "query": text,
                    "type": "most_fields",
                    "operator": operator,
                    # "fields": fields
                }
            },
            "highlight": {
                "order": "score",
                "require_field_match": False,
                "fields": {}
            }
        }
        weights_fields = []
        for field in fields:
            # 高亮
            dsl['highlight']['fields'][field] = highlight
            # 加权
            if field in ALLTITLES:
                field = field + '^3'
            elif field == 'tags':
                field = field + '^2'
            else:
                field = field + '^1'
            weights_fields.append(field)
        dsl['query']['multi_match']['fields'] = weights_fields
        return dsl

    @classmethod
    def __execute_dsl(cls, dsl, index):
        if index == 'apps':
            print(dsl)
            dsl['query']['constant_score']['filter']['bool']['must'] = {
                "term": {
                    "type": 'app'
                }
            }
            return cls._es.search(index='projects',
                                  doc_type='project',
                                  body=dsl)
        if index == 'modules':
            dsl['query']['constant_score']['filter']['bool']['must'] = {
                "term": {
                    "type": 'module'
                }
            }
            return cls._es.search(index='projects',
                                  doc_type='project',
                                  body=dsl)
        if index == 'datasets':
            dsl['query']['constant_score']['filter']['bool']['must'] = {
                "term": {
                    "type": 'dataset'
                }
            }
            return cls._es.search(index='projects',
                                  doc_type='project',
                                  body=dsl)
        if index == 'users':
            return cls._es.search(index='users', doc_type='user', body=dsl)
        if index == 'requests':
            return cls._es.search(index='requests',
                                  doc_type='request',
                                  body=dsl)
        if index == 'projects':
            return cls._es.search(index='projects',
                                  doc_type='project',
                                  body=dsl)
        if index == 'code_snippets':
            return cls._es.search(index='code_snippets',
                                  doc_type='code_snippet',
                                  body=dsl)

    @classmethod
    def search_title(cls, search_value=None, index=None):
        if not isinstance(search_value, list):
            search_value = [search_value]
        if index:
            # des index
            if not search_value:
                return {'msg': 'failed'}
            fields = TABLEFORFIELDS[index]
            '''
            cls.__create_dsl_for_single_field_with_regex(field=title,
                                                               texts=search_value, size=20,
                                                               highlight=HIGHTLIGHT)
            '''
            dsl = cls.__create_dsl_for_multi_field_with_regex(
                fields=fields,
                texts=search_value,
                start=0,
                size=3,
                highlight=HIGHTLIGHT)
            results = cls.__execute_dsl(dsl, index=index)
        else:
            # all
            # indexes = list(TITLESFORTABLES.values())
            results = []
            for idx in range(len(INDEXES)):
                '''
                cls.__create_dsl_for_single_field_with_regex(field=titles[title_idx], texts=search_value, size=3,
                                                                   highlight=HIGHTLIGHT)
                '''
                fields = TABLEFORFIELDS[INDEXES[idx]]
                dsl = cls.__create_dsl_for_multi_field_with_regex(
                    fields=fields,
                    texts=search_value,
                    start=0,
                    size=3,
                    highlight=HIGHTLIGHT)
                # for index in indexes[title_idx]:
                search_result = cls.__execute_dsl(dsl, INDEXES[idx])
                results.append({INDEXES[idx]: search_result})
        # 获取数值接口
        count_info = cls.count(search_value)
        return results, count_info

    @classmethod
    def __remove_for_count_api(cls, dsl):
        del dsl['highlight']
        del dsl['size']
        return dsl

    @classmethod
    def count(cls, search_text):
        # 分别求所有
        # request 的数量
        request_dsl = cls.__create_dsl_for_multi_field_with_regex(
            fields=['request_title', 'description'], texts=search_text)
        request_dsl = cls.__remove_for_count_api(request_dsl)

        #        request_count = cls._es.count(index='requests', doc_type='request', body=request_dsl)
        request_count = {}

        total_request_count = 0
        # projects
        project_dsl = cls.__create_dsl_for_multi_field_with_regex(
            fields=['display_name', 'description'], texts=search_text)
        project_dsl = cls.__remove_for_count_api(project_dsl)

        total_project_count = 0
        #        project_count = cls._es.count(index='projects', doc_type='project', body=project_dsl)
        # 创建
        project_count = {}
        for project_type in ['app', 'module', 'dataset']:
            project_type_dsl = copy.deepcopy(project_dsl)
            request_type_dsl = copy.deepcopy(request_dsl)
            project_type_dsl['query']['constant_score']['filter']['bool'][
                'must'] = {
                    "term": {
                        "type": project_type
                    }
                }
            request_type_dsl['query']['constant_score']['filter']['bool'][
                'must'] = {
                    "term": {
                        "type": project_type
                    }
                }
            project_type_count = cls._es.count(
                index='projects', doc_type='project',
                body=project_type_dsl).get('count')
            request_type_count = cls._es.count(
                index='requests', doc_type='request',
                body=request_type_dsl).get('count')
            request_count[project_type] = request_type_count
            project_count[project_type] = project_type_count
            total_request_count += request_type_count
            total_project_count += project_type_count
        # users
        user_dsl = cls.__create_dsl_for_multi_field_with_regex(
            fields=['username', 'bio'], texts=search_text)
        user_dsl = cls.__remove_for_count_api(user_dsl)
        user_count = cls._es.count(index='users',
                                   doc_type='user',
                                   body=user_dsl)

        return {
            'project_nums': total_project_count,
            'request_nums': total_request_count,
            'user_nums': user_count.get('count'),
            'projects': project_count,
            'requests': request_count
        }

    @classmethod
    def search_code_snippet(cls,
                            fields=[],
                            search_values=[],
                            index=None,
                            start=0,
                            size=3):
        # 多字段检索
        if not isinstance(fields, list):
            return {'message': "Please input the valid data"}
        # 是否多值检索
        if not isinstance(search_values, list):
            search_values = [search_values]
        # 创建 DSL 语句
        dsl = cls.__create_dsl_for_multi_field_with_regex(fields=fields,
                                                          texts=search_values,
                                                          start=start,
                                                          size=size,
                                                          highlight=HIGHTLIGHT)
        results = cls.__execute_dsl(dsl, index=index)

        return results

    @classmethod
    def search_fields(cls,
                      fields=[],
                      search_values=[],
                      index=None,
                      start=0,
                      size=3,
                      request_type=None):
        # 多字段检索
        if not isinstance(fields, list):
            return {'message': "Please input the valid data"}
        # 是否多值检索
        if not isinstance(search_values, list):
            search_values = [search_values]
        # 创建 DSL 语句
        dsl = cls.__create_dsl_for_multi_field_with_regex(fields=fields,
                                                          texts=search_values,
                                                          start=start,
                                                          size=size,
                                                          highlight=HIGHTLIGHT)
        # 统计其他的种类的数量
        count_info = cls.count(search_values)
        # 执行出结果
        # if 'request' in index and request_type:
        #     print(dsl)
        #     dsl['query']['constant_score']['filter']['bool']['must'] = { "term": {"type": request_type}}
        results = cls.__execute_dsl(dsl, index=index)
        # 或许会要执行筛选
        return results, count_info

    @classmethod
    def search_tags(cls,
                    tags,
                    start=0,
                    size=10,
                    index='all',
                    project_type='app'):
        dsl = cls.__create_dsl_for_tags(tags,
                                        project_type,
                                        start=start,
                                        size=size)
        if index == 'project':
            results = cls._es.search(index='projects',
                                     doc_type='project',
                                     body=dsl)
        elif index == 'request':
            results = cls._es.search(index='requests',
                                     doc_type='request',
                                     body=dsl)
        else:
            # 基本不存在所有
            project_results = cls._es.search(index='projects',
                                             doc_type='project',
                                             body=dsl)
            request_results = cls._es.search(index='requests',
                                             doc_type='request',
                                             body=dsl)
            return project_results, request_results
        return results

    @classmethod
    def operation_with_index(cls, body, index, doc_type, index_id):
        #        if not cls._es.indices.exists(index):
        #            body = {
        #                "mappings": {
        #                    index[:-1]: {
        #                        "properties": {
        #                            "description": {
        #                                "type": "text",
        #                                "analyzer": "ik_max_word",
        #                                "search_analyzer": "ik_max_word"
        #                            }
        #                        }
        #                    }
        #                }
        #            }
        #            if index == 'requests':
        #                body['mappings'][index[:-1]]['properties']['request_title'] = {
        #                    "type": "text",
        #                    "analyzer": "ik_max_word",
        #                    "search_analyzer": "ik_max_word"
        #                }
        #                cls._es.indices.create(index=index, body=body, ignore=400)
        #            else:
        #                if index == 'projects':
        #                    body['mappings'][index[:-1]]['properties']['display_name'] = {
        #                        "type": "text",
        #                        "analyzer": "standard"
        #                    }
        #                else:
        #                    body['mappings'][index[:-1]]['properties']['username'] = {
        #                        "type": "text",
        #                        "analyzer": "standard"
        #                    }
        #                cls._es.indices.create(index=index, body=body, ignore=400)
        cls.create_index(index=index)
        print(
            cls._es.index(index=index,
                          doc_type=doc_type,
                          body=body,
                          id=index_id,
                          refresh='true',
                          request_timeout=60))

    @classmethod
    def create_index(cls, index):
        if not cls._es.indices.exists(index):
            body = {
                "mappings": {
                    index[:-1]: {
                        "properties": {
                            "description": {
                                "type": "text",
                                "analyzer": "ik_max_word",
                                "search_analyzer": "ik_max_word"
                            }
                        }
                    }
                }
            }
            if index == 'requests':
                body['mappings'][index[:-1]]['properties']['request_title'] = {
                    "type": "text",
                    "analyzer": "ik_max_word",
                    "search_analyzer": "ik_max_word"
                }
                cls._es.indices.create(index=index, body=body, ignore=400)
            else:
                if index == 'projects':
                    body['mappings'][
                        index[:-1]]['properties']['display_name'] = {
                            "type": "text",
                            "analyzer": "ik_max_word",
                            "search_analyzer": "ik_max_word"
                        }
                    cls._es.indices.create(index=index, body=body, ignore=400)
                elif index == 'code_snippets':
                    body['mappings'][index[:-1]]['properties']['code_name'] = {
                        "type": "text",
                        "analyzer": "ik_max_word",
                        "search_analyzer": "ik_max_word"
                    }
                    body['mappings'][
                        index[:-1]]['properties']['code_source'] = {
                            "type": "text",
                            "analyzer": "ik_max_word",
                            "search_analyzer": "ik_max_word"
                        }
                    body['mappings'][index[:-1]]['properties']['code_tags'] = {
                        "type": "text",
                        "analyzer": "ik_max_word",
                        "search_analyzer": "ik_max_word"
                    }
                    cls._es.indices.create(index=index, body=body, ignore=400)
                else:
                    body['mappings'][index[:-1]]['properties']['username'] = {
                        "type": "text",
                        "analyzer": "standard"
                    }
                cls._es.indices.create(index=index, body=body, ignore=400)

    @classmethod
    def delete_all(cls):
        if cls._es.indices.exists('apps'):
            cls._es.indices.delete(index='apps')
        if cls._es.indices.exists('modules'):
            cls._es.indices.delete(index='modules')
        if cls._es.indices.exists('datasets'):
            cls._es.indices.delete(index='datasets')
        if cls._es.indices.exists('projects'):
            print('刪除projects')
            cls._es.indices.delete(index='projects')
        if cls._es.indices.exists('requests'):
            print('刪除requests')
            cls._es.indices.delete(index='requests')
        if cls._es.indices.exists('users'):
            print('刪除User')
            cls._es.indices.delete(index='users')
        if cls._es.indices.exists('code_snippets'):
            print('刪除code_snippets')
            cls._es.indices.delete(index='code_snippets')

    @classmethod
    def refresh_index(cls):
        cls._es.indices.refresh()

    @classmethod
    def add_user(cls, user_ID, username, bio, avatarV, avatar_url):
        body = {
            'username': username,
            'bio': bio,
            'avatarV': avatarV,
            'avatar_url': avatar_url
        }
        cls.operation_with_index(index='users',
                                 doc_type='user',
                                 body=body,
                                 index_id=user_ID)

    @classmethod
    def add_project(cls, project_id, display_name, description, tags,
                    project_type, img_v, photo_url, username):
        body = {
            'display_name': display_name,
            'description': description,
            'tags': [tag for tag in tags],
            'type': project_type,
            'img_v': img_v,
            'photo_url': photo_url,
            'username': username
        }
        cls.operation_with_index(index='projects',
                                 doc_type='project',
                                 body=body,
                                 index_id=project_id)

    @classmethod
    def add_request(cls, request_title, description, request_type, username,
                    request_id):
        body = {
            'request_title': request_title,
            'description': description,
            'type': request_type,
            'username': username
        }
        cls.operation_with_index(index='requests',
                                 doc_type='request',
                                 body=body,
                                 index_id=request_id)

    @classmethod
    def add_code_snippet(cls, code_snippet_id, code_name, code_des, code_tags,
                         code_source, detail_url, insert_num):
        body = {
            'code_name': code_name,
            'code_des': code_des,
            'code_tags': [tag for tag in code_tags],
            'code_source': code_source,
            'detail_url': detail_url,
            'insert_num': insert_num
            # 'key_words': [key_word for key_word in key_words],
        }
        cls.operation_with_index(index='code_snippets',
                                 doc_type='code_snippet',
                                 body=body,
                                 index_id=code_snippet_id)

    # 添加Projects
    @classmethod
    def add_projects(cls):
        # 添加 Project 的数据
        from server3.business.project_business import ProjectBusiness
        # privacy='public'
        projects = ProjectBusiness.repo.objects()
        projects = projects(privacy='public')
        if len(projects) <= 0:
            cls.create_index(index='projects')
        for project in projects:
            try:
                body = {
                    'display_name': project.display_name,
                    'description': project.description,
                    'tags': [tag.id for tag in project.tags],
                    'type': project.type,
                    'img_v': project.img_v if project.img_v else '',
                    'photo_url':
                    project.photo_url if project.photo_url else '',
                    'username': project.user.username,
                    'create_time': project.create_time
                }
                if project.privacy == 'private':
                    continue
                cls.operation_with_index(index='projects',
                                         doc_type='project',
                                         body=body,
                                         index_id=project.id)
            except Exception as e:
                # project.delete()
                print('项目被删除掉, 无法添加elasticsearch')
                continue

    # 添加Requests
    @classmethod
    def add_requests(cls):
        from server3.business.user_request_business import UserRequestBusiness
        requests = UserRequestBusiness.repo.objects()
        if len(requests) <= 0:
            cls.create_index(index='requests')
        for request in requests:
            try:
                body = {
                    'request_title': request.title,
                    'description': request.description,
                    'type': request.type,
                    'username': request.user.username,
                    'tags': [tag.id for tag in request.tags],
                    'create_time': request.create_time
                }
                cls.operation_with_index(index='requests',
                                         doc_type='request',
                                         body=body,
                                         index_id=request.id)
            except Exception as e:
                print('需求被删除掉, 无法添加到elasticsearch')
                continue

    @classmethod
    def add_code_snippets(cls):
        from server3.business.code_snippet_business import CodeSnippetBusiness
        code_snippets = CodeSnippetBusiness.repo.objects()
        if len(code_snippets) <= 0:
            cls.create_index(index='code_snippets')
        for code_snippet in code_snippets:
            try:
                body = {
                    'code_name': code_snippet.code_name,
                    'code_des': code_snippet.code_des,
                    'code_tags': [tag for tag in code_snippet.code_tags],
                    'code_source': code_snippet.code_source,
                    'detail_url': code_snippet.detail_url,
                    'insert_num': code_snippet.insert_num,
                    # 'key_words': [key_word for key_word in code_snippet.key_words],
                }
                cls.operation_with_index(index='code_snippets',
                                         doc_type='code_snippet',
                                         body=body,
                                         index_id=code_snippet.id)
            except Exception as e:
                print('代码块不存在')
                continue

    @classmethod
    # 添加 users
    def add_users(cls):
        from server3.business.user_business import UserBusiness
        users = UserBusiness.repo.objects()
        if len(users) <= 0:
            cls.create_index(index='users')
        for user in users:
            try:
                body = {
                    'username': user.username,
                    'bio': user.bio,
                    'avatarV': user.avatarV,
                    'avatar_url': user.avatar_url if user.avatar_url else ''
                }
                cls.operation_with_index(index='users',
                                         doc_type='user',
                                         body=body,
                                         index_id=user.user_ID)
            except Exception as e:
                print('用户不存在, 添加不了elasticsearch')
                continue

    @classmethod
    def delete_project(cls, project_id):
        try:
            print(
                cls._es.delete(index='projects',
                               doc_type='project',
                               id=project_id,
                               refresh='true',
                               ignore=['400', '404']))
        except NotFoundError as e:
            print('Elastic not found this project with id: ' + project_id)

    @classmethod
    def delete_user(cls, user_ID):
        try:
            print(
                cls._es.delete(index='users',
                               doc_type='user',
                               id=user_ID,
                               refresh='true',
                               ignore=['400', '404']))
        except NotFoundError as e:
            print('Elastic not found this user with id: ' + user_ID)

    @classmethod
    def delete_request(cls, request_id):
        try:
            print(
                cls._es.delete(index='requests',
                               doc_type='request',
                               id=request_id,
                               refresh='true',
                               ignore=['400', '404']))
        except NotFoundError as e:
            print('Elastic not found this request with id: ' + request_id)

    @classmethod
    # 添加所有数据
    def add_all(cls):
        if not cls._es.indices.exists('requests'):
            cls.create_index('requests')
            cls.add_requests()
        if not cls._es.indices.exists('projects'):
            cls.create_index('projects')
            cls.add_projects()
        if not cls._es.indices.exists('users'):
            cls.create_index('users')
            cls.add_users()
        if not cls._es.indices.exists('code_snippets'):
            cls.create_index('code_snippets')
            cls.add_code_snippets()

    @classmethod
    def clear_indices(cls):
        cls._es.indices.clear_cache()
Exemple #12
0
 def __init__(self):
     self.es = Elasticsearch(ElasticConfig.uri)
     self._multi_search_results = []
     self.bulk_task_queue = []
     self.bulk_last_time = datetime_now_obj()
Exemple #13
0
class ElasticHelper(object):
    def __init__(self):
        self.es = Elasticsearch(ElasticConfig.uri)
        self._multi_search_results = []
        self.bulk_task_queue = []
        self.bulk_last_time = datetime_now_obj()

    def delay_index(self, body, index, doc_type):
        self.bulk_task_queue.append(
            {"index": {
                "_index": index,
                "_type": doc_type
            }})
        self.bulk_task_queue.append(body)

        if self._can_do_bulk():
            self.bulk(body=self.bulk_task_queue,
                      index=index,
                      doc_type=doc_type)
            self.bulk_task_queue = []

        self.bulk_last_time = datetime_now_obj()

    def _can_do_bulk(self):
        # 任务队列超过100条数据
        if len(self.bulk_task_queue) > 100:
            return True
        # 时间间隔超过1分钟
        if get_n_min_ago(1) > self.bulk_last_time:
            return True
        return False

    def index(self, body, index, doc_type):
        self.es.index(body=body, index=index, doc_type=doc_type)

    def bulk(self, body, index, doc_type):
        self.es.bulk(body=body, index=index, doc_type=doc_type)

    def scan(self, body, index, doc_type):
        return helpers.scan(self.es,
                            query=body,
                            index=index,
                            doc_type=doc_type,
                            preserve_order=True)

    def search(self, body, index, doc_type):
        try:
            rsp = self.es.search(body=body,
                                 index=index,
                                 doc_type=doc_type,
                                 request_timeout=100)
            if rsp.get("error"):
                logger.error(rsp.get("error").get("reason"))
                return
            return rsp
        except Exception as e:
            print(body)
            logger.error("es search error: " + str(e) + index)

    def count(self, body, index, doc_type):
        return self.es.count(index=index,
                             doc_type=doc_type,
                             body=body,
                             request_timeout=100)

    def delete_index(self, index):
        return self.es.indices.delete(index=index)

    def put_template(self, name, body, **kwargs):
        return self.es.indices.put_template(name=name, body=body, **kwargs)

    def exists_template(self, name, **kwargs) -> bool:
        return self.es.indices.exists_template(name=name, **kwargs)

    def delete_template(self, name, **kwargs):
        return self.es.indices.delete_template(name=name, **kwargs)

    def get_template(self, name, **kwargs):
        return self.es.indices.get_template(name=name, **kwargs)

    def wait_log_in_database(self, computer_name, record_number):
        """
            因为消息队列和入库ES是分开进行的,所以可能会出现当消费到某条日志时,ES还没入库,所以需要检查同步
        """
        count = 0
        query = {
            "query":
            get_must_statement(
                get_term_statement("computer_name", computer_name),
                get_term_statement("record_number", record_number)),
            "_source":
            False,
            "size":
            1
        }
        while True:
            try:
                rsp = self.es.search(body=query,
                                     index=ElasticConfig.event_log_index,
                                     doc_type=ElasticConfig.event_log_doc_type,
                                     request_timeout=100)
                if rsp.get("error"):
                    logger.error(rsp.get("error").get("reason"))
                    break
                if len(rsp["hits"]["hits"]) > 0:
                    return rsp["hits"]["hits"][0]["_id"]
                time.sleep(2)
                # 最多等5次,即 2 * 5 = 10秒
                if count == 10:
                    break
                count += 1
            except Exception as e:
                logger.error("es wait_log_in_database search error: " + str(e))
                break

    def multi_search(self, body, index, doc_type):
        try:
            rsp = self.es.msearch(body=body,
                                  index=index,
                                  doc_type=doc_type,
                                  request_timeout=100)
            if rsp.get("error"):
                logger.error(rsp.get("error").get("reason"))
                return
            return rsp
        except Exception as e:
            logger.error("es msearch error: " + str(e))
Exemple #14
0
def create_app(config, enable_config_file=False):
    """
    创建应用
    :param config: 配置信息对象
    :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息
    :return: 应用
    """
    app = create_flask_app(config, enable_config_file)

    # 创建Snowflake ID worker
    from utils.snowflake.id_worker import IdWorker
    app.id_worker = IdWorker(app.config['DATACENTER_ID'],
                             app.config['WORKER_ID'], app.config['SEQUENCE'])

    # 限流器
    from utils.limiter import limiter as lmt
    lmt.init_app(app)

    # 配置日志
    from utils.logging import create_logger
    create_logger(app)

    # 注册url转换器
    from utils.converters import register_converters
    register_converters(app)

    from redis.sentinel import Sentinel
    _sentinel = Sentinel(app.config['REDIS_SENTINELS'])
    app.redis_master = _sentinel.master_for(
        app.config['REDIS_SENTINEL_SERVICE_NAME'])
    app.redis_slave = _sentinel.slave_for(
        app.config['REDIS_SENTINEL_SERVICE_NAME'])

    from rediscluster import StrictRedisCluster
    app.redis_cluster = StrictRedisCluster(
        startup_nodes=app.config['REDIS_CLUSTER'])

    # rpc
    app.rpc_reco_channel = grpc.insecure_channel(app.config['RPC'].RECOMMEND)
    app.rpc_reco = app.rpc_reco_channel

    # Elasticsearch
    app.es = Elasticsearch(
        app.config['ES'],
        # sniff before doing anything
        sniff_on_start=True,
        # refresh nodes after a node fails to respond
        sniff_on_connection_fail=True,
        # and also every 60 seconds
        sniffer_timeout=60)

    # socket.io
    # 通过sio mgr对象 可以发布要进行即使消息推送的任务,由socketio服务器从rabbitmq中取出任务,推送消息
    app.sio_mgr = socketio.KombuManager(app.config['RABBITMQ'],
                                        write_only=True)

    # MySQL数据库连接初始化
    from models import db

    db.init_app(app)

    # 创建APScheduler定时任务调度器对象
    executors = {'default': ThreadPoolExecutor(10)}

    app.scheduler = BackgroundScheduler(executors=executors)

    # 添加"静态的"定时任务
    from .schedule.statistic import fix_statistics
    # app.scheduler.add_job(fix_statistics, 'date', args=[app])
    app.scheduler.add_job(fix_statistics, 'cron', hour=3, args=[app])

    # 启动定时任务调度器
    app.scheduler.start()

    # 废弃 添加异常处理 对于flask-restful无效
    # from utils.error_handlers import handle_redis_error, handler_mysql_error
    # app.register_error_handler(RedisError, handle_redis_error)
    # app.register_error_handler(SQLAlchemyError, handler_mysql_error)

    # 添加请求钩子
    from utils.middlewares import jwt_authentication
    app.before_request(jwt_authentication)

    # 注册用户模块蓝图
    from .resources.user import user_bp
    app.register_blueprint(user_bp)

    # 注册新闻模块蓝图
    from .resources.news import news_bp
    app.register_blueprint(news_bp)

    # 注册通知模块
    from .resources.notice import notice_bp
    app.register_blueprint(notice_bp)

    # 搜索
    from .resources.search import search_bp
    app.register_blueprint(search_bp)

    return app
Exemple #15
0
from elasticsearch5 import Elasticsearch

useIndex = 'tw_user_database_*'
TWEETSINDEX = "tweets_database*"
# host = "192.168.209.113"
# port = "9200"
host = "192.168。8.200"
port = "9201"
es_client = Elasticsearch([{"host": host, "port": port}])
info = es_client.info()

userid = "25073877"
body = {"query": {"match": {"user.id": userid}}}

rs = es_client.search(index=TWEETSINDEX, body=body)
print rs
print type(rs)
Exemple #16
0
from elasticsearch5 import Elasticsearch
from .constants import Constants
from .text import get_file_list
import json

es = Elasticsearch()


def create_index():
    """Create new index
    """
    es.indices.create(Constants.INDEX_NAME, body=get_es_script('index_create'))


def delete_index():
    """Delete existing index
    """
    es.indices.delete(index=Constants.INDEX_NAME, ignore=[400, 404])


def get_es_script(script_name):
    """Read es json file
    return dictionary of the body
    """
    with open(Constants.ES_SCRIPTS_PATH + script_name + '.json') as s:
        body = json.load(s)
    return body


def get_doc_count():
    """Retrieve document count from ES
Exemple #17
0
def create_app(config, enable_config_file=False):
    """
    创建应用
    :param config: 配置信息对象
    :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息
    :return: 应用
    """
    app = create_flask_app(config, enable_config_file)

    # 创建Snowflake ID worker
    from utils.snowflake.id_worker import IdWorker
    app.id_worker = IdWorker(app.config['DATACENTER_ID'],
                             app.config['WORKER_ID'], app.config['SEQUENCE'])

    # 如果在视图中需要生成分布式ID
    # id = current_app.id_worker.get_id()

    # 限流器
    from utils.limiter import limiter as lmt
    lmt.init_app(app)

    # 配置日志
    from utils.logging import create_logger
    create_logger(app)

    # 注册url转换器
    from utils.converters import register_converters
    register_converters(app)

    from redis.sentinel import Sentinel
    _sentinel = Sentinel(app.config['REDIS_SENTINELS'])
    app.redis_master = _sentinel.master_for(
        app.config['REDIS_SENTINEL_SERVICE_NAME'])
    app.redis_slave = _sentinel.slave_for(
        app.config['REDIS_SENTINEL_SERVICE_NAME'])

    from rediscluster import StrictRedisCluster
    app.redis_cluster = StrictRedisCluster(
        startup_nodes=app.config['REDIS_CLUSTER'])

    # 视图
    # current_app.redis_master.set()
    # current_app.redis_cluster.get()

    # rpc
    app.rpc_reco = grpc.insecure_channel(app.config['RPC'].RECOMMEND)

    # Elasticsearch
    app.es = Elasticsearch(
        app.config['ES'],
        # sniff before doing anything
        sniff_on_start=True,
        # refresh nodes after a node fails to respond
        sniff_on_connection_fail=True,
        # and also every 60 seconds
        sniffer_timeout=60)

    # socket.io
    # app.sio = socketio.KombuManager(app.config['RABBITMQ'], write_only=True)

    # MySQL数据库连接初始化
    from models import db

    db.init_app(app)

    # db = SQLAlchmey(app)

    # db = SQLAlchemy()
    # db.init_app(app_

    # 创建定时任务工具对象
    # 将scheduler对象保存到flask app对象中的目的,是方便视图执行的时候随时产生新的定时任务需求,可以借助current_app.scheduler.add_job()来
    # 动态添加新的定时任务

    executors = {
        'default': ThreadPoolExecutor(10),
    }

    app.scheduler = BackgroundScheduler(executors=executors)

    # 此处可以添加定时任务,这些定时任务与视图程序的执行无关,是在程序启动一开始就确定好的
    from .schedule import statistic
    # 每天凌晨3点执行
    app.scheduler.add_job(statistic.fix_statistics, 'cron', hour=3, args=[app])

    # 为了测试方便,立即执行
    # app.scheduler.add_job(statistic.fix_statistics, 'date', args=[app])

    # app.scheduler.add_job()
    # app.scheduler.add_job()
    # app.scheduler.add_job()

    app.scheduler.start()

    # 添加请求钩子
    from utils.middlewares import jwt_authentication
    app.before_request(jwt_authentication)

    # 注册用户模块蓝图
    from .resources.user import user_bp
    app.register_blueprint(user_bp)

    # 注册新闻模块蓝图
    from .resources.news import news_bp
    app.register_blueprint(news_bp)

    # 注册通知模块
    from .resources.notice import notice_bp
    app.register_blueprint(notice_bp)

    # 搜索
    from .resources.search import search_bp
    app.register_blueprint(search_bp)

    return app
Exemple #18
0
import os
import time
import xmltodict
from elasticsearch5 import Elasticsearch

es = Elasticsearch()
#Set the path for the XML files
path = "Parsed files/"
#Set the path for the queries txt
Qfile = "testingQueries.txt"
counter = 0
mode = "other"
#Set mode to 'all' if you want to read the XML files, turn them into dictionaries and upload them to elasticsearch
#Set the mode to 'other' if you have already uploaded the data and you want to do the queries part only
if mode == "all":
    #Creates the index form
    #Sets the english analyzer to elasticsearch before inserting the data
    es.indices.create(index='test',
                      ignore=400,
                      body={
                          'mappings': {
                              'project': {
                                  'properties': {
                                      'rcn': {
                                          'type': 'integer'
                                      },
                                      'acronym': {
                                          'type': 'string'
                                      },
                                      'text': {
                                          'type': 'string',
from acmappings import mappings

from elasticsearch5 import Elasticsearch
from elasticsearch5 import helpers
import os

esUrl = os.environ['esurl']
esPort = os.environ['esport']
esPass = os.environ['espass']
esUser = os.environ['esuser']
aliases = os.environ['aliases']
indices = os.environ['indices']

esObj = Elasticsearch([{"host":esUrl,"port":esPort}],http_auth=(esUser,esPass))

version = ""
oldversion = ""

with open("sacindexcreator_count","r") as file:
  version,oldversion = file.readline().split(",")
  version = str(int(version) + 1)
  oldversion = str(int(oldversion) + 1)

locales = {"da":"danish_rebuilt","hr":"standard","pl":"standard","sl":"standard","el":"greek_rebuilt","ja":"cjk_rebuilt","ko":"cjk_rebuilt","ar": "arabic_rebuilt","de": "german_rebuilt","zh":"cjk_rebuilt","id": "indonesian_rebuilt","th": "thai_rebuilt","sv": "swedish_rebuilt","tr": "turkish_rebuilt","ru": "russian_rebuilt","pt": "portuguese_rebuilt","br": "brazilian_rebuilt","it": "italian_rebuilt","hu": "hungarian_rebuilt","nl": "dutch_rebuilt","no": "norwegian_rebuilt","es": "spanish_rebuilt","fr": "french_rebuilt","fr_ca": "french_rebuilt","cz": "czech_rebuilt","en": "english_rebuilt"}

if indices == "yes":
  for locale in locales:
      tempmapping = mappings
      for item in tempmapping["mappings"]["autocomplete"]["properties"]:
          if tempmapping["mappings"]["autocomplete"]["properties"][item].get("type",False):
              if tempmapping["mappings"]["autocomplete"]["properties"][item]["type"] == "text":
Exemple #20
0
from elasticsearch5 import Elasticsearch  # 使用对应版本的模块中的类

# elasticsearch 集群服务器的地址
ES = ['127.0.0.1:9200']

# 创建elasticsearch客户端
es = Elasticsearch(
    ES,
    # 启动前嗅探es集群服务器
    sniff_on_start=True,
    # es集群服务器结点连接异常时是否刷新es节点信息
    sniff_on_connection_fail=True,
    # 每60秒刷新节点信息
    sniffer_timeout=60)


def search():
    search_name = input("请输入要查询的关键词:")

    query = {
        "from": 0,
        "size": 10000,  # from + size must be less than or equal to: [10000]
        "query": {
            "bool": {
                # 匹配标题或作者
                "should": [{
                    "match_phrase": {
                        "title": search_name
                    }
                }, {
                    "match_phrase": {
import pandas as pd

from elasticsearch5 import Elasticsearch

#pid = 2337

es = Elasticsearch(hosts=ES_HOST)
count = es.count(index="prd_review")['count']


def get_mtermvectors(ids):
    body = dict()
    body["ids"] = ids
    body["parameters"] = {"fields": ["title"]}

    res = es.mtermvectors(index='prd_review', doc_type='_doc', body=body)['docs']
    return res


def get_termvectors(id):
    res = es.termvectors(index='prd_review', doc_type='_doc', id=id)['term_vectors']
    if 'title' in res.keys():
        return res
    else:
        return None


def sort_terms_vector(term_vectors):
    if not term_vectors:
        return None
    term_dict = {}
Exemple #22
0
def create_app(config, enable_config_file=False):
    """
    创建应用
    :param config: 配置信息对象
    :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息
    :return: 应用
    """
    app = create_flask_app(config, enable_config_file)

    # 创建Snowflake ID worker
    from utils.snowflake.id_worker import IdWorker
    app.id_worker = IdWorker(app.config['DATACENTER_ID'],
                             app.config['WORKER_ID'], app.config['SEQUENCE'])

    # 限流器
    from utils.limiter import limiter as lmt
    lmt.init_app(app)

    # 配置日志
    from utils.logging import create_logger
    create_logger(app)

    # 注册url转换器
    from utils.converters import register_converters
    register_converters(app)

    from redis.sentinel import Sentinel
    _sentinel = Sentinel(app.config['REDIS_SENTINELS'])
    app.redis_master = _sentinel.master_for(
        app.config['REDIS_SENTINEL_SERVICE_NAME'])
    app.redis_slave = _sentinel.slave_for(
        app.config['REDIS_SENTINEL_SERVICE_NAME'])

    from rediscluster import StrictRedisCluster
    app.redis_cluster = StrictRedisCluster(
        startup_nodes=app.config['REDIS_CLUSTER'])

    # rpc
    app.rpc_reco = grpc.insecure_channel(app.config['RPC'].RECOMMEND)

    # Elasticsearch
    app.es = Elasticsearch(
        app.config['ES'],
        # sniff before doing anything
        sniff_on_start=True,
        # refresh nodes after a node fails to respond
        sniff_on_connection_fail=True,
        # and also every 60 seconds
        sniffer_timeout=60)

    # socket.io
    # app.sio = socketio.KombuManager(app.config['RABBITMQ'], write_only=True)

    # MySQL数据库连接初始化
    from models import db

    db.init_app(app)

    # 添加请求钩子
    from utils.middlewares import jwt_authentication
    app.before_request(jwt_authentication)

    # 注册用户模块蓝图
    from .resources.user import user_bp
    app.register_blueprint(user_bp)

    # 注册新闻模块蓝图
    from .resources.news import news_bp
    app.register_blueprint(news_bp)

    # 注册通知模块
    from .resources.notice import notice_bp
    app.register_blueprint(notice_bp)

    # 搜索
    from .resources.search import search_bp
    app.register_blueprint(search_bp)

    # 定义apscheduler的调度器对象
    # 保存到flask的app对象,方便在视图中使用调度器添加新的定时任务
    executors = {
        'default': ThreadPoolExecutor(20),
    }
    app.scheduler = BackgroundScheduler(executors=executors)

    # 由scheduler管理的定时任务 两种:
    # 一种是一开始就明确确定的 ,比如 修正redis的统计数据
    # 在此处定义 add_job
    # app.scheduler.add_job()

    # 添加定时修正统计数据的定时任务
    from .schedulers.statistic import fix_statistics
    # 每天的凌晨3点执行
    # 通过args 可以在调度器执行定时任务方法的时候,传递给定时任务方法参数
    # app.scheduler.add_job(fix_statistics, 'cron', hour=3, args=[app])

    # 为了测试方便,需要立即执行
    app.scheduler.add_job(fix_statistics, 'date', args=[app])

    # 另一种 是在flask运行期间,由视图函数产生的,动态添加的新定时任务
    # 在视图函数中 调用 current_app.scheduler.add_job来添加

    app.scheduler.start()  # 非阻塞,不会阻塞住flask程序的执行,会在后台单独创建进程或线程进行计时

    return app
Exemple #23
0
from elasticsearch5 import Elasticsearch
from elasticsearch5 import helpers
import os
import uuid
import datetime

esUrl = os.environ['esurl']
esPort = os.environ['esport']
esPass = os.environ['espass']
esUser = os.environ['esuser']
rootOrg = os.environ['rootOrg']
org = os.environ['org']

esObj = Elasticsearch([{
    "host": esUrl,
    "port": esPort
}],
                      http_auth=(esUser, esPass))

response = esObj.search("mlsearch_*",
                        "searchresources",
                        '''{
        "size":1000,
        "_source":["locale","keywords","catalogPaths","name","sourceName","sourceShortName"]
    }''',
                        scroll="5s")
result_pending = [response]
cnt = 1
indexDocs = []
stData = {}
while result_pending:
Exemple #24
0
def create_app(config, enable_config_file=False):
    """
    创建应用
    :param config: 配置信息对象
    :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息
    :return: 应用
    """
    app = create_flask_app(config, enable_config_file)

    # 创建Snowflake ID worker
    from utils.snowflake.id_worker import IdWorker
    app.id_worker = IdWorker(app.config['DATACENTER_ID'],
                             app.config['WORKER_ID'], app.config['SEQUENCE'])

    # 限流器
    from utils.limiter import limiter as lmt
    lmt.init_app(app)

    # 配置日志
    from utils.logging import create_logger
    create_logger(app)

    # 注册url转换器
    from utils.converters import register_converters
    register_converters(app)

    from redis.sentinel import Sentinel
    _sentinel = Sentinel(app.config['REDIS_SENTINELS'])
    app.redis_master = _sentinel.master_for(
        app.config['REDIS_SENTINEL_SERVICE_NAME'])
    app.redis_slave = _sentinel.slave_for(
        app.config['REDIS_SENTINEL_SERVICE_NAME'])

    from rediscluster import StrictRedisCluster
    app.redis_cluster = StrictRedisCluster(
        startup_nodes=app.config['REDIS_CLUSTER'])

    # rpc
    # app.rpc_reco = grpc.insecure_channel(app.config['RPC'].RECOMMEND)

    # Elasticsearch
    app.es = Elasticsearch(
        app.config['ES'],
        # sniff before doing anything
        sniff_on_start=True,
        # refresh nodes after a node fails to respond
        sniff_on_connection_fail=True,
        # and also every 60 seconds
        sniffer_timeout=60)

    # socket.io
    # 创建一个socketio提供的写入rabbitmq任务信息的工具对象
    app.sio_mgr = socketio.KombuManager(app.config['RABBITMQ'],
                                        write_only=True)

    # MySQL数据库连接初始化
    from models import db

    db.init_app(app)

    # 添加请求钩子
    from utils.middlewares import jwt_authentication
    app.before_request(jwt_authentication)

    # 注册用户模块蓝图
    from .resources.user import user_bp
    app.register_blueprint(user_bp)

    # 注册新闻模块蓝图
    from .resources.news import news_bp
    app.register_blueprint(news_bp)

    # 注册通知模块
    from .resources.notice import notice_bp
    app.register_blueprint(notice_bp)

    # 搜索
    from .resources.search import search_bp
    app.register_blueprint(search_bp)

    return app
Exemple #25
0
def create_app(config, enable_config_file=False):
    """
    创建应用
    :param config: 配置信息对象
    :param enable_config_file: 是否允许运行环境中的配置文件覆盖已加载的配置信息
    :return: 应用
    """
    app = create_flask_app(config, enable_config_file)

    # 创建Snowflake ID worker
    from utils.snowflake.id_worker import IdWorker
    app.id_worker = IdWorker(app.config['DATACENTER_ID'],
                             app.config['WORKER_ID'], app.config['SEQUENCE'])

    # 限流器
    from utils.limiter import limiter as lmt
    lmt.init_app(app)

    # 配置日志
    from utils.logging import create_logger
    create_logger(app)

    # 注册url转换器
    from utils.converters import register_converters
    register_converters(app)

    from redis.sentinel import Sentinel
    _sentinel = Sentinel(app.config['REDIS_SENTINELS'])
    app.redis_master = _sentinel.master_for(
        app.config['REDIS_SENTINEL_SERVICE_NAME'])
    app.redis_slave = _sentinel.slave_for(
        app.config['REDIS_SENTINEL_SERVICE_NAME'])

    from rediscluster import StrictRedisCluster
    app.redis_cluster = StrictRedisCluster(
        startup_nodes=app.config['REDIS_CLUSTER'])

    # rpc

    app.rpc_reco_channel = grpc.insecure_channel(app.config['RPC'].RECOMMEND)

    # app.rpc_reco = grpc.insecure_channel(app.config['RPC'].RECOMMEND)

    # Elasticsearch
    app.es = Elasticsearch(
        app.config['ES'],
        # sniff before doing anything
        sniff_on_start=True,
        # refresh nodes after a node fails to respond
        sniff_on_connection_fail=True,
        # and also every 60 seconds
        sniffer_timeout=60)

    # socket.io
    # app.sio = socketio.KombuManager(app.config['RABBITMQ'], write_only=True)

    # MySQL数据库连接初始化
    from models import db

    db.init_app(app)

    # # 添加请求钩子
    from utils.middleware import jwt_authorization
    app.before_request(jwt_authorization)

    # 添加定时任务APScheduler
    from apscheduler.schedulers.background import BackgroundScheduler
    from apscheduler.executors.pool import ThreadPoolExecutor
    # 触发器
    from apscheduler.triggers import date, interval, cron
    from toutiao.schedule.statistics import fix_statistics

    # 1.创建执行器对象executors
    executors = {
        # 默认会将定时任务使用线程执行,并且添加到线程池,最大并发10个线程
        "default": ThreadPoolExecutor(max_workers=10)
    }

    # 2.创建调度器对象-使用executors进行配置
    scheduler = BackgroundScheduler(executors=executors)

    # 2.1 将scheduler对象保存到app中,其他地方如果需要添加`动态任务` :current_app.scheduler.add_job(动态任务)
    app.scheduler = scheduler

    # 3.添加任务--修正统计数据--`静态任务`
    # app.scheduler.add_job(func="定时任务函数引用", trigger="触发器", args=["参数"])
    # app.scheduler.add_job(func=fix_statistics, trigger=cron.CronTrigger(hour=4), args=["参数"])
    # 触发器凌晨4点执行任务
    # app.scheduler.add_job(func=fix_statistics, trigger="cron", hour=4, args=[app])
    app.scheduler.add_job(func=fix_statistics, trigger="date", args=[app])

    # 4.开启定时任务
    app.scheduler.start()

    # 注册用户模块蓝图
    from .resources.user import user_bp

    app.register_blueprint(user_bp)

    # 注册新闻模块蓝图
    from .resources.news import news_bp

    app.register_blueprint(news_bp)

    # 注册通知模块
    from .resources.notice import notice_bp

    app.register_blueprint(notice_bp)

    # 搜索
    from .resources.search import search_bp

    app.register_blueprint(search_bp)

    return app
Exemple #26
0
from elasticsearch5 import Elasticsearch
from tools.config import Config
import json

config = Config("./config.yml")
index_name = config.get("index_name")
es_dir = config.get("es_dir")
es = Elasticsearch(timeout=2000)

scroll_id = ""


def get_es_script(script_name):
    """Read es json file
    return dictionary of the body
    """
    with open(es_dir + script_name + '.json') as s:
        body = json.load(s)
    return body


def search(keywords, body={}):
    global scroll_id
    """ES Built-in search command
    parameter string keyword to search
    return list of matched documents
    """
    print("Passed: " + keywords)
    if len(body) == 0 and keywords != "":
        body = get_es_script('search')
        body['query']['match']['text'] = keywords
Exemple #27
0
|  $$$$$$   | $$    | $$  \ $$| $$      | $$$$$$/ |  $$$$$$ | $$| $$  \ $$| $$  \ $$  | $$    
 \____  $$  | $$ /$$| $$  | $$| $$      | $$_  $$  \____  $$| $$| $$  | $$| $$  | $$  | $$ /$$
 /$$$$$$$/  |  $$$$/|  $$$$$$/|  $$$$$$$| $$ \  $$ /$$$$$$$/| $$|  $$$$$$$| $$  | $$  |  $$$$/
|_______/    \___/   \______/  \_______/|__/  \__/|_______/ |__/ \____  $$|__/  |__/   \___/  
                                                                 /$$  \ $$                    
                       :) = +$   :( = -$                        |  $$$$$$/                    
                                                                 \______/  v%s
    Join the StockSight website https://stocksight.diskoverspace.com
        \033[0m""" % (color, STOCKSIGHT_VERSION)
        print(banner + '\n')

    if not args.noelasticsearch:
        # create instance of elasticsearch
        es = Elasticsearch(hosts=[{
            'host': elasticsearch_host,
            'port': elasticsearch_port
        }],
                           http_auth=(elasticsearch_user,
                                      elasticsearch_password))

        # set up elasticsearch mappings and create index
        mappings = {
            "mappings": {
                "tweet": {
                    "properties": {
                        "author": {
                            "type": "string",
                            "fields": {
                                "keyword": {
                                    "type": "keyword"
                                }
                            }
Exemple #28
0
 def __init__(self):
     self.es = Elasticsearch(['https://search-el-dev-znz7hdtpcgghjcq4vatwtc3xiu.ap-northeast-2.es.amazonaws.com:443'])
     self.set_service()
     pass
Exemple #29
0
import os
import json
from pathlib import Path

import click
from elasticsearch5 import Elasticsearch, helpers

from utils import convert_message
from config import INDEX_NAME, TYPE_NAME

ELASTICSEARCH_URL = os.environ['ELASTICSEARCH_URL']
es = Elasticsearch([ELASTICSEARCH_URL])


@click.group()
def cmd():
    pass


def parse_file(fname, channel):
    results = []
    for data in json.load(fname.open()):
        data = convert_message(data)
        if data is None:
            continue

        data['channel'] = channel
        results.append(data)
    return results