Exemple #1
0
class ESChat:

    def __init__(self, index_name, kb=True):
        self.es = Elasticsearch(http_auth=('elastic', 'elastic123'))
        self.index = index_name

    def multi_search(self, topics, samples=10):
        # limit the querys length
        search_arr = []
        for topic in topics:
            search_arr.append({'index': self.index})
            search_arr.append({'query': {'bool': {'should': [{'match': {'utterance': {'query': topic}}}]}}, 'size': samples})
        request = ''
        for each in search_arr:
            request += f'{json.dumps(each)} \n'
        rest = self.es.msearch(body=request)
        return rest

    def multi_search_edge(self, topics, samples=10):
        # limit the querys length
        search_arr = []
        for topic1, topic2 in topics:
            search_arr.append({'index': self.index})
            search_arr.append({'query': {'bool': {'must': [{'match': {'utterance': {'query': topic1}}}, {'match': {'utterance': {'query': topic2}}}]}}, 'size': samples})
        request = ''
        for each in search_arr:
            request += f'{json.dumps(each)} \n'
        rest = self.es.msearch(body=request)
        return rest
Exemple #2
0
def start_stop_chr(start, stop, chr):
    es = Elasticsearch()
    request = []
    for i in range(len(chr)):
        req_head = {'index': 'annotations', 'type': 'annotations'}
        req_body = {"from": 0, "size": 100, "query": {
            "bool": {
                "must": [
                    {"match": {"CHROM": chr}}
                ],
                "filter":
                    {"range": {"START": {"gte": start, "lte": stop}}}

            }
        }
                    }
        request.extend([req_head, req_body])
    resp = es.msearch(body=request)
    annotations = []
    start = []
    stop = []
    chr = []
    for i in resp["responses"][0]["hits"]["hits"]:
        annotations.append(i["_source"])
        start.append(i["_source"]["START"])
        stop.append(i["_source"]["STOP"])
        chr.append(i["_source"]["CHROM"])
    return binding_site(start, stop, chr, annotations)
Exemple #3
0
def binding_site(start, stop, chr, annotations):
    es = Elasticsearch()
    request = []
    for i in range(len(chr)):
        req_head = {'index': 'bs', 'type': 'bs'}
        req_body = {"from": 0, "size": 1, "query": {
            "bool": {
                "must": [{
                    "range": {"START": {"gte": start[i], "lte": stop[i]}},
                    "range": {"STOP": {"lte": stop[i], "gte": start[i]}}
                }],
                "filter": {
                    "term": {"CHR": chr[i]}
                }
            }
        }
                    }
        request.extend([req_head, req_body])
    resp = es.msearch(body=request)
    bs = []
    bs_start = []
    bs_stop = []
    bs_chr = []
    for i in resp["responses"]:
        if i["hits"]["hits"] != []:
            bs.append(i["hits"]["hits"][0]["_source"])
            bs_start.append(i["hits"]["hits"][0]["_source"]["START"])
            bs_stop.append(i["hits"]["hits"][0]["_source"]["STOP"])
            bs_chr.append(i["hits"]["hits"][0]["_source"]["CHR"])
    return exon_sgrna_peek(bs_start, bs_stop, bs_chr, annotations, bs)
Exemple #4
0
 def msearch_protein(self, host, query):
     self.es_client = Elasticsearch(host)
     es = Elasticsearch(host)
     response = es.msearch(body=query, request_timeout=150)
     # filter_pathを設定する場合は以下を利用する
     #response = es.msearch(body=query_text, request_timeout=150, filter_path=['responses.aggregations.tags.buckets.key','responses.aggregations.tags.buckets.top_tag_hits.hits.hits._score','responses.aggregations.tags.buckets.top_tag_hits.hits.hits._source.name','responses.aggregations.tags.buckets.top_tag_hits.hits.hits._source.normalized_name'])
     return response
Exemple #5
0
class ElasticsearchService(object):
    def __init__(self, host, port):
        self._es = Elasticsearch([{'host': host, 'port': port}])

    def search(self, *args, **kwargs):
        return self._es.search(*args, **kwargs)

    def create(self, *args, **kwargs):
        return self._es.create(*args, **kwargs)

    def get(self, *args, **kwargs):
        return self._es.get(*args, **kwargs)

    def exists(self, *args, **kwargs):
        return self._es.exists(*args, **kwargs)

    def msearch(self, *args, **kwargs):
        return self._es.msearch(*args, **kwargs)

    def index(self, *args, **kwargs):
        return self._es.index(*args, **kwargs)

    def update(self, *args, **kwargs):
        return self._es.update(*args, **kwargs)

    def delete(self, *args, **kwargs):
        return self._es.delete(*args, **kwargs)

    def put_template(self, *args, **kwargs):
        return self._es.indices.put_template(*args, **kwargs)
class ESUtils:
    
    def __init__(self, index_name):
        self.es = Elasticsearch(http_auth=('elastic', 'elastic123'), timeout=30)
        self.index = index_name

    def multi_search(self, msgs, samples=10):
        search_arr = []
        for msg in msgs:
            search_arr.append({'index': self.index})
            # https://elasticsearch.cn/article/132
            search_arr.append({
                'query': {
                    'bool': {
                        'should': [{'match': {'utterance': {'query': msg.replace('[SEP]', '')}}}],
                    }
                },
                "collapse": {
                    "field": "keyword",     
                },
                'size': samples,
            })
        request = ''
        for each in search_arr:
            request += f'{json.dumps(each)} \n'
        rest = self.es.msearch(body=request)
        return rest
Exemple #7
0
def detect_entities(entity_set):
    entities = set([])  # using a set so duplicates are not added
    es = Elasticsearch()
    index = "dbpedia_2015_10"

    # TODO include MLM, this will very likely improve the results
    search_array = []
    for entity in entity_set:
        search_array.append({'index': index, "sort": ["_score"]})
        # search_array.append({"query": {"match": {"names": entity}}})
        search_array.append(
            {"query": {
                "multi_match": {
                    "query": entity,
                    "fields": ["names"]
                }
            }})

    request = ''
    for each in search_array:
        request += '%s \n' % json.dumps(each)
    res = es.msearch(body=request, max_concurrent_searches=1000)
    # print(res)
    # # print("Got %d Hits" % res['hits']['total'])
    for response in res['responses']:
        for hit in response['hits']['hits']:
            # print(hit['_source'].keys())
            entities |= set(hit['_source']['names'])
    # print("%d entities related to %s" % (len(entities), text))
    # print(entities)
    return entities
Exemple #8
0
def batch_insert(session, org_category, mm_result):

    es_config = ConfigManager().get_setting(key='elasticsearch')

    addresses = es_config['address']
    if not isinstance(addresses, list):
        addresses = [addresses]
    es = Elasticsearch(addresses, port=es_config['port'])

    batch_insert = es.msearch(body=mm_result)['responses']

    if batch_insert:
        try:
            for org_cat, batch_result in zip(org_category, batch_insert):
                if len(batch_result['hits']['hits']) == 0:
                    message = 'No Category Mapping: %s' % ' '.join(
                        str(item) for item in org_cat)
                    logging.getLogger(__name__).error(message)
                else:
                    item_cat = ItemCat(org_cat)
                    intg_id = batch_result['hits']['hits'][0]['_source'][
                        'intg_id']
                    cat_map = CatMap(INTG_ID=intg_id,
                                     UPDATE_TIME=datetime.datetime.now(
                                         timezone('Asia/Seoul')))
                    item_cat.cat_map.append(cat_map)
                    session.add(item_cat)
            session.commit()
        except IntegrityError as e:
            logger.error('Insertion Error %s' % e)
            session.rollback()
    session.close()
Exemple #9
0
    def findLogs(self, queryCommand, querySize=1):
        """find方法返回字符串,字符串内容是查询的消息文本"""
        logger.debug("es查询采集部分,收到命令:" + queryCommand + ",采集数量:" +
                     str(querySize))
        es = Elasticsearch(
            [{
                "host": settings.ES_ADDRESS["ip"],
                "port": 80,
                "url_prefix": "elasticsearch"
            }],
            headers={
                "kbn-version": "4.5.4",
                "Host": settings.ES_ADDRESS["host"],
                "User-Agent": "Mozilla/5.0 Gecko/20100101 Firefox/68.0"
            },
            timeout=10,
            http_compress=False)
        # 模板里面的${queryCommand}必须经过JSON处理,转义
        query_data = '''
{"index":["project_app-${today}"],"ignore_unavailable":true}
{"size":${querySize},"sort":[{"@timestamp":{"order":"desc","unmapped_type":"boolean"}}],"query":{"filtered":{"query":{"query_string":{"query":${queryCommand},"analyze_wildcard":true}},"filter":{"bool":{"must":[]}}}},"fields":["message"],"fielddata_fields":["@timestamp"]}
'''
        tpl = Template(query_data)
        return json.dumps(
            es.msearch(
                tpl.substitute(today=time.strftime("%Y.%m.%d"),
                               querySize=querySize,
                               queryCommand=json.dumps(queryCommand))))
def query_es_bulk(
    question,
    host,
    port,
    num_hits=25,
    query_field=("text", ),
    highligh_size=400,
    num_highlights=3,
    explain=False,
):
    elastic = Elasticsearch(host, port=port)

    def map_response(response):
        return [{
            "score": x["_score"],
            "hit": {
                "title": x["_source"]["title"],
                "text": x["_source"]["text"],
                "url": x["_source"]["isbn"],
            },
        } for x in response["hits"]["hits"]]

    question_data = question["question"]

    if not question_data["choices"]:
        return None
    index_name = index_mapping[question["info"]["language"].lower()]

    body = []
    for choice in question_data["choices"]:
        body.append({"index": index_name})
        query = " ".join((question_data["stem"], choice["text"]))
        body.append({
            "explain": explain,
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": query_field
                }
            },
            "highlight": {
                "fragment_size": highligh_size,
                "type": "plain",
                "number_of_fragments": num_highlights,
                "fields": {
                    "passage": {}
                },
            },
            "from": 0,
            "size": num_hits,
        })

    responses = elastic.msearch(index=index_name,
                                body=body,
                                request_timeout=60)
    return [map_response(res) for res in responses["responses"]]
Exemple #11
0
def es_search(index_name, field_name, search_date):
    
    es = Elasticsearch()
    request = []
    print('Search Date : ', search_date.strftime("%Y-%m-%dT%H:%M:%S"))
    req_head = {'index': index_name}
    req_body = {'query':{'range':{field_name:{'gte': search_date.strftime("%Y-%m-%dT%H:%M:%S") }}}}
    request.extend([req_head, req_body])
    resp = es.msearch(body = request)
    print(resp)
Exemple #12
0
def multisearch(es_client: Elasticsearch, search_bodies, index, doc_type):
    search_header = {"index": index, "type": doc_type}
    request = "".join(
        [
            "%s \n" % json.dumps(each)
            for search_body in search_bodies
            for each in [search_header, search_body]
        ]
    )
    out = es_client.msearch(body=request, index=index)
    return out
Exemple #13
0
def compute_rank(search_arr: List[str],
                 purchase_arr: List[List[Dict[str, List[str]]]],
                 rank_num: List[float], rank_den: List[float],
                 es_client: Elasticsearch) -> None:
    """
    Sends queries against Elasticsearch and compares results with what customers
    purchased. Computes the average rank position of where the purchased document falls
    within the retrieved items.

    Args
    ----
      search_arr: List[str]
          Searches made by customers as observed in validation data. We send those
          against Elasticsearch and compare results with purchased data
      purchase_arr: List[List[Dict[str, List[str]]]]
          List of documents that were purchased by customers
      rank_num: List[float]
          Numerator value of the rank equation. Defined as list to emulate a pointer
      rank_den: List[float]
      es_client: Elasticsearch
          Python Elasticsearch client
    """
    idx = 0
    if not search_arr:
        return

    request = os.linesep.join(search_arr)
    response = es_client.msearch(body=request, request_timeout=60)

    for hit in response['responses']:
        docs = [doc['_id'] for doc in hit['hits'].get('hits', [])]

        if not docs or len(docs) < 2:
            continue

        purchased_docs = [
            docs for purch in purchase_arr[idx] for docs in purch['purchased']
        ]
        ranks = np.where(np.in1d(docs, purchased_docs))[0]
        idx += 1

        if ranks.size == 0:
            continue

        rank_num[0] += ranks.sum() / (len(docs) - 1)
        rank_den[0] += ranks.size

    print('rank num: ', rank_num[0])
    print('rank den: ', rank_den[0])
Exemple #14
0
def add_snp(mod, species):
    if mod!=[]:
        if species == "human":
            index = "snp"
            position = "POS"
        else:
            index = "mouse_snp"
            position = "START"
        chr = []
        start = []
        request = []
        for i in mod:
            if i!="!":
                start.append(i['Start'])
                chr.append(i['Chr'])
        for i in range(len(start)):
            req_head = {'index': index, 'type': index}
            req_body = {"size": 1,
                        "query": {
                            "bool": {
                                "must": [
                                    {
                                        "range": {
                                            position: {
                                                "gt": int(start[i])-1, "lt": int(start[i])+1
                                            }
                                        }
                                    },
                                    {
                                        "term": {
                                            "CHR": chr[i]
                                        }
                                    }]
                            }
                        }
                        }
            request.extend([req_head, req_body])
        es = Elasticsearch('https://SECRET/elasticsearch', verify_certs=False, timeout=50,
                           max_retries=10, retry_on_timeout=True)
        resp = es.msearch(body=request)
        response = {}
        for i in range(len(resp['responses'])):
            if resp['responses'][i]['hits']['hits'] != []:
                mod[i]["SNP"] = Markup("<a href='https://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=%s' target='_blank'>%s</a>"%(str(resp['responses'][i]['hits']['hits'][0]['_source']['ID']).lstrip("rs"),resp['responses'][i]['hits']['hits'][0]['_source']['ID']))
            else:
                mod[i]["SNP"] = "-"
        return mod
    else:
        return mod
Exemple #15
0
def multi_search(text):
    es = Elasticsearch()
    body = []
    index = {'index': 'recipe_index', 'type': 'recipe_index'}
    search_ings = {'query':
                   {'nested':
                    {'path': 'ingredients', 'query':
                     {'match': {'ingredients.name':
                                {'fuzziness': 1, 'query': text}}}}}}
    search_recipe = {'query': {'multi_match':
                     {'fields': ['title', 'description'],
                      'query': text, 'fuzziness': 1}}}
    body.extend([index, search_ings, index, search_recipe])
    response = es.msearch(body=body)
    return response
Exemple #16
0
class ElasticsearchService(object):
    def __init__(self, host, port):
        self._es = Elasticsearch([{'host': host, 'port': port}])

    def search(self, *args, **kwargs):
        return self._es.search(*args, **kwargs)

    def create(self, *args, **kwargs):
        return self._es.create(*args, **kwargs)

    def get(self, *args, **kwargs):
        return self._es.get(*args, **kwargs)

    def exists(self, *args, **kwargs):
        return self._es.exists(*args, **kwargs)

    def msearch(self, *args, **kwargs):
        return self._es.msearch(*args, **kwargs)
Exemple #17
0
    def run(self, es_client: Elasticsearch, debug):
        if debug:
            logger.debug(
                'QUERY (for %s):\n%s', self.types[0],
                json.dumps(self.q[self.types[0]], indent=2,
                           ensure_ascii=False))
            if len(self.types) > 1:
                logger.debug(
                    'QUERY (for %s):\n%s', self.types[-1],
                    json.dumps(self.q[self.types[-1]],
                               indent=2,
                               ensure_ascii=False))

        body = ''.join('{}\n{}\n'.format(json.dumps(dict(
            index=index)), json.dumps(self.q[t]))
                       for t, index in zip(self.types, self.indexes)
                       if t in self.filtered_type_names)
        return es_client.msearch(body)
    def findErrorLogs(self, job):
        """find方法返回字符串,字符串内容是查询的消息文本"""
        logger.debug("es查询采集部分,收到命令:" + job.es_query + ",采集数量:" + str(job.es_query_num))
        es = Elasticsearch(
            [{"host": settings.ES_ADDRESS["ip"], "port": 80, "url_prefix": "elasticsearch"}],
            headers={"kbn-version":"4.5.4","Host":settings.ES_ADDRESS["host"],"User-Agent":"Mozilla/5.0 Gecko/20100101 Firefox/68.0"},
            timeout=10,
            http_compress=False
        )
        # 模板里面的${queryCommand}必须经过JSON处理,转义
        query_data='''
{"index":["project_err-*"],"ignore_unavailable":true}
{"size":${querySize},"sort":[{"@timestamp":{"order":"desc","unmapped_type":"boolean"}}],"query":{"filtered":{"query":{"query_string":{"analyze_wildcard":true,"query":${queryCommand}}},"filter":{"bool":{"must":[{"range":{"@timestamp":{"gte":${timestampMsStart},"lte":${timestampMsEnd},"format":"epoch_millis"}}}],"must_not":[]}}}},"fields":["*","_source"],"script_fields":{},"fielddata_fields":["@timestamp"]}
'''
        tpl = Template(query_data)
        return json.dumps(es.msearch(tpl.substitute(
            querySize = job.es_query_num,
            timestampMsStart = round((time.time() - float(job.delay_sec) - 2 * 60) * 1000),
            timestampMsEnd = round(time.time() * 1000),
            queryCommand = json.dumps(job.es_query)
        )))
class Elasticsearch_service:
    def __init__(self, ip="localhost", timeout=1000, index_name='finger'):
        self.ip = ip
        self.es = Elasticsearch(hosts=ip)
        self.timeout = timeout
        self.index_name = index_name

    def create_index(self):
        request_body = request_create_index()
        try:
            ret = self.es.indices.create(index=self.index_name,
                                         body=request_body,
                                         request_timeout=self.timeout)
        except:
            return False
        return ret['acknowledged']

    def indexing(self, list_json_text):
        pool_index = multiprocessing.Pool()
        request_body = pool_index.map(create_request_minutia, list_json_text)
        try:

            res = helpers.bulk(self.es,
                               request_body,
                               request_timeout=self.timeout)
        except:
            return False
        return True

    def delete_index(self):
        self.es.indices.delete(index=self.index_name)

    def search(self, list_text):
        request_body = request_msearch(list_text)
        try:
            text_return = self.es.msearch(body=request_body,
                                          request_timeout=self.timeout)
            return text_return
        except:
            return False
class Search(object):
    def __init__(self):
        self.es = Elasticsearch(hosts, http_compress=True)

    def multi_get(self):
        #health_status = es.cluster.health()
        #print health_status
        #res = es.mget(params)
        #body = {"query":{"term":{}}}

        #number = es.count(body=body)
        index = ["log-2018.03.21"]
        from_ = 0
        body = """
            {"index":%(index)s}
            {"query":{"match_all":{}},"from":%(from_)d, "size":%(limit)d}
            """ % dict(index=index, from_=from_, limit=LIMIT)
        res = self.es.msearch(body, doc_type='message')
        total = res['responses'][0]['hits']['total']
        hits = res['responses'][0]['hits']['hits']
        for i in xrange(total / LIMIT):
            body = """
                {"index":["log-2018.03.21"]}
                {"query":{"match_all":{}},"from":LIMIT*(1+i), "size":LIMIT}"""
            res = es.msearch(body, doc_type='message')
            hits.append(hits)
        return hits

    def _count(self, index=None, item=None, value=None):
        body = {
            "query": {
                "term": {
                    item: value,
                }
            }
        }
        res = self.es.count(index=index, body=body)
        return res['count']
Exemple #21
0
    def get(self, request, params, format=None):
        params = params.replace(' & ', ' %26 ')
        params = params.replace(';', '%3B')
        search_options = urllib.parse.parse_qs(params)
        es = Elasticsearch([ELASTICSEARCH_ADDRESS])
        body = es_functions.create_base_query()
        filters = []
        advanced_filters = []
        body['query']['bool']['must'] = es_functions.create_query_string(search_options.get('q'))
        body['sort'] = es_functions.create_sort_query(search_options.get('sort'))

        if search_options.get('date_gte') or search_options.get('date_lte'):
            date_query = es_functions.create_date_query(search_options.get('date_gte'), search_options.get('date_lte'))
            body['query']['bool']['filter']['bool']['filter'].append(date_query)
            filters.append(date_query)

        for field in self.advanced_fields:
            if search_options.get(field):
                adv_filters = es_functions.create_advanced_filters(field, search_options.get(field))
                for filter in adv_filters:
                    body['query']['bool']['filter']['bool']['filter'].append(filter)
                    advanced_filters.append(filter)

        if len(advanced_filters):
            filters.append(advanced_filters)

        for category in self.facet_categories:
            if search_options.get(category):
                facet_filters = es_functions.create_facet_filters(category, search_options.get(category))
                body['query']['bool']['filter']['bool']['must'].append(facet_filters)
                for other_category in self.facet_categories:
                    if other_category != category:
                        body['aggregations'][other_category]['filter']['bool']['must'].append(facet_filters)

        query = es_functions.create_multisearch(body, search_options.get('from'), search_options.get('size'), filters)
        response = es.msearch(body=query)
        data = json.loads(json.dumps(response))
        return Response(data['responses'], status=status.HTTP_200_OK)
    def findErrorLogs(self, queryCommand, querySize = 1):
        """find方法返回字符串,字符串内容是查询的消息文本"""
        logger.debug("es查询采集部分,收到命令:" + queryCommand + ",采集数量:" + str(querySize))
        es = Elasticsearch(
            [{"host": settings.MLES_ADDRESS["ip"], "port": 443, "url_prefix": "elasticsearch"}],
            headers={"kbn-version":"7.5.2","Host":settings.MLES_ADDRESS["host"],"User-Agent":"Mozilla/5.0 Gecko/20100101 Firefox/68.0","Referer":"https://"+settings.XES_ADDRESS["host"]+"/app/kibana"},
            timeout=30,
            http_compress=False,
            use_ssl=True,
            verify_certs=False,
            http_auth=settings.MLES_ADDRESS["http_auth"]
        )
        query_data='''
{"index":"err-prod-*","ignore_unavailable":true}
{"size":${querySize},"query":{"bool":{"filter":[{"match_phrase":{"project":{"query":"${project}"}}},{"range":{"timestamp":{"format":"strict_date_optional_time","gte":"${timestampMsStart}","lte":"${timestampMsEnd}"}}}]}}}
'''
        tpl = Template(query_data)
        return json.dumps(es.msearch(tpl.substitute(
            querySize = querySize,
            timestampMsStart = (datetime.datetime.utcnow() - datetime.timedelta(seconds = int(queryCommand))).isoformat() + 'Z',
            timestampMsEnd = datetime.datetime.utcnow().isoformat() + 'Z',
            project = settings.MLES_ADDRESS["project"]
        )))
    def findLogs(self, queryCommand, querySize=1):
        """find方法返回字符串,字符串内容是查询的消息文本"""
        logger.debug("es查询采集部分,收到命令:" + queryCommand + ",采集数量:" +
                     str(querySize))
        es = Elasticsearch(
            [{
                "host": settings.XES_ADDRESS["ip"],
                "port": 443,
                "url_prefix": "elasticsearch"
            }],
            headers={
                "kbn-version":
                "6.3.2",
                "Host":
                settings.XES_ADDRESS["host"],
                "User-Agent":
                "Mozilla/5.0 Gecko/20100101 Firefox/68.0",
                "Referer":
                "https://" + settings.XES_ADDRESS["host"] + "/app/kibana"
            },
            timeout=30,
            http_compress=False,
            use_ssl=True,
            verify_certs=False,
            http_auth=settings.XES_ADDRESS["http_auth"])
        # 模板里面的${queryCommand}必须经过JSON处理,转义
        query_data = '''
{"index":"err-prod*","ignore_unavailable":true,"timeout":30000}
{"size":${querySize},"sort":[{"@timestamp":{"order":"desc","unmapped_type":"boolean"}}],"query":{"bool":{"must":[{"match_phrase":{"project":{"query":"ms-order-crm"}}},{"match_phrase":{"env":{"query":"prod"}}},{"range":{"timestamp":{"gte":${timestampMsStart},"lte":${timestampMsEnd},"format":"epoch_millis"}}}]}}}
'''
        tpl = Template(query_data)
        return json.dumps(
            es.msearch(
                tpl.substitute(querySize=querySize,
                               timestampMsStart=round(
                                   (time.time() - float(queryCommand)) * 1000),
                               timestampMsEnd=round(time.time() * 1000))))
Exemple #24
0
def transcript_expression(start, stop, chr, species):
    if species=="human":
        index = "human_transcript_exp"
        keys = ["Thyroid", "Testis", "Brain - Anterior cingulate cortex (BA24)", "Skin - Not Sun Exposed (Suprapubic)",
                "Esophagus - Mucosa", "Heart - Atrial Appendage", "Brain - Caudate (basal ganglia)",
                "Esophagus - Muscularis",
                "Brain - Putamen (basal ganglia)", "Small Intestine - Terminal Ileum", "Breast - Mammary Tissue",
                "Cervix - Ectocervix", "Cervix - Endocervix", "Fallopian Tube", "Brain - Cerebellum", "Bladder",
                "Brain - Cerebellar Hemisphere", "Brain - Spinal cord (cervical c_1)", "Artery - Coronary", "Liver",
                "Esophagus - Gastroesophageal Junction", "Brain - Hypothalamus", "Colon - Transverse",
                "Brain - Amygdala",
                "Pancreas", "Adipose - Subcutaneous", "Cells - Leukemia cell line (CML)", "Spleen",
                "Brain - Hippocampus", "Whole Blood", "Brain - Cortex", "Artery - Tibial", "Uterus", "Stomach", "Ovary",
                "Artery - Aorta", "Heart - Left Ventricle", "Kidney - Cortex",
                "Brain - Nucleus accumbens (basalganglia)",
                "Prostate", "Brain - Frontal Cortex (BA9)", "V****a", "Adipose - Visceral (Omentum)", "Adrenal Gland",
                "Lung",
                "Cells - Transformed fibroblasts", "Muscle - Skeletal", "Colon - Sigmoid", "Nerve - Tibial",
                "Brain - Substantia nigra", "Cells - EBV-transformed lymphocytes"]
        return_keys = ['Chr', 'Start', 'Stop', 'Transcript_stable_ID', 'Strand', 'Thyroid', 'Testis', 'Brain - Anterior cingulate cortex (BA24)', 'Skin - Not Sun Exposed (Suprapubic)', 'Esophagus - Mucosa', 'Heart - Atrial Appendage', 'Brain - Caudate (basal ganglia)', 'Esophagus - Muscularis', 'Brain - Putamen (basal ganglia)', 'Small Intestine - Terminal Ileum', 'Breast - Mammary Tissue', 'Cervix - Ectocervix', 'Cervix - Endocervix', 'Fallopian Tube', 'Brain - Cerebellum', 'Bladder', 'Brain - Cerebellar Hemisphere', 'Brain - Spinal cord (cervical c_1)', 'Artery - Coronary', 'Liver', 'Esophagus - Gastroesophageal Junction', 'Brain - Hypothalamus', 'Colon - Transverse', 'Brain - Amygdala', 'Strand', 'Pancreas', 'Adipose - Subcutaneous', 'Cells - Leukemia cell line (CML)', 'Spleen', 'Brain - Hippocampus', 'Whole Blood', 'Brain - Cortex', 'Artery - Tibial', 'Uterus', 'Stomach', 'Ovary', 'Artery - Aorta', 'Heart - Left Ventricle', 'Kidney - Cortex', 'Brain - Nucleus accumbens (basalganglia)', 'Prostate', 'Brain - Frontal Cortex (BA9)', 'V****a', 'Adipose - Visceral (Omentum)', 'Adrenal Gland', 'Lung', 'Cells - Transformed fibroblasts', 'Muscle - Skeletal', 'Colon - Sigmoid', 'Nerve - Tibial', 'Brain - Substantia nigra', 'Cells - EBV-transformed lymphocytes']
    else:
        index = "mouse_transcript_exp"
        keys = ["embryo", "heart", "bone marrow macrophage", "fat pad", "neural tube", "embryonic fibroblast", "brain", "hindbrain", "limb", "stomach", "erythroblast", "midbrain", "kidney", "B cell", "MEL cell line", "testis", "vesicular gland", "G1E", "subcutaneous adipose tissue", "adrenal gland", "gonadal fat pad", "telencephalon", "brown adipose tissue", "placenta", "intestine", "forestomach", "CH12.LX", "ES-Bruce4", "activated regulatory T-cells", "cortical plate", "regulatory T cell", "skeletal muscle tissue", "urinary bladder", "cerebellum", "small intestine", "416B", "NIH3T3", "pancreas", "A20", "Patski", "G1E-ER4", "embryonic facial prominence", "bone marrow", "spleen", "thymus", "splenic B cell", "inflammation-experienced regulatory T-cells", "forebrain", "uterus", "lung", "ovary", "muscle", "olfactory bulb", "liver"]
        return_keys = ['Chr','Start','Stop','Transcript_ID', 'Strand', 'embryo', 'heart', 'neural tube', 'bone marrow macrophage', 'CH12.LX', 'fat pad', 'embryonic fibroblast', 'brain', 'hindbrain', 'limb', 'stomach', 'erythroblast', 'kidney', 'B cell', 'MEL cell line', 'testis', 'vesicular gland', 'ES-Bruce4', 'G1E', 'subcutaneous adipose tissue', 'adrenal gland', 'gonadal fat pad', 'forestomach', 'brown adipose tissue', 'placenta', 'uterus', 'activated regulatory T-cells', 'intestine', 'cortical plate', 'regulatory T cell', 'skeletal muscle tissue', 'urinary bladder', 'embryonic facial prominence', 'small intestine', '416B', 'NIH3T3', 'midbrain', 'pancreas', 'cerebellum', 'Patski', 'G1E-ER4', 'bone marrow', 'spleen', 'thymus', 'A20', 'splenic B cell', 'telencephalon', 'forebrain', 'inflammation-experienced regulatory T-cells', 'lung', 'ovary', 'muscle', 'olfactory bulb', 'liver']

    request = []
    req_head = {'index': index, 'type': index}
    req_body = {
                "size": 9000,
                            "query": {
                                "bool": {
                                    "must": {
                                        "range": {"Start": {"gte": start, "lte": stop}}
                                    },
                                    "filter": {
                                        "term": {"Chr": chr}
                                    }
                                }
                            }
                        }
    request.extend([req_head, req_body])
    es = Elasticsearch('https://SECRET/elasticsearch', verify_certs=False, timeout=30,
                       max_retries=10, retry_on_timeout=True)
    resp = es.msearch(body=request)
    response = []
    expression_list = []
    try:
        for i in range(len(resp['responses'][0]['hits']['hits'])):
            response.append(resp['responses'][0]['hits']['hits'][i]["_source"])
            expression_list.append([])
            temp = resp['responses'][0]['hits']['hits'][i]["_source"]
            for j in keys:
                try:
                    expression_list[i].append(float(temp[j]))
                except:
                    expression_list[i].append(None)
    except:
        response = []
        expression_list = []
    normalized_list = []
    for i in range(len(expression_list)):
        normalized_list.append([])
        for j in expression_list[i]:
            if None not in expression_list[i] and max(expression_list[i])!=0:
                normalized_list[i].append(j/max(expression_list[i]))
            elif None in expression_list[i]:
                    maximum = 0
                    for k in expression_list[i]:
                        if k != None and k > maximum:
                            maximum = k
                    if maximum!=0 and j!=None:
                        normalized_list[i].append(j/maximum)
                    else:
                        normalized_list[i].append(j)
            else:
                normalized_list[i].append(j)
    return response, json.dumps(expression_list), json.dumps(normalized_list), return_keys
Exemple #25
0
def mod_function_mouse(start, stop, chr, mod_indices, gene):
    request = []
    if gene!=0:
        for i in mod_indices:
            req_head = {'index': i, 'type': i}
            req_body = {
                        "size": 9000,
                          "query": {
                            "term" : { "Gene" : gene }
                          }
                        }
            request.extend([req_head, req_body])
    else:
        for i in mod_indices:
            req_head = {'index': i, 'type': i}
            req_body = {"size": 9000,
                                "query": {
                                    "bool": {
                                        "must": [{
                                            "range": {"Start": {"gte": start, "lte": stop}},
                                            "range": {"Stop": {"lte": stop, "gte": start}}
                                        }],
                                        "filter": {
                                            "term": {"Chr": chr}
                                        }
                                    }
                                }
                            }

            request.extend([req_head, req_body])
    es = Elasticsearch('https://SECRET/elasticsearch', verify_certs=False, timeout=30,
                       max_retries=10, retry_on_timeout=True)
    resp = es.msearch(body=request)
    response = {}
    for i in range(len(resp["responses"])):
        response[mod_indices[i]] = []
        for j in range(resp["responses"][i]["hits"]["total"]):
            try:
                resp["responses"][i]["hits"]["hits"][j]["_source"]["ENSG_ID"] = Markup("<a href='https://asia.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=%s;r=' target='_blank'>%s</a>"%(resp['responses'][i]['hits']['hits'][j]['_source']['ENSG_ID'],resp['responses'][i]['hits']['hits'][j]['_source']['ENSG_ID']))
                resp["responses"][i]["hits"]["hits"][j]["_source"]["ENST_ID"] = Markup("<a href='https://asia.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=%s;r=' target='_blank'>%s</a>"%(resp['responses'][i]['hits']['hits'][j]['_source']['ENST_ID'],resp['responses'][i]['hits']['hits'][j]['_source']['ENST_ID']))
                if "280771691" in resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"]:
                    resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"] = "28077169"
                if "/" in resp["responses"][i]["hits"]["hits"][j]["_source"]["Tissue"]:
                    resp["responses"][i]["hits"]["hits"][j]["_source"]["Tissue"] = resp["responses"][i]["hits"]["hits"][j]["_source"]["Tissue"].replace("/", ", ")
                resp["responses"][i]["hits"]["hits"][j]["_source"]["Tissue"] = resp["responses"][i]["hits"]["hits"][j]["_source"]["Tissue"].capitalize()

                if "|" in resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"]:
                    ids = resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"]
                    ids = ids.split("|")
                    ids_final = []
                    for id in ids:
                        ids_final.append("<a href='https://www.ncbi.nlm.nih.gov/pubmed/%s' target='_blank'>%s</a>" % (id, id))
                    resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"] = Markup(", ".join(ids_final))
                else:
                    resp["responses"][i]["hits"]["hits"][j]["_source"]["Pubmed_ID"] = Markup("<a href='https://www.ncbi.nlm.nih.gov/pubmed/%s' target='_blank'>%s</a>"%(resp['responses'][i]['hits']['hits'][j]['_source']['Pubmed_ID'],resp['responses'][i]['hits']['hits'][j]['_source']['Pubmed_ID']))
                response[mod_indices[i]].append(resp["responses"][i]["hits"]["hits"][j]["_source"])
            except IndexError:
                response[mod_indices[i]].append("!")
    pool_query = ThreadPool(processes=6)
    response['a_to_i_mouse'] = pool_query.apply_async(add_snp, (response['a_to_i_mouse'], 'mouse'))
    response['m1a_mouse'] = pool_query.apply_async(add_snp, (response['m1a_mouse'], 'mouse'))
    response['m5c_mouse'] = pool_query.apply_async(add_snp, (response['m5c_mouse'], 'mouse'))
    response['m6a_mouse'] = pool_query.apply_async(add_snp, (response['m6a_mouse'], 'mouse'))
    response['nm_mouse'] = pool_query.apply_async(add_snp, (response['nm_mouse'], 'mouse'))
    response['pseudou_mouse'] = pool_query.apply_async(add_snp, (response['pseudou_mouse'], 'mouse'))
    #response['c_to_u_mouse'] = pool_query.apply_async(add_snp, (response['c_to_u_mouse'], 'mouse'))
    response['dihydrouridine_mouse'] = pool_query.apply_async(add_snp, (response['dihydrouridine_mouse'], 'mouse'))
    response['m1g_mouse'] = pool_query.apply_async(add_snp, (response['m1g_mouse'], 'mouse'))
    response['m2g_mouse'] = pool_query.apply_async(add_snp, (response['m2g_mouse'], 'mouse'))
    response['m7g_mouse'] = pool_query.apply_async(add_snp, (response['m7g_mouse'], 'mouse'))
    response['other_mouse'] = pool_query.apply_async(add_snp, (response['other_mouse'], 'mouse'))
    response['t6a_mouse'] = pool_query.apply_async(add_snp, (response['t6a_mouse'], 'mouse'))
    response['a_to_i_mouse'] = response['a_to_i_mouse'].get()
    response['m1a_mouse'] = response['m1a_mouse'].get()
    response['m5c_mouse'] = response['m5c_mouse'].get()
    response['m6a_mouse'] = response['m6a_mouse'].get()
    response['nm_mouse'] = response['nm_mouse'].get()
    response['pseudou_mouse'] = response['pseudou_mouse'].get()
    #response['c_to_u_mouse'] = response['c_to_u_mouse'].get()
    response['dihydrouridine_mouse'] = response['dihydrouridine_mouse'].get()
    response['m1g_mouse'] = response['m1g_mouse'].get()
    response['m2g_mouse'] = response['m2g_mouse'].get()
    response['m7g_mouse'] = response['m7g_mouse'].get()
    response['other_mouse'] = response['other_mouse'].get()
    response['t6a_mouse'] = response['t6a_mouse'].get()
    pool_query.terminate()
    pool_query.close()
    return response['a_to_i_mouse'],response['m1a_mouse'],response['m5c_mouse'],response['m6a_mouse'],response['nm_mouse'],response['pseudou_mouse'], response['dihydrouridine_mouse'], response['m1g_mouse'],response['m2g_mouse'], response['m7g_mouse'], response['other_mouse'], response['t6a_mouse']
Exemple #26
0
class ES_DB:

    es_db = None

    # ================================ initializer
    def __init__(self, es_ip, es_port):
        self.es_ip = es_ip
        self.es_port = es_port
        self.es_db = Elasticsearch('http://' + self.es_ip + ':' + self.es_port,
                                   timeout=120)
        #print inspect.getargspec(self.es_db.indices.put_settings())
        # setting

    # ================================ Create Index
    # create new index (case)
    def create_index(self, index_name):
        try:
            self.es_db.indices.create(
                index=index_name,
                body={
                    "mappings": {
                        "dynamic_templates": [{
                            "strings": {
                                "match_mapping_type": "string",
                                "mapping": {
                                    "type": "text",
                                    "fields": {
                                        "keyword": {
                                            "type": "keyword",
                                            "ignore_above": 256
                                        }
                                    },
                                    "copy_to": "catch_all"
                                }
                            }
                        }]
                    },
                    "settings": {
                        "analysis": {
                            "analyzer": {
                                "default": {
                                    "tokenizer": "keyword",
                                    "filter": ["lowercase"]
                                },
                                "default_search": {
                                    "tokenizer": "keyword",
                                    "filter": ["lowercase"]
                                }
                            }
                        }
                    }
                })
            return [True, index_name]
        except Exception as e:
            return [False, "Error: " + str(e)]

    # ================================ Delete Index
    # delete index (case)
    def delete_index(self, index_name):
        try:
            self.es_db.indices.delete(index=index_name)
            return [True, index_name]
        except Exception as e:
            return [False, "Error: " + str(e)]

    # ================================ get max results window
    # get the setting for maximum number of records to be retrived from elasticsearch
    def get_max_result_window(self, indx):
        settings = self.es_db.indices.get_settings(index=indx)
        settings = settings[indx]['settings']['index']
        if "max_result_window" in settings.keys():
            return settings['max_result_window']
        else:
            return 10000  # default value

    # ================================ get max results window
    # get the setting for maximum number of records to be retrived from elasticsearch
    def get_max_fields_num(self, indx):
        settings = self.es_db.indices.get_settings(index=indx)
        settings = settings[indx]['settings']['index']
        if "query" in settings.keys():
            if "default_field" in settings['query']:
                return settings['query']['default_field']
        else:
            return 1024  # default value

    # ================================ bulk query
    # this except index and bodies (list of single body requests)
    def multiqueries(self, index, bodies):
        request_header = json.dumps({'index': index})
        requests = []
        for b in bodies:
            b["track_total_hits"] = True
            requests.extend([request_header, b])
        resp = self.es_db.msearch(body=requests)

        # check if there are failed queries
        for result in resp["responses"]:
            if "error" in result.keys():
                return [False, result["error"]["root_cause"][0]["reason"]]
        return [True, resp['responses']]

    # ================================ query
    # query the elasticsearch db, index is the index name of the case, and body is the query body
    # count: number of times the function recursive
    def query(self, indexname, body, count=3):
        count -= 1

        indexname = indexname.lower()
        body["track_total_hits"] = True
        logger.logger(level=logger.DEBUG,
                      type="elasticsearch",
                      message="Query to index [" + indexname + "]",
                      reason=json.dumps(body))
        filter_path = [
            'hits.hits._source.Data', 'hits.total.value',
            'aggregations.*.buckets'
        ]
        try:
            #search_res = self.es_db.search(index=indexname,body=body , filter_path=filter_path)
            search_res = self.es_db.search(index=indexname, body=body)
            return [True, search_res]
        except elasticsearch.RequestError as e:
            reason = e.info['error']['reason']
            logger.logger(level=logger.WARNING,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [RequestError]",
                          reason=reason)
            # if the problem in shards
            if reason == "all shards failed":
                for shard in e.info['error']['failed_shards']:
                    if 'caused_by' in shard['reason'].keys():
                        shard_reason = shard['reason']['caused_by']['reason']
                    else:
                        shard_reason = shard['reason']['reason']

                    # if the reason is that the field used for key is text and is not sortable, then try it sub-field ".keyword"
                    if shard_reason.startswith(
                            "Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default"
                    ):
                        if "sort" in body.keys():
                            field = body['sort'].keys()[0]
                            order = body['sort'][field]['order']
                            body['sort'] = {
                                field + ".keyword": {
                                    'order': order
                                }
                            }

                            logger.logger(
                                level=logger.INFO,
                                type="elasticsearch",
                                message="Query [" + indexname +
                                "], the sort is not a sortable field, try using sub-field .keyword"
                            )
                            return self.query(indexname, body, count)

                    # if the reason is the result has too many fields
                    match = re.match(
                        'field expansion (for \[.*\] )?matches too many fields, limit: ([0-9]+), got: ([0-9]+)',
                        shard_reason)
                    if match is not None:
                        # if the problem is the number of fields more than the default max number of fields in query
                        max_field_num = int(match.groups()[1]) + 100

                        inc = self.es_db.indices.put_settings(
                            index=indexname,
                            body='{ "index" : { "query": { "default_field" : '
                            + str(max_field_num) + '} } }')
                        if inc["acknowledged"]:
                            logger.logger(
                                level=logger.INFO,
                                type="elasticsearch",
                                message="Query [" + indexname +
                                "] max query fields number increased " +
                                str(max_field_num))
                            if count != 0:
                                return self.query(indexname, body, count)
                            else:
                                return [
                                    False,
                                    "exceeded the number of tries to fix the issue, field expansion matches too many fields"
                                ]
                        else:
                            logger.logger(
                                level=logger.ERROR,
                                type="elasticsearch",
                                message="Query [" + indexname +
                                "] Failed increasing the result window")
                            continue

                    # if the result window is too large, increase the window
                    match = re.match(
                        'Result window is too large, from \+ size must be less than or equal to: \[([0-9]+)\] but was \[([0-9]+)\].*',
                        shard_reason)
                    if match is not None:
                        max_result_window = int(match.groups()[1]) + 1000
                        inc = self.es_db.indices.put_settings(
                            index=indexname,
                            body='{ "index" : { "max_result_window" : ' +
                            str(max_result_window) + ' } }')
                        if inc["acknowledged"]:
                            logger.logger(
                                level=logger.INFO,
                                type="elasticsearch",
                                message="Query [" + indexname +
                                "] result window increased to " +
                                str(self.get_max_result_window(indexname)))
                            if count != 0:
                                return self.query(indexname, body, count)
                            else:
                                return [
                                    False,
                                    "exceeded the number of tries to fix the issue, Result window is too large"
                                ]
                        else:
                            logger.logger(
                                level=logger.ERROR,
                                type="elasticsearch",
                                message="Query [" + indexname +
                                "] Failed increasing the result window")
                            continue

                    else:
                        logger.logger(level=logger.ERROR,
                                      type="elasticsearch",
                                      message="Query [" + indexname +
                                      "] failed [RequestError]",
                                      reason=shard_reason)
            else:
                logger.logger(level=logger.ERROR,
                              type="elasticsearch",
                              message="Query [" + indexname +
                              "] failed [RequestError]",
                              reason=json.dumps(e.info))
            res = [False, reason]
        except elasticsearch.ConnectionError as e:
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [ConnectionError]",
                          reason=e.info)
            res = [False, 'Failed to connect to elasticsearch']
        except elasticsearch.TransportError as e:
            reason = str(e)
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [TransportError]",
                          reason=reason)
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [TransportError]",
                          reason=json.dumps(e.info))
            res = [False, reason]
        except elasticsearch.ElasticsearchException as e:
            reason = str(e)
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [ElasticsearchException]",
                          reason=reason)
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [ElasticsearchException]",
                          reason=json.dumps(e.info))
            res = [False, reason]
        except Exception as e:
            print str(e)
            res = [False, str(e)]
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Query [" + indexname +
                          "] failed [Exception]",
                          reason=str(e))

        return res

    # ================================ get max fields limit
    # get the total_fields.limit from settings
    def get_total_fields_limit(self, indx):
        settings = self.es_db.indices.get_settings(index=indx)
        if 'mapping' in settings[settings.keys()[0]]['settings']['index']:
            if 'total_fields' in settings[settings.keys()
                                          [0]]['settings']['index']['mapping']:
                if 'limit' in settings[settings.keys(
                )[0]]['settings']['index']['mapping']['total_fields']:
                    return settings[settings.keys()[0]]['settings']['index'][
                        'mapping']['total_fields']['limit']
        return 1000  # default fields limit

    # ================================ push records to elasticsearch
    # data: is a list of json data
    def bulk_queue_push(self,
                        data,
                        case_id,
                        source=None,
                        machine=None,
                        data_type=None,
                        data_path=None,
                        chunk_size=500):
        case_id = case_id.lower()
        bulk_queue = []
        for d in data:
            di = {
                "_index": case_id,
                "_source": {
                    "Data": d
                },
                '_id': str(uuid.uuid4())
            }
            if source is not None:
                di['_source']['data_source'] = source
            if machine is not None:
                di['_source']['machine'] = machine
            if data_type is not None:
                di['_source']['data_type'] = data_type
            if data_path is not None:
                di['_source']['data_path'] = data_path

            bulk_queue.append(di)
        logger.logger(level=logger.DEBUG,
                      type="elasticsearch",
                      message="Index [" + case_id + "]: Pushing [" +
                      str(len(bulk_queue)) + "] records")

        push_es = self.bulk_to_elasticsearch(bulk_queue, case_id, chunk_size)
        if push_es[0]:
            logger.logger(level=logger.INFO,
                          type="elasticsearch",
                          message="Index [" + case_id + "]: Pushed [" +
                          str(len(bulk_queue) - len(push_es[2])) +
                          "] records successfully")
            return [
                True, "Pushed [" + str(len(bulk_queue)) + "] records",
                push_es[2], push_es[3]
            ]
        else:
            logger.logger(level=logger.ERROR,
                          type="elasticsearch",
                          message="Index [" + case_id +
                          "]: Failed pusheing [" + str(len(bulk_queue)) +
                          "] records",
                          reason=push_es[1])
            return [
                False,
                'Failed to bulk data to Elasticsearch: ' + str(push_es[1]),
                bulk_queue, push_es[3]
            ]

    # ================================ push records to elasticsearch
    # return list of records ids successed or failed
    def bulk_to_elasticsearch(self, bulk_queue, indx, chunk_size):

        try:
            errors = {
            }  # contain dictionary of failed data (origin data and error info)
            failed = []  # contain the IDs of the failed records
            successed = []  # contain the IDs of successed records

            logger.logger(level=logger.DEBUG,
                          type="elasticsearch",
                          message="Index [" + indx +
                          "]: bulk push to ES, default chunk[" +
                          str(chunk_size) + "]: ",
                          reason="number of records: " + str(len(bulk_queue)))
            # use helpers to push the data to elasticsearch
            for ok, item in helpers.parallel_bulk(self.es_db,
                                                  bulk_queue,
                                                  chunk_size=chunk_size,
                                                  raise_on_error=False,
                                                  raise_on_exception=False):
                if not ok:
                    errors[item['index']['_id']] = item
                    logger.logger(level=logger.WARNING,
                                  type="elasticsearch",
                                  message="Index [" + indx +
                                  "]: Failed pushing record: ",
                                  reason=str(item))
                    failed.append(item['index']['_id'])
                else:
                    successed.append(item['index']['_id'])

            if len(failed):
                logger.logger(level=logger.WARNING,
                              type="elasticsearch",
                              message="Index [" + indx +
                              "]: Failed pushing [" + str(len(failed)) +
                              "] records, try to fix the issue")
                # get origin data from ID
                for data in bulk_queue:
                    try:
                        errors[data['_id']]['index']['data'] = data['_source']
                        logger.logger(level=logger.DEBUG,
                                      type="elasticsearch",
                                      message="Index [" + indx +
                                      "]: get data for failed record [" +
                                      data['_id'] + "]",
                                      reason=str(errors[data['_id']]))
                    except:
                        # if record not in the errors list, continue
                        continue
                    logger.logger(level=logger.WARNING,
                                  type="elasticsearch",
                                  message="Index [" + indx +
                                  "]: Failed pushing record: ",
                                  reason=str(data['_id']))

                fixed_errors, nonfixed_errors = self.bulk_to_elasticsearch_fix_errors(
                    indx, errors)
                failed = nonfixed_errors
                if len(fixed_errors):
                    logger.logger(
                        level=logger.DEBUG,
                        type="elasticsearch",
                        message="Index [" + indx + "]: fixed issue of [" +
                        str(len(fixed_errors)) + "] records, retry to push it")
                    repush_failed_errors = self.bulk_to_elasticsearch(
                        fixed_errors, indx, chunk_size)
                    if repush_failed_errors[0]:
                        successed += repush_failed_errors[3]
                        failed += repush_failed_errors[2]

            return [
                True, "Pushed [" + str(len(successed)) + "] records to [" +
                indx + "] index", failed, successed
            ]

        # if connection timeout to elasticsearch occurred
        except elasticsearch.exceptions.ConnectionTimeout as e:
            logger.logger(level=logger.WARNING,
                          type="elasticsearch",
                          message="Index [" + indx +
                          "]: Failed to push the records, retry again",
                          reason="Connection to Elasticsearch timeout")
            return self.bulk_to_elasticsearch(bulk_queue, indx, chunk_size)

        except Exception as e:
            logger.logger(
                level=logger.ERROR,
                type="elasticsearch",
                message="Failed pushing the records, unexpected error",
                reason=str(e))

            return [
                False, "Failed pushing [" + str(len(bulk_queue)) +
                "] records to [" + indx + "] index", bulk_queue, []
            ]

    # ================================ fix the errors faced during build_to_elasticsearch
    # this will recevie the failed data from bulk queue and fix it
    # it will return the list of fixed records and nonfixed records
    def bulk_to_elasticsearch_fix_errors(self, indx, errors):
        logger.logger(level=logger.WARNING,
                      type="elasticsearch",
                      message="Index [" + indx + "]: Failed pushing [" +
                      str(len(errors)) +
                      "] records [BulkIndexError], retry to fix the issue")

        # check the returned error for each document and try to solve it
        fixed_data = []
        nonfixed_data = []
        limit_fields_increased = False
        for _id, doc in errors.iteritems():

            record_msg_info = "Indx[" + indx + "]"
            if 'machine' in doc['index']['data'].keys():
                record_msg_info += ", machine [" + doc['index']['data'][
                    'machine'] + "]"
            if 'data_type' in doc['index']['data'].keys():
                record_msg_info += ", data_type[" + doc['index']['data'][
                    'data_type'] + "]"
            if '_id' in doc['index'].keys():
                record_msg_info += ", rec_id[" + doc['index']['_id'] + "]"

            try:

                doc_reason = doc['index']['error']['reason']
                logger.logger(level=logger.WARNING,
                              type="elasticsearch",
                              message=record_msg_info + ": record failed",
                              reason=doc_reason)

                # === if the error is the limitation on the fields number, get the add 1000 to the limitation and try again
                if "Limit of total fields" in doc_reason and limit_fields_increased == False:
                    new_limit = int(self.get_total_fields_limit(indx))
                    new_limit = new_limit + 1000
                    inc = self.es_db.indices.put_settings(
                        index=indx,
                        body='{"index.mapping.total_fields.limit": ' +
                        str(new_limit) + '}')

                    if inc["acknowledged"]:
                        logger.logger(
                            level=logger.INFO,
                            type="elasticsearch",
                            message=record_msg_info +
                            " : The total_fields.limit has been increased to "
                            + str(new_limit))
                        limit_fields_increased = True
                    else:
                        logger.logger(
                            level=logger.ERROR,
                            type="elasticsearch",
                            message=record_msg_info +
                            " : failed to increase total_fields.limit")

                # === if already fixed the limit of total fields issue, then add it to the list
                if "Limit of total fields" in doc_reason and limit_fields_increased:
                    fixed_data.append({
                        "_index": doc['index']['_index'],
                        "_type": doc['index']['_type'],
                        "_id": doc['index']['_id'],
                        "_source": doc['index']['data']
                    })
                    continue

                # if there is error where the text field exceeded the maximum number of charactors (by default 32766)
                match = re.match(
                    'Document contains at least one immense term in field="(.+)" \(whose UTF8 encoding is longer than the max length ([0-9]+)\), all of which were skipped.* original message: bytes can be at most ([0-9]+) in length; got ([0-9]+)',
                    doc_reason)
                if match is not None:
                    field = match.groups()[0]
                    current_max = int(match.groups()[1])
                    data_length = int(match.groups()[3])

                    logger.logger(level=logger.ERROR,
                                  type="elasticsearch",
                                  message=record_msg_info +
                                  " : field data more than the specified",
                                  reason="field " + field +
                                  ", defined max length [" + str(current_max) +
                                  "], field data [" + str(data_length) + "]")

                # ==== check if reason that an object received but the field data type is not correct
                match = re.match(
                    "object mapping for \[(.*)\] tried to parse field \[(.*)\] as (.*), but found a concrete value",
                    doc_reason)
                if match is not None:
                    match = match.groups()
                    failed_field = match[0]

                    # if datatype is object but found concrete value
                    if match[2] == 'object':
                        d = json_get_val_by_path(doc['index']['data'],
                                                 failed_field)

                        if d[0]:
                            # if type of field is object but found "None" as string
                            if d[1] == 'None':

                                if json_update_val_by_path(
                                        doc['index']['data'], failed_field,
                                        None)[0]:

                                    fixed_data.append({
                                        "_index":
                                        doc['index']['_index'],
                                        "_type":
                                        doc['index']['_type'],
                                        "_id":
                                        doc['index']['_id'],
                                        "_source":
                                        doc['index']['data']
                                    })
                                    continue

                            # if type of field is object but found string
                            if isinstance(d[1], str):
                                if json_update_val_by_path(
                                        doc['index']['data'], failed_field,
                                    {'value': d[1]})[0]:

                                    fixed_data.append({
                                        "_index":
                                        doc['index']['_index'],
                                        "_type":
                                        doc['index']['_type'],
                                        "_id":
                                        doc['index']['_id'],
                                        "_source":
                                        doc['index']['data']
                                    })
                                    continue

                # ==== failed to parse field as date
                match = re.match(
                    "failed to parse field \[(.*)\] of type \[(.*)\] in document with id .*",
                    doc_reason)
                if match is not None:
                    match = match.groups()
                    failed_field = match[0]
                    failed_field_type = match[1]

                    # if the field mapped as date
                    if failed_field_type == 'date':
                        if json_update_val_by_path(doc['index']['data'],
                                                   failed_field,
                                                   '1700-01-01T00:00:00')[0]:
                            fixed_data.append({
                                "_index": doc['index']['_index'],
                                "_type": doc['index']['_type'],
                                "_id": doc['index']['_id'],
                                "_source": doc['index']['data']
                            })
                            continue

                    # if the field mapped as text
                    if failed_field_type == 'text':
                        d = json_get_val_by_path(doc['index']['data'],
                                                 failed_field)
                        if d[0]:
                            d = d[1]
                            try:
                                if isinstance(d, list):
                                    res = [0 for x in range(len(d))]
                                    for i in d.keys():
                                        res[int(i)] = d[i]
                                    res_str = '\n'.join(res)
                                    if json_update_val_by_path(
                                            doc['index']['data'], failed_field,
                                            res_str)[0]:
                                        fixed_data.append({
                                            "_index":
                                            doc['index']['_index'],
                                            "_type":
                                            doc['index']['_type'],
                                            "_id":
                                            doc['index']['_id'],
                                            "_source":
                                            doc['index']['data']
                                        })
                                        continue
                                elif isinstance(d, dict):
                                    res_str = "\n".join([
                                        str(k) + "=" + str(d[k])
                                        for k in d.keys()
                                    ])
                                    if json_update_val_by_path(
                                            doc['index']['data'], failed_field,
                                            res_str)[0]:
                                        fixed_data.append({
                                            "_index":
                                            doc['index']['_index'],
                                            "_type":
                                            doc['index']['_type'],
                                            "_id":
                                            doc['index']['_id'],
                                            "_source":
                                            doc['index']['data']
                                        })
                                        continue

                            except Exception as e:
                                pass

                logger.logger(level=logger.ERROR,
                              type="elasticsearch",
                              message=record_msg_info +
                              " : No fix found for failed record [" +
                              doc['index']['_id'] + "] data",
                              reason=doc['index']['data'])
                nonfixed_data.append({
                    "_index": doc['index']['_index'],
                    "_type": doc['index']['_type'],
                    "_id": doc['index']['_id'],
                    "_source": doc['index']['data']
                })
            except Exception as e:
                logger.logger(level=logger.ERROR,
                              type="elasticsearch",
                              message=record_msg_info +
                              " : unsuspected error in fixing record issue",
                              reason=str(e))
                nonfixed_data.append({
                    "_index": doc['index']['_index'],
                    "_type": doc['index']['_type'],
                    "_id": doc['index']['_id'],
                    "_source": doc['index']['data']
                })

        return fixed_data, nonfixed_data

    # ================================ push records to elasticsearch
    # update specific record in elasticsearch
    def update_field(self, data, doc_id, indx):
        try:
            indx = indx.lower()

            up = self.es_db.update(index=indx,
                                   doc_type="_doc",
                                   id=doc_id,
                                   body=data)
            if up['result'] == 'updated':
                return [True, 'updated']
            else:
                return [
                    False,
                    "Index[" + indx + "]: Failed to update the record [" +
                    str(doc_id) + "] : " + str(json.dumps(data))
                ]
        except Exception as e:
            return [False, str(e)]

    # ================================ add tag
    def es_add_tag(self, data, case_id):
        try:
            case_id = case_id.lower()
            ins = self.es_db.index(index=case_id, body=data)
            return [True, ins]
        except Exception as e:
            return [False, str(e)]

    # ================================ get record
    # get specific record by its id
    def get_record_by_id(self, case_id, record_id):
        case_id = case_id.lower()
        try:
            res = self.es_db.get(index=case_id, doc_type="_doc", id=record_id)
            return [True, res]
        except Exception as e:
            return [False, str(e)]

    # ================================ Delete record
    # delete records by id
    def del_record_by_id(self, case_id, record_id):
        case_id = case_id.lower()
        try:
            res = self.es_db.delete(index=case_id,
                                    doc_type="_doc",
                                    id=record_id)
            if res['result'] == 'deleted':
                return [True, 'deleted']
            else:
                return [
                    False, "Index[" + case_id +
                    "]: Failed to delete the record [" + str(record_id) + "]"
                ]
        except elasticsearch.NotFoundError as e:
            return [
                False, "NotFound: [" + case_id + "] _id[" + record_id + "]"
            ]
        except Exception as e:
            return [False, str(e)]

    # ================================ Delete record
    # delete records by query
    def del_record_by_query(self, case_id, query):
        case_id = case_id.lower()

        try:
            res = self.es_db.delete_by_query(index=case_id, body=query)
            return [
                True, "Indx[" + case_id + "]: Deleted " + json.dumps(query)
            ]
        except Exception as e:
            return [False, str(e)]

    # ================================ Get fields mapping
    # return the fields mapping (all fields and its properties)
    def get_mapping_fields(self, case_id):
        try:
            mapping = self.es_db.indices.get_mapping(index=case_id)

            if 'properties' in mapping[case_id]['mappings'].keys():
                fields = mapping[case_id]['mappings']['properties']
                fields_rec = self.get_mapping_fields_rec(fields)
                if fields_rec[0] == False:
                    return fields_rec
                else:
                    fields_list = fields_rec[1]
            else:
                fields_list = []

            return [True, fields_list]
        except Exception as e:
            return [False, str(e)]

    # recursive function for get_mapping_fields
    def get_mapping_fields_rec(self, fields, current_path=[]):
        fields_list = []
        try:
            for k in fields.keys():
                if 'properties' in fields[k].keys():
                    fields_rec = self.get_mapping_fields_rec(
                        fields[k]['properties'], current_path + [k])
                    if fields_rec[0] == False:
                        return fields_rec
                    else:
                        fields_list += fields_rec[1]
                else:
                    current_path_tmp = '.'.join(current_path)
                    if len(current_path) > 0:
                        current_path_tmp += "."

                    r = {
                        'type':
                        fields[k]['type'],
                        'field_path':
                        current_path_tmp + k,
                        'fields':
                        fields[k]['fields'].keys()[0]
                        if 'fields' in fields[k].keys() else ''
                    }
                    fields_list.append(r)
            return [True, fields_list]
        except Exception as e:
            return [False, str(e)]

    # ============================== get System health information
    # return the nodes information
    def get_nodes_info(self):
        try:
            return [True, self.es_db.nodes.info()]
        except Exception as e:
            return [False, str(e)]

    def get_indices_settings(self):
        try:
            return [True, self.es_db.indices.get_settings('*')]
        except Exception as e:
            return [False, str(e)]

    def get_indices_stats(self):
        try:
            return [True, self.es_db.indices.stats('')]
        except Exception as e:
            return [False, str(e)]

    def get_index_count(self, index):
        #print json_beautifier( self.es_db.indices.stats(index) )
        try:
            return [
                True,
                self.es_db.cat.count(index, params={"format": "json"})
            ]
        except Exception as e:
            return [False, str(e)]
Exemple #27
0
class ES:
    def __init__(self, model='EC', similarity='BM25'):
        self.model = model
        self.similarity = similarity

        self._settings = self.get_model_settings()
        self._index_name = f'{model}_{similarity}'.lower()

        self._settings['settings'] = getattr(
            self, f'get_{similarity.lower()}_settings')()

        self.es = Elasticsearch(timeout=120)
        #print(self.es.info())

    def get_index(self):
        return self._index_name

    def get_lm_settings(self):
        return {"similarity": {"default": {"type": "LMDirichlet"}}}

    def get_bm25_settings(self):
        return {"similarity": {"default": {"type": "BM25"}}}

    def get_custom_settings(self):
        return {
            "similarity": {
                "default": {
                    "type": "LMDirichlet"
                },
                "custom_LMDirichlet": {
                    "type": "scripted",
                    "script": {
                        "source":
                        """
                            double freq = doc.freq;
                            double PtC = (term.totalTermFreq+1.0)/(field.sumTotalTermFreq+1.0);
                            double tw = Math.log(1.0 + (freq/(2000.0*PtC)));
                            double norm = Math.log(2000.0 / (doc.length + 2000.0));
                            return Math.max((tw + norm), 0.0);
                            """
                    }
                }
            }
        }

    def get_model_settings(self):
        properties = {
            'body': {
                'type': 'text',
                'term_vector': 'yes',
                'analyzer': 'english',
                'similarity':
                'default'  #'custom_lm' if self.similarity == 'Custom' else 'default'
            }
        }
        if self.similarity == 'Custom':
            properties['weight'] = {
                'type': 'float',
                'index': False,
                "store": True
            }

        return {'mappings': {'properties': properties}}

    def reset_index(self):
        if self.es.indices.exists(self._index_name):
            self.es.indices.delete(self._index_name)

        self.es.indices.create(self._index_name, self._settings)

    def data_from_generator(self, doc):
        num_docs = len(doc) // 10
        for i, (doc_id, body) in enumerate(doc.items()):
            yield {'_index': self._index_name, '_id': doc_id, '_source': body}
            if i % num_docs == 0:
                print('{}% done'.format((i // num_docs) * 10))

    def _index_EC(self, documents):
        for success, info in parallel_bulk(self.es,
                                           self.data_from_generator(documents),
                                           thread_count=12,
                                           chunk_size=5000,
                                           max_chunk_bytes=104857600,
                                           queue_size=6):
            if not success:
                print('A document failed:', info)

    def _index_TC(self, documents):
        num_docs = len(documents) // 10
        for i, (did, body) in enumerate(documents.items()):
            self.es.index(self._index_name, body=body, id=did)
            if i % num_docs == 0:
                print('{}% done'.format((i // num_docs) * 10))

    def reindex(self, doc_body='short', ancestors=False):
        print('Indexing model {} - {}'.format(self.model, self.similarity))
        self.reset_index()
        if self.model == 'EC':
            documents = get_EC_documents(doc_body)
            self._index_EC(documents)
        else:
            documents = get_TC_documents(doc_body, ancestors)
            if self.similarity == 'Custom':
                weights = get_type_weights()
                for t in documents:
                    documents[t]['weight'] = weights.get(t, 1)
            self._index_TC(documents)
            # self._index_TC({k: v for k, v in list(documents.items())[:20]})

    def analyze_query(self, query, field='body'):
        """Analyzes a query with respect to the relevant index.
        
        Arguments:
            query: String of query terms.
            field: The field with respect to which the query is analyzed.
        
        Returns:
            A list of query terms that exist in the specified field among the documents in the index. 
        """
        tokens = self.es.indices.analyze(index=self._index_name,
                                         body={'text': query})['tokens']
        query_terms = []
        for t in sorted(tokens, key=lambda x: x['position']):
            ## Use a boolean query to find at least one document that contains the term.
            hits = self.es.search(index=self._index_name,
                                  body={
                                      'query': {
                                          'match': {
                                              field: t['token']
                                          }
                                      }
                                  },
                                  _source=False,
                                  size=1).get('hits', {}).get('hits', {})
            doc_id = hits[0]['_id'] if len(hits) > 0 else None
            if doc_id is None:
                continue
            query_terms.append(t['token'])
        return query_terms

    def baseline_EC_retrieval(self, queries, k=100):
        """Performs baseline retrival on index.
        """
        ids, body = [], []
        for query in queries:
            if query['category'] != 'resource':
                continue

            q = self.analyze_query(query['question'])
            if not q:
                continue

            ids.append(query['id'])
            body.append({})
            body.append({
                'query': {
                    'match': {
                        'body': ' '.join(q)
                    }
                },
                '_source': False,
                'size': k
            })
        res = self.es.msearch(index=self._index_name, body=body)['responses']

        return {
            qid: [(doc['_id'], doc['_score']) for doc in hits['hits']['hits']]
            for qid, hits in zip(ids, res)
        }

    def baseline_TC_retrieval(self, queries, k=100):
        """Performs baseline retrival on index.
        """
        results = {}
        for query in queries:
            if query['category'] != 'resource':
                continue

            q = self.analyze_query(query['question'])
            if not q:
                continue

            body = []
            for term in q:
                body.append({})
                body.append({
                    'query': {
                        'match': {
                            'body': term
                        }
                    },
                    '_source': False
                })
            res = self.es.msearch(index=self._index_name,
                                  body=body)['responses']

            scores = defaultdict(int)
            for hits in res:
                for doc in hits['hits']['hits']:
                    scores[doc['_id']] += doc['_score']

            results[query['id']] = sorted(scores.items(),
                                          key=lambda x: x[1],
                                          reverse=True)[:k]

        return results

    def load_baseline_results(self, dataset='train', force=False):
        fname = f'top100_{self.model}_{self.similarity}_{dataset}'
        if not force:
            results = load_dict_from_json(fname)
            if results:
                return results

        print('Retrieving from index.')
        queries = load_dict_from_json(f'{dataset}_set_fixed.json')
        if not queries:
            print('Cannot find the dataset.')
            return None

        res = getattr(self, f'baseline_{self.model}_retrieval')(queries)
        save_dict_to_json(res, fname)
        return res

    def get_baseline_EC_scores(self, results, k=100):
        """Aggregates scores from EC index and return ranked types

        Args:
            results (dict): baseline entity retrieval
            k (int, optional): Number of documents to aggregate over. Defaults to 100.

        Returns:
            dict: Type scores
        """
        type_weights = get_type_weights()
        entity_types = get_all_instance_types(True)
        system_output = {}
        for qid, res in results.items():
            scores = defaultdict(int)
            for entity, score in res[:k]:
                for t in entity_types[entity]:
                    scores[t] += score / type_weights[t]

            system_output[qid] = sorted(scores.items(),
                                        key=lambda x: x[1],
                                        reverse=True)
        return system_output

    def get_baseline_TC_scores(self, results, k=None):
        return results

    def generate_baseline_scores(self, dataset='train', k=100, force=False):
        raw_results = self.load_baseline_results(dataset, force)
        return getattr(self, f'get_baseline_{self.model}_scores')(raw_results,
                                                                  k)
Exemple #28
0
class DB:
    IND = "appsearch"

    def __init__(self, mdb_conn, es_conn, force_delete=False):
        self.db = MongoClient(mdb_conn).main
        self.es = Elasticsearch([{'host': es_conn, 'port': 9200}])
        if self.es.ping():
            print('Elasticsearch connected')
        else:
            print('Elasticsearch not connected')

        self.create_index(force_delete)

    def create_index(self, force):
        if force:
            try:
                self.es.indices.delete(self.IND)
            except:
                pass
        if not self.es.indices.exists(self.IND):
            self.es.indices.create(index=self.IND)
            print('Index created')
            self.es.indices.close(self.IND)
            self.es.indices.put_settings(index=self.IND, body=json.load(open('data/es_settings.json')))
            self.es.indices.put_mapping(index=self.IND, body=json.load(open('data/es_mapping.json')))
            print("Index configured")
            self.es.indices.open(self.IND)
            self.es.indices.refresh(index=self.IND)
            print("Index opened")
            self.update_es()

    def insert(self, data):
        return self.es.index(index=self.IND, body=data)

    def update_es(self):
        res = self.db.apps.find({})
        for i in tqdm(res, total=res.count()):
            i["extid"] = str(i["_id"])
            del i["_id"]
            self.insert(i)
        print("Finishing copying records")

    def execute_search(self, **params):
        def proc(x):
            x["_source"]["_score"] = x["_score"]
            return x["_source"]

        try:
            return list(map(proc, self.es.search(timeout="2s", **params)['hits']['hits']))
        except:
            return []

    def execute_msearch(self, allow_remove_source=True, **params):
        def proc(x):
            if allow_remove_source and len(x["_source"]) == 1:
                return x["_source"].popitem()[1]
            x["_source"]["_score"] = x["_score"]
            return x["_source"]

        try:
            data = self.es.msearch(body=reduce(
                lambda x, y: x + [{"index": params["index"]}, y],
                params["body"],
                [])
            )['responses']
            return reduce(lambda x, y: x + list(map(proc, y['hits']['hits'])), data, [])
        except:
            return []

    def get_pending_review(self):
        return self.db.reviews.find_one()

    def get_pending_app(self):
        return list(self.db.apps.aggregate([{"$sample": {"size": 1}}]))[0]

    def save_review(self, aid, text, type):
        self.db.reviews.insert_one({
            "aid": aid,
            "text": text,
            "type": type,
            "checked": 0
        })

    def approve_review(self, id):
        rev = self.db.reviews.find_one({
            "_id": ObjectId(id)
        })
        if rev['type']:
            self.db.apps.update_one({
                "_id": ObjectId(rev['aid'])
            }, {
                '$push': {
                    'feedbacks': rev['text']
                }
            })
        else:
            self.db.apps.update_one({
                "_id": ObjectId(rev['aid'])
            }, {
                '$push': {
                    'tags': rev['text']
                }
            })
            self.update_es()
        self.db.reviews.delete_one({
            "_id": ObjectId(id)
        })

    def category_search(self, text):
        return self.execute_search(
            index=self.IND,
            body={
                "sort": [
                    {"pos_feedbacks": {"order": "desc"}},
                    "_score"
                ],
                "query": {
                    "match": {
                        "category": text
                    }
                }
            }
        )

    def search(self, txt):
        return self.execute_msearch(index=self.IND, body=[
            {
                "query": {
                    "match": {
                        "title": txt
                    }
                },
            },
            {
                "query": {
                    "match": {
                        "description": txt
                    }
                },
            }
        ])

    def get_ids_for_query(self, txt):
        res = self.execute_msearch(index=self.IND, body=[
            {
                "query": {
                    "match": {
                        "title": txt
                    }
                },
                "_source": ["extid"],
            },
            {
                "query": {
                    "match": {
                        "description": txt
                    }
                },
                "_source": ["extid"],
            }
        ])
        return list(map(lambda x: ObjectId(x), res))

    def combine_tags(self, txt):
        res = self.db.apps.aggregate([
            {
                "$match": {
                    "_id": {
                        "$in": self.get_ids_for_query(txt),
                    }
                }
            },
            {
                "$unwind": "$tags"
            },
            {
                "$group": {
                    "_id": "null",
                    "tags": {
                        "$addToSet": "$tags"
                    }
                }
            }
        ])
        res = list(res)
        if len(res) > 0:
            res = res[0]['tags']
            if '' in res:
                res.remove('')
            return res
        return []

    def query_by_tags(self, txt, tags):
        return list(self.db.apps.aggregate([
            {
                "$match": {
                    "$and": [{
                        "_id": {
                            "$in": self.get_ids_for_query(txt),
                        }
                    }] + ([{
                        "tags": {
                            "$elemMatch": {
                                "$in": tags
                            }
                        }
                    }] if len(tags) > 0 else [])
                }
            },
            {
                "$sort": {
                    "pos_feedbacks": -1
                }
            }
        ]
        ))

    def get_app_by_id(self, x):
        return self.db.apps.find_one({"_id": ObjectId(x) if type(x) == str else x})
Exemple #29
0
class MUSEUM:
    def __init__(self, host, port, use_caching=False):
        self.es = Elasticsearch(hosts=host, port=port, timeout=600)
        self.use_caching = use_caching

    def create_index(self,
                     index,
                     module,
                     num_hash=128,
                     use_smallest=False,
                     use_mod=False,
                     use_minmax=False,
                     shards=5,
                     replicas=1,
                     interval=10):
        if index == '':
            raise NotDefinedError("Index parameter is not passed")
        if self.es.indices.exists(index):
            raise AlreadyExistError("\"{}\" already exist index".format(index))

        res = self.es.indices.create(
            index=index,
            body=get_index_template(module, num_hash, use_smallest, use_mod,
                                    use_minmax, shards, replicas, interval))
        return res

    def get_index_info(self, index_name):
        if not self.es.indices.exists(index_name):
            raise NotExistError("Index does not exist")
        index_info = self.es.indices.get_mapping(
            index=index_name)[index_name]['mappings']['_meta']
        index_info['module'] = module_loader(index_info['module_info'])
        return index_info

    def bulk(self,
             index_name,
             target,
             process_count=8,
             batch_size=10000,
             disable_tqdm=False,
             pass_indexed_files=False):
        index_info = self.get_index_info(index_name)

        if type(target) is list or type(target) is set:
            file_list = target
        elif type(target) is str and os.path.isdir(target):
            file_list = walk_directory(target)
        else:
            raise NotADirectoryError("{} is not a directory".format(target))

        pbar = tqdm(total=len(file_list),
                    desc="Bulk index",
                    disable=disable_tqdm)
        for batch_file_list in batch_generator(file_list, batch_size):
            if not pass_indexed_files:
                remain_file_list = batch_file_list
            else:
                remain_file_list = []
                exist_md5_set = self.__check_exists(index_name,
                                                    batch_file_list)
                for file_path in batch_file_list:
                    if not os.path.splitext(
                            os.path.split(file_path)[1])[0] in exist_md5_set:
                        remain_file_list.append(file_path)
                    else:
                        pbar.update(1)

            bulk_body_list = []
            for file_md5, sampled_data, feature_size, file_name in mp_helper(
                    preprocess.action,
                    remain_file_list,
                    process_count,
                    index_info=index_info,
                    use_caching=self.use_caching):
                if sampled_data:
                    bulk_body_list.append(
                        get_bulk_request(file_md5, sampled_data, feature_size,
                                         file_name, index_name))
                pbar.update(1)

            if bulk_body_list:
                self.es.bulk(body=bulk_body_list)
        pbar.close()
        print("Waiting {} sec for index refresh".format(
            index_info["refresh_interval"]))
        time.sleep(int(index_info["refresh_interval"]))

    def search(self, index_name, file_path, limit=1, index_info=None):
        if not index_info:
            index_info = self.get_index_info(index_name)
        _, query_samples, query_feature_size, file_name = preprocess.action(
            file_path, index_info, self.use_caching)

        report = {'query': file_name, 'hits': []}
        if query_samples:
            try:
                response = self.es.search(index=index_name,
                                          body=get_search_body(
                                              query_samples, limit),
                                          search_type='dfs_query_then_fetch')
            except ConnectionTimeout:
                print('Search error detected')
                return report
            report['hits'] = make_report_hits(response, query_samples,
                                              query_feature_size, index_info)
        return report

    def multi_search(self,
                     index_name,
                     target,
                     limit=1,
                     process_count=1,
                     batch_size=100,
                     disable_tqdm=False):
        if type(target) is list or type(target) is set:
            file_list = target
        elif type(target) is str and os.path.isdir(target):
            file_list = walk_directory(target)
        else:
            raise NotADirectoryError("{} is not a directory".format(target))
        index_info = self.get_index_info(index_name)
        pbar = tqdm(total=len(file_list),
                    disable=disable_tqdm,
                    desc="Multiple search")
        for jobs in batch_generator(file_list, batch_size):
            search_data_list = []
            query_samples_list = []
            query_feature_size_list = []
            file_name_list = []
            for _, query_samples, query_feature_size, file_name in mp_helper(
                    preprocess.action,
                    jobs,
                    process_count,
                    index_info=index_info,
                    use_caching=self.use_caching):
                if query_samples:
                    search_data_list.append(
                        get_msearch_request(index_name, query_samples, limit))
                    query_samples_list.append(query_samples)
                    query_feature_size_list.append(query_feature_size)
                    file_name_list.append(file_name)
            report_list = []
            if search_data_list:
                try:
                    resp = self.es.msearch(body="\n".join(search_data_list))
                except ConnectionTimeout:
                    print('Search error detected')
                    continue
                for i, response in enumerate(resp['responses']):
                    report = {
                        'query':
                        file_name_list[i],
                        'hits':
                        make_report_hits(response, query_samples_list[i],
                                         query_feature_size_list[i],
                                         index_info)
                    }
                    report_list.append(report)
            pbar.update(len(report_list))
            yield report_list
        pbar.close()

    def __check_exists(self, index_name, batch_file_list):
        md5_list = [
            os.path.splitext(os.path.split(file_path)[1])[0]
            for file_path in batch_file_list
        ]
        exist_query_list = []
        for md5 in md5_list:
            exist_query_list.append(get_exists_request(index_name, md5))
        responses = self.es.msearch(
            body="\n".join(exist_query_list))['responses']
        exist_md5_set = set()
        for response in responses:
            hits = response['hits']['hits']
            if hits:
                exist_md5_set.add(hits[0]['_id'])
        return exist_md5_set
class RaintankFinder(object):
    __fetch_multi__ = "raintank"

    def __init__(self, config):
        cfg = config.get('raintank', {})
        es = cfg.get('es', {})
        rt = cfg.get('tank', {})
        self.config = {
            "tank": {
               "url": rt.get('url', 'http://localhost:6060')
            },
            "es": {
                "url": es.get('url', 'http://localhost:9200')
            }
        }
        logger.info("initialize RaintankFinder", config=self.config)
        self.es = Elasticsearch([self.config['es']['url']])

    def find_nodes(self, query):
        seen_branches = set()
        #query Elasticsearch for paths
        matches = self.search_series(query)

        for name, metrics in matches['leafs'].iteritems():    
            yield RaintankLeafNode(name, RaintankReader(self.config, metrics))
        for branchName in matches['branches']:
            yield BranchNode(branchName)

    def search_series(self, query):
        parts = query.pattern.split(".")
        part_len = len(parts)
        es_query = {
            "bool": {
                "must": [
                ]
            }
        }
        pos = 0
        for p in parts:
            node = "nodes.n%d" % pos
            value = p
            q_type = "term"
            if is_pattern(p):
                q_type = "regexp"
                value = p.replace('*', '.*').replace('{', '(').replace(',', '|').replace('}', ')')

            es_query['bool']['must'].append({q_type: {node: value}})
            pos += 1

        leaf_search_body = {
          "query": {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [
                                { 
                                    "term" : {
                                        "node_count": part_len
                                    }
                                }
                            ],
                            "should": [
                                {
                                    "term": {
                                        "org_id": g.org
                                    }
                                },
                                {
                                    "term": {
                                       "org_id": -1
                                    }
                                }
                            ]
                        }
                    },
                "query": es_query
                }
            }
        }
        leaf_query = json.dumps(leaf_search_body)

        branch_search_body = leaf_search_body
        branch_search_body["query"]["filtered"]["filter"]["bool"]["must"][0] = {"range": {"node_count": {"gt": part_len}}}
        branch_search_body["aggs"] = {
            "branches" : {
                "terms": {
                    "field": "nodes.n%d" % (part_len - 1),
                    "size": 500
                }
            }
        }
        branch_query = json.dumps(branch_search_body)

        search_body = '{"index": "metric", "type": "metric_index", "size": 500}' + "\n" + leaf_query +"\n"
        search_body += '{"index": "metric", "type": "metric_index", "search_type": "count"}' + "\n" + branch_query + "\n"

        branches = []
        leafs = {}
        with statsd.timer("graphite-api.search_series.es_search.query_duration"):
            ret = self.es.msearch(index="metric", doc_type="metric_index", body=search_body)
            if len(ret['responses'][0]["hits"]["hits"]) > 0:
                for hit in ret['responses'][0]["hits"]["hits"]:
                    leaf = True
                    source = hit['_source']
                    if source['name'] not in leafs:
                        leafs[source['name']] = []
                    leafs[source['name']].append(RaintankMetric(source, leaf))

            if len(ret['responses'][1]['aggregations']['branches']['buckets']) > 0:
                for agg in ret['responses'][1]['aggregations']['branches']['buckets']:
                    branches.append("%s.%s" % (".".join(parts[:-2]), agg['key']))

        return dict(leafs=leafs, branches=branches)

    def fetch_multi(self, nodes, start_time, end_time):
        step = None
        node_ids = {}
        for node in nodes:
            for metric in node.reader.metrics:
                if step is None or metric.interval < step:
                    step = metric.interval

        with statsd.timer("graphite-api.fetch.raintank_query.query_duration"):
            data = self.fetch_from_tank(nodes, start_time, end_time)
        series = {}
        delta = None
        with statsd.timer("graphite-api.fetch.unmarshal_raintank_resp.duration"):

            for path, points in data.iteritems():
                datapoints = []
                next_time = start_time;
                
                max_pos = len(points)

                if max_pos == 0:
                    for i in range(int((end_time - start_time) / step)):
                        datapoints.append(None)
                    series[path] = datapoints
                    continue

                pos = 0

                if delta is None:
                    delta = (points[0][1] % start_time) % step
                    # ts[0] is always greater then start_time.
                    if delta == 0:
                        delta = step

                while next_time <= end_time:
                    # check if there are missing values from the end of the time window
                    if pos >= max_pos:
                        datapoints.append(None)
                        next_time += step
                        continue

                    ts = points[pos][1]
                    # read in the metric value.
                    v = points[pos][0]

                    # pad missing points with null.
                    while ts > (next_time + step):
                        datapoints.append(None)
                        next_time += step

                    datapoints.append(v)
                    next_time += step
                    pos += 1
                    if (ts + step) > end_time:
                        break

                series[path] = datapoints

        if delta is None:
            delta = 1
        time_info = (start_time + delta, end_time, step)
        return time_info, series

    def fetch_from_tank(self, nodes, start_time, end_time):
        params = {"render": [], "from": start_time, "to": end_time}
        pathMap = {}
        for node in nodes:
            for metric in node.reader.metrics:
                params['render'].append(metric.id)
                pathMap[metric.id] = metric.name

        url = "%sget" % self.config['tank']['url']
        resp = requests.get(url, params=params)
        logger.debug('fetch_from_tank', url=url, status_code=resp.status_code, body=resp.text)
        dataMap = {}
        for result in resp.json():
            path = pathMap[result['Target']]
            if path in dataMap:
                #we need to merge the datapoints.
                dataMap[path].extend(result['Datapoints'])
                # sort by timestamp
                dataMap[path].sort(key=lambda x: x[1])
            else:
                dataMap[path] = result['Datapoints']
        return dataMap
def search_similarity_master_data(similar_row_hash, index_name, doc_type='details'):
    # 建立连接
    es = Elasticsearch([{'host': ES_HOST, 'port': ES_PORT }])

    # 构造查询语句
    msearch_dsl = []
    key_index_list = []
    for row_key in similar_row_hash:
        master = similar_row_hash[row_key]
        # 配置索引信息
        dsl_index = {"index": index_name, "doc_type": doc_type}

        # 配置必要条件
        must_list = []
        for key in master:
            query_name = master[key]
            if query_name == '':
                continue
            must_list.append({
                "match":
                    { key:
                        {"query":query_name}
                    }
                }
            )

        # 构造查询语句
        dsl_query = {
            "query" : {
              "bool":{
                  "must": must_list
              }
            },
            "size" : 1
        }
        msearch_dsl.append(dsl_index)
        msearch_dsl.append(dsl_query)
        key_index_list.append(row_key)

    # 批量查询,2000查询1次
    similarity_hash = {}
    queue_size = 2000
    max_queue_count = len(msearch_dsl)/queue_size+1
    for i in range(0,max_queue_count):
        from_index = i*queue_size
        to_index = (i+1)*queue_size
        queue_dsl = msearch_dsl[from_index:to_index]
        if len(queue_dsl) == 0:
            continue

        # 查询
        responses = es.msearch(body=queue_dsl)
        response_index = 0
        for response in responses['responses'] :
            # 解析每条查询的结果
            masters = response['hits']['hits']
            if len(masters) > 0:
                es_master = masters[0]['_source']
                # 所以的sha1
                row_sha1 = key_index_list[i*queue_size/2+response_index]
                similar_row_hash[row_sha1] = es_master

            response_index = response_index + 1

    return similar_row_hash
class QueryLogRhythm:
    elastic_misp_mapping = {
        'authentihash': ['hash', 'object'],
        'cdhash': ['hash', 'object'],
        'domain':
        ['domain', 'domainOrigin', 'impactedHostName', 'originHostName'],
        'email-dst': ['recipient'],
        'email-reply-to': ['recipient'],
        'email-src': ['sender'],
        'email-subject': ['subject'],
        'filename': ['object', 'objectName'],
        'impfuzzy': ['hash', 'object'],
        'imphash': ['hash', 'object'],
        'md5': ['hash', 'object'],
        'pehash': ['hash', 'object'],
        'sha1': ['hash', 'object'],
        'sha224': ['hash', 'object'],
        'sha256': ['hash', 'object'],
        'sha384': ['hash', 'object'],
        'sha512': ['hash', 'object'],
        'sha512/224': ['hash', 'object'],
        'sha512/256': ['hash', 'object'],
        'ssdeep': ['hash', 'object'],
        'tlsh': ['hash', 'object'],
        'hassh-md5': ['hash', 'object'],
        'hasshserver-md5': ['hash', 'object'],
        'ja3-fingerprint-md5': ['hash', 'object'],
        'hostname':
        ['domain', 'domainOrigin', 'impactedHostName', 'originHostName'],
        'http-method': ['action', 'command'],
        'port': ['originPort', 'impactedPort'],
        'o-port': ['originPort'],
        'i-port': ['impactedPort'],
        'ip-dst': ['impactedIp', 'impactedIpV6'],
        'ip-src': ['originIp', 'originIpV6'],
        'link': ['url'],
        'mac-address': ['impactedMac', 'originMac'],
        'mime-type': ['object', 'objectName', 'objectType'],
        'mutex':
        ['object', 'parentProcessName', 'parentProcessPath', 'process'],
        'named pipe':
        ['object', 'parentProcessName', 'parentProcessPath', 'process'],
        'regkey': ['object', 'objectName'],
        'target-email': ['recipient'],
        'target-machine':
        ['domain', 'domainOrigin', 'impactedHostName', 'originHostName'],
        'target-user': ['login', 'account'],
        'uri': ['object', 'url', 'objectName'],
        'url': ['url'],
        'user-agent': ['userAgent'],
        'vulnerability': ['CVE', 'object'],
        'windows-scheduled-task':
        ['parentProcessName', 'parentProcessPath', 'process'],
        'windows-service-name':
        ['parentProcessName', 'parentProcessPath', 'process', 'serviceName'],
        'test':
        'test',
        'test2':
        9200,
        'windows-service-displayname':
        ['parentProcessName', 'parentProcessPath', 'process', 'serviceName']
    }

    def __init__(self, elastic_host='localhost', elastic_port=9200):
        self.elastic_host = elastic_host
        self.elastic_port = elastic_port
        self.elastic_client = Elasticsearch([{
            'host': elastic_host,
            'port': elastic_port
        }])

    def build_query(self, parameters):
        if parameters is None:
            return None

        lst_and_qry = list()
        for parameter in parameters:
            data = parameters[parameter]
            terms = self.elastic_misp_mapping[parameter]
            lst_or_qry = list()
            for term in terms:
                if isinstance(data, list):
                    for value in data:
                        qry = term + ':' + str(value)
                        lst_or_qry.append(qry)
                else:
                    qry = term + ': ' + str(data)
                    lst_or_qry.append(qry)
            or_qyr = ' OR '.join(lst_or_qry)
            lst_and_qry.append('(' + or_qyr + ')')

        and_qry = ' AND '.join(lst_and_qry)
        return and_qry

    def query_ec(self,
                 str_query,
                 q_fields,
                 start_date=0,
                 end_date=0,
                 index='logs-*',
                 doc_type='logs',
                 hours=24,
                 debug=False):
        if start_date > end_date:
            raise Exception(
                'The start_date can\'t be greater than the end_date')

        if start_date == 0 or end_date == 0:
            dt_end_date = datetime.now().timestamp()
            dt_start_date = (datetime.now() -
                             timedelta(hours=hours)).timestamp()
            start_date = int(dt_start_date) * 1000
            end_date = int(dt_end_date) * 1000

        # print(str(start_date) + ' -- ' + str(end_date))

        elastic_qry = ElasticQuery(es=self.elastic_client,
                                   index=index,
                                   doc_type=doc_type)
        elastic_qry.query(
            Query.bool(must=[
                Query.query_string(str_query),
                Query.range('normalDate', gte=start_date, lte=end_date)
            ]))

        elastic_qry.aggregate(
            Aggregate.date_histogram('2', 'normalDate', '12h'))

        my_qry = elastic_qry.dict()
        my_qry['stored_fields'] = q_fields

        search_arr = list()
        header_qry = {"index": ["logs-*"], "ignore_unavailable": True}
        search_arr.append(header_qry)
        search_arr.append(my_qry)

        print('Elastic Query: ' + str(search_arr))
        print(
            '------------------------------------------------------------------------------------'
        )
        print('Lucene Query: ' + str_query)

        request = ''
        for each in search_arr:
            request += '%s \n' % json.dumps(each)

        # print(request)

        resp = self.elastic_client.msearch(body=request)

        if resp is None and len(resp['responses']) <= 0:
            return None
        else:
            response = resp['responses'][0]
            hits_data = list()
            if response['hits']['total'] > 0:
                for hit in response['hits']['hits']:
                    hits_data.append(hit)

        # print(str(hits_data))

        return search_arr, hits_data