def get_data_from_es(endpoint, index, service, num=20, time=2, query=DEFALULT_QUERY):
    """Get data from elasticsearch using index name."""
    es = Elasticsearch(endpoint, timeout=30)
    query["size"] = num
    query["filter"]["range"]["@timestamp"]["gte"] = "now-" + str(time) + "s"
    query["query"]["match"]["service"] = service

    return es.search(index, body=json.dumps(query), request_timeout=500)
Esempio n. 2
0
def client():
    global CLIENT
    if CLIENT is None:
        try:
            CLIENT = Elasticsearch(settings.ELASTIC_URI,
                                   request_timeout=settings.ELASTIC_TIMEOUT,
                                   retry_on_timeout=True,
                                   **settings.ELASTIC_KWARGS)
            logging.getLogger('elasticsearch').setLevel(logging.WARN)
            logging.getLogger('elasticsearch.trace').setLevel(logging.WARN)
            logging.getLogger('urllib3').setLevel(logging.WARN)
            logging.getLogger('requests').setLevel(logging.WARN)
            CLIENT.cluster.health(wait_for_status='yellow')
        except ConnectionError:
            message = (
                'The SEARCH_ENGINE setting is set to "elastic", but there '
                'was a problem starting the elasticsearch interface. Is '
                'elasticsearch running?')
            if settings.SENTRY_DSN:
                try:
                    sentry.log_exception()
                    sentry.log_message(message)
                except AssertionError:  # App has not yet been initialized
                    logger.exception(message)
            else:
                logger.error(message)
            exit(1)
    return CLIENT
 def __init__(self,
              company_count: int = 100,
              cb_connect: str = SQL_CONNECT,
              es_connect: List[Dict] = ES_CONNECT,
              es_index: str = ES_INDEX):
     self.company_count = company_count
     # prep company selection select top-<limit> companies with most workers
     self.companies_select = CMPS_SELECT.format(limit=self.company_count)
     # connect to mysql crunshbase database
     self.sql_engine = create_engine(cb_connect)
     # connect to es instance
     self.es_client = Elasticsearch(list(es_connect))
     self.es_index = es_index
     self.insertions = 0
     if not self.es_client.ping():
         raise ValueError("ElasticSearch Ping Failed")
Esempio n. 4
0
def get_es_client(es_config):
    """Get ES client."""
    if es_config['version'] == 2:
        from elasticsearch2 import Elasticsearch
        return Elasticsearch(host='localhost', port=9200)
    else:
        raise Exception('unsupported ES version: {}'.format(
            es_config['version']))
def process_date(current_date):
    """
    Enriches all articles for the given date from the given index
    :param current_date:
    :param index_name:
    :return:
    """
    total_docs = 0
    start = current_date
    end = current_date + timedelta(days=1)
    query = {
        "query": {
            "constant_score": {
                "filter": {
                    "bool": {
                        "must": [{"range": {
                            "dateHarvested": {"gte": start.strftime("%Y-%m-%d"),
                                              'lte': end.strftime("%Y-%m-%d")}}}
                        ]
                    }
                }
            }
        }
    }
    batch_size = 2000
    scroller = elastic.scroll(
        Elasticsearch(hosts=[ES_HOST], timeout=120, max_retries=10, retry_on_timeout=True),
        index=ES_INDEX,
        body=query,
        scroll='2m',
        clear_scroll=False,
        size=batch_size)
    docs = elastic.scroll_docs_mapped(scroller, mapper)
    for doc_batch in batch(docs, batch_size):
        doc_batch = list(doc_batch)
        suzi_input = [
            {'title': d['title'], 'snip': d['snip']} for d in doc_batch
        ]
        events = score_articles(suzi_input)
        updates = []
        for doc, doc_events in zip(doc_batch, events):
            for company_events in doc_events['events']:
                exploded = doc.copy()
                exploded['company_id'] = company_events['company_id']
                exploded['sdr_scores'] = company_events['scores']
                updates.append(exploded)
        if len(updates) > 0:
            mongo_collection.insert(updates)
        total_docs += len(doc_batch)
        logging.info("docs in day %i" % total_docs)
    return total_docs
class ESSearch(object):
    es_client: Elasticsearch
    es_index: str

    def __init__(self, connect=ES_CONNECT, index=ES_INDEX):
        self.es_client = Elasticsearch(connect)
        self.es_index = index

    def search_by_name(self, search: str):
        res = self.es_client.search(
            index=self.es_index,
            doc_type='company',
            body=dict(query=dict(match_phrase=dict(company_name=search))))
        return [company['_source'] for company in res['hits']['hits']]

    def search_by_location(self, search: str):
        res = self.es_client.search(
            index=self.es_index,
            doc_type='company',
            body=dict(query=dict(match_phrase=dict(location=search))))
        return [company['_source'] for company in res['hits']['hits']]

    def search_by_id(self, company_id: str):
        res = self.es_client.search(
            index=self.es_index,
            doc_type='company',
            body=dict(query=dict(match=dict(company_id=company_id))))
        if len(res['hits']['hits']):
            return res['hits']['hits'][0]['_source']
        else:
            return {}

    def search_by_text(self, text: str):
        res = self.es_client.search(
            index=self.es_index,
            doc_type='company',
            body=dict(query=dict(query_string=dict(query=text))))
        return [company['_source'] for company in res['hits']['hits']]
Esempio n. 7
0
import requests
from elasticsearch2 import Elasticsearch
from flask import render_template
from flask import request

from app import app
from app import docSummary

es = Elasticsearch(['https://73efa8624ce5b1aa7b0636a629e2d9f1.us-west-1.aws.found.io:9243/'],
                   http_auth=('admin', 'jfnN6ArBrfnlD6accc0WatAy'),
                   scheme="https")

solr = 'http://*****:*****@35.230.16.178/solr/wiki/select'


@app.route('/')
@app.route('/index')
def index():
    return render_template('index.html', name='index')


@app.route('/query/es/', methods=['GET'])
def query_es():
    search_word = request.args.get('q')
    term = {
        "query": {
            "filtered": {
                "query": {
                    "query_string": {
                        "query": "(" + search_word + ") AND (NOT(#redirect)) AND (NOT(#REDIRECT)) AND (NOT(.*jpg))",
                        "fields": [
Esempio n. 8
0
from dateutil.relativedelta import relativedelta
import logging

# Set up some logging
logger = logging.getLogger('myapp')
hdlr = logging.FileHandler('./logs/RestConnect-{:%Y.%m.%d}.log'.format(
    datetime.now()))
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)

# Set connection
try:
    es = Elasticsearch(hosts='host',
                       http_auth=('login', 'pass'),
                       port=9200,
                       timeout=600)
    logger.info('Connected: ', es.info())
except Exception as ex:
    logger.error(ex)


def get_index(index, period, delta):
    if str(period).upper() == 'D':
        date = datetime.now() + relativedelta(days=-int(delta))
        return '{}-{:%Y.%m.%d}'.format(index, date)
    else:
        date = datetime.now() + relativedelta(months=-int(delta))
        return '{}-{:%Y.%m}'.format(index, date)

Esempio n. 9
0
 def _connect(self):
     self.es = Elasticsearch(self.config.storage.ES_ENDPOINT,
                             timeout=60,
                             max_retries=2)
Esempio n. 10
0
class ESStorage(Storage):
    """Elasticsearch storage backend."""

    NAME = "es"
    _MESSAGE_FIELD_NAME = "_source.message"

    def __init__(self, configuration):
        """Initialize Elasticsearch storage backend."""
        super(ESStorage, self).__init__(configuration)
        self.config.storage = ESConfiguration()
        self._connect()

    def _connect(self):
        self.es = Elasticsearch(self.config.storage.ES_ENDPOINT,
                                timeout=60,
                                max_retries=2)

    def _prep_index_name(self, prefix):
        # appends the correct date to the index prefix
        now = datetime.datetime.now()
        date = now.strftime("%Y.%m.%d")
        index = prefix + date
        return index

    def retrieve(self, time_range: int, number_of_entires: int):
        """Retrieve data from ES."""
        index_in = self._prep_index_name(self.config.storage.ES_INPUT_INDEX)

        query = {
            'query': {
                'match': {
                    'service': 'journal'
                }
            },
            "filter": {
                "range": {
                    "@timestamp": {
                        "gte": "now-2s",
                        "lte": "now"
                    }
                }
            },
            'sort': {
                '@timestamp': {
                    'order': 'desc'
                }
            },
            "size": 20
        }

        _LOGGER.info(
            "Reading in max %d log entries in last %d seconds from %s",
            number_of_entires, time_range, self.config.storage.ES_ENDPOINT)

        query['size'] = number_of_entires
        query['filter']['range']['@timestamp']['gte'] = 'now-%ds' % time_range
        query['query']['match']['service'] = self.config.storage.ES_SERVICE

        es_data = self.es.search(index_in, body=json.dumps(query))

        # only use _source sub-dict
        es_data = [x['_source'] for x in es_data['hits']['hits']]
        es_data_normalized = json_normalize(es_data)

        _LOGGER.info("%d logs loaded in from last %d seconds",
                     len(es_data_normalized), time_range)

        self._preprocess(es_data_normalized)

        return es_data_normalized, es_data  # bad solution, this is how Entry objects could come in.

    def store_results(self, data):
        """Store results back to ES."""
        index_out = self._prep_index_name(self.config.storage.ES_TARGET_INDEX)

        actions = [{
            "_index": index_out,
            "_type": "log",
            "_source": data[i]
        } for i in range(len(data))]

        helpers.bulk(self.es, actions, chunk_size=int(len(data) / 4) + 1)
Esempio n. 11
0
#coding:utf-8
from elasticsearch2 import Elasticsearch
from datetime import datetime

es = Elasticsearch(hosts="10.10.6.6")

es.index(index="keti10_10", doc_type="keti10_10", id=3, body={"bdcdyh": "123", "lx": '1',\
 'postDate':'2017-12-30 12:11:06','qx':'北京','records':2,'uuid':'00123dfad','zl':'北京海淀区'})

#doc=es.get(index="keti10_10", doc_type="keti10_10", id=1)['_source']

#print "doc is %s" % doc

res = es.search(index="keti10_10",
                body={"query": {
                    "match_phrase": {
                        "zl": '北京'
                    }
                }})

for hit in res['hits']['hits']:
    hitmap = hit['_source']
    print "%(zl)s %(postDate)s" % hitmap
class SQLToESImporter(object):
    company_count: int
    companies_select: str
    sql_engine: Engine
    es_client: Elasticsearch
    es_index: str
    insertions: int

    def __init__(self,
                 company_count: int = 100,
                 cb_connect: str = SQL_CONNECT,
                 es_connect: List[Dict] = ES_CONNECT,
                 es_index: str = ES_INDEX):
        self.company_count = company_count
        # prep company selection select top-<limit> companies with most workers
        self.companies_select = CMPS_SELECT.format(limit=self.company_count)
        # connect to mysql crunshbase database
        self.sql_engine = create_engine(cb_connect)
        # connect to es instance
        self.es_client = Elasticsearch(list(es_connect))
        self.es_index = es_index
        self.insertions = 0
        if not self.es_client.ping():
            raise ValueError("ElasticSearch Ping Failed")

    def pull(self) -> Dict:
        companies_result: ResultProxy
        try:
            with self.sql_engine.connect() as conn:
                companies_result = conn.execute(self.companies_select)
                for i, company in enumerate(companies_result):
                    company_events = []
                    events_select = EVENTS_SELECT.format(
                        company_id=company['company_id'])
                    try:
                        events_result = conn.execute(events_select)
                        for event in events_result:
                            try:
                                company_events.append(
                                    dict(event_date=event['event_date'],
                                         event_code=event['event_code'],
                                         event_desc=event['event_desc'],
                                         event_url=event['event_url']))
                            except KeyError:
                                raise
                    except SQLAlchemyError as sq_e:
                        raise
                    try:
                        company_document = dict(
                            company_id=company['company_id'],
                            company_name=company['company_name'],
                            homepage_url=company['homepage_url'],
                            logo_url=company['logo_url'],
                            founded_date=company['founded_date'],
                            country=company['country'],
                            industry=company['industry'],
                            location=company['location'],
                            worker_count=company['worker_count'],
                            events=company_events,
                        )
                    except KeyError:
                        raise
                    yield company_document
        except SQLAlchemyError as sq_e:
            raise

    def push(self, company_document: Dict) -> bool:
        es_result = self.es_client.index(index=self.es_index,
                                         doc_type='company',
                                         id=self.insertions,
                                         body=company_document)
        if es_result['created']:
            self.insertions += 1
            return True
        else:
            return False

    def delete_index(self):
        self.es_client.indices.delete(index=self.es_index, ignore=(400, 404))

    def reimport(self) -> int:
        self.insertions = 0
        self.delete_index()
        for company_document in self.pull():
            self.push(company_document)
        return self.insertions
Esempio n. 13
0
from django.shortcuts import render, render_to_response
from django.http import HttpResponse, HttpResponseRedirect
import requests
import json
import os

from utils import utils
from utils import dbinfo
from database.base import MongoDatabase

from elasticsearch2 import Elasticsearch
from elasticsearch2 import helpers

# ICD_SOURCE = "icd_source.txt"

es = Elasticsearch()
ES_SERVERS = [{'host': 'localhost', 'port': 9200}]
es_client = Elasticsearch(hosts=ES_SERVERS)

mongod = MongoDatabase()

icds = {}  # icds:{zd_lc:[],zd_gb:[]...}


def icd_page(request):
    return render_to_response("match_icd.html", "")


def icd_code_page(request):
    return render_to_response("match_icd_with_code.html", "")
Esempio n. 14
0
                "category": fields["category"],
                "sutra_body": fields["sutra_body"],
            }
        }
        ACTIONS.append(action)

    # batch proc
    success, _ = bulk(es, ACTIONS, index=index_name, raise_on_error=True)
    print('Performed %d actions' % success)


#read command line args
def read_args():
    parser = argparse.ArgumentParser(description="Search Elastic Engine")
    parser.add_argument("-i",
                        dest="input_file",
                        action="store",
                        help="input file",
                        required=True)
    #parser.add_argument("-o", dest="output_file", action="store", help="output file", required=True)
    return parser.parse_args()


if __name__ == '__main__':
    args = read_args()
    es = Elasticsearch(hosts=[settings.host + ':' + settings.port],
                       timeout=5000)
    print(json.dumps(mapping))
    set_mapping(es)
    # set_data(es, args.input_file)
Esempio n. 15
0
def main():
    """
    Main function
    """
    #elasticsearch config
    es_cfg = {
        'input_index':
        'kb-clean',
        'dest_index':
        'kb-clean-geo',
        'host':
        'localhost',
        'port':
        9200,
        'timeout':
        1000,
        'size':
        1000,
        'scroll':
        '2m',
        'doc_type':
        'kb_clean',
        'query_locations': {
            "query": {
                "terms": {
                    "types": [
                        "Location", "Facility", "GeopoliticalEntity",
                        "Physical.OrganizationLocationOrigin"
                    ]
                }
            }
        },
        'query': {
            "query": {
                "match_all": {}
            }
        },
        'loc_types': [
            "Location", "Facility", "GeopoliticalEntity",
            "Physical.OrganizationLocationOrigin"
        ],
        'body':
        '''{"settings":{"index":{"number_of_shards":3,"number_of_replicas":0}},"mappings":{"kb_clean":{"properties":{"categories":{"type":"string","index":"not_analyzed"},"docIds":{"type":"string","index":"not_analyzed"},"edgeLabel":{"type":"string","index":"not_analyzed"},"edgeTarget":{"type":"string","index":"not_analyzed"},"hypotheses":{"type":"string","index":"not_analyzed"},"kbid":{"type":"string","index":"not_analyzed"},"name":{"type":"string","index":"not_analyzed"},"types":{"type":"string","index":"not_analyzed"},"x":{"type":"long"},"y":{"type":"long"},"geoLocation":{"properties":{"geohash":{"type":"string"},"lon":{"type":"double"},"lat":{"type":"double"}}}}}}}'''
    }

    #geonames config
    gn_cfg = {
        'user': '******',
        'url': 'http://api.geonames.org/',
        'endpoint': 'searchJSON'
    }

    #estalbish elasticsearch connection
    es = Elasticsearch([{
        'host': es_cfg['host'],
        'port': es_cfg['port']
    }],
                       timeout=es_cfg['timeout'])

    #create new destination index
    create_dest_index(es, es_cfg['dest_index'], es_cfg['body'])

    #execute process
    process_input_index(es, es_cfg, gn_cfg)
Esempio n. 16
0
from __future__ import unicode_literals
from collections import OrderedDict
import re
from fuzzywuzzy import fuzz
from copy import deepcopy
import codecs
import sys
import requests
import json

import utils
from build_icd import build_icd_norm, build_icd_type_norm, build_icd_code_dict

from elasticsearch2 import Elasticsearch

es = Elasticsearch()

reload(sys)
sys.setdefaultencoding('utf-8')

MATCH_COUNT = 10
ACCURACY = 55
'''
预处理
'''


def get_config(type):
    source_dic = {}
    for line in open("config.txt").readlines():
        t, k, v = line.strip().split(" ")
 def __init__(self, connect=ES_CONNECT, index=ES_INDEX):
     self.es_client = Elasticsearch(connect)
     self.es_index = index
Esempio n. 18
0
            "latitude": 31.231706,
            "longitude": 121.472644
        },
        "rule_id": "EPM7J8KR6723",
        "src_port": 60417,
        "event_content": "",
        "response": "/accept",
        "dst_port": 80,
        "event_level": 0
    }
}

if __name__ == "__main__":
    logging.info("==================== Start ====================")
    dst_es = Elasticsearch(hosts=config["dst_es"],
                           sniff_on_start=True,
                           sniff_on_connection_fail=True,
                           timeout=120)
    bat = []
    _id = 1
    while True:
        item = copy.deepcopy(data)
        item["_id"] = _id
        bat.append(item)
        if (_id % 1000) == 0:
            helpers.bulk(client=dst_es,
                         actions=bat,
                         chunk_size=1000,
                         max_chunk_bytes=209715200)
            bat = []

        if (_id % 30000) == 0:
Esempio n. 19
0
        + str(close_to))
    for i in es_host.cat.indices().split('\n'):
        index = i.split()
        if len(i) > 4 and index[1] == "open":
            for j in close_to:
                if (j[0] in index[2]) and index[2] <= j[1]:
                    indices.append(index[2])

    indices.sort()
    return indices


if __name__ == '__main__':
    logging.info("==================== Start ====================")
    es = Elasticsearch(hosts=config["es_cluster"],
                       sniff_on_start=True,
                       sniff_on_connection_fail=True,
                       timeout=120)
    prefix = config["indices_prefix"]
    retain = config["retention_time"]
    dryrun = config["dry-run"]

    logging.info("let's do the job, carry small and live large...")
    logging.info(
        "IMPORTANT: the indices should follow the naming pattern xxxx_yyyyMMdd"
    )
    indices_to_close = get_indices_to_be_closed(es_host=es,
                                                prefix_list=prefix,
                                                retention_days=retain)
    if dryrun:
        logging.info("<Dry-run> The following indices will be closed: " +
                     str(indices_to_close))