class ElasticsearchUtils(object):
    def __init__(self, host_ports):
        # host_ports格式 [{'host':'xxx', 'port':9200},{}]
        self.host_ports = host_ports
        self.es = None

    def init_connect(self):
        self.es = Elasticsearch(self.host_ports)
        return self.es.ping()


    def get_search_result(self, index_name, type_name, query_body):
        if self.es:
            return self.es.search(index=index_name, doc_type=type_name, body=query_body)
        return

    def get_id_result(self, index_name, type_name, doc_id):
        if self.es:
            return self.es.get(index=index_name, doc_type=type_name, id=doc_id)['_source']
        return


    # doc_id为None说明让es自动生成id
    def add_index_doc(self, index_name, type_name, doc_id, doc_body):
        if doc_id:
            self.es.index(index=index_name, doc_type=type_name, id=doc_id, body=doc_body)
        else:
            self.es.index(index=index_name, doc_type=type_name, body=doc_body)

    def batch_index(self, index_name, type_name, doc_body_lines):
        self.es.bulk(index=index_name, doc_type=type_name, body=doc_body_lines)
Beispiel #2
0
def load_files():

    for book in os.listdir("./books/"):
        with open("./books/" + book) as new_file:
            data = new_file.read()
            es = Elasticsearch([{'port': 9123}])
            es.bulk(body=data.lower())
def main(draft_data):
    es_index = 'vtest'
    es_type = 'lottery'

    inserts = ""
    for l in draft_data.split("\n"):
        if not l: continue
        inserts += "{\"index\":{\"_index\":\"%s\",\"_type\":\"%s\"}}\n%s\n" % \
            (es_index, es_type, l)
        
    es = Elasticsearch()
    if es.indices.exists(index=es_index):
      es.indices.delete(index=es_index)
      import time
      time.sleep(2)
      es.indices.create(index=es_index,
                        body={
                          "mappings": {
                            es_type: {
                              "properties": {
                                "Day": {"type":"integer"},
                                "Day_of_year": {"type":"integer"},
                                "Draft_No.": {"type":"integer"},
                                "Mo.Number": {"type":"integer"}
                              }
                            }
                          }
                        })

    es.bulk(index=es_index, doc_type=es_type, body=inserts)
Beispiel #4
0
def basic_info_insert(info_data):
    #info_data = basic_info_reader()
    es = Elasticsearch([{'host':'219.224.134.214','port':9202}])
    count = 0
    bulk_action = []
    if len(info_data):
        for pre_item in info_data:
            #basic_es_insert("gongshang","basic_info",item)
            item = {}
            item['stock_id'] = pre_item['stock_id']
            item['title'] = pre_item['title']
            item['publish_time'] = pre_item['publish_time']
            item['url'] = pre_item['url']
            item['type'] = pre_item['type']
            #item['content'] = pre_item['content']
            #print item['id']
            action = {"index":{}} #action  "_id": item['url']
            request_body = item # request body
            bulk_action.extend ([action,request_body]) # 在列表中组织形式   

            count += 1
            if count % 1000 == 0:
                try:
                    es.bulk(bulk_action, index="announcement", doc_type="basic_info",timeout=1000)
                    bulk_action = []
                    print count
                except Exception,e:
                    print e

            
        #把最后剩余不足1000条的也插入
        es.bulk(bulk_action, index="announcement", doc_type="basic_info",timeout=1000)
Beispiel #5
0
def word():
    para_dict = config.mysql_config.get_config("../file/mysql_online.conf")

    mysql = db.mysql_common.MysqlCommon(para_dict['host'], para_dict['port'], para_dict['user'], para_dict['pwd'],
                                        para_dict['databases'])
    count = mysql.fetch_count("select count(*) from npc_poi limit 2000");

    es = Elasticsearch(hosts='10.94.48.41:8200')

    for index in range(141000, count[0], 1000):
        print index

        names = mysql.fetch_data("select poi_id,poi_name from npc_poi limit " + str(index) + "," + str(index + 1000))
        data = []
        for name in names:
            json_data = {}
            json_data['poi_id'] = name[0]
            json_data['poi_name'] = name[1]
            seg_list = jieba.cut(name[1])
            json_data['poi_new_name'] = " ".join(seg_list)
            data.append(json_data)
        cache = []
        for d in data:
            new_action = {}
            new_action['_index'] = 'word'
            new_action['_type'] = 'word'
            new_action['_id'] = d['poi_id']
            action = {}
            action['index'] = new_action
            cache.append(action)
            cache.append(d)
        es.bulk(body=cache, index='word', doc_type='word')
Beispiel #6
0
def reindex(old_index, new_index, s):
    ''' Function to reindex by scan and scroll combined with a bulk insert.
    old_index is the index to take docs from, new_index is the one the docs go to.
    s is the size of each bulk insert - should set this as high as the RAM
    on the machine you run it on allows.  500-1000 seems reasonable for t2.medium '''
    def create_bulk_insert_string(results, index):
        ret_str = ''
        for hit in results:
            ret_str += '{"create":{"_index":"' + index + '","_type":"variant","_id":"' + hit['_id'] + '"}}\n'
            ret_str += json.dumps(hit) + '\n'
        return ret_str

    es = Elasticsearch('localhost:9200')
    s = es.search(index=old_index, body='{"query": {"match_all": {}}}', search_type='scan', scroll='5m', size=s)
    curr_done = 0

    try:
        while True:  # do this loop until failure
            r = es.scroll(s['_scroll_id'], scroll='5m')
            this_l = [res['_source'] for res in r['hits']['hits']]
            this_str = create_bulk_insert_string(this_l, new_index)
            es.bulk(body=this_str, index=new_index, doc_type='variant')
            curr_done += len(this_l)
    except:
        print('{} documents inserted'.format(curr_done))
def run_bulk_request(input_file, max_lines):
    """
    ATM, this is under test, so do not use
    use bulk api from elasticsearch. This would be much faster,
    but needs tuning
    :param input_file: input json file
    :param max_lines: maximum lines to process
    :return: none
    """
    start_time = time.clock()
    create_index()
    es_host = {"host": "localhost", "port": 9200}
    es = Elasticsearch(hosts=[es_host], timeout=30)
    bulk_data = list()
    max_record_per_request = 500000
    current_total_records = 0
    with open(input_file, 'rb') as f_input:
        for line_number, line in enumerate(f_input):
            if max_lines and (line_number >= max_lines):
                break
            current_entry = json.loads(line)
            current_title = current_entry.get(TITLE_KEY)
            if not current_title:
                continue
            post_id = current_entry.get('@Id', -1)
            popularity = current_entry.get('@Score', 0)
            data = {
                'name': current_title,
                'popularity': float(popularity),
            }
            op_data = {
                "index": {
                    "_index": INDEX_NAME,
                    "_id": post_id,
                    "_type": FIELD_NAME,
                }
            }
            bulk_data.append(op_data)
            bulk_data.append(data)
            current_total_records += 1
            if current_total_records >= max_record_per_request:
                end_time = time.clock()
                elapsed = end_time - start_time
                es.bulk(index=INDEX_NAME, body=bulk_data)
                bulk_data = list()
                current_total_records = 0
                print 'INDEXED: %d documents elapsed ' \
                      '%2.2f seconds' % (line_number, elapsed)

        if current_total_records > 0:
            es.bulk(index=INDEX_NAME, body=bulk_data)
        end_time = time.clock()
        elapsed = end_time - start_time
        print 'INDEXED: %d documents elapsed ' \
              '%2.2f seconds' % (line_number, elapsed)
Beispiel #8
0
def bulk_data_to_es(host=None, body=None, port=80):

    if body is None:
        return

    es = Elasticsearch([{'host': host, 'port':port}])

    # Bulk action
    es.bulk(body=body)

    return
Beispiel #9
0
def heatmap():
    file = open("d:\\000002_0")
    data = []
    i = 0
    es = Elasticsearch(hosts='10.94.48.41:8200')
    for line in file:
        i += 1
        cols = line.split("\n")[0].split("\t")
        json_data = {}
        json_data['firm_id'] = long(cols[0])
        json_data['uid'] = long(cols[1])
        json_data['deal_id'] = long(cols[2])
        json_data['cert_id'] = long(cols[3])
        json_data['location'] = str(float(cols[5]) / 10000) + "," + str(float(cols[4]) / 10000)
        s = datetime.datetime(2016, 8, 1, 0, 0, 0)
        result = time.mktime(s.timetuple())
        json_data['day'] = long(result * 1000)
        data.append(json_data)
        if i >= 1000:
            cache = []
            for d in data:
                new_action = {}
                new_action['_index'] = 'heatmap'
                new_action['_type'] = 'heatmap'
                new_action['_id'] = d['cert_id']
                action = {}
                action['index'] = new_action
                cache.append(action)
                cache.append(d)
            result = es.bulk(body=cache, index='heatmap', doc_type='heatmap')
            print result
            data = []
            i = 0
def worker_main(queue, source_index, destination_index, batch_size):
    """
    Worker main
    :param queue: multiprocessing.Queue
    :param source_index: str
    :param destination_index: str
    :param batch_size: int
    :return: None
    """
    pid = os.getpid()

    client = Elasticsearch()
    print "process {0} connected to elastic search at http://localhost:9200".format(pid)

    count = 0
    total = 0
    bulk_items = []

    while True:
        doc = queue.get(True)

        if doc == Stop:
            break

        action, data = create_index_item(doc, destination_index)
        bulk_items.append(action)
        bulk_items.append(data)

        action = create_delete_item(doc, source_index)
        bulk_items.append(action)

        count += 1
        total += 1

        if count >= batch_size:
            client.bulk(body=bulk_items)
            bulk_items = []
            count = 0
            print "Process {0} total done: {1}".format(pid, total)

    if bulk_items:
        client.bulk(body=bulk_items)
        print "Process {0} total done: {1}".format(pid, total)
    time.sleep(3)

    print "process {0} finished".format(pid)
Beispiel #11
0
def update_data_index():
    """Update data index"""
    es = Elasticsearch(hosts=[app.config.get('ES_HOST')])
    bulk_data = create_bulk_data()

    # bulk index the data
    print("bulk indexing...")
    res = es.bulk(index=app.config.get('ES_INDEX_NAME'), body=bulk_data, refresh=True)
def update_data_index():
    """Update data index"""
    es = Elasticsearch(hosts=[ES_HOST])
    bulk_data = create_bulk_data()

    # bulk index the data
    print("bulk indexing...")
    res = es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
Beispiel #13
0
class ES_query(object):

    def __init__(self):
        self.es = Elasticsearch()

#load schema and create an index
    def create_index(self,index_name):
        with open('sportsman_schema.txt','r') as schema:
            sports_schema = json.load(schema)
        novel_index = self.es.indices.create(index = index_name, body = sports_schema)
        return sports_schema

    #bulk load the data
    def bulk_loading(self):
        with open('rock_climbing.json','r') as j:
            json_text = json.load(j)
        bulk_file = []
        action = { "index": { "_index": "i_sportsman", "_type": "stadium" }}

        for i in range(len(json_text)):
            bulk_file.append(action)
            bulk_file.append(json_text.values()[i])
                #return bulk_file

        #call create_index function to create i_novel index
        self.create_index("i_sportsman")
        bulk_load = self.es.bulk(body = bulk_file)
        self.es.indices.refresh(index = "i_sportsman")
        return bulk_load

    def q_place(self,string):
        query_body = {
            "query":{
                "multi_match" : {
                    "query": string,
                    "fields": [ "name", "location" ]}},
            "highlight":{
                "fields":{
                    "locations":{}}}
        }

        res = self.es.search(index = "i_sportsman", doc_type = "stadium", body = query_body,size = 10000)
        self.prints(res)

    #print the required results by order
    def prints(self,res):
        hits = res["hits"]["hits"]
        print 'totle number of hits: ' + str(len(hits))
        for i in range(min(10,len(hits))):
            print '\n'
            print 'rank: ' + str(i+1)
            stadium = hits[i]["_source"]
            print 'name: ' + stadium['name']
            highlight = hits[i]["highlight"]
            print 'highlights:'
            for (k,v) in highlight.items():
                print '    '+ k + ': ' + str(v)
Beispiel #14
0
def hit_es(threadNum, times):
    time_outs = 0
    # pdb.set_trace()
    # connect to our cluster
    es = Elasticsearch([{'host': host_es, 'port': 9200}])

    upload_data_txt = ""
    upload_data_count = 0

    with open("./finalLogsDataSet") as f:
        artLogs = f.readlines()

    for i in range(hits_per_thread):  # 0 1 2 3
        if i % 100000 == 0:
            print "On the Way! " + str(i)
        item = random.choice(artLogs)

        cmd = {'index': {'_index': index_name, '_type': 'nova9'}}

        upload_data_txt += json.dumps(cmd) + "\n"

        upload_data_txt += item
        # print 'upload_data_txt is ', upload_data_txt
        upload_data_count += 1
        # print 'upload_data_count is', upload_data_count

        if upload_data_count == batch_size:  # batch_size 3
            start_time = time.time()

            while True:
                try:
                    res = es.bulk(
                        index=index_name,
                        body=upload_data_txt,
                        refresh=False,
                        request_timeout=timeout_value)
                except Exception as e:
                    print e
                    time_outs = time_outs + 1
                    continue
                break

            res_txt = "OK" if not res['errors'] else "FAILED"

            #print (res_txt)
            #finish_time = (time.time() - start_time)
            # print finish_time
            if res:
                real_time = res['took']
            #print (real_time)
            upload_data_txt = ""
            upload_data_count = 0
            if real_time:
                times.append(real_time)
        # print result['hits']['total']
    print ("Thread " + str(threadNum) + " finished... \n\n\n")
    print " Total time-outs: " + str(time_outs)
def create_index():
    """create ES client, create index"""

    es = Elasticsearch(hosts=[ES_HOST])

    if es.indices.exists(INDEX_NAME):
        print("deleting '%s' index..." % (INDEX_NAME))
        res = es.indices.delete(index=INDEX_NAME)
        print(" response: '%s'" % (res))

    request_body = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0,
            "analysis": {
                "analyzer": {
                    "default": {
                        "tokenizer": "standard",
                        "filter": ["lowercase", "asciifolding"]
                    },
                    "vietnamese": {
                        "tokenizer": "vi_tokenizer"
                    },
                    "folding": {
                        "tokenizer": "standard",
                        "filter": ["lowercase", "asciifolding"]
                    }
                }
            }
        }
    }
                        # "type": "custom",
    # request_body = {
    #     "settings": {
    #         "number_of_shards": 1,
    #         "number_of_replicas": 0,
    #         "analysis": {
    #             "analyzer": {
    #                 "folding": {
    #                     "tokenizer": "standard",
    #                     "filter": ["lowercase", "asciifolding"]
    #                 }
    #             }
    #         }
    #     }
    # }


    print("creating '%s' index..." % (INDEX_NAME))
    res = es.indices.create(index=INDEX_NAME, body=request_body)
    print(" response: '%s'" % (res))

    bulk_data = create_bulk_data()

    # bulk index the data
    print("bulk indexing...")
    res = es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
Beispiel #16
0
def main():
    ''' Main '''
    try:
        args = parse_args()
        elastic = Elasticsearch(args['elasticsearch_url'])
        solr = pysolr.Solr(args['solr_url'], timeout=args['timeout'])
        query = solr.search(q='*:*', rows=0, fl='numFound')
        create_elastic_index(elastic, args['elasticsearch_index'])
        progress_bar = Bar('Indexing', max=query.hits, suffix='%(index)d/%(max)d - %(percent).1f%% - %(eta)ds')
        for i in islice(count(), 0, query.hits, args['pagesize']):
            solr_response = solr.search(q='*:*', start=i, rows=args['pagesize'])
            data = []
            for document in solr_response.docs:
                progress_bar.next()
                data.append('{"index": {"_id":"%s"}}\n %s \n' % (uuid.uuid4(), json.dumps(document)))
            elastic.bulk(index=args['elasticsearch_index'], doc_type=args['elasticsearch_doctype'], body=''.join(data))
        progress_bar.finish()
    except KeyboardInterrupt:
        print('Interrupted')
Beispiel #17
0
def main():
  if len(sys.argv) != 6:
    print 'usage: ./import.py <raw file> <yogi name> <esHost> <esIndex> <esType>'
    sys.exit(1)

  filename = sys.argv[1]
  yogi = sys.argv[2]
  esHost = sys.argv[3]
  esIndex = sys.argv[4]
  esType = sys.argv[5]
  
  tz = timezone('Asia/Hong_Kong')
 
  attnList = readFileAsList(filename, yogi, tz)
  body = composeESBulkIndexBody(attnList, esIndex, esType)
  
  # connect to ES
  es = Elasticsearch([esHost])
  es.bulk(body=body, index=esHost, doc_type=esType)
def BuildInitialIndex():
    response = urllib2.urlopen(FILE_URL)
    csv_file_object = csv.reader(response)

    header = csv_file_object.next()
    header = [item.lower() for item in header]

    bulk_data = [] 

    for row in csv_file_object:
        data_dict = {}
        for i in range(len(row)):
            data_dict[header[i]] = row[i]
        op_dict = {
            "index": {
                "_index": INDEX_NAME, 
                "_type": TYPE_NAME, 
                "_id": data_dict[ID_FIELD]
            }
        }
        bulk_data.append(op_dict)
        bulk_data.append(data_dict)

    # create ES client, create index
    #es = Elasticsearch(hosts = [ES_HOST])
    es = Elasticsearch([REMOTE_URL])

    if es.indices.exists(INDEX_NAME):
        print("deleting '%s' index..." % (INDEX_NAME))
        res = es.indices.delete(index = INDEX_NAME)
        print(" response: '%s'" % (res))

    # since we are running locally, use one shard and no replicas
    request_body = {
        "settings" : {
            "number_of_shards": 3,
            "number_of_replicas": 1
        }
    }

    print("creating '%s' index..." % (INDEX_NAME))
    res = es.indices.create(index = INDEX_NAME, body = request_body)
    print(" response: '%s'" % (res))

    # bulk index the data
    print("bulk indexing...")
    res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = True)

    # sanity check
    res = es.search(index = INDEX_NAME, size=4, body={"query": {"match_all": {}}})
    print(" response: '%s'" % (res))

    print("results:")
    for hit in res['hits']['hits']:
        print(hit["_source"])
def certificate_use(dir, month, day):
    for file in os.listdir(dir):
        path = os.path.join(dir, file)
        file = open(path)
        data = []
        i = 0
        es = Elasticsearch(hosts="10.94.48.41:8200")
        for line in file:
            i += 1
            cols = line.split("\n")[0].split("\t")
            json_data = {}
            try:
                json_data["deal_id"] = long(cols[0], 0)
                json_data["current_price"] = long(cols[1])
                json_data["simple_price"] = long(cols[1]) / 10000 * 10000
                json_data["single_price"] = long(cols[2])
                json_data["use_time"] = str(cols[3])
                json_data["firm_id"] = long(cols[4])
                json_data["location"] = str(float(cols[6]) / 10000) + "," + str(float(cols[5]) / 10000)
                json_data["area_id"] = long(cols[7])
                json_data["uid"] = long(cols[8])
                json_data["user_id"] = long(cols[9])
                json_data["mobile"] = long(cols[10])
                json_data["category_id_1"] = long(cols[11])
                json_data["category_name_1"] = str(cols[12]).decode("utf-8")
                json_data["category_id_2"] = long(cols[13])
                json_data["category_name_2"] = str(cols[14]).decode("utf-8")
                json_data["certificate_id"] = long(cols[15])
                json_data["cert_money"] = long(cols[16])
                s = dt.datetime(2016, month, day, 0, 0, 0)
                result = time.mktime(s.timetuple())
                json_data["day"] = long(result * 1000)
            except Exception as e:
                print e
                continue
            data.append(json_data)
            if i >= 1000:
                cache = []
                for d in data:
                    new_action = {}
                    new_action["_index"] = "cert"
                    new_action["_type"] = "cert_info"
                    new_action["_id"] = d["certificate_id"]
                    action = {}
                    action["index"] = new_action
                    cache.append(action)
                    cache.append(d)
                result = es.bulk(body=cache, index="cert", doc_type="cert_info")
                # print result
                data = []
                i = 0
        print path
Beispiel #20
0
def api_elastic_sample_build_an_index(request):
    response = urllib.urlopen(FILE_URL)
    csv_file_object = csv.reader(response)
    header = csv_file_object.next()
    header = [item.lower() for item in header]

    es = Elasticsearch(hosts=[ES_HOST])

    if es.indices.exists(INDEX_NAME):
        es.indices.delete(index=INDEX_NAME)

    request_body = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        }
    }
    es.indices.create(index=INDEX_NAME, body=request_body)

    bulk_data = []
    for row in csv_file_object:
        data_dict = {}

        for i in range(len(row)):
            data_dict[header[i]] = row[i]

        op_dict = {
            "index": {
                "_index": INDEX_NAME,
                "_type": DOC_TYPE_NAME,
                "_id": data_dict[ID_FIELD]
            }
        }

        bulk_data.append(op_dict)
        bulk_data.append(data_dict)
    es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)

    return JsonResponse({'status': 'Build a sample an Elastic index done.'})
Beispiel #21
0
 def ingestDataIntoElasticSearchStore(self , 
                                      inputFilePath='' , 
                                      delimiter='' , 
                                      index_model_path='' ,  
                                      document_model_path='' ,
                                      request_model_path='' , 
                                      refresh=True):
     '''
     This method is charged with indexing the documents created above via Elasticsearch's Bulk API
     
     @param inputFilePath: The path to the CSV file
     @type inputFilePath: str
     @param delimiter: The delimiter used in the CSV file
     @type delimiter: str
     @param index_model_path: The path to the 'index.model.json' file.
     @type index_model_path: str
     @param document_model_path: The path to the 'document.model.json' file.
     @type document_model_path: str
     @param request_model_path: The path to the 'search.engine.model.json' file
     @type request_model_path: str
     @param refresh: A flag to determine if the index should be refreshed.
     @type refresh: bool
     '''
     es = None
     request_model = {}
     try:
         es = Elasticsearch(hosts = self._searchUrl)
         request_model = self.__retrieve_models(request_model_path)
         if self._dropIndexFlag is True:
             self.logger.info('Dropping Index: \'{0}\'.'.format(self._indexName))
             es.indices.delete(index=self._indexName)
             self.logger.info('Creating new index...')
             es.indices.create(index=self._indexName , body=request_model)
             bulk_data = self.__create_documents(inputFilePath, delimiter, index_model_path , document_model_path)
             es.bulk(index=self._indexName , body=bulk_data , refresh=True)
     except Exception , error:
         self.logger.error('SearchEngineIndexer.ingestDataIntoElasticSearchStore: Error occured - {0}'.format(str(error)))
Beispiel #22
0
def bulk_send(FILE_URL):

	ES_HOST = {"host" : "localhost", "port" : 9200}

	INDEX_NAME = 'firebolt'
	TYPE_NAME = 'endpoint'


	with open(FILE_URL) as f:
		csv_file_object = csv.reader(f, delimiter=';')

		header = csv_file_object.next()
		header = [item.lower() for item in header]

		bulk_data = [] 

		for row in csv_file_object:
			data_dict = {}
			for i in range(len(row)):
				data_dict[header[i]] = row[i]
			op_dict = {
				"index": {
					"_index": INDEX_NAME, 
					"_type": TYPE_NAME
				}
			}
			bulk_data.append(op_dict)
			bulk_data.append(data_dict)



	# create ES client, create index
	es = Elasticsearch(hosts = [ES_HOST])


	# since we are running locally, use one shard and no replicas
	request_body = {
		"settings" : {
			"number_of_shards": 1,
			"number_of_replicas": 0
		}
	}


	# bulk index the data
	print("bulk indexing...")
	res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = True)

	return True
Beispiel #23
0
def load_crimes(filename, es=None, index_name=index.DEFAULT_INDEX_NAME,
                type_name=index.DEFAULT_TYPE):
    if not es:
        es = Elasticsearch()

    with open(filename, 'r') as f:
        data = f.read()

    body = []
    crimes = json.loads(data)

    for c in crimes['features']:
        body.append({"index": {"_index": index_name, "_type": type_name}})
        body.append(c)

    return es.bulk(body)
Beispiel #24
0
def bulk_add_indexes(index, itype, data, host='127.0.0.1', port=9200):
    # https://qbox.io/blog/building-an-elasticsearch-index-with-python
    # https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html
    # curl -s -XPOST localhost:9200/_bulk --data-binary "@requests"; echo

    '''
    { "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } }
    { "field1" : "value1" }
    { "delete" : { "_index" : "test", "_type" : "type1", "_id" : "2" } }
    { "create" : { "_index" : "test", "_type" : "type1", "_id" : "3" } }
    { "field1" : "value3" }
    { "update" : {"_id" : "1", "_type" : "type1", "_index" : "index1"} }
    { "doc" : {"field2" : "value2"} }
    '''

    bulk_data = []

    for k,v in data.iteritems():
        data_dict = v
        op_dict = {
            "index": {
                "_index": index,
                "_type": itype,
                "_id": k
            }
        }
        bulk_data.append(op_dict)
        bulk_data.append(data_dict)

    es = Elasticsearch()

    success = False
    while not success:
        try:
            res = es.bulk(index=index, body=bulk_data, refresh=True)
            success = True
        except Exception as e:
            print(e)
            time.sleep(60)

    print data.keys()
    print res['took'], res['errors']
    if res['errors']:
        import epdb; epdb.st()
    assert not res['errors']
Beispiel #25
0
class DatastoreConnection:
    def __init__(self):
        self._es = Elasticsearch()
        self._patients = PatientManager(self)
        self._vocabularies = VocabularyManager(self)

    def index_patients(self, filename):
        return self._patients.index(filename)

    def index_hpo(self, filename):
        return self._vocabularies.index(index='hpo', filename=filename, Parser=OBOParser)

    def index_genes(self, filename):
        return self._vocabularies.index(index='genes', filename=filename, Parser=GeneParser)

    def get_vocabulary_term(self, id, index='_all'):
        return self._vocabularies.get_term(id, index=index)

    def find_similar_patients(self, patient, n=5):
        """Return the n most similar patients to the given query api.Patient"""
        return self._patients.find_similar_patients(patient=patient, n=n)

    def search(self, *args, **kwargs):
        """Expose ElasticSearch method"""
        return self._es.search(*args, **kwargs)

    def bulk(self, *args, **kwargs):
        """Expose ElasticSearch method"""
        return self._es.bulk(*args, **kwargs)

    def index(self, *args, **kwargs):
        """Expose ElasticSearch method"""
        return self._es.index(*args, **kwargs)

    def count(self, *args, **kwargs):
        """Expose ElasticSearch method"""
        return self._es.count(*args, **kwargs)

    @property
    def indices(self):
        """Expose ElasticSearch property"""
        return self._es.indices
Beispiel #26
0
def setRecord( context, hostname, graph_name, historical_data ):
    """
    データをElasticSearchに保存します
    @param context         設定情報
    @param hostname        ホスト名
    @param graph_name      グラフ名
    @param historical_data 履歴データ
    @return 成功で0 失敗でそれ以外を返します
    """
    es = Elasticsearch( context[ "es_server" ] )
    step = -1

    info = historical_data[ "meta" ]
    index_name  = "%s" % ( hostname.lower() )
    index_lists = []

    meta = {
        "index": {
                "_index": index_name,
                "_type":  context[ "doctype" ]
            }
        }
    meta_json = json.dumps( meta )

    for record in historical_data[ "data" ]:
        step += 1
        insert_data = {
            "hostname":   hostname,
            "graph_name": graph_name,
            "@timestamp": datetime.datetime.fromtimestamp( info[ "start" ] + info[ "step" ] * step ).isoformat(),
            "value":      record[0]
            }
        insert_json = json.dumps( insert_data )
        logging.debug( insert_json )
        index_lists.append( meta_json )
        index_lists.append( insert_json )

    result = es.bulk( body = "\n".join( index_lists ) )
    logging.debug( result )
Beispiel #27
0
        'age': 20
    },
    {
        'update': {
            '_index': 'indexName',
            '_type': 'typeName',
            '_id': 'idValue'
        }
    },
    {
        'doc': {
            'age': '100'
        }
    },
]
es.bulk(index=indexName, doc_type=typeName, body=doc)

#%% 统计
count = es.count(index=indexName)["count"]  #总条数
es.count(index='logstash-2015.08.21', q='http_status_code:500')

#每页多少条
pageLine = 1000
#多少页
page = count / pageLine if (count % pageLine) == 0 else count / pageLine + 1

#%%
es = Elasticsearch(['http://*****:*****@10.19.133.250:9200/'])
query = {"query": {"match_all": {}}}
#index = "wiseweb.jxg.annual_initiator_contributive_info"
index = 'wiseweb.jxg.company_base_business_merge_new'
Beispiel #28
0
        doc['alias'] = val['alias']
    if val.has_key('hotness'):
        doc['hotness'] = val['hotness']
    if val.has_key('ratings'):
        doc['ratings'] = val['ratings']
    if val.has_key('location'):
        doc['location'] = val['location']

    # bulk_data.append({ "desc": val['desc'],"alias":val['alias'],"hotness":val['hotness'],"rating":val['rating'],
    # "location":val['location']})

    bulk_data.append(doc)
    # 当累计有500条文档时,一次性写入Elasticsearch
    if i % 500 == 0:
        print("start bulk: %d" % i)
        res = es_client.bulk(index=index_name, body=bulk_data, refresh=True)
        print(res)
        bulk_data = []
        # break

if bulk_data:
    res = es_client.bulk(index=index_name, body=bulk_data, refresh=True)
    print(res)


# print bulk_data
# print("bulk indexing...")



Beispiel #29
0
def exec_query(stmt):

    my_lexer = lex(module=lexer, optimize=True, debug=True)

    my_parser = yacc(debug=True, module=parser)

    val = my_parser.parse(lexer=my_lexer.clone(), debug=False, input=sql)

    es = Elasticsearch([{'host': "10.68.23.84", "port": 9200}])
    if val.get_type() == TK.TOK_QUERY:

        query = Query(val)

        print(query.dsl())

        print(query._index, query._type)

        res = es.search(index=query._index,
                        doc_type=query._type,
                        body=query.dsl(),
                        request_timeout=100)

        stmt_res = response_hits(res)

        print(json.dumps(stmt_res, indent=4))

    elif val.get_type() == TK.TOK_CREATE_TABLE:

        stmt = Create(val)

        res = es.indices.create(index=stmt._index,
                                body=stmt._options,
                                request_timeout=100,
                                ignore=400)

        res = es.indices.put_mapping(index=stmt._index,
                                     doc_type=stmt._type,
                                     body=stmt.dsl(),
                                     request_timeout=100)

        print(json.dumps(res, indent=4))

    elif val.get_type() == TK.TOK_INSERT_INTO:

        #         val.debug()

        stmt = Insert(val)

        parms = stmt.metas

        res = es.index(index=stmt._index,
                       doc_type=stmt._type,
                       body=stmt.dsl(),
                       **parms)

        print(json.dumps(res, indent=4))

    elif val.get_type() == TK.TOK_BULK_INTO:

        #         val.debug()

        stmt = Bulk(val)

        res = es.bulk(index=stmt._index, doc_type=stmt._type, body=stmt.dsl())

        print(json.dumps(res, indent=4))

    elif val.get_type() == TK.TOK_UPDATE:

        val.debug()

        stmt = Update(val)

        print(json.dumps(stmt.dsl(), indent=4))

        res = es.update(index=stmt._index,
                        doc_type=stmt._type,
                        body=stmt.dsl(),
                        **stmt.conditions)

        print(json.dumps(res, indent=4))

    elif val.get_type() == TK.TOK_UPSERT_INTO:

        val.debug()

        stmt = Upsert(val)

        print(json.dumps(stmt.dsl(), indent=4))

        res = es.update(index=stmt._index,
                        doc_type=stmt._type,
                        body=stmt.dsl(),
                        **stmt.conditions)

        print(json.dumps(res, indent=4))

    elif val.get_type() == TK.TOK_DELETE:

        val.debug()

        stmt = Delete(val)

        res = es.delete(index=stmt._index,
                        doc_type=stmt._type,
                        **stmt.conditions,
                        ignore=404)

        print(json.dumps(res, indent=4))

    elif val.get_type() == TK.TOK_EXPLAIN:
        stmt = Explain(val)
        print(stmt.curl_str)
        print(json.dumps(stmt.dsl(), indent=4))

    elif val.get_type() == TK.TOK_DESC_TABLE:

        stmt = Describe(val)

        res = es.indices.get_mapping(index=stmt._index, doc_type=stmt._type)

        print(res)

    else:
        res = es.cat.indices(index='qs_test*', v=True)
        val.debug()
        print(res)
Beispiel #30
0
class ElasticsearchProxy(BaseProxy):
    """
    ElasticSearch connection handler
    """
    def __init__(self,
                 *,
                 host: str = None,
                 user: str = '',
                 password: str = '',
                 client: Elasticsearch = None,
                 page_size: int = 10) -> None:
        """
        Constructs Elasticsearch client for interactions with the cluster.
        Allows caller to pass a fully constructed Elasticsearch client, {elasticsearch_client}
        or constructs one from the parameters provided.

        :param host: Elasticsearch host we should connect to
        :param auth_user: user name to use for authentication
        :param auth_pw: user password to use for authentication
        :param elasticsearch_client: Elasticsearch client to use, if provided
        :param  page_size: Number of search results to return per request
        """
        if client:
            self.elasticsearch = client
        else:
            http_auth = (user, password) if user else None
            self.elasticsearch = Elasticsearch(host, http_auth=http_auth)

        self.page_size = page_size

    def _get_search_result(self,
                           page_index: int,
                           client: Search,
                           model: Any,
                           search_result_model: Any = SearchResult) -> Any:
        """
        Common helper function to get result.

        :param page_index:
        :param client:
        :param model: The model to import result(table, user etc)
        :return:
        """
        if model is None:
            raise Exception('ES Doc model must be provided!')

        results = []
        # Use {page_index} to calculate index of results to fetch from
        if page_index != -1:
            start_from = page_index * self.page_size
            end_at = start_from + self.page_size
            client = client[start_from:end_at]
        else:
            # if page index is -1, return everything
            client = client[0:client.count()]

        response = client.execute()

        for hit in response:
            try:
                es_metadata = hit.__dict__.get('meta', {})
                """
                ES hit example:
                {
                    '_d_': {
                        'name': 'name',
                        'database': 'database',
                        'schema': 'schema',
                        'key': 'database://cluster.schema/name',
                        'cluster': 'cluster',
                        'column_descriptions': ['description1', 'description2'],
                        'column_names': ['colname1', 'colname2'],
                        'description': None,
                        'display_name': 'display name',
                        'last_updated_timestamp': 12345678,
                        'programmatic_descriptions': [],
                        'schema_description': None,
                        'tags': ['tag1', 'tag2'],
                        'badges': [],
                        'total_usage': 0
                    },
                    'mata': {
                        'index': 'table index',
                        'id': 'table id',
                        'type': 'type'
                    }
                }
                """
                es_payload = hit.__dict__.get('_d_', {})
                if not es_payload:
                    raise Exception('The ES doc not contain required field')
                result = {}
                for attr, val in es_payload.items():
                    if attr in model.get_attrs():
                        result[attr] = self._get_instance(attr=attr, val=val)
                result['id'] = self._get_instance(attr='id',
                                                  val=es_metadata['id'])

                results.append(model(**result))
            except Exception:
                LOGGING.exception('The record doesnt contain specified field.')

        return search_result_model(total_results=response.hits.total,
                                   results=results)

    def _get_instance(self, attr: str, val: Any) -> Any:
        if attr in TAG_MAPPING:
            # maps a given badge or tag to a tag class
            return [
                TAG_MAPPING[attr](tag_name=property_val)
                for property_val in val
            ]  # type: ignore
        else:
            return val

    def _search_helper(self,
                       page_index: int,
                       client: Search,
                       query_name: dict,
                       model: Any,
                       search_result_model: Any = SearchResult) -> Any:
        """
        Constructs Elasticsearch Query DSL to:
          1. Use function score to customize scoring of search result. It currently uses "total_usage" field to score.
          `Link https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html`_
          2. Uses multi match query to search term in multiple fields.
          `Link https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-multi-match-query.html`_

        :param page_index:
        :param client:
        :param query_name: name of query to query the ES
        :return:
        """

        if query_name:
            q = query.Q(query_name)
            client = client.query(q)

        return self._get_search_result(page_index=page_index,
                                       client=client,
                                       model=model,
                                       search_result_model=search_result_model)

    @timer_with_counter
    def fetch_table_search_results(self,
                                   *,
                                   query_term: str,
                                   page_index: int = 0,
                                   index: str = '') -> SearchTableResult:
        """
        Query Elasticsearch and return results as list of Table objects

        :param query_term: search query term
        :param page_index: index of search page user is currently on
        :param index: current index for search. Provide different index for different resource.
        :return: SearchResult Object
        """
        current_index = index if index else \
            current_app.config.get(config.ELASTICSEARCH_INDEX_KEY, DEFAULT_ES_INDEX)
        if not query_term:
            # return empty result for blank query term
            return SearchTableResult(total_results=0, results=[])

        s = Search(using=self.elasticsearch, index=current_index)
        query_name = {
            "function_score": {
                "query": {
                    "multi_match": {
                        "query":
                        query_term,
                        "fields": [
                            "display_name^1000", "name.raw^75", "name^5",
                            "schema^3", "description^3", "column_names^2",
                            "column_descriptions", "tags", "badges",
                            "programmatic_descriptions"
                        ],
                    }
                },
                "field_value_factor": {
                    "field": "total_usage",
                    "modifier": "log2p"
                }
            }
        }

        return self._search_helper(page_index=page_index,
                                   client=s,
                                   query_name=query_name,
                                   model=Table,
                                   search_result_model=SearchTableResult)

    @staticmethod
    def get_model_by_index(index: str) -> Any:
        if index == TABLE_INDEX:
            return Table
        elif index == USER_INDEX:
            return User
        elif index == DASHBOARD_INDEX:
            return Dashboard

        raise Exception('Unable to map given index to a valid model')

    @staticmethod
    def parse_filters(filter_list: Dict, index: str) -> str:
        query_list = []  # type: List[str]
        if index == TABLE_INDEX:
            mapping = TABLE_MAPPING
        elif index == DASHBOARD_INDEX:
            mapping = DASHBOARD_MAPPING
        else:
            raise Exception(
                f'index {index} doesnt exist nor support search filter')
        for category, item_list in filter_list.items():
            mapped_category = mapping.get(category)
            if mapped_category is None:
                LOGGING.warn(
                    f'Unsupported filter category: {category} passed in list of filters'
                )
            elif item_list is '' or item_list == ['']:
                LOGGING.warn(
                    f'The filter value cannot be empty.In this case the filter {category} is ignored'
                )
            else:
                query_list.append(mapped_category + ':' + '(' +
                                  ' OR '.join(item_list) + ')')

        if len(query_list) == 0:
            return ''

        return ' AND '.join(query_list)

    @staticmethod
    def validate_filter_values(search_request: dict) -> Any:
        if 'filters' in search_request:
            filter_values_list = search_request['filters'].values()
            # Ensure all values are arrays
            filter_values_list = list(
                map(lambda x: x
                    if type(x) == list else [x], filter_values_list))
            # Flatten the array of arrays
            filter_values_list = list(
                itertools.chain.from_iterable(filter_values_list))
            # Check if / or : exist in any of the values
            if any(("/" in str(item) or ":" in str(item))
                   for item in (filter_values_list)):
                return False
            return True

    @staticmethod
    def parse_query_term(query_term: str, index: str) -> str:
        # TODO: Might be some issue with using wildcard & underscore
        # https://discuss.elastic.co/t/wildcard-search-with-underscore-is-giving-no-result/114010/8
        if index == TABLE_INDEX:
            query_term = f'(name:(*{query_term}*) OR name:({query_term}) ' \
                         f'OR schema:(*{query_term}*) OR schema:({query_term}) ' \
                         f'OR description:(*{query_term}*) OR description:({query_term}) ' \
                         f'OR column_names:(*{query_term}*) OR column_names:({query_term}) ' \
                         f'OR column_descriptions:(*{query_term}*) OR column_descriptions:({query_term}))'
        elif index == DASHBOARD_INDEX:
            query_term = f'(name:(*{query_term}*) OR name:({query_term}) ' \
                         f'OR group_name:(*{query_term}*) OR group_name:({query_term}) ' \
                         f'OR query_names:(*{query_term}*) OR query_names:({query_term}) ' \
                         f'OR description:(*{query_term}*) OR description:({query_term}) ' \
                         f'OR tags:(*{query_term}*) OR tags:({query_term}) ' \
                         f'OR badges:(*{query_term}*) OR badges:({query_term}) ' \
                         f'OR product:(*{query_term}*) OR product:({query_term}))'
        else:
            raise Exception(
                f'index {index} doesnt exist nor support search filter')
        return query_term

    @classmethod
    def convert_query_json_to_query_dsl(self, *, search_request: dict,
                                        query_term: str, index: str) -> str:
        """
        Convert the generic query json to query DSL
        e.g
        ```
        {
            'type': 'AND'
            'filters': {
                'database': ['hive', 'bigquery'],
                'schema': ['test-schema1', 'test-schema2'],
                'table': ['*amundsen*'],
                'column': ['*ds*']
                'tag': ['test-tag']
            }
        }

        This generic JSON will convert into DSL depending on the backend engines.

        E.g in Elasticsearch, it will become
        'database':('hive' OR 'bigquery') AND
        'schema':('test-schema1' OR 'test-schema2') AND
        'table':('*amundsen*') AND
        'column':('*ds*') AND
        'tag':('test-tag')
        ```

        :param search_request:
        :param query_term:
        :param index: table_index, dashboard_index
        :return: The search engine query DSL
        """
        filter_list = search_request.get('filters')
        add_query = ''
        query_dsl = ''
        if filter_list:
            valid_filters = self.validate_filter_values(search_request)
            if valid_filters is False:
                raise Exception(
                    'The search filters contain invalid characters and thus cannot be handled by ES'
                )
            query_dsl = self.parse_filters(filter_list, index)

        if query_term:
            add_query = self.parse_query_term(query_term, index)

        if not query_dsl and not add_query:
            raise Exception('Unable to convert parameters to valid query dsl')

        result = ''
        if query_dsl and add_query:
            result = query_dsl + ' AND ' + add_query
        elif add_query and not query_dsl:
            result = add_query
        elif query_dsl and not add_query:
            result = query_dsl

        return result

    @timer_with_counter
    def fetch_search_results_with_filter(
            self,
            *,
            query_term: str,
            search_request: dict,
            page_index: int = 0,
            index: str = ''
    ) -> Union[SearchDashboardResult, SearchTableResult]:
        """
        Query Elasticsearch and return results as list of Table objects
        :param search_request: A json representation of search request
        :param page_index: index of search page user is currently on
        :param index: current index for search. Provide different index for different resource.
        :return: SearchResult Object
        """
        current_index = index if index else \
            current_app.config.get(config.ELASTICSEARCH_INDEX_KEY, DEFAULT_ES_INDEX)  # type: str
        if current_index == DASHBOARD_INDEX:
            search_model = SearchDashboardResult  # type: Any
        elif current_index == TABLE_INDEX:
            search_model = SearchTableResult
        else:
            raise RuntimeError(
                f'the {index} doesnt have search filter support')
        if not search_request:
            # return empty result for blank query term
            return search_model(total_results=0, results=[])

        try:
            query_string = self.convert_query_json_to_query_dsl(
                search_request=search_request,
                query_term=query_term,
                index=current_index)  # type: str
        except Exception as e:
            LOGGING.exception(e)
            # return nothing if any exception is thrown under the hood
            return search_model(total_results=0, results=[])

        s = Search(using=self.elasticsearch, index=current_index)

        query_name = {
            "function_score": {
                "query": {
                    "query_string": {
                        "query": query_string
                    }
                },
                "field_value_factor": {
                    "field": "total_usage",
                    "modifier": "log2p"
                }
            }
        }

        model = self.get_model_by_index(current_index)
        return self._search_helper(page_index=page_index,
                                   client=s,
                                   query_name=query_name,
                                   model=model,
                                   search_result_model=search_model)

    @timer_with_counter
    def fetch_user_search_results(self,
                                  *,
                                  query_term: str,
                                  page_index: int = 0,
                                  index: str = '') -> SearchUserResult:
        if not index:
            raise Exception('Index cant be empty for user search')
        if not query_term:
            # return empty result for blank query term
            return SearchUserResult(total_results=0, results=[])

        s = Search(using=self.elasticsearch, index=index)

        # Don't use any weight(total_follow, total_own, total_use)
        query_name = {
            "function_score": {
                "query": {
                    "multi_match": {
                        "query":
                        query_term,
                        "fields": [
                            "full_name.raw^30", "full_name^5",
                            "first_name.raw^5", "last_name.raw^5",
                            "first_name^3", "last_name^3", "email^3"
                        ],
                        "operator":
                        "and"
                    }
                }
            }
        }

        return self._search_helper(page_index=page_index,
                                   client=s,
                                   query_name=query_name,
                                   model=User,
                                   search_result_model=SearchUserResult)

    @timer_with_counter
    def fetch_dashboard_search_results(self,
                                       *,
                                       query_term: str,
                                       page_index: int = 0,
                                       index: str = ''
                                       ) -> SearchDashboardResult:
        """
        Fetch dashboard search result with fuzzy search

        :param query_term:
        :param page_index:
        :param index:
        :return:
        """
        current_index = index if index else \
            current_app.config.get(config.ELASTICSEARCH_INDEX_KEY, DEFAULT_ES_INDEX)

        if not query_term:
            # return empty result for blank query term
            return SearchDashboardResult(total_results=0, results=[])
        s = Search(using=self.elasticsearch, index=current_index)

        query_name = {
            "function_score": {
                "query": {
                    "multi_match": {
                        "query":
                        query_term,
                        "fields": [
                            "name.raw^75", "name^7", "group_name.raw^15",
                            "group_name^7", "description^3", "query_names^3"
                        ]
                    }
                },
                "field_value_factor": {
                    "field": "total_usage",
                    "modifier": "log2p"
                }
            }
        }

        return self._search_helper(page_index=page_index,
                                   client=s,
                                   query_name=query_name,
                                   model=Dashboard,
                                   search_result_model=SearchDashboardResult)

    # The following methods are related to document API that needs to update
    @timer_with_counter
    def create_document(self, *, data: List[Table], index: str) -> str:
        """
        Creates new index in elasticsearch, then routes traffic to the new index
        instead of the old one
        :return: str
        """

        if not index:
            raise Exception('Index cant be empty for creating document')
        if not data:
            LOGGING.warn('Received no data to upload to Elasticsearch')
            return ''

        return self._create_document_helper(data=data, index=index)

    @timer_with_counter
    def update_document(self, *, data: List[Table], index: str) -> str:
        """
        Updates the existing index in elasticsearch
        :return: str
        """
        if not index:
            raise Exception('Index cant be empty for updating document')
        if not data:
            LOGGING.warn('Received no data to upload to Elasticsearch')
            return ''

        return self._update_document_helper(data=data, index=index)

    @timer_with_counter
    def delete_document(self, *, data: List[str], index: str) -> str:
        if not index:
            raise Exception('Index cant be empty for deleting document')
        if not data:
            LOGGING.warn('Received no data to upload to Elasticsearch')
            return ''

        return self._delete_document_helper(data=data, index=index)

    def _create_document_helper(self, data: List[Table], index: str) -> str:
        # fetch indices that use our chosen alias (should only ever return one in a list)
        indices = self._fetch_old_index(index)

        for i in indices:
            # build a list of elasticsearch actions for bulk upload
            actions = self._build_index_actions(data=data, index_key=i)

            # bulk create or update data
            self._bulk_helper(actions)

        return index

    def _update_document_helper(self, data: List[Table], index: str) -> str:
        # fetch indices that use our chosen alias (should only ever return one in a list)
        indices = self._fetch_old_index(index)

        for i in indices:
            # build a list of elasticsearch actions for bulk update
            actions = self._build_update_actions(data=data, index_key=i)

            # bulk update existing documents in index
            self._bulk_helper(actions)

        return index

    def _delete_document_helper(self, data: List[str], index: str) -> str:
        # fetch indices that use our chosen alias
        indices = self._fetch_old_index(index)

        # set the document type
        type = User.get_type() if index is USER_INDEX else Table.get_type()

        for i in indices:
            # build a list of elasticsearch actions for bulk deletion
            actions = self._build_delete_actions(data=data,
                                                 index_key=i,
                                                 type=type)

            # bulk delete documents in index
            self._bulk_helper(actions)

        return index

    def _build_index_actions(self, data: List[Table],
                             index_key: str) -> List[Dict[str, Any]]:
        actions = list()
        for item in data:
            index_action = {
                'index': {
                    '_index': index_key,
                    '_type': item.get_type(),
                    '_id': item.get_id()
                }
            }
            actions.append(index_action)
            actions.append(item.__dict__)
        return actions

    def _build_update_actions(self, data: List[Table],
                              index_key: str) -> List[Dict[str, Any]]:
        actions = list()

        for item in data:
            actions.append({
                'update': {
                    '_index': index_key,
                    '_type': item.get_type(),
                    '_id': item.get_id()
                }
            })
            actions.append({'doc': item.get_attrs_dict()})
        return actions

    def _build_delete_actions(self, data: List[str], index_key: str,
                              type: str) -> List[Dict[str, Any]]:
        return [{
            'delete': {
                '_index': index_key,
                '_id': id,
                '_type': type
            }
        } for id in data]

    def _bulk_helper(self, actions: List[Dict[str, Any]]) -> None:
        result = self.elasticsearch.bulk(actions)

        if result['errors']:
            # ES's error messages are nested within elasticsearch objects and can
            # fail silently if you aren't careful
            LOGGING.error('Error during Elasticsearch bulk actions')
            LOGGING.debug(result['items'])
            return

    def _fetch_old_index(self, alias: str) -> List[str]:
        """
        Retrieve all indices that are currently tied to alias
        (Can most often expect only one index to be returned in this list)
        :return: list of elasticsearch indices
        """
        try:
            indices = self.elasticsearch.indices.get_alias(alias).keys()
            return indices
        except NotFoundError:
            LOGGING.warn('Received index not found error from Elasticsearch',
                         exc_info=True)

            # create a new index if there isn't already one that is usable
            new_index = self._create_index_helper(alias=alias)
            return [new_index]

    def _create_index_helper(self, alias: str) -> str:
        def _get_mapping(alias: str) -> str:
            if alias is USER_INDEX:
                return USER_INDEX_MAP
            elif alias is TABLE_INDEX:
                return TABLE_INDEX_MAP
            return ''

        index_key = str(uuid.uuid4())
        mapping: str = _get_mapping(alias=alias)
        self.elasticsearch.indices.create(index=index_key, body=mapping)

        # alias our new index
        index_actions = {
            'actions': [{
                'add': {
                    'index': index_key,
                    'alias': alias
                }
            }]
        }
        self.elasticsearch.indices.update_aliases(index_actions)
        return index_key
Beispiel #31
0
class Scraper(object):
    '''
    Base Scraper:
    -------------
    This is the default scraper inherited by the rest.
    '''
    def __init__(self):

        self.small_batch = True if "small_batch" in sys.argv else False

        self.site_url = None
        self.site_pages_no = None
        self.fields = None

        self.doc_id = 1  # Id for each entry, to be incremented
        self.es_index = ES["index"]  # Elasticsearch index
        self.es_doc = None  # Elasticsearch doc_type

        self.s3 = boto3.client(
            "s3", **{
                "aws_access_key_id": AWS["aws_access_key_id"],
                "aws_secret_access_key": AWS["aws_secret_access_key"],
                "region_name": AWS["region_name"]
            })

        self.data_key = DATA_DIR + "data.json"  # Storage key for latest data
        self.data_archive_key = DATA_DIR + "archive/data-{}.json"  # Storage key for data to archive

        try:
            # client host for aws elastic search service
            if "aws" in ES["host"]:
                # set up authentication credentials
                awsauth = AWS4Auth(AWS["aws_access_key_id"],
                                   AWS["aws_secret_access_key"],
                                   AWS["region_name"], "es")
                self.es_client = Elasticsearch(
                    hosts=[{
                        "host": ES["host"],
                        "port": int(ES["port"])
                    }],
                    http_auth=awsauth,
                    use_ssl=True,
                    verify_certs=True,
                    connection_class=RequestsHttpConnection,
                    serializer=JSONSerializerPython2())

            else:
                self.es_client = Elasticsearch("{}:{}".format(
                    ES["host"], ES["port"]))
        except Exception as err:
            self.print_error(
                "- ERROR: ES Client Set Up \n- SOURCE: Invalid parameters for ES Client \n- MESSAGE: {}"
                .format(str(err)))

        self.results = []
        self.results_es = []

    def run_scraper(self):
        '''
        This function works to display some output and run scrape_site()
        '''
        print "[{}] ".format(
            re.sub(r"(\w)([A-Z])", r"\1 \2",
                   type(self).__name__))
        print "[{}] Started Scraper.".format(
            datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

        self.scrape_site()

        print "[{}] Scraper completed. {} documents retrieved.".format(
            datetime.now().strftime("%Y-%m-%d %H:%M:%S"), len(self.results))

        return self.results

    def scrape_site(self):
        '''
        This functions scrapes the entire website by calling each page.
        '''
        self.set_site_pages_no()
        if not self.site_pages_no:
            self.print_error(
                "- ERROR: scrape_site() \n- SOURCE: {} \n- MESSAGE: {}".format(
                    self.site_url, "No pages found."))
            return

        for page_num in range(1, self.site_pages_no + 1):
            # Check if is NHIF and if so just use page_num else format site_url
            nhif = set(re.sub(r"(\w)([A-Z])", r"\1 \2", type(self).__name__).lower().split()) &\
                set(NHIF_SERVICES)

            url = page_num if nhif else self.site_url.format(page_num)

            results, results_es = self.scrape_page(url, 5)

            if type(results) != list:
                self.print_error(
                    "- ERROR: scrape_site() \n- SOURCE: {} \n-MESSAGE: page: {} \ndata: {}"
                    .format(url, page_num, results))
                return

            self.results.extend(results)
            self.results_es.extend(results_es)

        if self.results:
            self.archive_data(json.dumps(self.results))
            self.elasticsearch_delete_docs()
            self.elasticsearch_index(self.results_es)

        return self.results

    def scrape_page(self, page_url, page_retries):
        '''
        Scrape the page for the data.
        '''
        try:
            soup = self.make_soup(page_url)
            table = soup.find("table", {"class": "zebra"}).find("tbody")
            rows = table.find_all("tr")

            results = []
            results_es = []
            for row in rows:
                # only the columns we want
                # -1 because fields/columns has extra index; id
                columns = row.find_all("td")[:len(self.fields) - 1]
                columns = [text.text.strip() for text in columns]
                columns.append(self.doc_id)

                entry = dict(zip(self.fields, columns))
                meta, entry = self.elasticsearch_format(entry)
                results_es.append(meta)
                results_es.append(entry)
                results.append(entry)

                self.doc_id += 1

            return results, results_es

        except Exception as err:
            if page_retries >= 5:
                self.print_error(
                    "- ERROR: scrape_page() \n- SOURCE: {} \n- MESSAGE: {}".
                    format(page_url, str(err)))
                return
            else:
                page_retries += 1
                self.print_error(
                    "- ERROR: Try {}/5 has failed... \n- SOURCE: {} \n- MESSAGE {} \nGoing to sleep for {} seconds."
                    .format(page_retries, page_url, err, page_retries * 5))
                time.sleep(page_retries * 5)
                self.scrape_page(page_url, page_retries)

    def set_site_pages_no(self):
        '''
        Set the total number of pages to be scraped
        '''
        try:
            soup = self.make_soup(self.site_url.format(1))
            text = soup.find("div", {"id": "tnt_pagination"}).getText()
            # What number of pages looks like
            pattern = re.compile("(\d+) pages?")
            self.site_pages_no = int(pattern.search(text).group(1))
        except Exception as err:
            self.print_error(
                "- ERROR: get_total_page_numbers() \n- SOURCE: {} \n- MESSAGE: {}"
                .format(self.site_url, str(err)))

        # If small batch is set, that would be the number of pages.
        if self.small_batch and self.site_pages_no and self.site_pages_no > SMALL_BATCH:
            self.site_pages_no = SMALL_BATCH

        # TODO: Print how many pages we found

    def make_soup(self, url):
        '''
        Get page, make and return a BeautifulSoup object
        '''
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        return soup

    def elasticsearch_format(self, entry):
        """
        Format entry into elasticsearch ready document
        :param entry: the data to be formatted
        :return: dictionaries of the entry's metadata and the formatted entry
        """
        # all bulk data need meta data describing the data
        meta_dict = {
            "index": {
                "_index": self.es_index,
                "_type": self.es_doc,
                "_id": entry["id"]
            }
        }
        return meta_dict, entry

    def elasticsearch_index(self, results):
        '''
        Upload data to Elastic Search
        '''
        try:
            # sanity check
            if not self.es_client.indices.exists(index=self.es_index):
                self.es_client.indices.create(index=self.es_index)
                print(
                    "[{0}] Elasticsearch: Index successfully created.".format(
                        datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

            # bulk index the data and use refresh to ensure that our data will be immediately available
            response = self.es_client.bulk(index=self.es_index,
                                           body=results,
                                           refresh=True)
            print("[{0}] Elasticsearch: Index successful.".format(
                datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
            return response
        except Exception as err:
            self.print_error(
                "- ERROR: elasticsearch_index() \n- SOURCE: {} \n- MESSAGE: {}"
                .format(type(self).__name__, str(err)))

    def elasticsearch_delete_docs(self):
        '''
        Delete documents that were uploaded to elasticsearch in the last scrape
        '''
        try:
            delete_query = {"query": {"match_all": {}}}
            try:
                response = self.es_client.delete_by_query(index=self.es_index,
                                                          doc_type=self.es_doc,
                                                          body=delete_query,
                                                          _source=True)
                return response
            except Exception as err:
                self.print_error(
                    "- ERROR: elasticsearch_delete_docs() \n- SOURCE: {} \n- MESSAGE: {}"
                    .format(type(self).__name__, str(err)))

        except Exception as err:
            self.print_error(
                "- ERROR: elasticsearch_delete_docs() \n- SOURCE: {} \n- MESSAGE: {}"
                .format(type(self).__name__, str(err)))

    def archive_data(self, payload):
        '''
        Upload scraped data to AWS S3
        '''
        try:
            date = datetime.today().strftime("%Y%m%d")
            self.data_key = DATA_DIR + self.data_key
            self.data_archive_key = DATA_DIR + self.data_archive_key
            if AWS["s3_bucket"]:
                old_etag = self.s3.get_object(Bucket=AWS["s3_bucket"],
                                              Key=self.data_key)["ETag"]
                new_etag = hashlib.md5(payload.encode("utf-8")).hexdigest()
                if eval(old_etag) != new_etag:
                    file_obj = StringIO(payload.encode("utf-8"))
                    self.s3.upload_fileobj(file_obj, AWS["s3_bucket"],
                                           self.data_key)

                    # archive historical data
                    self.s3.copy_object(
                        Bucket=AWS["s3_bucket"],
                        CopySource="{}/".format(AWS["s3_bucket"]) +
                        self.data_key,
                        Key=self.data_archive_key.format(date))
                    print "[{0}] Archive: Data has been updated.".format(
                        datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
                    return
                else:
                    print "[{0}] Archive: Data scraped does not differ from archived data.".format(
                        datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
            else:
                # archive to local dir
                with open(self.data_key, "w") as data:
                    json.dump(payload, data)
                # archive historical data to local dir
                with open(self.data_archive_key.format(date), "w") as history:
                    json.dump(payload, history)
                print("[{0}] Archived: Data has been updated.".format(
                    datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

        except Exception as err:
            self.print_error(
                "- ERROR: archive_data() \n- SOURCE: {} \n- MESSAGE: {}".
                format(self.data_key, str(err)))

    def print_error(self, message):
        '''
        Print error messages in the terminal.
        If slack webhook is set up, post the errors to Slack.
        '''
        print colored(
            "[{0}]\n".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) +
            message, "red")
        response = None
        if SLACK["url"]:
            try:
                err = message.split("-", 3)
                severity = err[3].split(":")[1]
                errors = {
                    "author": err[1].replace("ERROR:", "").strip(),
                    "pretext": err[2].replace("SOURCE:", "").strip(),
                    "message": err[3].replace("MESSAGE:", "").strip(),
                    "severity": severity
                }
            except:
                errors = {
                    "pretext": "",
                    "author": message,
                    "message": message,
                    "severity": message
                }
            response = requests.post(
                SLACK["url"],
                data=json.dumps({
                    "attachments": [{
                        "author_name":
                        "{}".format(errors["author"]),
                        "color":
                        "danger",
                        "pretext":
                        "[SCRAPER] New Alert for {} : {}".format(
                            errors["author"], errors["pretext"]),
                        "fields": [{
                            "title": "Message",
                            "value": "{}".format(errors["message"]),
                            "short": False
                        }, {
                            "title": "Machine Location",
                            "value": "{}".format(getpass.getuser()),
                            "short": True
                        }, {
                            "title":
                            "Time",
                            "value":
                            "{}".format(
                                datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
                            "short":
                            True
                        }, {
                            "title": "Severity",
                            "value": "{}".format(errors["severity"]),
                            "short": True
                        }]
                    }]
                }),
                headers={"Content-Type": "application/json"})
        return response
Beispiel #32
0
class Scraper(object):
    '''
    Base Scraper:
    -------------
    This is the default scraper inherited by the rest.
    '''
    def __init__(self):
        parser = argparse.ArgumentParser()
        parser.add_argument(
            '-sb',
            '--small-batch',
            action="store_true",
            help=
            "Specify option to scrape limited pages from site in development mode"
        )
        parser.add_argument(
            '-scr',
            '--scraper',
            nargs='+',
            choices=[
                "doctors", "foreign_doctors", "nhif_outpatient",
                "nhif_outpatient_cs", "nhif_inpatient", "health_facilities"
            ],
            help=
            "Allows selection of what to scrape instead of the default everything."
        )

        self.args = parser.parse_args()

        self.small_batch = True if self.args.small_batch else False

        # Logging
        self.log = logging.getLogger(__name__)

        self.site_url = None
        self.site_pages_no = None
        self.fields = None

        self.doc_id = 1  # Id for each entry, to be incremented
        self.es_index = ES["index"]  # Elasticsearch index
        self.es_doc = None  # Elasticsearch doc_type

        self.s3 = boto3.client(
            "s3", **{
                "aws_access_key_id": AWS["aws_access_key_id"],
                "aws_secret_access_key": AWS["aws_secret_access_key"],
                "region_name": AWS["region_name"]
            })

        self.s3_handler = S3ObjectHandler(self.s3)

        self.data_key = DATA_DIR + "data.json"  # Storage key for latest data
        # Storage key for data to archive
        self.data_archive_key = DATA_DIR + "archive/data-{}.json"

        try:
            # client host for aws elastic search service
            if "aws" in ES["host"]:
                # set up authentication credentials
                awsauth = AWS4Auth(AWS["aws_access_key_id"],
                                   AWS["aws_secret_access_key"],
                                   AWS["region_name"], "es")
                self.es_client = Elasticsearch(
                    hosts=[{
                        "host": ES["host"],
                        "port": int(ES["port"])
                    }],
                    http_auth=awsauth,
                    use_ssl=True,
                    verify_certs=True,
                    connection_class=RequestsHttpConnection,
                    serializer=JSONSerializerPython2())

            else:
                self.es_client = Elasticsearch("{}:{}".format(
                    ES["host"], ES["port"]))
        except Exception as err:
            error = {
                "ERROR": "ES Client Set Up",
                "SOURCE": "Invalid parameters for ES Client",
                "MESSAGE": str(err)
            }
            self.print_error(error)

        self.results = []
        self.results_es = []

        self.scraping_started = time.time()
        self.scraping_ended = time.time()
        self.stat_log = {}

    def run_scraper(self):
        '''
        This function works to display some output and run scrape_site()
        '''
        self.scraping_started = time.time()
        scraper_name = re.sub(r"(\w)([A-Z])", r"\1 \2", type(self).__name__)

        _scraper_name = re.sub(" Scraper", "", scraper_name).lower()
        _scraper_name = re.sub(" ", "_", _scraper_name)

        if not self.args.scraper or \
                (self.args.scraper and _scraper_name in self.args.scraper):

            self.log.info(
                "[%s]", re.sub(r"(\w)([A-Z])", r"\1 \2",
                               type(self).__name__))
            self.log.info("Started Scraper.")

            self.scrape_site()
            '''
            Log stats
            '''
            self.scraping_ended = time.time()
            time_taken_in_secs = self.scraping_ended - self.scraping_started
            m, s = divmod(time_taken_in_secs, 60)
            h, m = divmod(m, 60)
            time_taken = "%dhr:%02dmin:%02dsec" % (
                h, m, s) if time_taken_in_secs > 60 else '{} seconds'.format(
                    time_taken_in_secs)
            self.stat_log = {
                'Scraping took':
                time_taken,
                'Last successfull Scraping was':
                strftime("%Y-%m-%d %H:%M:%S", gmtime()),
                'Total documents scraped':
                len(self.results)
            }
            self.log.info("[%s] Scraper completed. %s documents retrieved.",
                          datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                          len(self.results))

            return self.results

    def scrape_site(self):
        '''
        This functions scrapes the entire website by calling each page.
        '''
        self.set_site_pages_no()

        if not self.site_pages_no:
            error = {
                "ERROR": "scrape_site()",
                "SOURCE": self.site_url,
                "MESSAGE": "No pages found."
            }
            self.print_error(error)
            return

        for page_num in range(1, self.site_pages_no + 1):
            # Check if is NHIF and if so just use page_num else format site_url
            nhif = set(re.sub(r"(\w)([A-Z])", r"\1 \2", type(self).__name__).lower().split()) &\
                set(NHIF_SERVICES)

            url = page_num if nhif else self.site_url.format(page_num)

            results, results_es = self.scrape_page(url, 5)

            if type(results) != list:
                error = {
                    "ERROR": "scrape_site()",
                    "SOURCE": url,
                    "MESSAGE": "page: {} \ndata: {}".format(page_num, results)
                }
                self.print_error(error)
                return

            self.results.extend(results)
            self.results_es.extend(results_es)

        if self.results:
            self.archive_data(self.results)
            self.elasticsearch_delete_docs()
            self.elasticsearch_index(self.results_es)

        return self.results

    def scrape_page(self, page_url, page_retries):
        '''
        Scrape the page for the data.
        '''
        try:
            soup = self.make_soup(page_url)
            table = soup.find("table", {"class": "zebra"}).find("tbody")
            rows = table.find_all("tr")

            results = []
            results_es = []
            for row in rows:
                # only the columns we want
                # -1 because fields/columns has extra index; id
                columns = row.find_all("td")[:len(self.fields) - 1]
                columns = [text.text.strip() for text in columns]
                columns.append(self.doc_id)

                entry = dict(zip(self.fields, columns))

                # Check if name field empty, skip
                if not entry['name']:
                    continue

                meta, entry = self.elasticsearch_format(entry)

                results_es.append(meta)
                results_es.append(entry)
                results.append(entry)

                self.doc_id += 1

            return results, results_es

        except Exception as err:
            # TODO: Check page_retries functionality
            if page_retries >= 5:
                error = {
                    "ERROR": "scrape_page()",
                    "SOURCE": page_url,
                    "MESSAGE": str(err)
                }
                self.print_error(error)
                return
            else:
                page_retries += 1
                error = {
                    "ERROR":
                    "Try {}/5 has failed...".format(page_retries),
                    "SOURCE":
                    page_url,
                    "MESSAGE":
                    "{} \nGoing to sleep for {} seconds.".format(
                        err, page_retries * 5)
                }
                self.print_error(error)

                time.sleep(page_retries * 5)
                self.scrape_page(page_url, page_retries)

    def set_site_pages_no(self):
        '''
        Set the total number of pages to be scraped
        '''
        try:
            soup = self.make_soup(self.site_url.format(1))
            text = soup.find("div", {"id": "tnt_pagination"}).getText()
            # What number of pages looks like
            pattern = re.compile("(\d+) pages?")
            self.site_pages_no = int(pattern.search(text).group(1))
        except Exception as err:
            error = {
                "ERROR": "get_total_page_numbers()",
                "SOURCE": self.site_url,
                "MESSAGE": str(err)
            }
            self.print_error(error)

        # If small batch is set, that would be the number of pages.
        if self.small_batch and self.site_pages_no and self.site_pages_no > SMALL_BATCH:
            self.site_pages_no = SMALL_BATCH

        self.log.info('{} pages found.'.format(self.site_pages_no))

        return self.site_pages_no

    def make_soup(self, url):
        '''
        Get page, make and return a BeautifulSoup object
        '''
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        return soup

    def elasticsearch_format(self, entry):
        """
        Format entry into elasticsearch ready document
        :param entry: the data to be formatted
        :return: dictionaries of the entry's metadata and the formatted entry
        """
        # all bulk data need meta data describing the data
        meta_dict = {
            "index": {
                "_index": self.es_index,
                "_type": self.es_doc,
                "_id": entry["id"]
            }
        }
        return meta_dict, entry

    def elasticsearch_index(self, results):
        '''
        Upload data to Elastic Search
        '''
        try:
            # sanity check
            if not self.es_client.indices.exists(index=self.es_index):
                self.es_client.indices.create(index=self.es_index)
                self.log.info("Elasticsearch: Index successfully created.")

            # bulk index the data and use refresh to ensure that our data will
            # be immediately available
            response = self.es_client.bulk(index=self.es_index,
                                           body=results,
                                           request_timeout=60,
                                           refresh='true')
            self.log.info("Elasticsearch: Index successful.")
            return response
        except Exception as err:
            error = {
                "ERROR": "elasticsearch_index()",
                "SOURCE": type(self).__name__,
                "MESSAGE": str(err)
            }
            self.print_error(error)

    def elasticsearch_delete_docs(self):
        '''
        Delete documents that were uploaded to elasticsearch in the last scrape
        '''
        delete_query = {"query": {"match_all": {}}}
        try:
            response = self.es_client.delete_by_query(index=self.es_index,
                                                      doc_type=self.es_doc,
                                                      body=delete_query,
                                                      request_timeout=60)
            return response
        except Exception as err:
            error = {
                "ERROR": "elasticsearch_delete_docs()",
                "SOURCE": type(self).__name__,
                "MESSAGE": str(err)
            }
            self.print_error(error)

    def archive_data(self, payload):
        '''
        Upload scraped data to AWS S3
        '''
        try:
            date = datetime.today().strftime("%Y%m%d")
            self.data_key = DATA_DIR + self.data_key
            self.data_archive_key = DATA_DIR + self.data_archive_key

            # Encode to JSON
            payload = json.dumps(payload)

            if AWS["s3_bucket"]:
                # TODO: Check if bucket exists and has the expected file structure
                self.s3_handler.handle_s3_objects(bucket_name=AWS["s3_bucket"],
                                                  key=self.data_key)

                old_etag = self.s3.get_object(Bucket=AWS["s3_bucket"],
                                              Key=self.data_key)["ETag"]
                new_etag = hashlib.md5(payload.encode("utf-8")).hexdigest()
                if eval(old_etag) != new_etag:
                    self.s3.put_object(Bucket=AWS["s3_bucket"],
                                       Key=self.data_key,
                                       Body=payload)
                    # archive historical data
                    self.s3.copy_object(
                        Bucket=AWS["s3_bucket"],
                        CopySource="{}/".format(AWS["s3_bucket"]) +
                        self.data_key,
                        Key=self.data_archive_key.format(date))
                    self.log.info("Archive: Data has been updated.")
                    return
                else:
                    self.log.info(
                        "Archive: Data scraped does not differ from archived data."
                    )
            else:
                # archive to local dir
                with open(self.data_key, "w") as data:
                    json.dump(payload, data)
                # archive historical data to local dir
                with open(self.data_archive_key.format(date), "w") as history:
                    json.dump(payload, history)
                self.log.info("Archived: Data has been updated.")

        except Exception as err:
            error = {
                "ERROR": "archive_data()",
                "SOURCE": self.data_key,
                "MESSAGE": str(err)
            }

            self.print_error(error)

    def print_error(self, message):
        '''
        Print error messages in the terminal.
        If slack webhook is set up, post the errors to Slack.
        '''

        error = "- ERROR: " + message['ERROR']
        source = ("- SOURCE: " +
                  message['SOURCE']) if "SOURCE" in message else ""
        error_msg = "- MESSAGE: " + message['MESSAGE']
        msg = "\n".join([error, source, error_msg])

        self.log.error(msg)

        response = None
        if SLACK["url"]:
            try:
                errors = {
                    "author": message['ERROR'],
                    "pretext": message['SOURCE'],
                    "message": message['MESSAGE'],
                }
            except:
                errors = {
                    "pretext": "",
                    "author": message,
                    "message": message,
                }

            response = requests.post(
                SLACK["url"],
                data=json.dumps({
                    "attachments": [{
                        "username":
                        "******",
                        "author_name":
                        "{}".format(errors["author"]),
                        "color":
                        "danger",
                        "pretext":
                        "[SCRAPER] New Alert for {} : {}".format(
                            errors["author"], errors["pretext"]),
                        "fields": [
                            {
                                "title": "Message",
                                "value": "{}".format(errors["message"]),
                                "short": False
                            },
                            {
                                "title": "Machine Location",
                                "value": "{}".format(getpass.getuser()),
                                "short": True
                            },
                            {
                                "title":
                                "Time",
                                "value":
                                "{}".format(datetime.now().strftime(
                                    "%Y-%m-%d %H:%M:%S")),
                                "short":
                                True
                            },
                        ]
                    }]
                }),
                headers={"Content-Type": "application/json"})
        return response

    def parse_date(self, datetime_string):
        '''
        Parse a string into a datetime object 
        :param datetime_string: the datetime string to parse
        :return: datetime object
        '''
        from dateutil.parser import parse
        dateobject = parse('1900-01-01')
        try:
            dateobject = parse(datetime_string)
        except Exception as ex:
            self.log.error(
                'Can not create a the datetime object from {}.'.format(
                    datetime_string))
        return dateobject
class Command(BaseCommand):
    def __init__(self, *args, **kwargs):
        ret = super().__init__(*args, **kwargs)

        credentials = boto3.Session().get_credentials()
        awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
                           region, service)

        self.es = Elasticsearch(hosts=[{
            'host': ES_CONFIG.get('host'),
            'port': 443
        }],
                                http_auth=awsauth,
                                use_ssl=True,
                                verify_certs=True,
                                connection_class=RequestsHttpConnection)

        return ret

    def add_arguments(self, parser):
        # Positional arguments
        parser.add_argument('--index-list', nargs='+', type=str)

        # Named (optional) arguments
        parser.add_argument(
            '--all',
            action='store_true',
            help='Reindex all',
        )

    def handle(self, *args, **options):
        if options.get('all'):
            index_list = ES_CONFIG.get('indices')
        else:
            index_list = filter(
                lambda x: x.get('name') in options.get('index_list')[0].split(
                    ','), ES_CONFIG.get('indices'))

        for index in index_list:
            self.recreate_index(index)

    def recreate_index(self, index):
        if self.es.indices.exists(index.get('name')):
            print("===== Deleting index %s ..." % index.get('name'))
            self.es.indices.delete(index=index.get('name'), ignore=[400, 404])

        print("===== Creating Index %s ..." % index.get('name'))
        self.es.indices.create(index=index.get('name'),
                               body=index.get('index-config'))
        print("===== Index %s created successfully." % index.get('name'))

    def bulk_insert_doc(self, doc_list, index_name):
        bulk_body = []

        for doc in doc_list:
            bulk_body.append(
                {'index': {
                    '_index': index_name,
                    '_id': doc.pop('id')
                }})
            bulk_body.append(doc)

        print(
            self.es.bulk(body=bulk_body,
                         headers={"Content-Type": "application/x-ndjson"}))

    def get_processed_doc(self, index):
        table = index.get('table')
        doc_config = index.get('doc_config')
        id_field = doc_config.get('id_field')
        term_fields = doc_config.get('term_fields')

        docs = []

        for item in self.dynamo.scan(TableName=table).get("Items"):
            for key, val in item.get(doc_config.get('id_field')).items():
                _id = val
                break

            assert _id, "Id field cannot be null"

            _source = {
                attr: val
                for attr in doc_config.get('extra_fields')
                for key, val in item.get(attr, {}).items()
            }

            _source['term'] = " ".join([
                val for attr in doc_config.get('term_fields')
                for key, val in item.get(attr, {}).items()
            ])

            docs.append({
                "_index": index.get('name'),
                "_type": '_doc',
                "_id": _id,
                "_source": _source
            })

        return docs
Beispiel #34
0
bulk_data = []
for row in response:
    data_dict = {}
    data_dict = row
    op_dict = {
        "index": {
            "_index": INDEX_NAME,
            "_type": TYPE_NAME,
            "_id": data_dict[ID_FIELD]
        }
    }
    bulk_data.append(op_dict)
    bulk_data.append(data_dict)
# # clear index
# if es.indices.exists(INDEX_NAME):
#     print("deleting '%s' index..." % (INDEX_NAME))
#     res = es.indices.delete(index = INDEX_NAME)
#     print(" response: '%s'" % (res))
# # create index and insert bulk data
# res = es.indices.create(index = INDEX_NAME)
res = es.bulk(index=INDEX_NAME, body=bulk_data, refresh=True)
# verify
res = es.search(index=INDEX_NAME, size=2, body={"query": {"match_all": {}}})
print(" response: '%s'" % (res))

# res = es.index(
#     index="march-mad", doc_type=ts, id=ts, body={
#         'timestamp':ts,
#         'text':tweet["text"]
#     })
Beispiel #35
0
    def post(self, request, *args, **kwargs):
        ret = {"code": -1, "desc": ""}
        es = Elasticsearch(settings.ELASTICSEARCH_HOST)
        myfile = request.FILES['payload']
        books = es.search(index='bookshelf',
                          body={
                              "_source": ["title"],
                              "query": {
                                  "bool": {
                                      "must": [{
                                          "match_phrase": {
                                              "title": myfile.name
                                          }
                                      }, {
                                          "multi_match": {
                                              "query": myfile.name,
                                              "type": "most_fields",
                                              "fields": ["title"]
                                          }
                                      }]
                                  }
                              }
                          })

        if books['hits']['total']['value'] > 0:
            logger.info('same title\'s book detected.')
            if request.POST.get('replace') == "1":
                logger.info('try to delete target book content.')
                es.delete_by_query(
                    index='bookshelf',
                    body={
                        "query": {
                            "bool": {
                                "must": [{
                                    "match_phrase": {
                                        "title":
                                        "Swift Development with Cocoa.pdf"
                                    }
                                }, {
                                    "multi_match": {
                                        "query":
                                        "Swift Development with Cocoa.pdf",
                                        "type": "most_fields",
                                        "fields": ["title"]
                                    }
                                }]
                            }
                        }
                    })
            else:
                logger.info('Uploaded book already exists.')
                ret["code"] = -1
                ret["desc"] = "Uploaded book already exists."
                return JsonResponse(ret)

        fs = FileSystemStorage()
        filename = fs.save(myfile.name, myfile)
        parsed = parser.from_file(filename, xmlContent=True)
        paged = parsed["content"].split('<div class="page">')
        body = ""
        iPage = 1
        for page in paged[1:-2]:
            body = body + json.dumps(
                {"index": {
                    "_index": "bookshelf",
                    "_type": "bookshelf_datas"
                }}) + '\n'
            body = body + json.dumps({
                "title": myfile.name,
                "page": iPage,
                "content": page
            }) + '\n'
            iPage = iPage + 1
        es.bulk(body)
        ret["code"] = 1
        ret["desc"] = "The Book has beend saved!"
        return JsonResponse(ret)
Beispiel #36
0
def populate_index(inputFilePath,logfile,INDEX_NAME,TYPE_NAME,index_populate_config):
    
    es = Elasticsearch()

    ic = 0
    ir = 0
    
    with open(inputFilePath, "r") as fin: 
        
            start = time.time()
            
            '''
            number of document processed in each bulk index
            '''
            bulk_size = 500
            
            '''
            data in bulk index
            '''
            bulk_data = [] 

            cnt = 0
            '''
            each line is single document
            '''
            for line in fin: 

                    cnt += 1
                    paperInfo = json.loads(line.strip())

                    data_dict = {}

                    '''
                    update PMID
                    '''
                    data_dict["pmid"] = paperInfo.get("PMID", "-1")
                    
                    '''
                    Update title
                    '''
                    if index_populate_config["title"]:
                        data_dict["title"] = paperInfo['ArticleTitle']

                    '''
                    update Abstract
                    '''
                    data_dict["abstract"] = paperInfo.get("Abstract", "").lower().replace('-', ' ')

                    '''
                    Update date
                    '''
                    if index_populate_config['date']:
                        data_dict["date"] = str(paperInfo['PubDate'])
                    
                    '''
                    Update MeSH
                    '''
                    if index_populate_config['MeSH']:
                        data_dict["MeSH"] = paperInfo['MeshHeadingList']
                        
                    '''
                    Update location
                    '''  
                    if index_populate_config['location']:
                        data_dict["location"] = paperInfo['Country']
                        
                    '''
                    Update Author
                    ''' 
                    if index_populate_config['author']:
                        data_dict["author"] = paperInfo['AuthorList']
                        
                    '''
                    Update Journal
                    '''
                    if index_populate_config['journal']:
                        data_dict["journal"] = paperInfo['Journal']
                        
                    
                    '''
                    Put current data into the bulk 
                    '''
                    op_dict = {
                        "index": {
                            "_index": INDEX_NAME,
                            "_type": TYPE_NAME,
                            "_id": data_dict["pmid"]
                        }
                    }

                    bulk_data.append(op_dict)
                    bulk_data.append(data_dict) 


                    '''
                    Start Bulk indexing
                    '''
                    if cnt % bulk_size == 0 and cnt != 0:
                        ic += 1
                        tmp = time.time()
                        es.bulk(index=INDEX_NAME, body=bulk_data, request_timeout = 500)

                        logfile.write("bulk indexing... %s, escaped time %s (seconds) \n" % ( cnt, tmp - start ) )
                        if ic%100 ==0:
                            print(" i bulk indexing... %s, escaped time %s (seconds) " % ( cnt, tmp - start ) )

                        bulk_data = []


            '''
            indexing those left papers
            '''
            if bulk_data:
                ir +=1
                tmp = time.time()
                es.bulk(index=INDEX_NAME, body=bulk_data, request_timeout = 500)

                logfile.write("bulk indexing... %s, escaped time %s (seconds) \n" % ( cnt, tmp - start ) )
                if ir%100 ==0:
                    print(" r bulk indexing... %s, escaped time %s (seconds) " % ( cnt, tmp - start ) )

                bulk_data = []




            end = time.time()
            logfile.write("Finish PubMed meta-data indexing. Total escaped time %s (seconds) \n" % (end - start) )
            print("Finish PubMed meta-data indexing. Total escaped time %s (seconds) " % (end - start) )
               
Beispiel #37
0
def StartIndex(txtdir, idxdir="/tmp/index", left=3, right=3, length=7, backend="files"):
    global idx
    global gbackend
    gbackend = backend
    bulkcnt = 0
    bulk = []
    bulk_size = 5000
    if backend == "elastic":
        pw = os.environ["ELASTICPW"]
        from elasticsearch import Elasticsearch
        es = Elasticsearch(hosts = ["http://*****:*****@localhost:9200/" % (pw)])
        INDEX_NAME = 'krp'
        metadata = {'index' : { '_index' : INDEX_NAME, '_type' : 'idx' }}
    elif backend == "rethink":
        pw = os.environ["RETHINKPW"]
        indexdb="krpindex"
        indextb="idx"
        import rethinkdb as r
        #conn=r.connect("localhost", password=pw)
        conn=r.connect("localhost", port=28020)
        if not indexdb in r.db_list().run(conn):
            r.db_create(indexdb).run(conn)
            r.db(indexdb).table_create(indextb).run(conn)
            #ngram index for n=1, n=2 and n=3
            #r.db(indexdb).table(indextb).index_create("ngram", [r.row["text"].slice(0,1),r.row["text"].slice(0,2),r.row["text"].slice(0,3)], multi=True).run(conn) 

        elif teardown:
            r.db(indexdb).table_drop(indextb).run(conn)
            r.db(indexdb).table_create(indextb).run(conn)
            r.db(indexdb).table(indextb).index_create("ngram", [r.row["text"].slice(0,1),r.row["text"].slice(0,2),r.row["text"].slice(0,3)], multi=True).run(conn) 
        rx=r.db(indexdb).table(indextb)
    old = {}
    now = {}
    oldindex = []
    repo = git.Repo(txtdir)
    txtid = os.path.split(txtdir)[-1]
    coll = txtid[0:4]
    #check if a previous run exists:
    for b in [a for a in repo.branches if a.name == a.name.upper() or a.name == "master"]:
        now[b.name] = b.commit.hexsha
    if backend == "files":
        lg = '%s/meta/%s/%s.log' % (idxdir, coll, txtid)
        # if we have a log file, this is an update
        update = os.path.isfile(lg)
    else:
        lg = ""
        update = False
    if debug:
        print lg, update
    changed = True
    if update:
        #        changed = False
        lgf = codecs.open(lg, 'r', 'utf-8')
        for l in lgf:
            if l.startswith('para:'):
                if debug:
                    print "INFO: overwriting parameters from old run: %s" % (l[5:-1])  
                idxdir, left, right, length = eval(l[5:-1])
            elif l.startswith('#'):
                continue
            else:
                branch, version = l[:-1].split('\t', 1)
                old[branch] = version
                if debug:
                    print "Branch check: ", branch, version, now[branch], now[branch] != version
                try:
                    #if a branch exists and has a different version, we need to remake all
                    changed = now[branch] != version
                except:
                    #if we have a new version, we need to do it anyway
                    changed = True
        #if we have a different number of versions, proceed
        lgf.close()
        if not changed:
            changed = len(old) != len(now)
        if changed:
            if debug:
                print "INFO: Something changed, re-indexing."
            oldindex = mdIndexGit(txtdir, repo, old, left, right, length)
    else:
        if idxdir:
            try:
                os.makedirs("%s/meta/%s" % (idxdir, coll))
            except:
                pass
    # now we write the new logfile
    if backend == "files":
        rec = codecs.open(lg, 'w', 'utf-8')
        if changed:
            rec.write(u"# updating index at: %s\n" % (datetime.datetime.now()))
        else:
            rec.write(u"# creating index at %s\n" % (datetime.datetime.now()))
        rec.write(u"para: '%s', %d, %d, %d\n" % (idxdir, left, right, length))
        for b in [a for a in repo.branches if a.name == a.name.upper() or a.name=="master"]:
            rec.write(u"%s\t%s\n" % (b.name.decode('utf-8'), b.commit.hexsha))
            # check for identical hashes:
        rec.close()
    if  len(now) > 0 and changed:
        index = mdIndexGit(txtdir, repo, now, left, right, length)
        if not update:
            if debug:
                dpath=idxdir + "/debug/" + coll
                try:
                    os.makedirs(dpath)
                except:
                    pass
                debfile = codecs.open(dpath + "/" + txtid + ".idx",  "w", "utf-8")
            for ixx in index:
                for i in ixx:
                    if debug:
                        debfile.write("%s\n" % ( i))
                    PrintToIdxfile(idxdir, i, txtid[0:8])
            if debug:
                print "writing index %d keys." % (len(idx))
            for of in idx.keys():                
                if backend == "files":
                    outfile=codecs.open(of, 'a+', 'utf-8')
                    outfile.write("".join(idx[of]))
                    outfile.close()
                elif backend == "elastic" or backend == "rethink":
                    #ch=unichr(int(os.path.split(of)[-1].split(".")[0], 16))
                    #es_out = [u"%s%s" % (ch, a.strip("\n")) for a in idx[of].split("\n")]
                    for mx in idx[of]:
                        if backend == "elastic":
                            bulk.append(metadata)
                        bulk.append(mx)
                        bulkcnt +=1
                    if bulkcnt > bulk_size:
                        if backend == "elastic":
                            resb = es.bulk(index = INDEX_NAME,body = bulk, refresh = True)
                        else:
                            resb = dict(rx.insert(bulk).run(conn))
                            try:
                                print resb['inserted']
                            except:
                                print resb
                        bulk = []
                        bulkcnt = 0
                else:
                    print "No valid backend found, exiting.  Valid backends are:\nfiles\tfile based index_stage\nelastic\telasticsearch index\nrethink\trethinkdb index\n"
                    sys.exit()
            idx={}
        else:
            if len(oldindex) > 0:
                t = SequenceMatcher()
                for x in range(0, len(index) - 1):
                    #this assumes the same number of files...
                    t.set_seqs(index[x], oldindex[x])
                    for tag, i1, i2, j1, j2 in t.get_opcodes():
                        print "update: ", tag, i1, i2, j1, j2
                        if tag == "replace":
                            pass
                            ##write out procedure to replace or insert
                        elif tag == "insert":
                            pass
                            ##write out procedure to replace or insert
                        elif tag == "delete":
                            pass
                             # for i in range(i1, i2):
                             #     print index[i]
        try:
            debfile.close()
        except:
            pass
    else:
        return "INFO: Nothing to do, no new commits in repository."

    
    ## do sth with the index... : either write out new or write patch
#    rec.write("hash: \n")
    repo.git.checkout('master')
Beispiel #38
0
class ErrorlogSender:
    def __init__(self):
        self._ERRORLOG_PREFIX = "error/mysql-error-running.log."

        self._GENERAL_CONFIG = {
            # Elasticsearch host name
            "ES_HOST": "192.168.0.1:4040",

            # Elasticsearch prefix for index name
            "INDEX_PREFIX": "rds_errorlog",

            # Elasticsearch type name is rds instance id
            "RDS_ID": "tb-master",

            # Enabled to change timezone. If you set UTC, this parameter is blank
            "TIMEZONE": "Asia/Seoul",

            # RDS region which you want to crawling error log.
            "AWS_RDS_REGION_ID": "ap-northeast-2",

            # If you have ec2 instances, then It need region and VPC involving instances.
            "AWS_EC2_REGION_ID": "ap-northeast-2",
            "AWS_EC2_VPC_ID": "vpc-XXxxXXxx",
        }

        self._REGEX4REFINE = {
            "QUERYTIME_REGEX":
            re.compile(
                "^[a-zA-Z#:_ ]+([0-9.]+)[a-zA-Z:_ ]+([0-9.]+)[a-zA-Z:_ ]+([0-9.]+).[a-zA-Z:_ ]+([0-9.]+)$"
            ),
            "GENERAL_ERR":
            re.compile(
                "(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (\w+) \[(\w+)\] (.*)"),
            "ABORTED_CONN":
            re.compile("db: '(\w+)' user: '******' host: '([\w\d\.]+)'"),
            "ACCESS_DENY":
            re.compile("user '(.*)'@'([\d\.]+)' "),
            "DEADLOCK":
            re.compile("(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (\w+)"),
            "ROLLBACK_TR":
            re.compile("\*\*\* WE ROLL BACK TRANSACTION \((\d+)\)"),
            "HOLD_USER_INFO":
            re.compile(
                "MySQL thread id (\w+), OS thread handle \w+, query id (\d+) ([\d\.]+) (\w+) "
            ),
            "HOLD_LOCK_INFO":
            re.compile("table `(\w+)`\.`(\w+)` trx id (\d+) lock_mode (\w+)")
        }

        self._LOG_CONFIG = {
            "LOG_OUTPUT_DIR": "/var/log/rdslog/errorlog2es.log",
            "RAW_OUTPUT_DIR": "/var/log/rdslog/errorlog"  # (Optional)
        }

        self._ABORTED_CONN_MSG = "Aborted connection"
        self._ACCESS_DENY_MSG = "Access denied"

        self._BEGIN_DEADLOCK = "deadlock detected"
        self._END_DEADLOCK = "WE ROLL BACK TRANSACTION"
        self._BEGIN_TRX = "TRANSACTION"
        self._TRASACTION_LENGTH = 9

        self._es = Elasticsearch(self._GENERAL_CONFIG["ES_HOST"])
        self._ec2dict = dict()
        self._data = list()
        self._num_of_total_doc = 0
        self._reaminer = RawFileRemainer(self._LOG_CONFIG["RAW_OUTPUT_DIR"])

        self._now = datetime.now()

    def validateLogDate(self, lines):
        delta = timedelta(hours=2)

        for line in lines:
            if not line:
                continue
            elif line.startswith("# Time: "):
                log_time = datetime.strptime(line[8:], "%y%m%d %H:%M:%S")
                log_time = log_time.replace(tzinfo=tz.tzutc()).astimezone(
                    zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
                log_time = log_time.replace(tzinfo=None)
                print(self._now, log_time)
                print("diff :", self._now - log_time)
                if (self._now - log_time) > delta:
                    return False
                else:
                    return True

        return True

    def initElasticsearchIndex(self):
        self._ES_INDEX = self._GENERAL_CONFIG[
            "INDEX_PREFIX"] + "-" + datetime.strftime(self._now, "%Y.%m")

    def initEC2InstancesInVpc(self, region, vpc):
        ec2 = boto3.resource("ec2", region_name=region)
        vpc = ec2.Vpc(vpc)
        for i in vpc.instances.all():
            for tag in i.tags:
                if tag['Key'] == 'Name':
                    self._ec2dict[i.private_ip_address] = "".join(
                        tag['Value'].split())

    def getRdsLog(self, log_filename):
        client = boto3.client(
            "rds", region_name=self._GENERAL_CONFIG["AWS_RDS_REGION_ID"])
        db_files = client.describe_db_log_files(
            DBInstanceIdentifier=self._GENERAL_CONFIG["RDS_ID"])

        if not filter(lambda log: log["LogFileName"] == log_filename,
                      db_files["DescribeDBLogFiles"]):
            return ""

        marker = "0"
        log_data = ""

        # It used like do-while statement.
        ret = client.download_db_log_file_portion(
            DBInstanceIdentifier=self._GENERAL_CONFIG["RDS_ID"],
            LogFileName=log_filename,
            Marker=marker,
            NumberOfLines=500)
        log_data = ret["LogFileData"]
        marker = ret["Marker"]

        while ret["AdditionalDataPending"]:
            ret = client.download_db_log_file_portion(
                DBInstanceIdentifier=self._GENERAL_CONFIG["RDS_ID"],
                LogFileName=log_filename,
                Marker=marker,
                NumberOfLines=500)

            log_data += ret["LogFileData"]
            marker = ret["Marker"]

        self._reaminer.clearOutOfDateRawFiles()
        self._reaminer.makeRawLog(
            "mysql-error.log." + str(datetime.now().utcnow().hour), log_data)

        return log_data

    def getRdsLog4Debug(self, path):
        content = ""
        import codecs
        f = codecs.open(path, "r", "utf-8")
        while True:
            l = f.readline()
            content += l
            if not l: break
        return content

    def validate_log_date(self, line):
        delta = timedelta(hours=2)

        m = REG_GENERAL_ERR.match(line)
        if m:
            log_time = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
            log_time = log_time.replace(tzinfo=tz.tzutc()).astimezone(
                zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
            log_time = log_time.replace(tzinfo=None)
            if (self._now - log_time) > delta:
                return False
        elif BEGIN_DEADLOCK in line:
            m = REG_DEADLOCK.match(line)
            log_time = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
            log_time = log_time.replace(tzinfo=tz.tzutc()).astimezone(
                zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
            log_time = log_time.replace(tzinfo=None)
            if (self._now - log_time) > delta:
                return False

        return True

    def createTemplate(self, template_name):
        template_body = {
            "template": "rds_errorlog-*",
            "settings": {
                "number_of_shards": 1
            }
        }

        response = self._es.indices.put_template(name=template_name,
                                                 body=template_body)
        if response["acknowledged"]:
            print("Create template success.")
        else:
            print("Create template failed.")

    def appendDoc2Data(self, doc, flush=False):
        self._data.append({
            "index": {
                "_index": self._ES_INDEX,
                "_type": self._GENERAL_CONFIG["RDS_ID"]
            }
        })
        self._data.append(doc)

        self._num_of_total_doc += 1

        if len(self._data) > 10000 or flush:
            self._es.bulk(index=self._ES_INDEX, body=self._data, refresh=flush)
            print("%s : Write doc into data that length is %s" %
                  (str(datetime.now()), len(self._data)))

    def run(self):
        self.initElasticsearchIndex()
        log_filename = self._ERRORLOG_PREFIX + str((self._now).utcnow().hour)
        log_data = self.getRdsLog(log_filename)

        if not log_data:
            print("%s does not exist!" % (log_filename))
            return -1

        lines = log_data.split("\n")
        if len(lines) > 0:
            if not self.validateLogDate(lines):
                print("%s already read log!" % (log_filename))
                return -2
        else:
            print("%s is empty!" % (log_filename))
            return -3

        sself.initEC2InstancesInVpc(self._GENERAL_CONFIG["AWS_EC2_REGION_ID"],
                                    self._GENERAL_CONFIG["AWS_EC2_VPC_ID"])
        self.createTemplate(self._GENERAL_CONFIG["INDEX_PREFIX"])

        print("%s : Ready to write %s in %s" %
              (str(datetime.now()), log_filename, self._ES_INDEX))
        i = 0
        doc = {}

        while i < len(lines):
            line = lines[i]
            if not line:
                i += 1
                continue

            if doc:
                self.appendDoc2Data(doc)

            doc = {}
            m = self._REGEX4REFINE["GENERAL_ERR"].match(line)
            if m:
                doc["type"] = "Errorlog"
                doc["code"] = m.group(2)
                doc["severity"] = m.group(3)

                # It need to be parse message additionally.
                # Specific cases as below.
                message = m.group(4)
                if self._ABORTED_CONN_MSG in message:
                    doc["detail"] = self._ABORTED_CONN_MSG
                    match = self._REGEX4REFINE["ABORTED_CONN"].search(message)
                    doc["db"] = match.group(1)
                    doc["user"] = match.group(2)
                    doc["host"] = match.group(3)
                    ip_addr = match.group(3)
                    if ip_addr not in self._ec2dict:
                        doc["name"] = "Missed"
                    else:
                        doc["name"] = self._ec2dict[ip_addr]
                elif self._ACCESS_DENY_MSG in message:
                    doc["detail"] = self._ACCESS_DENY_MSG
                    match = self._REGEX4REFINE["ACCESS_DENY"].search(message)
                    doc["user"] = match.group(1)
                    doc["host"] = match.group(2)
                else:
                    doc["detail"] = "Other"
                doc["message"] = message

                timestamp = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
                timestamp = timestamp.replace(tzinfo=tz.tzutc()).astimezone(
                    zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
                doc["timestamp"] = timestamp.isoformat()

            elif self._BEGIN_DEADLOCK in line:
                doc["type"] = "Deadlock"
                i += 1  # ignore deadlock dectected message
                m = self._REGEX4REFINE["DEADLOCK"].match(lines[i])

                timestamp = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
                timestamp = timestamp.replace(tzinfo=tz.tzutc()).astimezone(
                    zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
                doc["timestamp"] = timestamp.isoformat()
                doc["code"] = m.group(2)
                i += 1  # get next line

                # This transaction wait for using the lock.
                tr_a = ""
                for offset in range(self._TRASACTION_LENGTH):
                    tr_a += lines[i + offset] + "\n"
                i += self._TRASACTION_LENGTH
                doc["transaction_a"] = tr_a

                # Skip non-readable messages.
                while self._BEGIN_TRX not in lines[i]:
                    i += 1

                # This transaction hold the lock.
                tr_b = ""
                for offset in range(self._TRASACTION_LENGTH):
                    tr_b += lines[i + offset] + "\n"

                doc["transaction_b"] = tr_b

                m = self._REGEX4REFINE["HOLD_USER_INFO"].search(tr_b)
                doc["hold_lock_thread_id"] = m.group(1)
                doc["hold_lock_query_id"] = m.group(2)
                doc["hold_lock_usr"] = m.group(3)
                doc["hold_lock_ip"] = m.group(4)

                m = self._REGEX4REFINE["HOLD_LOCK_INFO"].search(tr_b)
                doc["hold_lock_db"] = m.group(1)
                doc["hold_lock_tb"] = m.group(2)
                doc["hold_lock_trx_id"] = m.group(3)
                doc["hold_lock_trx_mode"] = m.group(4)

                while self._END_DEADLOCK not in lines[i]:
                    i += 1
                m = self._REGEX4REFINE["ROLLBACK_TR"].match(lines[i])
                rollback = ""
                if m.group(1) == "1": rollback = "a"
                else: rollback = "b"
                doc["rollback"] = rollback
            else:
                print("Parse Error at", i)
                doc["type"] = "Other"
                doc["message"] = line

            i += 1

        if doc:
            print("%s : Write last data that length is %s (%s)" %
                  (str(datetime.now()), len(self._data), len(doc)))
            self.appendDoc2Data(doc, flush=True)

        print("Written Errorlogs : %s" % str(self._num_of_total_doc))
Beispiel #39
0
        }
    })
    docs.append({"data": data, "d_type": "Analysis"})

for file in os.listdir("docs/Contract Summary"):
    f = open("docs/Contract Summary/" + file, "rt").read()
    data = base64.b64encode(f).decode('ascii')
    docs.append({
        "index": {
            "_index": "docs",
            "_type": "doc",
            "pipeline": "attachment"
        }
    })
    docs.append({"data": data, "d_type": "Summary"})

for file in os.listdir("docs/Mineral Correspondence"):
    f = open("docs/Mineral Correspondence/" + file, "rt").read()
    docs.append({
        "index": {
            "_index": "docs",
            "_type": "doc",
            "pipeline": "attachment"
        }
    })
    docs.append({"data": data, "d_type": "Correspondence"})

resp = es.bulk(docs)

if (resp["errors"] == False):
    print("Ingest successfully completed:", len(resp["items"]), "docs")
Beispiel #40
0
#!/usr/bin/env python3

import sys
from elasticsearch import Elasticsearch

es_host = sys.argv[1]
es_port = sys.argv[2]
index_name = sys.argv[3]
json_file = sys.argv[4]

es = Elasticsearch([{'host': es_host, 'port': es_port}])

f = open(json_file)
json_content = f.read()
# Send the data into es
es.bulk(index=index_name, ignore=400, body=json_content)
Beispiel #41
0
class ElasticsearchDataStore(object):
    """Implements the datastore."""

    # Number of events to queue up when bulk inserting events.
    DEFAULT_FLUSH_INTERVAL = 1000
    DEFAULT_SIZE = 100
    DEFAULT_LIMIT = DEFAULT_SIZE  # Max events to return
    DEFAULT_FROM = 0
    DEFAULT_STREAM_LIMIT = 5000  # Max events to return when streaming results

    DEFAULT_FLUSH_RETRY_LIMIT = 3  # Max retries for flushing the queue.
    DEFAULT_EVENT_IMPORT_TIMEOUT = '3m'  # Timeout value for importing events.

    def __init__(self, host='127.0.0.1', port=9200):
        """Create a Elasticsearch client."""
        super().__init__()
        self._error_container = {}

        self.user = current_app.config.get('ELASTIC_USER', 'user')
        self.password = current_app.config.get('ELASTIC_PASSWORD', 'pass')
        self.ssl = current_app.config.get('ELASTIC_SSL', False)
        self.verify = current_app.config.get('ELASTIC_VERIFY_CERTS', True)

        parameters = {}
        if self.ssl:
            parameters['use_ssl'] = self.ssl
            parameters['verify_certs'] = self.verify

        if self.user and self.password:
            parameters['http_auth'] = (self.user, self.password)

        self.client = Elasticsearch([{
            'host': host,
            'port': port
        }], **parameters)

        self.import_counter = Counter()
        self.import_events = []
        self._request_timeout = current_app.config.get(
            'TIMEOUT_FOR_EVENT_IMPORT', self.DEFAULT_EVENT_IMPORT_TIMEOUT)

    @staticmethod
    def _build_labels_query(sketch_id, labels):
        """Build Elasticsearch query for Timesketch labels.

        Args:
            sketch_id: Integer of sketch primary key.
            labels: List of label names.

        Returns:
            Elasticsearch query as a dictionary.
        """
        label_query = {'bool': {'must': []}}

        for label in labels:
            # Increase metrics counter per label
            METRICS['search_filter_label'].labels(label=label).inc()
            nested_query = {
                'nested': {
                    'query': {
                        'bool': {
                            'must': [{
                                'term': {
                                    'timesketch_label.name.keyword': label
                                }
                            }, {
                                'term': {
                                    'timesketch_label.sketch_id': sketch_id
                                }
                            }]
                        }
                    },
                    'path': 'timesketch_label'
                }
            }
            label_query['bool']['must'].append(nested_query)
        return label_query

    @staticmethod
    def _build_events_query(events):
        """Build Elasticsearch query for one or more document ids.

        Args:
            events: List of Elasticsearch document IDs.

        Returns:
            Elasticsearch query as a dictionary.
        """
        events_list = [event['event_id'] for event in events]
        query_dict = {'query': {'ids': {'values': events_list}}}
        return query_dict

    @staticmethod
    def _build_query_dsl(query_dsl, timeline_ids):
        """Build Elastic Search DSL query by adding in timeline filtering.

        Args:
            query_dsl: A dict with the current query_dsl
            timeline_ids: Either a list of timeline IDs (int) or None.

        Returns:
            Elasticsearch query DSL as a dictionary.
        """
        # Remove any aggregation coming from user supplied Query DSL.
        # We have no way to display this data in a good way today.
        if query_dsl.get('aggregations', None):
            del query_dsl['aggregations']

        if not timeline_ids:
            return query_dsl

        if not isinstance(timeline_ids, (list, tuple)):
            es_logger.error(
                'Attempting to pass in timelines to a query DSL, but the '
                'passed timelines are not a list.')
            return query_dsl

        if not all([isinstance(x, int) for x in timeline_ids]):
            es_logger.error('All timeline IDs need to be an integer.')
            return query_dsl

        old_query = query_dsl.get('query')
        if not old_query:
            return query_dsl

        query_dsl['query'] = {
            'bool': {
                'must': [],
                'should': [{
                    'bool': {
                        'must': old_query,
                        'must_not': [{
                            'exists': {
                                'field': '__ts_timeline_id'
                            },
                        }],
                    }
                }, {
                    'bool': {
                        'must': [{
                            'terms': {
                                '__ts_timeline_id': timeline_ids
                            }
                        }, old_query],
                        'must_not': [],
                        'filter': [{
                            'exists': {
                                'field': '__ts_timeline_id'
                            }
                        }]
                    }
                }],
                'must_not': [],
                'filter': []
            }
        }
        return query_dsl

    @staticmethod
    def _convert_to_time_range(interval):
        """Convert an interval timestamp into start and end dates.

        Args:
            interval: Time frame representation

        Returns:
            Start timestamp in string format.
            End timestamp in string format.
        """
        # return ('2018-12-05T00:00:00', '2018-12-05T23:59:59')
        TS_FORMAT = '%Y-%m-%dT%H:%M:%S'
        get_digits = lambda s: int(''.join(filter(str.isdigit, s)))
        get_alpha = lambda s: ''.join(filter(str.isalpha, s))

        ts_parts = interval.split(' ')
        # The start date could be 1 or 2 first items
        start = ' '.join(ts_parts[0:len(ts_parts) - 2])
        minus = get_digits(ts_parts[-2])
        plus = get_digits(ts_parts[-1])
        interval = get_alpha(ts_parts[-1])

        start_ts = parser.parse(start)

        rd = relativedelta.relativedelta
        if interval == 's':
            start_range = start_ts - rd(seconds=minus)
            end_range = start_ts + rd(seconds=plus)
        elif interval == 'm':
            start_range = start_ts - rd(minutes=minus)
            end_range = start_ts + rd(minutes=plus)
        elif interval == 'h':
            start_range = start_ts - rd(hours=minus)
            end_range = start_ts + rd(hours=plus)
        elif interval == 'd':
            start_range = start_ts - rd(days=minus)
            end_range = start_ts + rd(days=plus)
        else:
            raise RuntimeError('Unable to parse the timestamp: ' +
                               str(interval))

        return start_range.strftime(TS_FORMAT), end_range.strftime(TS_FORMAT)

    def build_query(self,
                    sketch_id,
                    query_string,
                    query_filter,
                    query_dsl=None,
                    aggregations=None,
                    timeline_ids=None):
        """Build Elasticsearch DSL query.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            aggregations: Dict of Elasticsearch aggregations
            timeline_ids: Optional list of IDs of Timeline objects that should
                be queried as part of the search.

        Returns:
            Elasticsearch DSL query as a dictionary
        """

        if query_dsl:
            if not isinstance(query_dsl, dict):
                query_dsl = json.loads(query_dsl)

            if not query_dsl:
                query_dsl = {}

            return self._build_query_dsl(query_dsl, timeline_ids)

        if query_filter.get('events', None):
            events = query_filter['events']
            return self._build_events_query(events)

        query_dsl = {
            'query': {
                'bool': {
                    'must': [],
                    'must_not': [],
                    'filter': []
                }
            }
        }

        if query_string:
            query_dsl['query']['bool']['must'].append(
                {'query_string': {
                    'query': query_string
                }})

        # New UI filters
        if query_filter.get('chips', None):
            labels = []
            must_filters = query_dsl['query']['bool']['must']
            must_not_filters = query_dsl['query']['bool']['must_not']
            datetime_ranges = {
                'bool': {
                    'should': [],
                    'minimum_should_match': 1
                }
            }

            for chip in query_filter['chips']:
                # Exclude chips that the user disabled
                if not chip.get('active', True):
                    continue

                # Increase metrics per chip type
                METRICS['search_filter_type'].labels(type=chip['type']).inc()
                if chip['type'] == 'label':
                    labels.append(chip['value'])

                elif chip['type'] == 'term':
                    term_filter = {
                        'match_phrase': {
                            '{}'.format(chip['field']): {
                                'query': "{}".format(chip['value'])
                            }
                        }
                    }

                    if chip['operator'] == 'must':
                        must_filters.append(term_filter)

                    elif chip['operator'] == 'must_not':
                        must_not_filters.append(term_filter)

                elif chip['type'].startswith('datetime'):
                    range_filter = lambda start, end: {
                        'range': {
                            'datetime': {
                                'gte': start,
                                'lte': end
                            }
                        }
                    }
                    if chip['type'] == 'datetime_range':
                        start, end = chip['value'].split(',')
                    elif chip['type'] == 'datetime_interval':
                        start, end = self._convert_to_time_range(chip['value'])
                    else:
                        continue
                    datetime_ranges['bool']['should'].append(
                        range_filter(start, end))

            label_filter = self._build_labels_query(sketch_id, labels)
            must_filters.append(label_filter)
            must_filters.append(datetime_ranges)

        # Pagination
        if query_filter.get('from', None):
            query_dsl['from'] = query_filter['from']

        # Number of events to return
        if query_filter.get('size', None):
            query_dsl['size'] = query_filter['size']

        # Make sure we are sorting.
        if not query_dsl.get('sort', None):
            query_dsl['sort'] = {'datetime': query_filter.get('order', 'asc')}

        # Add any pre defined aggregations
        if aggregations:
            # post_filter happens after aggregation so we need to move the
            # filter to the query instead.
            if query_dsl.get('post_filter', None):
                query_dsl['query']['bool']['filter'] = query_dsl['post_filter']
                query_dsl.pop('post_filter', None)
            query_dsl['aggregations'] = aggregations

        # TODO: Simplify this when we don't have to support both timelines
        # that have __ts_timeline_id set and those that don't.
        # (query_string AND timeline_id NOT EXISTS) OR (
        #       query_string AND timeline_id in LIST)
        if timeline_ids and isinstance(timeline_ids, (list, tuple)):
            must_filters_pre = copy.copy(query_dsl['query']['bool']['must'])
            must_not_filters_pre = copy.copy(
                query_dsl['query']['bool']['must_not'])

            must_filters_post = copy.copy(query_dsl['query']['bool']['must'])
            must_not_filters_post = copy.copy(
                query_dsl['query']['bool']['must_not'])

            must_not_filters_pre.append({
                'exists': {
                    'field': '__ts_timeline_id'
                },
            })

            must_filters_post.append(
                {'terms': {
                    '__ts_timeline_id': timeline_ids
                }})

            query_dsl['query'] = {
                'bool': {
                    'must': [],
                    'should': [{
                        'bool': {
                            'must': must_filters_pre,
                            'must_not': must_not_filters_pre,
                        }
                    }, {
                        'bool': {
                            'must': must_filters_post,
                            'must_not': must_not_filters_post,
                            'filter': [{
                                'exists': {
                                    'field': '__ts_timeline_id'
                                }
                            }]
                        }
                    }],
                    'must_not': [],
                    'filter': []
                }
            }

        return query_dsl

    # pylint: disable=too-many-arguments
    def search(self,
               sketch_id,
               query_string,
               query_filter,
               query_dsl,
               indices,
               count=False,
               aggregations=None,
               return_fields=None,
               enable_scroll=False,
               timeline_ids=None):
        """Search ElasticSearch. This will take a query string from the UI
        together with a filter definition. Based on this it will execute the
        search request on ElasticSearch and get result back.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            indices: List of indices to query
            count: Boolean indicating if we should only return result count
            aggregations: Dict of Elasticsearch aggregations
            return_fields: List of fields to return
            enable_scroll: If Elasticsearch scroll API should be used
            timeline_ids: Optional list of IDs of Timeline objects that should
                be queried as part of the search.

        Returns:
            Set of event documents in JSON format
        """
        scroll_timeout = None
        if enable_scroll:
            scroll_timeout = '1m'  # Default to 1 minute scroll timeout

        # Exit early if we have no indices to query
        if not indices:
            return {'hits': {'hits': [], 'total': 0}, 'took': 0}

        # Make sure that the list of index names is uniq.
        indices = list(set(indices))

        # Check if we have specific events to fetch and get indices.
        if query_filter.get('events', None):
            indices = {
                event['index']
                for event in query_filter['events']
                if event['index'] in indices
            }

        query_dsl = self.build_query(sketch_id=sketch_id,
                                     query_string=query_string,
                                     query_filter=query_filter,
                                     query_dsl=query_dsl,
                                     aggregations=aggregations,
                                     timeline_ids=timeline_ids)

        # Default search type for elasticsearch is query_then_fetch.
        search_type = 'query_then_fetch'

        # Only return how many documents matches the query.
        if count:
            if 'sort' in query_dsl:
                del query_dsl['sort']
            try:
                count_result = self.client.count(body=query_dsl,
                                                 index=list(indices))
            except NotFoundError:
                es_logger.error(
                    'Unable to count due to an index not found: {0:s}'.format(
                        ','.join(indices)))
                return 0
            METRICS['search_requests'].labels(type='count').inc()
            return count_result.get('count', 0)

        if not return_fields:
            # Suppress the lint error because elasticsearch-py adds parameters
            # to the function with a decorator and this makes pylint sad.
            # pylint: disable=unexpected-keyword-arg
            return self.client.search(body=query_dsl,
                                      index=list(indices),
                                      search_type=search_type,
                                      scroll=scroll_timeout)

        # The argument " _source_include" changed to "_source_includes" in
        # ES version 7. This check add support for both version 6 and 7 clients.
        # pylint: disable=unexpected-keyword-arg
        try:
            if self.version.startswith('6'):
                _search_result = self.client.search(
                    body=query_dsl,
                    index=list(indices),
                    search_type=search_type,
                    _source_include=return_fields,
                    scroll=scroll_timeout)
            else:
                _search_result = self.client.search(
                    body=query_dsl,
                    index=list(indices),
                    search_type=search_type,
                    _source_includes=return_fields,
                    scroll=scroll_timeout)
        except RequestError as e:
            root_cause = e.info.get('error', {}).get('root_cause')
            if root_cause:
                error_items = []
                for cause in root_cause:
                    error_items.append('[{0:s}] {1:s}'.format(
                        cause.get('type', ''), cause.get('reason', '')))
                cause = ', '.join(error_items)
            else:
                cause = str(e)

            es_logger.error('Unable to run search query: {0:s}'.format(cause),
                            exc_info=True)
            raise ValueError(cause) from e

        METRICS['search_requests'].labels(type='single').inc()
        return _search_result

    # pylint: disable=too-many-arguments
    def search_stream(self,
                      sketch_id=None,
                      query_string=None,
                      query_filter=None,
                      query_dsl=None,
                      indices=None,
                      return_fields=None,
                      enable_scroll=True,
                      timeline_ids=None):
        """Search ElasticSearch. This will take a query string from the UI
        together with a filter definition. Based on this it will execute the
        search request on ElasticSearch and get result back.

        Args :
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            indices: List of indices to query
            return_fields: List of fields to return
            enable_scroll: Boolean determining whether scrolling is enabled.
            timeline_ids: Optional list of IDs of Timeline objects that should
                be queried as part of the search.

        Returns:
            Generator of event documents in JSON format
        """
        # Make sure that the list of index names is uniq.
        indices = list(set(indices))

        METRICS['search_requests'].labels(type='stream').inc()

        if not query_filter.get('size'):
            query_filter['size'] = self.DEFAULT_STREAM_LIMIT

        if not query_filter.get('terminate_after'):
            query_filter['terminate_after'] = self.DEFAULT_STREAM_LIMIT

        result = self.search(sketch_id=sketch_id,
                             query_string=query_string,
                             query_dsl=query_dsl,
                             query_filter=query_filter,
                             indices=indices,
                             return_fields=return_fields,
                             enable_scroll=enable_scroll,
                             timeline_ids=timeline_ids)

        if enable_scroll:
            scroll_id = result['_scroll_id']
            scroll_size = result['hits']['total']
        else:
            scroll_id = None
            scroll_size = 0

        # Elasticsearch version 7.x returns total hits as a dictionary.
        # TODO: Refactor when version 6.x has been deprecated.
        if isinstance(scroll_size, dict):
            scroll_size = scroll_size.get('value', 0)

        for event in result['hits']['hits']:
            yield event

        while scroll_size > 0:
            # pylint: disable=unexpected-keyword-arg
            result = self.client.scroll(scroll_id=scroll_id, scroll='5m')
            scroll_id = result['_scroll_id']
            scroll_size = len(result['hits']['hits'])
            for event in result['hits']['hits']:
                yield event

    def get_filter_labels(self, sketch_id, indices):
        """Aggregate labels for a sketch.

        Args:
            sketch_id: The Sketch ID
            indices: List of indices to aggregate on

        Returns:
            List with label names.
        """
        # This is a workaround to return all labels by setting the max buckets
        # to something big. If a sketch has more than this amount of labels
        # the list will be incomplete but it should be uncommon to have >10k
        # labels in a sketch.
        max_labels = 10000

        # pylint: disable=line-too-long
        aggregation = {
            'aggs': {
                'nested': {
                    'nested': {
                        'path': 'timesketch_label'
                    },
                    'aggs': {
                        'inner': {
                            'filter': {
                                'bool': {
                                    'must': [{
                                        'term': {
                                            'timesketch_label.sketch_id':
                                            sketch_id
                                        }
                                    }]
                                }
                            },
                            'aggs': {
                                'labels': {
                                    'terms': {
                                        'size': max_labels,
                                        'field':
                                        'timesketch_label.name.keyword'
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        # Make sure that the list of index names is uniq.
        indices = list(set(indices))

        labels = []
        # pylint: disable=unexpected-keyword-arg
        try:
            result = self.client.search(index=indices,
                                        body=aggregation,
                                        size=0)
        except NotFoundError:
            es_logger.error('Unable to find the index/indices: {0:s}'.format(
                ','.join(indices)))
            return labels

        buckets = result.get('aggregations',
                             {}).get('nested',
                                     {}).get('inner',
                                             {}).get('labels',
                                                     {}).get('buckets', [])
        for bucket in buckets:
            # Filter out special labels like __ts_star etc.
            if bucket['key'].startswith('__'):
                continue
            labels.append(bucket['key'])
        return labels

    # pylint: disable=inconsistent-return-statements
    def get_event(self, searchindex_id, event_id):
        """Get one event from the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id

        Returns:
            Event document in JSON format
        """
        METRICS['search_get_event'].inc()
        try:
            # Suppress the lint error because elasticsearch-py adds parameters
            # to the function with a decorator and this makes pylint sad.
            # pylint: disable=unexpected-keyword-arg
            if self.version.startswith('6'):
                event = self.client.get(index=searchindex_id,
                                        id=event_id,
                                        doc_type='_all',
                                        _source_exclude=['timesketch_label'])
            else:
                event = self.client.get(index=searchindex_id,
                                        id=event_id,
                                        doc_type='_all',
                                        _source_excludes=['timesketch_label'])

            return event

        except NotFoundError:
            abort(HTTP_STATUS_CODE_NOT_FOUND)

    def count(self, indices):
        """Count number of documents.

        Args:
            indices: List of indices.

        Returns:
            Tuple containing number of documents and size on disk.
        """
        if not indices:
            return 0, 0

        # Make sure that the list of index names is uniq.
        indices = list(set(indices))

        try:
            es_stats = self.client.indices.stats(index=indices,
                                                 metric='docs, store')

        except NotFoundError:
            es_logger.error('Unable to count indices (index not found)')
            return 0, 0

        except RequestError:
            es_logger.error('Unable to count indices (request error)',
                            exc_info=True)
            return 0, 0

        doc_count_total = es_stats.get('_all',
                                       {}).get('primaries',
                                               {}).get('docs',
                                                       {}).get('count', 0)
        doc_bytes_total = es_stats.get('_all', {}).get('primaries', {}).get(
            'store', {}).get('size_in_bytes', 0)

        return doc_count_total, doc_bytes_total

    def set_label(self,
                  searchindex_id,
                  event_id,
                  event_type,
                  sketch_id,
                  user_id,
                  label,
                  toggle=False,
                  remove=False,
                  single_update=True):
        """Set label on event in the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id
            event_type: String of ElasticSearch document type
            sketch_id: Integer of sketch primary key
            user_id: Integer of user primary key
            label: String with the name of the label
            remove: Optional boolean value if the label should be removed
            toggle: Optional boolean value if the label should be toggled
            single_update: Boolean if the label should be indexed immediately.

        Returns:
            Dict with updated document body, or None if this is a single update.
        """
        # Elasticsearch painless script.
        update_body = {
            'script': {
                'lang': 'painless',
                'source': UPDATE_LABEL_SCRIPT,
                'params': {
                    'timesketch_label': {
                        'name': str(label),
                        'user_id': user_id,
                        'sketch_id': sketch_id
                    },
                    remove: remove
                }
            }
        }

        if toggle:
            update_body['script']['source'] = TOGGLE_LABEL_SCRIPT

        if not single_update:
            script = update_body['script']
            return dict(source=script['source'],
                        lang=script['lang'],
                        params=script['params'])

        doc = self.client.get(index=searchindex_id,
                              id=event_id,
                              doc_type='_all')
        try:
            doc['_source']['timesketch_label']
        except KeyError:
            doc = {'doc': {'timesketch_label': []}}
            self.client.update(index=searchindex_id,
                               doc_type=event_type,
                               id=event_id,
                               body=doc)

        self.client.update(index=searchindex_id,
                           id=event_id,
                           doc_type=event_type,
                           body=update_body)

        return None

    def create_index(self,
                     index_name=uuid4().hex,
                     doc_type='generic_event',
                     mappings=None):
        """Create index with Timesketch settings.

        Args:
            index_name: Name of the index. Default is a generated UUID.
            doc_type: Name of the document type. Default id generic_event.
            mappings: Optional dict with the document mapping for Elastic.

        Returns:
            Index name in string format.
            Document type in string format.
        """
        if mappings:
            _document_mapping = mappings
        else:
            _document_mapping = {
                'properties': {
                    'timesketch_label': {
                        'type': 'nested'
                    },
                    'datetime': {
                        'type': 'date'
                    }
                }
            }

        # TODO: Remove when we deprecate Elasticsearch version 6.x
        if self.version.startswith('6'):
            _document_mapping = {doc_type: _document_mapping}

        if not self.client.indices.exists(index_name):
            try:
                self.client.indices.create(
                    index=index_name, body={'mappings': _document_mapping})
            except ConnectionError as e:
                raise RuntimeError(
                    'Unable to connect to Timesketch backend.') from e
            except RequestError:
                index_exists = self.client.indices.exists(index_name)
                es_logger.warning(
                    'Attempting to create an index that already exists '
                    '({0:s} - {1:s})'.format(index_name, str(index_exists)))

        return index_name, doc_type

    def delete_index(self, index_name):
        """Delete Elasticsearch index.

        Args:
            index_name: Name of the index to delete.
        """
        if self.client.indices.exists(index_name):
            try:
                self.client.indices.delete(index=index_name)
            except ConnectionError as e:
                raise RuntimeError(
                    'Unable to connect to Timesketch backend: {}'.format(
                        e)) from e

    def import_event(self,
                     index_name,
                     event_type,
                     event=None,
                     event_id=None,
                     flush_interval=DEFAULT_FLUSH_INTERVAL,
                     timeline_id=None):
        """Add event to Elasticsearch.

        Args:
            index_name: Name of the index in Elasticsearch
            event_type: Type of event (e.g. plaso_event)
            event: Event dictionary
            event_id: Event Elasticsearch ID
            flush_interval: Number of events to queue up before indexing
            timeline_id: Optional ID number of a Timeline object this event
                belongs to. If supplied an additional field will be added to
                the store indicating the timeline this belongs to.
        """
        if event:
            for k, v in event.items():
                if not isinstance(k, six.text_type):
                    k = codecs.decode(k, 'utf8')

                # Make sure we have decoded strings in the event dict.
                if isinstance(v, six.binary_type):
                    v = codecs.decode(v, 'utf8')

                event[k] = v

            # Header needed by Elasticsearch when bulk inserting.
            header = {
                'index': {
                    '_index': index_name,
                }
            }
            update_header = {'update': {'_index': index_name, '_id': event_id}}

            # TODO: Remove when we deprecate Elasticsearch version 6.x
            if self.version.startswith('6'):
                header['index']['_type'] = event_type
                update_header['update']['_type'] = event_type

            if event_id:
                # Event has "lang" defined if there is a script used for import.
                if event.get('lang'):
                    event = {'script': event}
                else:
                    event = {'doc': event}
                header = update_header

            if timeline_id:
                event['__ts_timeline_id'] = timeline_id

            self.import_events.append(header)
            self.import_events.append(event)
            self.import_counter['events'] += 1

            if self.import_counter['events'] % int(flush_interval) == 0:
                _ = self.flush_queued_events()
                self.import_events = []
        else:
            # Import the remaining events in the queue.
            if self.import_events:
                _ = self.flush_queued_events()

        return self.import_counter['events']

    def flush_queued_events(self, retry_count=0):
        """Flush all queued events.

        Returns:
            dict: A dict object that contains the number of events
                that were sent to Elastic as well as information
                on whether there were any errors, and what the
                details of these errors if any.
            retry_count: optional int indicating whether this is a retry.
        """
        if not self.import_events:
            return {}

        return_dict = {
            'number_of_events': len(self.import_events) / 2,
            'total_events': self.import_counter['events'],
        }

        try:
            # pylint: disable=unexpected-keyword-arg
            results = self.client.bulk(body=self.import_events,
                                       timeout=self._request_timeout)
        except (ConnectionTimeout, socket.timeout):
            if retry_count >= self.DEFAULT_FLUSH_RETRY_LIMIT:
                es_logger.error('Unable to add events, reached recount max.',
                                exc_info=True)
                return {}

            es_logger.error('Unable to add events (retry {0:d}/{1:d})'.format(
                retry_count, self.DEFAULT_FLUSH_RETRY_LIMIT))
            return self.flush_queued_events(retry_count + 1)

        errors_in_upload = results.get('errors', False)
        return_dict['errors_in_upload'] = errors_in_upload

        if errors_in_upload:
            items = results.get('items', [])
            return_dict['errors'] = []

            es_logger.error('Errors while attempting to upload events.')
            for item in items:
                index = item.get('index', {})
                index_name = index.get('_index', 'N/A')

                _ = self._error_container.setdefault(index_name, {
                    'errors': [],
                    'types': Counter(),
                    'details': Counter()
                })

                error_counter = self._error_container[index_name]['types']
                error_detail_counter = self._error_container[index_name][
                    'details']
                error_list = self._error_container[index_name]['errors']

                error = index.get('error', {})
                status_code = index.get('status', 0)
                doc_id = index.get('_id', '(unable to get doc id)')
                caused_by = error.get('caused_by', {})

                caused_reason = caused_by.get('reason',
                                              'Unkown Detailed Reason')

                error_counter[error.get('type')] += 1
                detail_msg = '{0:s}/{1:s}'.format(
                    caused_by.get('type', 'Unknown Detailed Type'),
                    ' '.join(caused_reason.split()[:5]))
                error_detail_counter[detail_msg] += 1

                error_msg = '<{0:s}> {1:s} [{2:s}/{3:s}]'.format(
                    error.get('type', 'Unknown Type'),
                    error.get('reason', 'No reason given'),
                    caused_by.get('type', 'Unknown Type'),
                    caused_reason,
                )
                error_list.append(error_msg)
                try:
                    es_logger.error(
                        'Unable to upload document: {0:s} to index {1:s} - '
                        '[{2:d}] {3:s}'.format(doc_id, index_name, status_code,
                                               error_msg))
                # We need to catch all exceptions here, since this is a crucial
                # call that we do not want to break operation.
                except Exception:  # pylint: disable=broad-except
                    es_logger.error(
                        'Unable to upload document, and unable to log the '
                        'error itself.',
                        exc_info=True)

        return_dict['error_container'] = self._error_container

        self.import_events = []
        return return_dict

    @property
    def version(self):
        """Get Elasticsearch version.

        Returns:
          Version number as a string.
        """
        version_info = self.client.info().get('version')
        return version_info.get('number')
Beispiel #42
0
class ElasticsearchDataStore(datastore.DataStore):
    """Implements the datastore."""
    def __init__(self, host=u'127.0.0.1', port=9200):
        """Create a Elasticsearch client."""
        super(ElasticsearchDataStore, self).__init__()
        self.client = Elasticsearch([{u'host': host, u'port': port}])
        self.import_counter = Counter()
        self.import_events = []

    @staticmethod
    def _build_label_query(sketch_id, label_name):
        """Build Elasticsearch query for Timesketch labels.

        Args:
            sketch_id: Integer of sketch primary key.
            label_name: Name of the label to search for.

        Returns:
            Elasticsearch query as a dictionary.
        """
        query_dict = {
            u'query': {
                u'nested': {
                    u'query': {
                        u'bool': {
                            u'must': [{
                                u'term': {
                                    u'timesketch_label.name': label_name
                                }
                            }, {
                                u'term': {
                                    u'timesketch_label.sketch_id': sketch_id
                                }
                            }]
                        }
                    },
                    u'path': u'timesketch_label'
                }
            }
        }
        return query_dict

    @staticmethod
    def _build_events_query(events):
        """Build Elasticsearch query for one or more document ids.

        Args:
            events: List of Elasticsearch document IDs.

        Returns:
            Elasticsearch query as a dictionary.
        """
        events_list = [event[u'event_id'] for event in events]
        query_dict = {u'query': {u'ids': {u'values': events_list}}}
        return query_dict

    @staticmethod
    def _build_field_aggregator(field_name):
        """Build Elasticsearch query for aggregation based on field.

        Args:
            field_name: Field to aggregate.

        Returns:
            Elasticsearch aggregation as a dictionary.
        """
        field_aggregation = {
            u'field_aggregation': {
                u'terms': {
                    u'field': u'{0:s}.keyword'.format(field_name)
                }
            }
        }
        return field_aggregation

    def build_query(self,
                    sketch_id,
                    query_string,
                    query_filter,
                    query_dsl,
                    aggregations=None):
        """Build Elasticsearch DSL query.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            aggregations: Dict of Elasticsearch aggregations

        Returns:
            Elasticsearch DSL query as a dictionary
        """
        if not query_dsl:
            if query_filter.get(u'star', None):
                query_dsl = self._build_label_query(sketch_id, u'__ts_star')

            if query_filter.get(u'events', None):
                events = query_filter[u'events']
                query_dsl = self._build_events_query(events)

            if not query_dsl:
                query_dsl = {
                    u'query': {
                        u'bool': {
                            u'must': [{
                                u'query_string': {
                                    u'query': query_string
                                }
                            }]
                        }
                    }
                }
            if query_filter.get(u'time_start', None):
                # TODO(jberggren): Add support for multiple time ranges.
                query_dsl[u'query'][u'bool'][u'filter'] = {
                    u'bool': {
                        u'should': [{
                            u'range': {
                                u'datetime': {
                                    u'gte': query_filter[u'time_start'],
                                    u'lte': query_filter[u'time_end']
                                }
                            }
                        }]
                    }
                }
            if query_filter.get(u'exclude', None):
                query_dsl[u'post_filter'] = {
                    u'bool': {
                        u'must_not': {
                            u'terms': {
                                u'data_type': query_filter[u'exclude']
                            }
                        }
                    }
                }
        else:
            query_dsl = json.loads(query_dsl)

        # Make sure we are sorting.
        if not query_dsl.get(u'sort', None):
            query_dsl[u'sort'] = {
                u'datetime': query_filter.get(u'order', u'asc')
            }

        # Remove any aggregation coming from user supplied Query DSL. We have
        # no way to display this data in a good way today.
        # TODO: Revisit this and figure out if we can display the data.
        if query_dsl.get(u'aggregations', None):
            del query_dsl[u'aggregations']

        # Add any pre defined aggregations
        if aggregations:
            # post_filter happens after aggregation so we need to move the
            # filter to the query instead.
            if query_dsl.get(u'post_filter', None):
                query_dsl[u'query'][u'bool'][u'filter'] = query_dsl[
                    u'post_filter']
                query_dsl.pop(u'post_filter', None)
            query_dsl[u'aggregations'] = aggregations
        return query_dsl

    def search(self,
               sketch_id,
               query_string,
               query_filter,
               query_dsl,
               indices,
               aggregations=None,
               return_results=True,
               return_fields=None,
               enable_scroll=False):
        """Search ElasticSearch. This will take a query string from the UI
        together with a filter definition. Based on this it will execute the
        search request on ElasticSearch and get result back.

        Args:
            sketch_id: Integer of sketch primary key
            query_string: Query string
            query_filter: Dictionary containing filters to apply
            query_dsl: Dictionary containing Elasticsearch DSL query
            indices: List of indices to query
            aggregations: Dict of Elasticsearch aggregations
            return_results: Boolean indicating if results should be returned
            return_fields: List of fields to return
            enable_scroll: If Elasticsearch scroll API should be used

        Returns:
            Set of event documents in JSON format
        """
        # Limit the number of returned documents.
        DEFAULT_LIMIT = 500  # Maximum events to return
        LIMIT_RESULTS = query_filter.get(u'limit', DEFAULT_LIMIT)

        scroll_timeout = None
        if enable_scroll:
            scroll_timeout = u'1m'  # Default to 1 minute scroll timeout

        # Use default fields if none is provided
        default_fields = [
            u'datetime', u'timestamp', u'message', u'timestamp_desc',
            u'timesketch_label', u'tag'
        ]
        if not return_fields:
            return_fields = default_fields

        # Exit early if we have no indices to query
        if not indices:
            return {u'hits': {u'hits': [], u'total': 0}, u'took': 0}

        # Check if we have specific events to fetch and get indices.
        if query_filter.get(u'events', None):
            indices = {
                event[u'index']
                for event in query_filter[u'events']
                if event[u'index'] in indices
            }

        query_dsl = self.build_query(sketch_id, query_string, query_filter,
                                     query_dsl, aggregations)

        # Default search type for elasticsearch is query_then_fetch.
        search_type = u'query_then_fetch'

        # Set limit to 0 to not return any results
        if not return_results:
            LIMIT_RESULTS = 0

        # Suppress the lint error because elasticsearch-py adds parameters
        # to the function with a decorator and this makes pylint sad.
        # pylint: disable=unexpected-keyword-arg
        return self.client.search(body=query_dsl,
                                  index=list(indices),
                                  size=LIMIT_RESULTS,
                                  search_type=search_type,
                                  _source_include=return_fields,
                                  scroll=scroll_timeout)

    def get_event(self, searchindex_id, event_id):
        """Get one event from the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id

        Returns:
            Event document in JSON format
        """
        try:
            # Suppress the lint error because elasticsearch-py adds parameters
            # to the function with a decorator and this makes pylint sad.
            # pylint: disable=unexpected-keyword-arg
            return self.client.get(index=searchindex_id,
                                   id=event_id,
                                   _source_exclude=[u'timesketch_label'])
        except NotFoundError:
            abort(HTTP_STATUS_CODE_NOT_FOUND)

    def count(self, indices):
        """Count number of documents.

        Args:
            indices: List of indices.

        Returns:
            Number of documents.
        """
        if not indices:
            return 0
        result = self.client.count(index=indices)
        return result.get(u'count', 0)

    def set_label(self,
                  searchindex_id,
                  event_id,
                  event_type,
                  sketch_id,
                  user_id,
                  label,
                  toggle=False):
        """Set label on event in the datastore.

        Args:
            searchindex_id: String of ElasticSearch index id
            event_id: String of ElasticSearch event id
            event_type: String of ElasticSearch document type
            sketch_id: Integer of sketch primary key
            user_id: Integer of user primary key
            label: String with the name of the label
            toggle: Optional boolean value if the label should be toggled
            (add/remove). The default is False.
        """
        doc = self.client.get(index=searchindex_id, id=event_id)
        try:
            doc[u'_source'][u'timesketch_label']
        except KeyError:
            # pylint: disable=redefined-variable-type
            doc = {u'doc': {u'timesketch_label': []}}
            self.client.update(index=searchindex_id,
                               doc_type=event_type,
                               id=event_id,
                               body=doc)

        # Choose the correct script.
        script_name = u'add_label'
        if toggle:
            script_name = u'toggle_label'
        script = {
            u'script': {
                u'lang': u'groovy',
                u'file': script_name,
                u'params': {
                    u'timesketch_label': {
                        u'name': str(label),
                        u'user_id': user_id,
                        u'sketch_id': sketch_id
                    }
                }
            }
        }
        self.client.update(index=searchindex_id,
                           id=event_id,
                           doc_type=event_type,
                           body=script)

    def create_index(self, index_name=uuid4().hex, doc_type=u'generic_event'):
        """Create index with Timesketch settings.

        Args:
            index_name: Name of the index. Default is a generated UUID.
            doc_type: Name of the document type. Default id generic_event.

        Returns:
            Index name in string format.
            Document type in string format.
        """
        _document_mapping = {
            doc_type: {
                u'properties': {
                    u'timesketch_label': {
                        u'type': u'nested'
                    }
                }
            }
        }

        if not self.client.indices.exists(index_name):
            try:
                self.client.indices.create(
                    index=index_name, body={u'mappings': _document_mapping})
            except ConnectionError:
                raise RuntimeError(u'Unable to connect to Timesketch backend.')
        # We want to return unicode here to keep SQLalchemy happy.
        index_name = unicode(index_name.decode(encoding=u'utf-8'))
        doc_type = unicode(doc_type.decode(encoding=u'utf-8'))
        return index_name, doc_type

    def import_event(self, flush_interval, index_name, event_type, event=None):
        """Add event to Elasticsearch.

        Args:
            flush_interval: Number of events to queue up before indexing
            index_name: Name of the index in Elasticsearch
            event_type: Type of event (e.g. plaso_event)
            event: Event dictionary
        """
        if event:
            # Make sure we have decoded strings in the event dict.
            event = {
                k.decode(u'utf8'): v.decode(u'utf8')
                for k, v in event.items()
            }

            # Header needed by Elasticsearch when bulk inserting.
            self.import_events.append(
                {u'index': {
                    u'_index': index_name,
                    u'_type': event_type
                }})
            self.import_events.append(event)
            self.import_counter[u'events'] += 1
            if self.import_counter[u'events'] % int(flush_interval) == 0:
                self.client.bulk(index=index_name,
                                 doc_type=event_type,
                                 body=self.import_events)
                self.import_events = []
        else:
            if self.import_events:
                self.client.bulk(index=index_name,
                                 doc_type=event_type,
                                 body=self.import_events)

        return self.import_counter[u'events']
Beispiel #43
0
class DBOperation:
    def __init__(self):
        self.logger = get_logger()
        self.es = Elasticsearch(ES_ADDRESS)

    def operate_data(self,
                     data,
                     operation='',
                     table_name='',
                     index='',
                     type='mysql'):
        if type == 'mysql':
            if table_name:
                if operation:
                    self._operate_from_db(operation, data, table_name)
                else:
                    self.logger.warning('请输入正确的operation')
            else:
                self.logger.warning('请输入数据库的表名, table name')
        elif type == 'es':
            if index:
                if operation == 'update':
                    self._update_data_from_es(data, index)
                elif operation == 'get':
                    self._get_data_from_es(data, index)
                elif operation == 'delete':
                    self._delete_data_from_es(data, index)
                else:
                    self.logger.warning('请输入正确的operation')
            else:
                self.logger.warning('请输入es的index')
        else:
            self.logger.warning('输入的数据库类型type: {} 不对,请重新输入.'.format(type))

    def _operate_from_db(self, operation, data, table_name, retry=3):
        '''
        插入数据到数据库
        :param data: 被写入的数据
        :param table_name: 数据库表名
        :param retry: 重试次数
        :return: None
        '''
        db_table = locals()[table_name]
        try:
            # 查看建表信息,没有则新建表
            if not db_table.table_exists():
                db_table.create_table()

            # 执行数据库操作
            if operation == 'insert':
                db_table.create(**data)
            elif operation == 'get':
                xxx = db_table.select(*data).limit(1 * 2)
                return xxx
            elif operation == 'update':
                db_table.update(**data).where(db_table.xxx == 'xxx').execute()
            elif operation == 'delete':
                db_table.delete().where(db_table.xxx == 'xxx').execute()

        except Exception as e:
            if str(e.args[0]) == '2013' or str(e.args[0]) == '2006' or str(
                    e.args[0]) == '0':
                self.logger.info("重试次数还剩:{}次".format(retry))
                if retry < 1:
                    self.logger.warning('数据库重连次数到达{}次,退出。'.format(retry))
                    return

                self.logger.error('数据库已断开:{}'.format(e))
                self.logger.error('重连数据库')
                db.close()
                db.get_conn().ping(True)
                return self._operate_from_db(operation,
                                             data,
                                             table_name,
                                             retry=retry - 1)
            else:
                self.logger.warning('非数据库重连错误')
                raise e

    def _get_data_from_es(self, data, index, doc_type='data'):
        '''
        es 查询操作
        :param index: 目标数据的索引
        :param body: es 查询语句
        :param doc_type: 类型,默认 data
        :return: 返回 dict 数据
        '''
        body = data  # ...
        return self.es.search(index, doc_type, body=body)

    def _update_data_from_es(self, data, index, doc_type='data'):
        '''
        es 查询操作
        :param index: 目标数据的索引
        :param body: es 查询语句
        :param doc_type: 类型,默认 data
        :return: 返回 dict 数据
        '''
        body = list()
        for j in data:
            if j:
                my_id = j['id']
                data = j['data']
                body.append({
                    'update': {
                        '_id': my_id,
                        '_type': doc_type,
                        '_index': index
                    }
                })
                body.append({'doc': data, 'doc_as_upsert': True})
        if body:
            self.es.bulk(body)

    def _delete_data_from_es(self, data, index, doc_type='data'):
        '''
        es 查询操作
        :param index: 目标数据的索引
        :param body: es 查询语句
        :param doc_type: 类型,默认 data
        :return: 返回 dict 数据
        '''
        self.es.delete(index=index, doc_type=doc_type, id=data['id'])
Beispiel #44
0
class ErrorlogSender:
  def __init__(self):
    self._ERRORLOG_PREFIX = "error/mysql-error-running.log."

    self._GENERAL_CONFIG = {
      # Elasticsearch host name
      "ES_HOST": "192.168.0.1:4040",
      
      # Elasticsearch prefix for index name
      "INDEX_PREFIX": "rds_errorlog",
      
      # Elasticsearch type name is rds instance id
      "RDS_ID": "tb-master",
      
      # Enabled to change timezone. If you set UTC, this parameter is blank
      "TIMEZONE": "Asia/Seoul",

      # RDS region which you want to crawling error log.
      "AWS_RDS_REGION_ID": "ap-northeast-2",

      # If you have ec2 instances, then It need region and VPC involving instances.
      "AWS_EC2_REGION_ID": "ap-northeast-2",
      "AWS_EC2_VPC_ID": "vpc-XXxxXXxx",
      }

    self._REGEX4REFINE = {
      "QUERYTIME_REGEX": re.compile("^[a-zA-Z#:_ ]+([0-9.]+)[a-zA-Z:_ ]+([0-9.]+)[a-zA-Z:_ ]+([0-9.]+).[a-zA-Z:_ ]+([0-9.]+)$"),
      "GENERAL_ERR": re.compile("(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (\w+) \[(\w+)\] (.*)"),
      "ABORTED_CONN": re.compile("db: '(\w+)' user: '******' host: '([\w\d\.]+)'"),
      "ACCESS_DENY": re.compile("user '(.*)'@'([\d\.]+)' "),
      "DEADLOCK": re.compile("(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (\w+)"),
      "ROLLBACK_TR": re.compile("\*\*\* WE ROLL BACK TRANSACTION \((\d+)\)"),
      "HOLD_USER_INFO": re.compile("MySQL thread id (\w+), OS thread handle \w+, query id (\d+) ([\d\.]+) (\w+) "),
      "HOLD_LOCK_INFO": re.compile("table `(\w+)`\.`(\w+)` trx id (\d+) lock_mode (\w+)")
      }
      
    self._LOG_CONFIG = {
      "LOG_OUTPUT_DIR": "/var/log/rdslog/errorlog2es.log",
      "RAW_OUTPUT_DIR": "/var/log/rdslog/errorlog" # (Optional)
      }

    self._ABORTED_CONN_MSG = "Aborted connection"
    self._ACCESS_DENY_MSG = "Access denied"

    self._BEGIN_DEADLOCK = "deadlock detected"
    self._END_DEADLOCK = "WE ROLL BACK TRANSACTION"
    self._BEGIN_TRX = "TRANSACTION"
    self._TRASACTION_LENGTH = 9

    self._es = Elasticsearch(self._GENERAL_CONFIG["ES_HOST"])
    self._ec2dict = dict()
    self._data = list()
    self._num_of_total_doc = 0
    self._reaminer = RawFileRemainer(self._LOG_CONFIG["RAW_OUTPUT_DIR"])

    self._now = datetime.now()

  def validateLogDate(self, lines):
    delta = timedelta(hours=2)
  
    for line in lines:
      if not line:
        continue
      elif line.startswith("# Time: "):
        log_time = datetime.strptime(line[8:], "%y%m%d %H:%M:%S")
        log_time = log_time.replace(tzinfo=tz.tzutc()).astimezone(zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
        log_time = log_time.replace(tzinfo=None)
        print(self._now, log_time)
        print("diff :", self._now - log_time)
        if (self._now - log_time) > delta:
          return False
        else:
          return True
      
    return True
    
  def initElasticsearchIndex(self):
    self._ES_INDEX = self._GENERAL_CONFIG["INDEX_PREFIX"] + "-" + datetime.strftime(self._now, "%Y.%m")

  def initEC2InstancesInVpc(self, region, vpc):
    ec2 = boto3.resource("ec2", region_name=region)
    vpc = ec2.Vpc(vpc)
    for i in vpc.instances.all():
      for tag in i.tags:
        if tag['Key'] == 'Name':
          self._ec2dict[i.private_ip_address] = "".join(tag['Value'].split())

  def getRdsLog(self, log_filename):
    client = boto3.client("rds", region_name=self._GENERAL_CONFIG["AWS_RDS_REGION_ID"])
    db_files = client.describe_db_log_files(DBInstanceIdentifier=self._GENERAL_CONFIG["RDS_ID"])

    if not filter(lambda log: log["LogFileName"] == log_filename, db_files["DescribeDBLogFiles"]):
      return ""

    marker = "0"
    log_data = ""

    # It used like do-while statement.
    ret = client.download_db_log_file_portion(
        DBInstanceIdentifier=self._GENERAL_CONFIG["RDS_ID"],
        LogFileName=log_filename,
        Marker=marker,
        NumberOfLines=500)
    log_data = ret["LogFileData"]
    marker = ret["Marker"]

    while ret["AdditionalDataPending"]:
      ret = client.download_db_log_file_portion(
        DBInstanceIdentifier=self._GENERAL_CONFIG["RDS_ID"],
        LogFileName=log_filename,
        Marker=marker,
        NumberOfLines=500)

      log_data += ret["LogFileData"]
      marker = ret["Marker"]
      
    self._reaminer.clearOutOfDateRawFiles()
    self._reaminer.makeRawLog("mysql-error.log." + str(datetime.now().utcnow().hour), log_data)

    return log_data

  def getRdsLog4Debug(self, path):
    content = ""
    import codecs
    f = codecs.open(path, "r", "utf-8")
    while True:
      l = f.readline()
      content += l
      if not l: break
    return content

  def validate_log_date(self, line):
    delta = timedelta(hours=2)

    m = REG_GENERAL_ERR.match(line)
    if m:
      log_time = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
      log_time = log_time.replace(tzinfo=tz.tzutc()).astimezone(zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
      log_time = log_time.replace(tzinfo=None)
      if (self._now - log_time) > delta:
        return False
    elif BEGIN_DEADLOCK in line:
      m = REG_DEADLOCK.match(line)
      log_time = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
      log_time = log_time.replace(tzinfo=tz.tzutc()).astimezone(zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
      log_time = log_time.replace(tzinfo=None)
      if (self._now - log_time) > delta:
        return False

    return True

  def createTemplate(self, template_name):
    template_body = {
      "template" : "rds_errorlog-*",
      "settings" : {
        "number_of_shards": 1 }
    }

    response = self._es.indices.put_template(name=template_name, body=template_body)
    if response["acknowledged"]:
      print("Create template success.")
    else:
      print("Create template failed.")

  def appendDoc2Data(self, doc, flush=False):
    self._data.append({"index": {
                         "_index": self._ES_INDEX,
                         "_type": self._GENERAL_CONFIG["RDS_ID"] }})
    self._data.append(doc)

    self._num_of_total_doc += 1

    
    if len(self._data) > 10000 or flush:
      self._es.bulk(index=self._ES_INDEX, body=self._data, refresh=flush)
      print("%s : Write doc into data that length is %s" % (str(datetime.now()), len(self._data)))

  def run(self):
    self.initElasticsearchIndex()
    log_filename = self._ERRORLOG_PREFIX + str((self._now).utcnow().hour)
    log_data = self.getRdsLog(log_filename)

    if not log_data:
      print("%s does not exist!" % (log_filename))
      return -1

    lines = log_data.split("\n")
    if len(lines) > 0:
      if not self.validateLogDate(lines):
        print("%s already read log!" % (log_filename))
        return -2
    else:
      print("%s is empty!" % (log_filename))
      return -3

    sself.initEC2InstancesInVpc(
      self._GENERAL_CONFIG["AWS_EC2_REGION_ID"],
      self._GENERAL_CONFIG["AWS_EC2_VPC_ID"])
    self.createTemplate(self._GENERAL_CONFIG["INDEX_PREFIX"])

    print("%s : Ready to write %s in %s" % (str(datetime.now()), log_filename, self._ES_INDEX))
    i = 0
    doc = {}

    while i < len(lines):
      line = lines[i]
      if not line:
        i += 1
        continue

      if doc:
        self.appendDoc2Data(doc)

      doc = {}
      m = self._REGEX4REFINE["GENERAL_ERR"].match(line)
      if m:
        doc["type"] = "Errorlog"
        doc["code"] = m.group(2)
        doc["severity"] = m.group(3)

        # It need to be parse message additionally.
        # Specific cases as below.
        message = m.group(4)
        if self._ABORTED_CONN_MSG in message:
          doc["detail"] = self._ABORTED_CONN_MSG
          match = self._REGEX4REFINE["ABORTED_CONN"].search(message)
          doc["db"] = match.group(1)
          doc["user"] = match.group(2)
          doc["host"] = match.group(3)
          ip_addr = match.group(3)
          if ip_addr not in self._ec2dict:
            doc["name"] = "Missed"
          else:
            doc["name"] = self._ec2dict[ip_addr]
        elif self._ACCESS_DENY_MSG in message:
          doc["detail"] = self._ACCESS_DENY_MSG
          match = self._REGEX4REFINE["ACCESS_DENY"].search(message)
          doc["user"] = match.group(1)
          doc["host"] = match.group(2)
        else:
          doc["detail"] = "Other"
        doc["message"] = message

        timestamp = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
        timestamp = timestamp.replace(tzinfo=tz.tzutc()).astimezone(zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
        doc["timestamp"] = timestamp.isoformat()

      elif self._BEGIN_DEADLOCK in line:
        doc["type"] = "Deadlock"
        i += 1 # ignore deadlock dectected message
        m = self._REGEX4REFINE["DEADLOCK"].match(lines[i])

        timestamp = datetime.strptime(m.group(1), "%Y-%m-%d %H:%M:%S")
        timestamp = timestamp.replace(tzinfo=tz.tzutc()).astimezone(zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
        doc["timestamp"] = timestamp.isoformat()
        doc["code"] = m.group(2)
        i += 1 # get next line

        # This transaction wait for using the lock.
        tr_a = ""
        for offset in range(self._TRASACTION_LENGTH):
          tr_a += lines[i + offset] + "\n"
        i += self._TRASACTION_LENGTH
        doc["transaction_a"] = tr_a

        # Skip non-readable messages.
        while self._BEGIN_TRX not in lines[i]:
          i += 1

        # This transaction hold the lock.
        tr_b = ""
        for offset in range(self._TRASACTION_LENGTH):
          tr_b += lines[i + offset] + "\n"

        doc["transaction_b"] = tr_b

        m = self._REGEX4REFINE["HOLD_USER_INFO"].search(tr_b)
        doc["hold_lock_thread_id"] = m.group(1)
        doc["hold_lock_query_id"] = m.group(2)
        doc["hold_lock_usr"] = m.group(3)
        doc["hold_lock_ip"] = m.group(4)

        m = self._REGEX4REFINE["HOLD_LOCK_INFO"].search(tr_b)
        doc["hold_lock_db"] = m.group(1)
        doc["hold_lock_tb"] = m.group(2)
        doc["hold_lock_trx_id"] = m.group(3)
        doc["hold_lock_trx_mode"] = m.group(4)

        while self._END_DEADLOCK not in lines[i]:
          i += 1
        m = self._REGEX4REFINE["ROLLBACK_TR"].match(lines[i])
        rollback = ""
        if m.group(1) == "1": rollback = "a"
        else: rollback = "b"
        doc["rollback"] = rollback
      else:
        print("Parse Error at", i)
        doc["type"] = "Other"
        doc["message"] = line

      i += 1

    if doc:
      print("%s : Write last data that length is %s (%s)" % (str(datetime.now()), len(self._data), len(doc)))
      self.appendDoc2Data(doc, flush=True)

    print("Written Errorlogs : %s" % str(self._num_of_total_doc))
    # create ES client, create index
    es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
    if es.indices.exists('booklib'):
        print("deleting '%s' index..." % ('booklib'))
        res = es.indices.delete(index='booklib')
        print(" response: '%s'" % (res))
    # since we are running locally, use one shard and no replicas
    request_body = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        'mappings': {
            'Book': {
                'properties': {
                    "text": {
                        "type": "text",
                        "index_options": "offsets",
                        "term_vector": "with_positions_offsets"
                    }
                }
            }
        }
    }
    print("creating '%s' index..." % ('booklib'))
    res = es.indices.create(index='booklib', body=request_body)
    print(" response: '%s'" % (res))
    # bulk index the data
    print("bulk indexing...")
    res = es.bulk(index='booklib', body=bulk_data, refresh=True)
                store['images'] = find_images(store['placeid'])
                #print store
                op_dict = {
                    "index": {
                        "_index": INDEX_NAME,
                        "_type": TYPE_NAME,
                        "_id": store[ID_FIELD]
                    }
                }

                bulk_data.append(op_dict)
                bulk_data.append(store)
		# Google query limit shit
		time.sleep(1)
        try:
            # bulk index the data
            print("bulk indexing...")
            res = es.bulk(index = INDEX_NAME, body = bulk_data, refresh = True)
        except Exception, ex:
            print "Failed to index: %s" % afile

    print "One file done: %s ..." % afile

    # sanity check
    res = es.search(index = INDEX_NAME, size=2, body={"query": {"match_all": {}}})
    print(" response: '%s'" % (res))

    print("results:")
    for hit in res['hits']['hits']:
        print(hit["_source"])
from elasticsearch import Elasticsearch
import json

with open('./news_data/news.json') as f:
    Data = json.load(f)

es = Elasticsearch()

action = ''
cnt = 0
for data in Data:
    query = '{"index": {"_index": "hw1", "_type": "news"}}\n' + json.dumps(
        data, ensure_ascii=False)
    action += query + '\n'
    cnt += 1
    print(cnt)
    if cnt % 1000 == 0:
        es.bulk(body=action)
        action = ''

es.bulk(body=action)
    with table.batch_writer() as batch:
        for business in business_data['businesses']:
            print(business['id'])
            table.put_item(
                Item={
                    'id':
                    business['id'],
                    'name':
                    business['name'],
                    'rating':
                    str(business['rating']),
                    'coordinates': {
                        'latitude': str(business['coordinates']['latitude']),
                        'longitude': str(business['coordinates']['longitude']),
                    },
                    'location':
                    business['location'],
                    'cuisine':
                    term,
                    'review_count':
                    business['review_count'],
                    'insertedAtTimestamp':
                    time.strftime("%d/%m/%Y-%H:%M:%S", time.localtime())
                })
            # ES
            payload = {"id": business['id'], "cuisine": term}
            bulk_file += '{ "index" : { "_index" : "restaurant", "_type" : "_doc", "_id" : "' + \
                str(business['id']) + '" } }\n'
            bulk_file += json.dumps(payload) + '\n'
    es.bulk(bulk_file)
Beispiel #49
0
def main():
    # es = Elasticsearch(hosts=[{'host': 'elasticsearch.aws.blahblah.com', 'port': '9200'}])
    local_es = Elasticsearch()
    local_client = client.IndicesClient(local_es)

    # ### Analyzers, Defaults, and Preventing Analysis
    #
    # Analysis is the process of chopping up your text and storing it in a form that can be searched efficiently against.
    #
    # #### Read this:
    #
    # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html
    #
    # An Analyzer, is in order, a sequence of optional
    # * character filters
    # * tokenizers
    # * token filters
    #
    # To prevent analysis, you can specify "not_analyzed" on the index itself.  The Interwebs also suggest "keyword" as the analyzer for a field, but some folks claim it does some simple analyis.
    #
    # The default analyzer (if unspecified!) for string fields is "standard."  In a custom analyzer, it would be defined:
    #
    #     {
    #         "type":      "custom",
    #         "tokenizer": "standard",
    #         "filter":  [ "lowercase", "stop" ]
    #     }
    #
    # More on default analysis from the docs (https://www.elastic.co/guide/en/elasticsearch/guide/current/_controlling_analysis.html):
    #
    # >While we can specify an analyzer at the field level, how do we determine which analyzer is used for a field if none is specified at the field level?
    # >
    # >Analyzers can be specified at several levels. Elasticsearch works through each level until it finds an analyzer that it can use. At index time, the order is as follows:
    # >
    # >1. The analyzer defined in the field mapping, else
    # >2. The analyzer named default in the index settings, which defaults to
    # >3. The standard analyzer
    # >
    # >...At search time, the sequence is slightly different:...
    # >
    # >1. The analyzer defined in the query itself, else
    # >2. The search_analyzer defined in the field mapping, else
    # >3. The analyzer defined in the field mapping, else
    # >4. The analyzer named default_search in the index settings, which defaults to
    # >5. The analyzer named default in the index settings, which defaults to
    # >6. The standard analyzer
    #
    # #### We can inspect analysis with the "analyze" function (or "_analyze" in the curl style).
    if local_es.indices.exists('my_index'):
        local_es.indices.delete(index='my_index')
    local_es.indices.create(index='my_index')

    # this is the default analyzer ES will use if you don't specify one! Specify one!
    print(
        local_client.analyze(index='my_index',
                             analyzer='standard',
                             text='My kitty-cat is adorable.'))

    # A utility to make analysis results easier to read:
    def get_analyzer_tokens(result):
        ''' Utility to combine tokens in an analyzer result. '''
        tokens = result[u'tokens']
        print(tokens)
        return ' '.join([token['token'] for token in tokens])

    get_analyzer_tokens(
        local_client.analyze(index='my_index',
                             analyzer="standard",
                             text='My kitty-cat\'s a pain in the neck.'))

    # **NB: Prevent analysis with "keyword" analyzer, or set the index itself as "not_analyzed" in settings.**
    #
    # But if you do this, you need to match on EXACT field contents to search for it.  Best to keep an analyzed copy too, if it's meant to be english searchable text.
    get_analyzer_tokens(
        local_client.analyze(index='my_index',
                             analyzer='keyword',
                             text='My kitty-cat\'s a pain in the neck.'))

    # ## The Built-In ES "English" Analyzer:
    # ### A useful analyzer for text is the built-in English one, which does this, approximately:
    #
    # https://www.elastic.co/guide/en/elasticsearch/guide/current/language-intro.html
    #
    # See:
    # https://simpsora.wordpress.com/2014/05/02/customizing-elasticsearch-english-analyzer/
    #
    # >Tokenizer: Standard tokenizer
    #
    # >TokenFilters:
    # >* Standard token filter
    # >* English possessive filter, which removes trailing 's from words
    # >* Lowercase token filter
    # >* Stop token filter
    # >* Keyword marker filter, which protects certain tokens from modification by stemmers
    # >* Porter stemmer filter, which reduces words down to a base form (“stem”)
    #
    #
    # These are the stop-words defined for English:
    #
    #     a, an, and, are, as, at, be, but, by, for, if, in, into, is, it,
    #     no, not, of, on, or, such, that, the, their, then, there, these,
    #     they, this, to, was, will, with
    #
    # If you want to customize you can create a new filter yourself or use a file in your config directory for ES.
    # Try it on some text and see...
    get_analyzer_tokens(
        local_client.analyze(index='my_index',
                             analyzer='english',
                             text='My kitty-cat\'s a pain in the neck.'))

    # If you wanted to customize the 'english' analyzer with your own special rules (extra stopwords etc), see here: https://www.elastic.co/guide/en/elasticsearch/guide/current/configuring-language-analyzers.html
    #

    # ## Analyzers and Custom Analyzers

    # You want to make sure you are explicit about types in your data, so that ES doesn't just guess and maybe get it wrong. Also, this is how you set explicit analysis.

    #
    #
    # Create a setting for the index:
    #
    #     PUT /my_index
    #     {
    #         "settings": {
    #             "analysis": {
    #                 "char_filter": { ... custom character filters ... },
    #                 "tokenizer":   { ...    custom tokenizers     ... },
    #                 "filter":      { ...   custom token filters   ... },
    #                 "analyzer":    { ...    custom analyzers referring to the definitions above ... }
    #             }
    #         }
    #     }
    #
    # For example - this saves a bunch of analysis components into an analyzer called 'my_analyzer':
    #
    #     PUT /my_index
    #     {
    #         "settings": {
    #             "analysis": {
    #                 "char_filter": {
    #                     "&_to_and": {
    #                         "type":       "mapping",
    #                         "mappings": [ "&=> and "]
    #                 }},
    #                 "filter": {
    #                     "my_stopwords": {
    #                         "type":       "stop",
    #                         "stopwords": [ "the", "a" ]
    #                 }},
    #                 "analyzer": {
    #                     "my_analyzer": {
    #                         "type":         "custom",
    #                         "char_filter":  [ "html_strip", "&_to_and" ],
    #                         "tokenizer":    "standard",
    #                         "filter":       [ "lowercase", "my_stopwords" ]
    #                 }}
    #     }}}
    #
    #  Then you **use it**, by referring to it in a mapping for a document in this index:
    #
    #      PUT /my_index/_mapping/my_type
    #     {
    #         "properties": {
    #             "title": {
    #                 "type":      "string",
    #                 "analyzer":  "my_analyzer"
    #             }
    #         }
    #     }
    #
    # #### Remember: If you don't assign it to a field in a mapping, you aren't using it.
    #
    # In Python:

    MY_SETTINGS = {
        "settings": {
            "analysis": {
                "char_filter": {
                    "&_to_and": {
                        "type": "mapping",
                        "mappings": ["&=> and "]
                    }
                },
                "filter": {
                    "my_stopwords": {
                        "type": "stop",
                        "stopwords": ["the", "a"]
                    }
                },
                "analyzer": {
                    "my_analyzer": {
                        "type": "custom",
                        "char_filter": ["html_strip", "&_to_and"],
                        "tokenizer": "standard",
                        "filter": ["lowercase", "my_stopwords"]
                    }
                }
            }
        }
    }

    MAPPING = {
        "my_doc_type": {
            "properties": {
                "title": {
                    "type": "string",
                    "analyzer": "my_analyzer"
                }
            }
        }
    }

    # ## Stopwords Note
    #
    # The default list of stopwords is indicated thusly:
    #
    # >"stopwords": "\_english\_"
    #
    # So you can specify both that filter and a custom stopwords list, if you want.
    if local_es.indices.exists('my_index'):
        local_es.indices.delete(index='my_index')
    local_es.indices.create(index='my_index', body=json.dumps(MY_SETTINGS))
    local_es.indices.put_mapping(index='my_index',
                                 doc_type="my_doc_type",
                                 body=json.dumps(MAPPING))

    # Check that your mapping looks right!
    print(local_client.get_mapping(index='my_index'))

    res = local_client.analyze(
        index='my_index',
        analyzer='my_analyzer',
        text="<p>This is the title & a Capitalized Word!</p>")
    get_analyzer_tokens(res)

    # ## Tokenizers vs. Analyzers - Be Careful.
    #
    # Some of the names in ES are confusing.  There is a **"standard" analyzer** and a **"standard" tokenizer**. https://www.elastic.co/guide/en/elasticsearch/guide/current/standard-tokenizer.html#standard-tokenizer
    #
    # Check them out:
    get_analyzer_tokens(
        local_client.analyze(
            index='my_index',
            analyzer='standard',
            text='My kitty-cat\'s not a pain in the \'neck\'!'))

    #  The difference is subtle but there.
    get_analyzer_tokens(
        local_client.analyze(
            index='my_index',
            tokenizer="standard",
            text='My kitty-cat\'s not a pain in the \'neck\'!'))

    # However, if you use the english analyzer it will override that uppercase and also remove the negation,
    # because "not" is in the stopwords list:
    get_analyzer_tokens(
        local_client.analyze(
            index='my_index',
            analyzer="english",
            tokenizer="standard",
            text='My kitty-cat\'s not a pain in the \'neck\'!'))

    # ## Indexing Yelp Data
    df = pd.read_msgpack("./data/yelp_df_forES.msg")
    print(df.head())

    # test with a small sample if you want
    dfshort = df.query('stars >= 5 and net_sentiment > 35')
    print(len(dfshort))
    print(dfshort.head())

    # filter out any rows with a nan for sent_per_token, which breaks bulk load:
    df = df[df.sent_per_token.isnull() != True]

    MAPPING = {
        'review': {
            'properties': {
                'business_id': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'date': {
                    'index': 'not_analyzed',
                    'format': 'dateOptionalTime',
                    'type': 'date'
                },
                'review_id': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'stars': {
                    'index': 'not_analyzed',
                    'type': 'integer'
                },
                'text': {
                    'index': 'analyzed',
                    'analyzer': 'english',
                    'store': 'yes',
                    "term_vector": "with_positions_offsets_payloads",
                    'type': 'string'
                },
                'fake_name': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'text_orig': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'user_id': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'net_sentiment': {
                    'index': 'not_analyzed',
                    'type': 'integer'
                },
                'sent_per_token': {
                    'index': 'not_analyzed',
                    'type': 'float'
                }
            }
        }
    }

    if local_es.indices.exists('yelp'):
        local_es.indices.delete(index='yelp')
    local_es.indices.create(index='yelp')
    local_es.indices.put_mapping(index='yelp',
                                 doc_type='review',
                                 body=json.dumps(MAPPING))

    # Bulk data is structured as alternating opt_dict and data dicts.
    bulk_data = []

    for index, row in df.iterrows():
        data_dict = {}
        data_dict['text_orig'] = row['text']
        data_dict['text'] = row['text']
        data_dict['net_sentiment'] = row['net_sentiment']
        data_dict['sent_per_token'] = row['sent_per_token']
        data_dict['stars'] = row['stars']
        data_dict['fake_name'] = row['fake_name']
        data_dict['user_id'] = row['user_id']
        data_dict['business_id'] = row['business_id']
        data_dict['date'] = row['date']
        data_dict['review_id'] = row['review_id']
        op_dict = {
            "index": {
                "_index": 'yelp',
                "_type": 'review',
                "_id": row['review_id']
            }
        }
        bulk_data.append(op_dict)
        bulk_data.append(data_dict)

    pprint(bulk_data[0])
    pprint(bulk_data[1])
    print(len(bulk_data))

    # May time out with a large bulk_data bump or error and fail without any reason.  Mine did, so see below.
    # res = local_es.bulk(index = 'yelp', body = bulk_data)

    # In order to find the error, I did them one-by-one, with a try.
    for ind, obj in enumerate(bulk_data):
        # every other one is the data, so use those to do it one by one
        if ind % 2 != 0:
            try:
                local_es.index(index='yelp',
                               doc_type='review',
                               id=obj['review_id'],
                               body=json.dumps(obj))
            except:
                print(obj)

    local_es.search(index='yelp', doc_type='review', q='pizza-cookie')

    # Remember that score relevancy results are based on the indexed TF-IDF for the doc and docs:
    #     https://www.elastic.co/guide/en/elasticsearch/guide/current/relevance-intro.html

    # Want to explain why something matched?  You need the id of the matched doc.
    local_es.explain(index='yelp',
                     doc_type='review',
                     q='pizza-cookie',
                     id=u'fmn5yGrPChOYMR2vGOIrYA')

    # ### More Like This
    #
    # A variety of options for finding similar documents, including term counts and custom stop words:
    # https://www.elastic.co/guide/en/elasticsearch/reference/2.3/query-dsl-mlt-query.html
    #
    #
    text = df.iloc[0].text
    print(text)

    QUERY = {
        "query": {
            "more_like_this": {
                "fields": ["text"],
                "like_text": text,
                "analyzer": "english",
                "min_term_freq": 2
            }
        }
    }

    # Result is not brilliant, though.  You could limit the hits unless a score threshold is hit.
    pprint(
        local_es.search(index='yelp',
                        doc_type='review',
                        body=json.dumps(QUERY)))

    # ### Suggestions: For Mispellings
    #
    # Can be added to queries too, to help if there are no matches.  Still in development, though. See: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters.html#search-suggesters
    SUGGESTION = {
        "my-suggestion": {
            "text": "cheese piza",
            "term": {
                "field": "text"
            }
        }
    }

    # I don't love the results, tbh.  Fail on cheese.
    pprint(local_es.suggest(index='yelp', body=SUGGESTION))

    # ## Reminders:
    # * check your mapping on your fields
    # * check your analyzer results - they can be mysterious and hidden; if you configure wrong, it will use defaults...
    # * check your document tokenization
    # * use multi-fields to be sure of matches that may need stopwords too

    # ## Let's Index the Businesses too
    biz = pd.read_msgpack("data/biz_stats_df.msg")
    print(len(biz))
    pprint(biz[0:2])

    B_MAPPING = {
        'business': {
            'properties': {
                'business_id': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'reviews': {
                    'index': 'not_analyzed',
                    'type': 'integer'
                },
                'stars_median': {
                    'index': 'not_analyzed',
                    'type': 'float'
                },
                'stars_mean': {
                    'index': 'not_analyzed',
                    'type': 'float'
                },
                'text_length_median': {
                    'index': 'not_analyzed',
                    'type': 'float'
                },
                'fake_name': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'net_sentiment_median': {
                    'index': 'not_analyzed',
                    'type': 'float'
                },
                'sent_per_token_median': {
                    'index': 'not_analyzed',
                    'type': 'float'
                }
            }
        }
    }

    # local_es.indices.delete(index='yelp')  # nb: this errors the first time you run it. comment out.
    # local_es.indices.create(index='yelp')  # do not do this is you already made the reviews!
    local_es.indices.put_mapping(index='yelp',
                                 doc_type='business',
                                 body=json.dumps(B_MAPPING))

    bulk_data = []

    for index, row in biz.iterrows():
        data_dict = {}
        data_dict['net_sentiment_median'] = row['net_sentiment_median']
        data_dict['sent_per_token_median'] = row['sent_per_token_median']
        data_dict['stars_median'] = row['stars_median']
        data_dict['stars_mean'] = row['stars_mean']
        data_dict['fake_name'] = row['fake_name']
        data_dict['text_length_median'] = row['text_length_median']
        data_dict['business_id'] = row['business_id']
        data_dict['reviews'] = row['reviews']
        op_dict = {
            "index": {
                "_index": 'yelp',
                "_type": 'business',
                "_id": row['business_id']
            }
        }
        bulk_data.append(op_dict)
        bulk_data.append(data_dict)

    # May time out with a large bulk_data bump or error and fail without any reason.  Mine did, so see below.
    res = local_es.bulk(index='yelp', body=bulk_data)

    print(
        local_es.search(index='yelp',
                        doc_type='business',
                        q='JokKtdXU7zXHcr20Lrk29A'))

    # ## Aggregate Queries to get Business ID's and More
    #
    #
    # Here we are using the operator "and" to make sure all words in the search match, and then getting counts of matching business id's.
    QUERY = {
        "query": {
            "match": {
                "text": {
                    "query": "good pizza",
                    "operator": "and"
                }
            }
        },
        "aggs": {
            "businesses": {
                "terms": {
                    "field": "business_id"
                }
            }
        }
    }

    pprint(
        local_es.search(index="yelp",
                        doc_type="review",
                        body=json.dumps(QUERY)))

    # exact match on field: https://www.elastic.co/guide/en/elasticsearch/guide/master/_finding_exact_values.html
    # requires not indexed field for the match
    QUERY = {
        "query": {
            "constant_score": {
                "filter": {
                    "term": {
                        "business_id": "VVeogjZya58oiTxK7qUjAQ"
                    }
                }
            }
        }
    }

    pprint(
        local_es.search(index="yelp",
                        doc_type="business",
                        body=json.dumps(QUERY)))
Beispiel #50
0
class Esql():
    def __init__(self):
        conf_file = open(os.path.join(ESQL_HOME, 'conf', 'esql.yml'), 'r')
        app_conf = yaml.load(conf_file)
        conf_file.close()
        self.es_hosts = app_conf['elastic'].get('hosts')
        self.es_handler = Elasticsearch(self.es_hosts)
        self.lexer = lex(module=lexer, optimize=True, debug=False)
        self.parser = yacc(debug=False, module=parser)

    def get_host_url(self):
        return "http://" + self.es_hosts[0]['host'] + ":" + str(
            self.es_hosts[0]['port'])

    def _exec_query(self, ast):

        try:
            stmt = Query(ast)
        except Exception:
            return http_response_error('Parse statement to dsl error!')

        try:
            hits = self.es_handler.search(index=stmt._index,
                                          doc_type=stmt._type,
                                          body=stmt.dsl(),
                                          request_timeout=100)
        except ElasticsearchException as e:
            return http_response_error(str(e))

        stmt_res = None
        try:
            stmt_res = response_hits(hits)
        except Exception as e:
            return http_response_nor(str(e))
        return http_response_succes(stmt_res)

    def _exec_create_table(self, ast):

        start_time = time.time()
        try:
            stmt = Create(ast)
        except Exception:
            return http_response_error('Parse statement to dsl error!')
        try:
            res = self.es_handler.indices.create(index=stmt._index,
                                                 body=stmt._options,
                                                 request_timeout=100,
                                                 ignore=400)
            if stmt._type == None:
                stmt._type = 'base'
            res = self.es_handler.indices.put_mapping(index=stmt._index,
                                                      doc_type=stmt._type,
                                                      body=stmt.dsl(),
                                                      request_timeout=100)
        except ElasticsearchException as e:
            return http_response_nor(str(e))

        stmt_res = None

        end_time = time.time()

        took = int((end_time - start_time) * 1000)
        try:
            stmt_res = response_nor(res, took)
        except Exception as e:
            return http_response_error(str(e))
        return http_response_succes(stmt_res)

    def _exec_show_tables(self, ast):

        start_time = time.time()

        try:
            res = self.es_handler.cat.indices(v=True,
                                              bytes='b',
                                              h=[
                                                  'index', 'status', 'pri',
                                                  'rep', 'docs.count',
                                                  'store.size'
                                              ])
        except ElasticsearchException as e:
            return http_response_error(str(e))

        stmt_res = res

        end_time = time.time()

        took = int((end_time - start_time) * 1000)
        try:
            stmt_res = response_cat(res, took)
        except Exception as e:
            return http_response_error(str(e))
        return http_response_succes(stmt_res)

    def _exec_desc_table(self, ast):
        start_time = time.time()
        try:
            stmt = Describe(ast)
        except Exception:
            return http_response_error('Parse statement to dsl error!')
        try:
            res = self.es_handler.indices.get_mapping(index=stmt._index,
                                                      doc_type=stmt._type)
        except ElasticsearchException as e:
            return http_response_error(e.error)

        stmt_res = None

        end_time = time.time()

        took = int((end_time - start_time) * 1000)
        try:
            stmt_res = response_mappings(res, took)
        except Exception as e:
            return http_response_error(str(e))
        return http_response_succes(stmt_res)

    def _exec_drop_table(self, ast):
        start_time = time.time()
        try:
            stmt = Drop(ast)
        except Exception:
            return http_response_error('Parse statement to dsl error!')
        try:
            res = self.es_handler.indices.delete(index=stmt._index)
        except ElasticsearchException as e:
            return http_response_error(e.error)

        stmt_res = None

        end_time = time.time()

        took = int((end_time - start_time) * 1000)
        try:
            stmt_res = response_nor(res, took)
        except Exception as e:
            return http_response_error(str(e))
        return http_response_succes(stmt_res)

    def _exec_insert_into(self, ast):
        start_time = time.time()
        try:
            stmt = Insert(ast)
        except Exception:
            return http_response_error('Parse statement to dsl error!')
        try:
            parms = stmt.metas
            if stmt._type == None:
                stmt._type = 'base'
            res = self.es_handler.index(index=stmt._index,
                                        doc_type=stmt._type,
                                        body=stmt.dsl(),
                                        **parms)

        except ElasticsearchException as e:
            return http_response_error(str(e))

        stmt_res = None
        end_time = time.time()
        took = int((end_time - start_time) * 1000)
        try:
            stmt_res = response_nor(res, took)
        except Exception as e:
            return http_response_error(str(e))
        return http_response_succes(stmt_res)

    def _exec_bulk_into(self, ast):

        try:
            stmt = Bulk(ast)
        except Exception:
            return http_response_error('Parse statement to dsl error!')
        try:
            if stmt._type == None:
                stmt._type = 'base'
            res = self.es_handler.bulk(index=stmt._index,
                                       doc_type=stmt._type,
                                       body=stmt.dsl())

        except ElasticsearchException as e:
            return http_response_error(str(e))

        stmt_res = None

        try:
            stmt_res = response_bulk(res)
        except Exception as e:
            return http_response_error(str(e))
        return http_response_succes(stmt_res)

    def _exec_update(self, ast):
        start_time = time.time()
        try:
            stmt = Update(ast)
        except Exception:
            return http_response_error('Parse statement to dsl error!')
        try:
            if stmt._type == None:
                stmt._type = 'base'
            res = self.es_handler.update(index=stmt._index,
                                         doc_type=stmt._type,
                                         body=stmt.dsl(),
                                         **stmt.conditions)

        except ElasticsearchException as e:
            return http_response_error(str(e))

        stmt_res = None

        end_time = time.time()
        took = int((end_time - start_time) * 1000)
        try:
            stmt_res = response_nor(res, took)
        except Exception as e:
            return http_response_error(str(e))
        return http_response_succes(stmt_res)

    def _exec_upsert(self, ast):
        start_time = time.time()
        try:
            stmt = Upsert(ast)
        except Exception:
            return http_response_error('Parse statement to dsl error!')
        try:
            if stmt._type == None:
                stmt._type = 'base'
            res = self.es_handler.update(index=stmt._index,
                                         doc_type=stmt._type,
                                         body=stmt.dsl(),
                                         **stmt.conditions)

        except ElasticsearchException as e:
            return http_response_error(str(e))

        stmt_res = None

        end_time = time.time()
        took = int((end_time - start_time) * 1000)
        try:
            stmt_res = response_nor(res, took)
        except Exception as e:
            return http_response_error(str(e))
        return http_response_succes(stmt_res)

    def _exec_delete(self, ast):
        start_time = time.time()
        try:
            stmt = Delete(ast)
        except Exception:
            return http_response_error('Parse statement to dsl error!')
        try:
            if stmt._type == None:
                stmt._type = 'base'
            res = self.es_handler.delete(index=stmt._index,
                                         doc_type=stmt._type,
                                         **stmt.conditions)

        except ElasticsearchException as e:
            return http_response_error(str(e))

        stmt_res = None

        end_time = time.time()
        took = int((end_time - start_time) * 1000)
        try:
            stmt_res = response_nor(res, took)
        except Exception as e:
            return http_response_error(str(e))
        return http_response_succes(stmt_res)

    def _exec_explain(self, ast):
        try:
            stmt = Explain(ast)
        except Exception:
            return http_response_error('Parse statement to dsl error!')
        return http_response_nor(stmt.dsl(), 202)

    def exec_statement(self, sql):
        ast = None
        try:
            ast = self.parser.parse(lexer=self.lexer.clone(),
                                    debug=False,
                                    input=sql)
        except Exception as e:
            return http_response_error(str(e))

        if ast == None:
            return http_response_error('parse statement error')

        if ast.get_type() == TK.TOK_QUERY:
            return self._exec_query(ast)
        elif ast.get_type() == TK.TOK_CREATE_TABLE:
            return self._exec_create_table(ast)
        elif ast.get_type() == TK.TOK_SHOW_TABLES:
            return self._exec_show_tables(ast)
        elif ast.get_type() == TK.TOK_DESC_TABLE:
            return self._exec_desc_table(ast)
        elif ast.get_type() == TK.TOK_INSERT_INTO:
            return self._exec_insert_into(ast)
        elif ast.get_type() == TK.TOK_BULK_INTO:
            return self._exec_bulk_into(ast)
        elif ast.get_type() == TK.TOK_UPDATE:
            return self._exec_update(ast)
        elif ast.get_type() == TK.TOK_UPSERT_INTO:
            return self._exec_upsert(ast)
        elif ast.get_type() == TK.TOK_DELETE:
            return self._exec_delete(ast)
        elif ast.get_type() == TK.TOK_DROP_TABLE:
            return self._exec_drop_table(ast)
        elif ast.get_type() == TK.TOK_EXPLAIN:
            return self._exec_explain(ast)
        else:
            return http_response_error('Syntax not supported!')
Beispiel #51
0
            if count_index % 2000 == 0:
                test_speed(es, count_index, bulk_action)
                bulk_action = []

            if count_index % 10000 == 0:
                ts = time.time()
                print "%s  per  %s  second"  %(count_index, ts-tb)
                tb = ts

        else:
            exist_item = es.get(index="activity", doc_type="manage", id=user_id, _source=True)['_source']
            update_item = compare_activity(item, exist_item)
            xdata = expand_index_action(update_item)
            bulk_action.extend([xdata[0], xdata[1]])
            count_index += 1
            if count_index % 2000 == 0:
                test_speed(es, count_index, bulk_action)
                bulk_action = []

            if count_index % 10000 == 0:
                ts = time.time()
                print "%s  per  %s  second"  %(count_index, ts-tb)
                tb = ts

    es.bulk(bulk_action, index="activity", doc_type="manage", timeout=30)





Beispiel #52
0
class ElasticObj:
    def __init__(self, ip="127.0.0.1"):
        '''
        :param index_name: 索引名称
        :param index_type: 索引类型
        '''
        # 无用户名密码状态
        self.es = Elasticsearch([ip])
        #用户名密码状态
        #self.es = Elasticsearch([ip],http_auth=('elastic', 'password'),port=9200)

    def check(self):
        '''
        输出当前系统的ES信息
        :return:
        '''
        return self.es.info()

    def create_index(self, index_name, index_mappings):
        '''
        创建索引,创建索引名称为index_name,类型为index_type的索引
        :param index_mappings: 创建索引的映射
        :return:
        '''
        if self.es.indices.exists(index=index_name) is not True:
            _created = self.es.indices.create(index=index_name,
                                              body=index_mappings)
            print(_created)
            return _created

    def insert_one_document(self, index_name, index_type, body, id=None):
        '''
        插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成
        :param index_name: 待插入的index值
        :param index_type: 待插入的type值
        :param body: 待插入的数据 -> dict型
        :param id: 自定义Id值
        :return:
        '''
        _inserted = self.es.index(index=index_name,
                                  doc_type=index_type,
                                  body=body,
                                  id=id)
        print(_inserted['result'])
        return _inserted

    def index_data_fromCSV(self, index_name, index_type, csvfile):
        '''
        从CSV文件中读取数据,并存储到es中
        :param csvfile: csv文件,包括完整路径
        :return:
        '''
        data_list = csvop.read_csv(csvfile)
        index = 0
        doc = {}
        title = []
        title_num = len(data_list[0])
        for i in range(title_num):  #第一行是标题
            title.append(data_list[0][i])
        for item in data_list:
            if index >= 1:
                for i in range(title_num):
                    doc[title[i]] = item[i]
                res = self.es.index(index=index_name,
                                    doc_type=index_type,
                                    body=doc)
                print(res['result'])
            index += 1
            #print(index)

    def insert_DataFrame(self, index_name, index_type, dataFrame):
        '''
        使用bulk方法批量插入接口;
        bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}]
        其中optionType可为index、delete、update
        Condition可设置每条数据所对应的index值和type值
        data为具体要插入/更新的单条数据
        :param index_name: 默认插入的index值
        :param index_type: 默认插入的type值
        :param dataFrame: 待插入数据集
        :return:
        '''
        index_name = index_name
        index_type = index_type
        dataList = dataFrame.to_dict(orient='records')
        insertHeadInfoList = [{"index": {}} for i in range(len(dataList))]
        temp = [dict] * (len(dataList) * 2)
        temp[::2] = insertHeadInfoList
        temp[1::2] = dataList
        try:
            return self.es.bulk(index=index_name,
                                doc_type=index_type,
                                body=temp)
        except Exception, e:
            return str(e)
Beispiel #53
0
    def scrape(self):
        i = 0
        courseNumList = self.writeToCourseNumList()
        coursePrefixList = self.writeToCoursePrefixList()
        courses = []
        course_state = [["0"] * 2 for i in range(54)]

        #elasticsearch host variable
        ES_HOST = {
            "host": "localhost",  #added
            "port": 9200  #added
        }
        INDEX_NAME = 'fall'  #added
        TYPE_NAME = 'somecscourse'  #added

        self.driver.find_element_by_xpath(
            '//select[@id="CLASS_SRCH_WRK2_STRM$35$"]/option[@value="1178"]'
        ).click()
        sleep(1)

        #While loop returns a list of dict objects that will be bulk indexed
        #into a newly created index
        while (i < len(courseNumList)):
            self.clearAndSearch(courseNumList[i], coursePrefixList[i])
            course_state[i][0] = coursePrefixList[i] + "_" + courseNumList[i]

            if (self.checkSearch() == True):
                i = i + 1
                continue
            if (self.checkOverflow() == True):
                self.driver.find_element_by_xpath('//*[@id="#ICSave"]').click()
            #Added index information as parameter to build dict objects
            self.scrapeAndModifySearch(courses, INDEX_NAME, TYPE_NAME)
            course_state[i][1] = "1"
            i = i + 1

        #create ES clinet and index
        esearch = Elasticsearch(hosts=[ES_HOST])

        #Replace exisiting index
        if esearch.indices.exists(INDEX_NAME):
            print("deleting %s index..." % INDEX_NAME)
            res = esearch.indices.delete(index=INDEX_NAME)
            print("response: %s" % res)

        #Number of indices
        request_body = {
            "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0
            }
        }

        #create index
        print("creating %s index..." % INDEX_NAME)
        res = esearch.indices.create(index=INDEX_NAME, body=request_body)
        print("response %s" % res)

        #bulk index
        print("bulk indexing...")
        res = esearch.bulk(index=INDEX_NAME, body=courses, refresh=True)

        return course_state
Beispiel #54
0
es.indices.create(index='xinpei001', ignore=[400, 404])

INDEX = "xinpei001"
TYPE = "house"


#這裡將dataframe匯入db
def rec_to_actions(df):
    i = 0
    for rec in df.to_dict(orient="records"):
        i += 1
        yield ('{"index":{"_index":"%s","_type":"%s","_id":"%d"}}' %
               (INDEX, TYPE, i))
        yield (json.dumps(rec, default=int))

    if not es.indices.exists(INDEX):
        raise RuntimeError('Index does not exists')


#匯入Data
r = es.bulk(rec_to_actions(data))

# In[ ]:

# In[11]:

# In[ ]:

# In[ ]:
Beispiel #55
0
targetList = ['Url','Target','Time Email Received', 'Recipient Address','Sender']
for hit in hitsList:
    data_dict = {}
    count = 0
    for item in hit:
        data_dict[(targetList[count])] = item
        count += 1
    op_dict = {
        "index": {
            "_index": 'threatelligence',
            "_type": 'PhishingAttacks',
        }
    }
    bulk_data.append(op_dict)
    bulk_data.append(data_dict)

endTime = time.time()
timeElapsed = endTime - startTime
print ("Time Taken: "),(timeElapsed)


# Let's create our index using the Python ES client.
# By default we assume the aserver is running on http://localhost:9200
es = Elasticsearch(hosts=['localhost:9200'])
# bulk index the data
res = es.bulk(index = 'threatelligence', body = bulk_data, refresh = True)

endTime = time.time()
# sends email notification
email = IntelNotify()
email.send_mail(len(hitsList),(endTime - startTime), os.path.basename(__file__))
Beispiel #56
0
class ElasticsearchDataStore(object):
    """Implements the datastore."""

    # Number of events to queue up when bulk inserting events.
    DEFAULT_FLUSH_INTERVAL = 20000
    DEFAULT_SIZE = 1000  # Max events to return

    def __init__(self, host='127.0.0.1', port=9200):
        """Create an Elasticsearch client."""
        super(ElasticsearchDataStore, self).__init__()
        self.client = Elasticsearch([{'host': host, 'port': port}], timeout=30)
        self.import_counter = collections.Counter()
        self.import_events = []

    def create_index(self, index_name):
        """Create an index.

    Args:
      index_name: Name of the index

    Returns:
      Index name in string format.
      Document type in string format.
    """
        if not self.client.indices.exists(index_name):
            try:
                self.client.indices.create(index=index_name)
            except exceptions.ConnectionError:
                raise RuntimeError('Unable to connect to backend datastore.')

        if not isinstance(index_name, six.text_type):
            index_name = codecs.decode(index_name, 'utf8')

        return index_name

    def delete_index(self, index_name):
        """Delete Elasticsearch index.

    Args:
      index_name: Name of the index to delete.
    """
        if self.client.indices.exists(index_name):
            try:
                self.client.indices.delete(index=index_name)
            except exceptions.ConnectionError as e:
                raise RuntimeError(
                    'Unable to connect to backend datastore: {}'.format(e))

    def import_event(self,
                     index_name,
                     event=None,
                     event_id=None,
                     flush_interval=DEFAULT_FLUSH_INTERVAL):
        """Add event to Elasticsearch.

    Args:
      index_name: Name of the index in Elasticsearch
      event: Event dictionary
      event_id: Event Elasticsearch ID
      flush_interval: Number of events to queue up before indexing

    Returns:
      The number of events processed.
    """
        if event:
            for k, v in event.items():
                if not isinstance(k, six.text_type):
                    k = codecs.decode(k, 'utf8')

                # Make sure we have decoded strings in the event dict.
                if isinstance(v, six.binary_type):
                    v = codecs.decode(v, 'utf8')

                event[k] = v

            # Header needed by Elasticsearch when bulk inserting.
            header = {'index': {'_index': index_name}}
            update_header = {'update': {'_index': index_name, '_id': event_id}}

            if event_id:
                # Event has "lang" defined if there is a script used for import.
                if event.get('lang'):
                    event = {'script': event}
                else:
                    event = {'doc': event}
                header = update_header

            self.import_events.append(header)
            self.import_events.append(event)
            self.import_counter['events'] += 1

            if self.import_counter['events'] % int(flush_interval) == 0:
                self.client.bulk(body=self.import_events)
                self.import_events = []
        else:
            # Import the remaining events in the queue.
            if self.import_events:
                self.client.bulk(body=self.import_events)

        return self.import_counter['events']

    @staticmethod
    def build_query(query_string):
        """Build Elasticsearch DSL query.

    Args:
      query_string: Query string

    Returns:
      Elasticsearch DSL query as a dictionary
    """

        query_dsl = {
            'query': {
                'bool': {
                    'must': [{
                        'query_string': {
                            'query': query_string
                        }
                    }]
                }
            }
        }

        return query_dsl

    def search(self, index_id, query_string, size=DEFAULT_SIZE):
        """Search ElasticSearch.

    This will take a query string from the UI together with a filter definition.
    Based on this it will execute the search request on ElasticSearch and get
    the result back.

    Args:
      index_id: Index to be searched
      query_string: Query string
      size: Maximum number of results to return

    Returns:
      Set of event documents in JSON format
    """

        query_dsl = self.build_query(query_string)

        # Default search type for elasticsearch is query_then_fetch.
        search_type = 'query_then_fetch'

        return self.client.search(body=query_dsl,
                                  index=index_id,
                                  size=size,
                                  search_type=search_type)
Beispiel #57
0
class SlowquerySender:
    def __init__(self):
        self._SLOWQUERYLOG_PREFIX = "slowquery/mysql-slowquery.log."

        self._GENERAL_CONFIG = {
            # Elasticsearch host name
            "ES_HOST": "192.168.0.1:4040",

            # Elasticsearch prefix for index name
            "INDEX_PREFIX": "rds_slowquery",

            # Elasticsearch type name is rds instance id
            "RDS_ID": "tb-master",

            # Enabled to change timezone. If you set UTC, this parameter is blank
            "TIMEZONE": "Asia/Seoul",

            # RDS region which you want to crawling error log.
            "AWS_RDS_REGION_ID": "ap-northeast-2",

            # If you have ec2 instances, then It need region and VPC involving instances.
            "AWS_EC2_REGION_ID": "ap-northeast-2",
            "AWS_EC2_VPC_ID": "vpc-XXxxXXxx"
        }

        self._REGEX4REFINE = {
            "REG_TIME": re.compile(
                "^[a-zA-Z#:_ ]+([0-9.]+)[a-zA-Z:_ ]+([0-9.]+)[a-zA-Z:_ ]+([0-9.]+).[a-zA-Z:_ ]+([0-9.]+)$"),
        }

        self._LOG_CONFIG = {
            "LOG_OUTPUT_DIR": "/var/log/rdslog/slowquery2es.log",
            "RAW_OUTPUT_DIR": "/var/log/rdslog/slowquery"  # (Optional)
        }

        self._es = Elasticsearch(self._GENERAL_CONFIG["ES_HOST"])
        self._ec2dict = dict()
        self._last_time = ""
        self._data = list()
        self._new_doc = True
        self._num_of_total_doc = 0
        self._now = datetime.now()

        self._reaminer = RawFileRemainer(self._LOG_CONFIG["RAW_OUTPUT_DIR"])

    # Get raw data.
    def getRdsSlowQlog(self, log_filename):
        client = boto3.client("rds", region_name=self._GENERAL_CONFIG["AWS_RDS_REGION_ID"])
        db_files = client.describe_db_log_files(DBInstanceIdentifier=self._GENERAL_CONFIG["RDS_ID"])

        if not filter(lambda log: log["LogFileName"] == log_filename, db_files["DescribeDBLogFiles"]):
            return ""

        marker = "0"
        log_data = ""

        # It used like do-while statement.
        ret = client.download_db_log_file_portion(
            DBInstanceIdentifier=self._GENERAL_CONFIG["RDS_ID"],
            LogFileName=log_filename,
            Marker=marker,
            NumberOfLines=500)
        log_data = ret["LogFileData"]
        marker = ret["Marker"]

        while ret["AdditionalDataPending"]:
            ret = client.download_db_log_file_portion(
                DBInstanceIdentifier=self._GENERAL_CONFIG["RDS_ID"],
                LogFileName=log_filename,
                Marker=marker,
                NumberOfLines=500)
            print("keep going...")

            log_data += ret["LogFileData"]
            marker = ret["Marker"]

        # Delete old log files.
        self._reaminer.clearOutOfDateRawFiles()
        self._reaminer.makeRawLog("mysql-slowquery.log." + str((datetime.now().utcnow()).hour), log_data)
        return log_data

    def getRdsSlowQlog4Debug(self, path):
        content = ""
        import codecs
        f = codecs.open(path, "r", "utf-8")
        while True:
            l = f.readline()
            content += l
            if not l: break
        return content

    def validateLogDate(self, lines):
        delta = timedelta(hours=2)

        for line in lines:
            if not line:
                continue
            elif line.startswith("# Time: "):
                log_time = datetime.strptime(line[8:], "%y%m%d %H:%M:%S")
                log_time = log_time.replace(tzinfo=tz.tzutc()).astimezone(
                    zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
                log_time = log_time.replace(tzinfo=None)
                print(self._now, log_time)
                print("diff :", self._now - log_time)
                if (self._now - log_time) > delta:
                    return False
                else:
                    return True

        return True

    # Initialization.
    def initLastTime(self, path):
        if not os.path.exists(path):
            cur_time = dself._now.strftime("%y%m%d %H:%M:%S")
            self._last_time = datetime.strptime(cur_time, "%y%m%d %H:%M:%S").isoformat()
            return False

        last_re = re.compile("last_time : (.*)\\n?")

        ifs = open(path, "r")
        lines = ifs.readlines()

        for l in reversed(lines):
            if not l: continue
            m = last_re.match(l)
            if m is not None:
                self._last_time = (m.groups(0))[0]
                ifs.close()
                return True
        ifs.close()

        cur_time = self._now.strftime("%y%m%d %H:%M:%S")
        self._last_time = datetime.strptime(cur_time, "%y%m%d %H:%M:%S").isoformat()
        return False

    def initEC2InstancesInVpc(self, region, vpc):
        for attempt in range(3):
            try:
                ec2 = boto3.resource("ec2", region_name=region)
                vpc = ec2.Vpc(vpc)
                for i in vpc.instances.all():
                    for tag in i.tags:
                        if tag['Key'] == 'Name':
                            self._ec2dict[i.private_ip_address] = "".join(tag['Value'].split())
            except:
                time.sleep(3)
                print("sleeping..., because DescribeInstances have been failed.")
            else:
                break

    def setTargetIndex(self):
        self._ES_INDEX = self._GENERAL_CONFIG["INDEX_PREFIX"] + "-" + datetime.strftime(self._now, "%Y.%m")

    def createTemplate(self, template_name):
        template_body = {
            "template": "test_slowquerylog-*",
            "mappings": {
                self._GENERAL_CONFIG["RDS_ID"]: {
                    "properties": {
                        "query_time": {
                            "type": "float",
                            "index": "not_analyzed"},
                        "row_sent": {
                            "type": "integer",
                            "index": "not_analyzed"},
                        "rows_examined": {
                            "type": "integer",
                            "index": "not_analyzed"},
                        "lock_time": {
                            "type": "float",
                            "index": "not_analyzed"}
                    }
                }
            },
            "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0 }
        }

        response = self._es.indices.put_template(name=template_name, body=template_body)
        if response["acknowledged"]:
            print("Create template success.")
        else:
            print("Create template failed.")

    def isNewDoc(self, line):
        if (self._new_doc) and (line.startswith("# Time: ") or line.startswith("# User@Host: ")):
            return True
        else:
            return False

    def refreshLastTime(self, line):
        timestamp = datetime.strptime(line[8:], "%y%m%d %H:%M:%S")
        if self._GENERAL_CONFIG["TIMEZONE"]:
            timestamp = timestamp.replace(tzinfo=tz.tzutc()).astimezone(
                zoneinfo.gettz(self._GENERAL_CONFIG["TIMEZONE"]))
        self._last_time = timestamp.isoformat()

    def removeDuplicatedLineFeed(self, s):
        stripped = s.strip()
        # Substitue multiple line feed to single line feed.
        stripped = re.sub(r"(\n)+", r"\n", stripped)
        return stripped

    def appendDoc2Data(self, doc, flush=False):
        doc["timestamp"] = self._last_time
        doc["sql"] = self.removeDuplicatedLineFeed(doc["sql"])
        self._data.append({"index": {
            "_index": self._ES_INDEX,
            "_type": self._GENERAL_CONFIG["RDS_ID"]}})
        self._data.append(doc)

        self._num_of_total_doc += 1
        if len(self._data) > 100000 or flush:
            print("I'll gonna send~!")
            self._es.bulk(index=self._ES_INDEX, body=self._data, refresh=flush)

    def initNewDoc(self, doc, l1, l2, i):
        if l1.startswith("# Time: "):
            self.refreshLastTime(l1)
            doc["timestamp"] = self._last_time
            i += 2  # Because we use two lines above.
        elif l1.startswith("# User@Host: "):
            doc["timestamp"] = self._last_time
            l2 = l1
            i += 1
        else:
            doc["timestamp"] = self._last_time
            print("There is an another pattern!")

        doc["user"] = l2.split("[")[1].split("]")[0]
        doc["client"] = l2.split("[")[2].split("]")[0]
        doc["client_id"] = l2.split(" Id: ")[1]
        ip_addr = doc["client"]
        if ip_addr not in self._ec2dict:
            doc["name"] = "Missed"
        else:
            doc["name"] = self._ec2dict[ip_addr]

        self._new_doc = False

        return doc, i

    def run(self):
        log_filename = self._SLOWQUERYLOG_PREFIX + str((self._now.utcnow()).hour)
        log_data = self.getRdsSlowQlog(log_filename)

        if not log_data:
            print("%s does not exist!" % (log_filename))
            return -1

        lines = log_data.split("\n")
        if len(lines) > 0:
            if not self.validateLogDate(lines):
                print("%s already read log!" % (log_filename))
                return -2
        else:
            print("%s is empty!" % (log_filename))
            return -3

        # Get ready for extracting log file.
        self.initLastTime(self._LOG_CONFIG["LOG_OUTPUT_DIR"])
        self.initEC2InstancesInVpc(
            self._GENERAL_CONFIG["AWS_EC2_REGION_ID"],
            self._GENERAL_CONFIG["AWS_EC2_VPC_ID"])
        self.setTargetIndex()
        self.createTemplate(self._GENERAL_CONFIG["INDEX_PREFIX"])

        print("%s : Ready to write %s in %s" % (str(datetime.now()), log_filename, self._ES_INDEX))
        i = 0
        doc = {}

        # Consider a case when only new one is appeared.
        while not self.isNewDoc(lines[i]):
            i += 1
            continue

        while i < len(lines):
            line = lines[i]

            if self.isNewDoc(line):
                if doc:
                    self.appendDoc2Data(doc)
                    doc = {}

                doc, i = self.initNewDoc(doc, lines[i], lines[i + 1], i)
                line = lines[i]

            if line.startswith("# Query_time: "):
                m = self._REGEX4REFINE["REG_TIME"].match(line).groups(0)
                doc["query_time"] = m[0]
                doc["lock_time"] = m[1]
                doc["rows_sent"] = m[2]
                doc["rows_examined"] = m[3]
            else:
                if doc.get("sql"):
                    doc["sql"] += "\n" + line
                else:
                    doc["sql"] = line
                self._new_doc = True

            i += 1

        if doc:
            self.appendDoc2Data(doc, flush=True)

        print("Written Slow Queries : %s" % str(self._num_of_total_doc))
        print("last_time : %s" % (self._last_time))
Beispiel #58
0
                      'mappings': {
                          'dictionary_datas': {
                              'properties': {
                                  'id': {
                                      'type': 'long'
                                  },
                                  'title': {
                                      'type': 'text',
                                      'analyzer': 'my_analyzer'
                                  }
                              }
                          }
                      }
                  })

# json file열어
with open(DICT_DIR, encoding='utf-8') as json_file:
    json_data = json.loads(json_file.read())

# body에 dictionary 정보 뭉쳐놓기!!
body = ""
for i in json_data:
    body = body + json.dumps(
        {'index': {
            '_index': 'dictionary',
            '_type': 'dictionary_datas'
        }}) + '\n'
    body = body + json.dumps(i, ensure_ascii=False) + '\n'

es.bulk(body)
Beispiel #59
0
def elasticsearch_create(event, _):
    # Conditionally creates and loads Elasticsearch products index
    # If the products index already exists, this function does nothing.
    # Otherwise, this function will create the products index and add
    # all products from the bundled products.yaml file.

    es_domain_endpoint = event['ResourceProperties'][
        'ElasticsearchDomainEndpoint']
    logger.info('Elasticsearch endpoint: ' + es_domain_endpoint)

    es_host = {
        'host': es_domain_endpoint,
        'port': 443,
        'scheme': 'https',
    }

    # For testing: specify 'ForceIndex' to force existing index to be deleted and products indexed.
    force_index = event['ResourceProperties'].get(
        'ForceIndex', 'no').lower() in ['true', 'yes', '1']

    es = Elasticsearch(hosts=[es_host],
                       timeout=30,
                       max_retries=10,
                       retry_on_timeout=True)

    create_index_and_bulk_load = True

    if es.indices.exists(INDEX_NAME):
        logger.info(f'{INDEX_NAME} already exists')
        create_index_and_bulk_load = False

        if force_index:
            logger.info(f'Deleting "{INDEX_NAME}"...')
            res = es.indices.delete(index=INDEX_NAME)
            logger.debug(" response: '%s'" % (res))
            create_index_and_bulk_load = True
    else:
        logger.info('Index does not exist')

    if create_index_and_bulk_load:
        request_body = {
            "settings": {
                "number_of_shards": 1,
                "number_of_replicas": 0
            }
        }
        logger.info(f'Creating "{INDEX_NAME}" index...')
        res = es.indices.create(index=INDEX_NAME, body=request_body)
        logger.debug(" response: '%s'" % (res))

        logger.info('Downloading products.yaml...')
        s3.meta.client.download_file(event['ResourceProperties']['Bucket'],
                                     event['ResourceProperties']['File'],
                                     '/tmp/products.yaml')
        with open('/tmp/products.yaml') as file:
            logger.info('Loading products.yaml...')
            products_list = yaml.safe_load(file)

            logger.info(
                f'Bulk indexing {len(products_list)} products in batches...')
            bulk_data = []

            for product in products_list:
                bulk_data.append({
                    "index": {
                        "_index": INDEX_NAME,
                        "_type": TYPE_NAME,
                        "_id": product[ID_FIELD]
                    }
                })
                bulk_data.append(product)

                if len(bulk_data) >= MAX_BULK_BATCH_SIZE:
                    es.bulk(index=INDEX_NAME, body=bulk_data)
                    bulk_data = []

            if len(bulk_data) > 0:
                es.bulk(index=INDEX_NAME, body=bulk_data)

        logger.info('Products successfully indexed!')

        helper.Data['Output'] = 'Elasticsearch product index populated'
    else:
        helper.Data['Output'] = 'Elasticsearch product index already exists'

    return es_domain_endpoint
Beispiel #60
0
class ElasticSearchUtil:
    def __init__(self, host):
        self.host = host
        self.conn = Elasticsearch([self.host])

    def __del__(self):
        self.close()

    def check(self):
        '''
        输出当前系统的ES信息
        :return:
        '''
        return self.conn.info()

    def insertDocument(self, index, type, body, id=None):
        '''
        插入一条数据body到指定的index、指定的type下;可指定Id,若不指定,ES会自动生成
        :param index: 待插入的index值
        :param type: 待插入的type值
        :param body: 待插入的数据 -> dict型
        :param id: 自定义Id值
        :return:
        '''
        return self.conn.index(index=index, doc_type=type, body=body, id=id)

    def insertDataFrame(self, index, type, dataFrame):
        '''
        批量插入接口;
        bulk接口所要求的数据列表结构为:[{{optionType}: {Condition}}, {data}]
        其中optionType可为index、delete、update
        Condition可设置每条数据所对应的index值和type值
        data为具体要插入/更新的单条数据
        :param index: 默认插入的index值
        :param type: 默认插入的type值
        :param dataFrame: 待插入数据集
        :return:
        '''
        dataList = dataFrame.to_dict(orient='records')
        print(dataList)
        insertHeadInfoList = [{"index": {}} for i in range(len(dataList))]
        print(insertHeadInfoList)
        temp = [dict] * (len(dataList) * 2)
        temp[::2] = insertHeadInfoList
        temp[1::2] = dataList
        print(temp)
        try:
            return self.conn.bulk(index=index, doc_type=type, body=temp)
        except Exception as e:
            return str(e)

    def deleteDocById(self, index, type, id):
        '''
        删除指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.delete(index=index, doc_type=type, id=id)

    def deleteDocByQuery(self, index, query, type=None):
        '''
        删除idnex下符合条件query的所有数据
        :param index:
        :param query: 满足DSL语法格式
        :param type:
        :return:
        '''
        return self.conn.delete_by_query(index=index, body=query, doc_type=type)

    def deleteAllDocByIndex(self, index, type=None):
        '''
        删除指定index下的所有数据
        :param index:
        :return:
        '''
        try:
            query = {'query': {'match_all': {}}}
            return self.conn.delete_by_query(index=index, body=query, doc_type=type)
        except Exception as e:
            return str(e) + ' -> ' + index

    def searchDoc(self, index=None, type=None, body=None):
        '''
        查找index下所有符合条件的数据
        :param index:
        :param type:
        :param body: 筛选语句,符合DSL语法格式
        :return:
        '''
        return self.conn.search(index=index, doc_type=type, body=body)

    def getDocById(self, index, type, id):
        '''
        获取指定index、type、id对应的数据
        :param index:
        :param type:
        :param id:
        :return:
        '''
        return self.conn.get(index=index, doc_type=type, id=id)

    def updateDocById(self, index, type, id, body=None):
        '''
        更新指定index、type、id所对应的数据
        :param index:
        :param type:
        :param id:
        :param body: 待更新的值
        :return:
        '''
        return self.conn.update(index=index, doc_type=type, id=id, body=body)


    def close(self):
     if self.conn is not None:
        try:
            self.conn.close()
        except Exception as e:
            pass
        finally:
            self.conn = None

    def test(self, index, type, body):
        query = self.conn.update_by_query(index=index, doc_type=type, body=body,request_timeout=60)
        print(query)