class ESLayers(object): """Implementation of Elastic Search as layers backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def _transform(self, layer, version, layer_name): """Add some meta data fields which are ES specific""" layer = dict(layer) # copy label = layer['label'] del layer['label'] return { 'id': '%s/%s/%s' % (version, layer_name, label), 'version': version, 'name': layer_name, 'label': label, 'layer': layer } def bulk_put(self, layers, version, layer_name, root_label): """Store all layer objects""" self.es.bulk_index( settings.ELASTIC_SEARCH_INDEX, 'layer', map(lambda l: self._transform(l, version, layer_name), layers)) def get(self, name, label, version): """Find the layer that matches these parameters""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'layer', version + '/' + name + '/' + label) return result['_source']['layer'] except ElasticHttpNotFoundError: return None
def delete(config, tree_names, all, force): """Delete indices and their catalog entries. This deletes the indices that have the format version of the copy of DXR this runs under. """ es = ElasticSearch(config.es_hosts) if all: echo('Deleting catalog...') es.delete_index(config.es_catalog_index) # TODO: Delete tree indices as well. else: for tree_name in tree_names: frozen_id = '%s/%s' % (FORMAT, tree_name) try: frozen = es.get(config.es_catalog_index, TREE, frozen_id) except ElasticHttpNotFoundError: raise ClickException('No tree "%s" in catalog.' % tree_name) # Delete the index first. That way, if that fails, we can still # try again; we won't have lost the catalog entry. Refresh is # infrequent enough that we wouldn't avoid a race around a # catalogued but deleted instance the other way around. try: es.delete_index(frozen['_source']['es_alias']) except ElasticHttpNotFoundError: # It's already gone. Fine. Just remove the catalog entry. pass es.delete(config.es_catalog_index, TREE, frozen_id)
class ESDiffs(object): """Implementation of Elastic Search as diff backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) @staticmethod def to_id(label, old, new): return "%s/%s/%s" % (label, old, new) def put(self, label, old_version, new_version, diff): """Store a diff between two versions of a regulation node""" struct = { 'label': label, 'old_version': old_version, 'new_version': new_version, 'diff': diff } self.es.index(settings.ELASTIC_SEARCH_INDEX, 'diff', struct, id=self.to_id(label, old_version, new_version)) def get(self, label, old_version, new_version): """Find the associated diff""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'diff', self.to_id(label, old_version, new_version)) return result['_source']['diff'] except ElasticHttpNotFoundError: return None
class ESNotices(object): """Implementation of Elastic Search as notice backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def put(self, doc_number, notice): """Store a single notice""" self.es.index(settings.ELASTIC_SEARCH_INDEX, 'notice', notice, id=doc_number) def get(self, doc_number): """Find the associated notice""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice', doc_number) return result['_source'] except ElasticHttpNotFoundError: return None def listing(self, part=None): """All notices or filtered by cfr_part""" if part: query = {'match': {'cfr_part': part}} else: query = {'match_all': {}} query = {'fields': ['effective_on', 'fr_url', 'publication_date'], 'query': query} notices = [] results = self.es.search(query, doc_type='notice', size=100, index=settings.ELASTIC_SEARCH_INDEX) for notice in results['hits']['hits']: notice['fields']['document_number'] = notice['_id'] notices.append(notice['fields']) return notices
def export_serie(id, output): result = [] es = ElasticSearch(CONTEXT["datahub-store"]) series = es.get(CONTEXT["datahub-index"], "_all", id)["_source"]["data"]["series"][0]["data"] for key, value in series: date = datetime.datetime.strptime(key, "%Y-%m-%d") output.write("%s/%s/%s,%s\n" % (date.month, date.day, date.year, value))
class ESBase(object): """Shared code for Elastic Search storage models""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def safe_fetch(self, doc_type, id): """Attempt to retrieve a document from Elastic Search. :return: Found document, if it exists, otherwise None""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, doc_type, id) return result['_source'] except ElasticHttpNotFoundError: return None
class ESRegulations(object): """Implementation of Elastic Search as regulations backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def get(self, label, version): """Find the regulation label + version""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', version + '/' + label) reg_node = result['_source'] del reg_node['regulation'] del reg_node['version'] del reg_node['label_string'] del reg_node['id'] return reg_node except ElasticHttpNotFoundError: return None def _transform(self, reg, version): """Add some meta data fields which are ES specific""" node = dict(reg) # copy node['version'] = version node['label_string'] = '-'.join(node['label']) node['regulation'] = node['label'][0] node['id'] = version + '/' + node['label_string'] node['root'] = len(node['label']) == 1 return node def bulk_put(self, regs, version, root_label): """Store all reg objects""" self.es.bulk_index(settings.ELASTIC_SEARCH_INDEX, 'reg_tree', map(lambda r: self._transform(r, version), regs)) def listing(self, label=None): """List regulation version-label pairs that match this label (or are root, if label is None)""" if label is None: query = {'match': {'root': True}} else: query = {'match': {'label_string': label}} query = {'fields': ['label_string', 'version'], 'query': query} result = self.es.search(query, index=settings.ELASTIC_SEARCH_INDEX, doc_type='reg_tree', size=100) return sorted((res['fields']['version'], res['fields']['label_string']) for res in result['hits']['hits'])
class GetLastUpdate(): def __init__(self): self.es = ElasticSearch(config.DATABASE_URL) def get(self, query): query = query.split(',') last_update = 0 for q in query: l = self.es.get(q, 'seq', 'last_seq')['_source'] if last_update == 0 or l['time'] < last_update: last_update = l['time'] lu = {} lu['last_update'] = last_update return json.dumps({"results": lu})
class ESNotices(object): """Implementation of Elastic Search as notice backend""" def __init__(self): self.es = ElasticSearch(settings.ELASTIC_SEARCH_URLS) def put(self, doc_number, notice): """Store a single notice""" self.es.index(settings.ELASTIC_SEARCH_INDEX, 'notice', notice, id=doc_number) def get(self, doc_number): """Find the associated notice""" try: result = self.es.get(settings.ELASTIC_SEARCH_INDEX, 'notice', doc_number) return result['_source'] except ElasticHttpNotFoundError: return None def listing(self, part=None): """All notices or filtered by cfr_part""" if part: query = {'match': {'cfr_parts': part}} else: query = {'match_all': {}} query = { 'fields': ['effective_on', 'fr_url', 'publication_date'], 'query': query } notices = [] results = self.es.search(query, doc_type='notice', size=100, index=settings.ELASTIC_SEARCH_INDEX) for notice in results['hits']['hits']: notice['fields']['document_number'] = notice['_id'] notices.append(notice['fields']) return notices
class ElasticConnector(Connector): """ Class for connectors that are operate with elasticsearch database """ MAX_SIZE = 1000 def __init__(self, database, host='http://localhost:9200/'): self.client = ElasticSearch(host) self.index = database self.create_index() def query_to_id(self, query): """ Returns id representation of a specified query This is a temporary method as a replacement of elasticsearch query search """ return "_".join(str(k) + "_" + str(v) for k, v in query.items()).replace("/", "_") def create_index(self): """ Creates specified index or catches an exception if it has already been created """ try: self.client.create_index(self.index) except Exception as e: pass def set_dynamic_mapping(self, collection): """ Sets dynamic mapping for a specified document type """ self.client.put_mapping(self.index, collection, {'dynamic': True}) def save_block(self, block): """ Saves operation info in a database """ super().save_block(block) collection = block.get_collection() dictionary = block.to_dict() query = block.get_query() self.update_by_query(collection, query, block) def update_by_query(self, collection, query, document): """ Sets dynamic mapping for a specified collection, then creates a new id for a document depending on query for it. Saves a new object in a database as a new one """ try: self.set_dynamic_mapping(collection) document_id = document.get_id() document_body = document.to_dict() if "_id" in document_body.keys(): del document_body['_id'] self.client.index(self.index, collection, document_body, id=self.query_to_id(query)) except Exception as e: print(e) pass def find_last_block(self): """ Finds last block index as a value field of a document in a status collection with specified id """ try: document = self.client.get(self.index, 'status', 'height_all_tsx')['_source'] return document['value'] except ElasticHttpNotFoundError as e: return 0 def update_last_block(self, last_block): """ Updates last block index as a value field of a document in a status collection with specified id """ self.client.index(self.index, 'status', {'value': last_block}, id='height_all_tsx') def save_instance(self, instance): """ Saves account or comment object """ self.update_by_query(instance.get_collection(), instance.get_query(), instance) def get_instances_to_update(self, collection): """ Finds and returns all dictionaries with objects that should be updated """ hits = self.client.search("need_update:true", index=self.index, doc_type=collection, size=self.MAX_SIZE)['hits']['hits'] return [{**hit['_source'], **{"_id": hit["_id"]}} for hit in hits] def update_instances(self, collection, instances): """ Resets need_update flag for all instances in a list by their ids in _id field """ for instance in instances: self.client.update(self.index, collection, instance["_id"], doc={'need_update': False})
'age': 31, 'title': 'Programmer' }, { 'id': 3, 'name': 'Freddy Coder抽', 'age': 29, 'title': 'Office Assistant' }] es.bulk((es.index_op(doc, id=doc.pop('id')) for doc in docs), index='test', doc_type='test') es.refresh('test') res1 = es.get('test', 'test', 1) # 全文匹配, 注意中英文的分词方式. # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html res8 = es.search(index='test', size=2, query={"query": { "query_string": { "query": "抽" } }}) # 前缀匹配查询,只接受小写. res12 = es.search(index='test', query={"query": {"prefix": {"title": "p"}}})
class Elastic(DataLayer): """ElasticSearch data layer.""" serializers = { 'integer': int, 'datetime': parse_date } def init_app(self, app): app.config.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200/') app.config.setdefault('ELASTICSEARCH_INDEX', 'eve') self.es = ElasticSearch(app.config['ELASTICSEARCH_URL']) self.index = app.config['ELASTICSEARCH_INDEX'] def _get_field_mapping(self, schema): """Get mapping for given field schema.""" if schema['type'] == 'datetime': return {'type': 'date'} elif schema['type'] == 'string' and schema.get('unique'): return {'type': 'string', 'index': 'not_analyzed'} elif schema['type'] == 'string': return {'type': 'string'} def put_mapping(self, app): """Put mapping for elasticsearch for current schema. It's not called automatically now, but rather left for user to call it whenever it makes sense. """ for resource, resource_config in app.config['DOMAIN'].items(): properties = {} properties[config.DATE_CREATED] = self._get_field_mapping({'type': 'datetime'}) properties[config.LAST_UPDATED] = self._get_field_mapping({'type': 'datetime'}) for field, schema in resource_config['schema'].items(): field_mapping = self._get_field_mapping(schema) if field_mapping: properties[field] = field_mapping datasource = (resource, ) # TODO: config.SOURCES not available yet (self._datasource_ex(resource)) mapping = {} mapping[datasource[0]] = {'properties': properties} self.es.put_mapping(self.index, datasource[0], mapping) def find(self, resource, req, sub_resource_lookup): """ TODO: implement sub_resource_lookup """ query = { 'query': { 'query_string': { 'query': request.args.get('q', '*'), 'default_field': request.args.get('df', '_all'), 'default_operator': 'AND' } } } if not req.sort and self._default_sort(resource): req.sort = self._default_sort(resource) # skip sorting when there is a query to use score if req.sort and 'q' not in request.args: query['sort'] = [] sort = ast.literal_eval(req.sort) for (key, sortdir) in sort: sort_dict = dict([(key, 'asc' if sortdir > 0 else 'desc')]) query['sort'].append(sort_dict) if req.where: where = json.loads(req.where) if where: query['filter'] = { 'term': where } if req.max_results: query['size'] = req.max_results if req.page > 1: query['from'] = (req.page - 1) * req.max_results source_config = config.SOURCES[resource] if 'facets' in source_config: query['facets'] = source_config['facets'] try: args = self._es_args(resource) args['es_fiels'] = self._fields(resource) return self._parse_hits(self.es.search(query, **args), resource) except es_exceptions.ElasticHttpError: return ElasticCursor() def find_one(self, resource, **lookup): args = self._es_args(resource) args['es_fields'] = self._fields(resource) if config.ID_FIELD in lookup: try: hit = self.es.get(id=lookup[config.ID_FIELD], **args) except es_exceptions.ElasticHttpNotFoundError: return if not hit['exists']: return doc = hit.get('fields', hit.get('_source', {})) doc['_id'] = hit.get('_id') convert_dates(doc, self._dates(resource)) return doc else: query = { 'query': { 'constant_score': { 'filter': { 'term': lookup } } } } try: args['size'] = 1 docs = self._parse_hits(self.es.search(query, **args), resource) return docs.first() except es_exceptions.ElasticHttpNotFoundError: return None def find_list_of_ids(self, resource, ids, client_projection=None): args = self._es_args(resource) args['es_fields'] = self._fields(resource) return self._parse_hits(self.es.multi_get(ids, **args), resource) def insert(self, resource, doc_or_docs, **kwargs): ids = [] kwargs.update(self._es_args(resource)) for doc in doc_or_docs: doc.update(self.es.index(doc=doc, id=doc.get('_id'), **kwargs)) ids.append(doc['_id']) self.es.refresh(self.index) return ids def update(self, resource, id_, updates): args = self._es_args(resource, refresh=True) return self.es.update(id=id_, doc=updates, **args) def replace(self, resource, id_, document): args = self._es_args(resource, refresh=True) args['overwrite_existing'] = True return self.es.index(document=document, id=id_, **args) def remove(self, resource, id_=None): args = self._es_args(resource, refresh=True) if id_: return self.es.delete(id=id_, **args) else: try: return self.es.delete_all(**args) except es_exceptions.ElasticHttpNotFoundError: return def _parse_hits(self, hits, resource): """Parse hits response into documents.""" return ElasticCursor(hits, self._dates(resource)) def _es_args(self, resource, refresh=None): """Get index and doctype args.""" datasource = self._datasource(resource) args = { 'index': self.index, 'doc_type': datasource[0], } if refresh: args['refresh'] = refresh return args def _fields(self, resource): """Get projection fields for given resource.""" datasource = self._datasource(resource) keys = datasource[2].keys() return ','.join(keys) def _default_sort(self, resource): datasource = self._datasource(resource) return datasource[3] def _dates(self, resource): dates = [config.LAST_UPDATED, config.DATE_CREATED] datasource = self._datasource(resource) schema = config.DOMAIN[datasource[0]]['schema'] for field, field_schema in schema.items(): if field_schema['type'] == 'datetime': dates.append(field) return dates
class GetGrowing(): def __init__(self): self.count_fake = 0 self.es = ElasticSearch(config.DATABASE_URL) self.overflow = 1000000 def fake_suffix(self): self.count_fake += 1 return 'X'*(5-min(len(str(self.count_fake)), 4))+str(self.count_fake) def create_fake_request(self, original_request, member_of_campaign, status='upcoming', total=None): fake_request = {} fake_request['status'] = status fake_request['member_of_campaign'] = member_of_campaign for f in ['pwg', 'priority', 'total_events', 'time_event']: fake_request[f] = original_request[f] if total is not None: fake_request['total_events'] = total else: fake_request['total_events'] = 0 fake_request['prepid'] = '-'.join([original_request['pwg'], member_of_campaign, self.fake_suffix()]) return fake_request def get(self, campaign): arg_list = campaign.split(',') # get all chained campaigns which contain selected CAMPAIGN # reduction to only cc while True: again = False for arg in arg_list: if not arg.startswith('chain'): # this is a flow, or a campaign: not matter for the query ccs = [s['_source'] for s in self.es.search(('campaigns:%s' % arg), index='chained_campaigns', size=self.overflow)['hits']['hits']] arg_list.extend(map(lambda cc: cc['prepid'], ccs)) # arg is going to be duplicated arg_list.remove(arg) again = True break if not again: break # arg_list contains only chained campaigns steps = [] # what are the successive campaigns all_cr = [] # what are the chained requests to look at all_cc = {} # unique it arg_list = list(set(arg_list)) # collect all cr for a_cc in arg_list: try: mcm_cc = self.es.get('chained_campaigns', 'chained_campaign', a_cc)['_source'] except Exception: # try to see if that's a flow # TODO: patch for this exception return '%s does not exists' % (a_cc) all_cc[a_cc] = mcm_cc # keep it in mind all_cr.extend([s['_source'] for s in self.es.search(('member_of_campaign:%s' % a_cc), index='chained_requests', size=self.overflow)['hits']['hits']]) these_steps = map(lambda s: s[0], mcm_cc['campaigns']) if len(steps) == 0: steps = these_steps else: # concatenate to existing steps # add possible steps at the beginning connection = 0 while not steps[connection] in these_steps: connection += 1 new_start = these_steps.index(steps[connection]) if new_start != 0: # they do not start at the same campaign for where in range(new_start): steps.insert(where, these_steps[where]) # verify strict overlapping # ==> does not function properly and limits the flexibility for check in range(new_start, len(these_steps)): if these_steps[check] not in steps: steps.append(these_steps[check]) # preload all requests !!! all_requests = {} for step in steps: for r in [s['_source'] for s in self.es.search(('member_of_campaign:%s' % step), index='requests', size=self.overflow)['hits']['hits']]: all_requests[r['prepid']] = r req_copy = dict(all_requests) # avoid double counting already_counted = set() # the list of requests to be emitted to d3js list_of_request_for_ramunas = [] for cr in all_cr: upcoming = 0 if len(cr['chain']) == 0: # crap data continue stop_at = cr['step'] stop_at = len(cr['chain'])-1 for (r_i, r) in enumerate(cr['chain']): if r_i > stop_at: # this is a reserved request, will count as upcoming later continue mcm_r = all_requests[r] try: del req_copy[r] except KeyError: pass upcoming = int(mcm_r['total_events']*abs(mcm_r['efficiency'])) if r in already_counted: continue else: already_counted.add(r) # add it to emit def pop(mcm_r): for member in mcm_r.keys(): if member not in ['prepid', 'pwg', 'efficiency', 'total_events', 'status', 'priority', 'member_of_campaign', 'time_event']: mcm_r.pop(member) return mcm_r if mcm_r['status'] == 'done': if (not len(mcm_r['output_dataset']) or mcm_r['total_events'] == -1): mcm_r['total_events'] = 0 else: mcm_r['total_events'] = mcm_r['completed_events'] if mcm_r['status'] == 'submitted': try: if not len(mcm_r['reqmgr_name']): mcm_r['total_events'] = 0 except KeyError: pass if mcm_r['status'] == 'submitted': mcm_r_fake_done = copy.deepcopy(mcm_r) mcm_r_fake_done['status'] = 'done' mcm_r_fake_done['total_events'] = mcm_r['completed_events'] mcm_r_fake_subm = copy.deepcopy(mcm_r) mcm_r_fake_subm['total_events'] = max( [0, mcm_r['total_events'] - mcm_r['completed_events']]) list_of_request_for_ramunas.append(pop(mcm_r_fake_subm)) list_of_request_for_ramunas.append(pop(mcm_r_fake_done)) else: if mcm_r['total_events'] == -1: mcm_r['total_events'] = 0 list_of_request_for_ramunas.append(pop(mcm_r)) for noyet in all_cc[cr[ 'member_of_campaign']]['campaigns'][stop_at+1:]: # create a fake request with the proper member of campaign processing_r = all_requests[cr['chain'][stop_at]] fake_one = self.create_fake_request(processing_r, noyet[0], total=upcoming) list_of_request_for_ramunas.append(fake_one) # add req that does not belong to chain (from org campaign) for r in req_copy: r = req_copy[r] if r['member_of_campaign'] == campaign: if r['status'] == 'done': if (not len(r['output_dataset']) or r['total_events'] == -1): r['total_events'] = 0 else: r['total_events'] = r['completed_events'] if r['total_events'] == -1: r['total_events'] = 0 list_of_request_for_ramunas.append(r) return json.dumps({"results": list_of_request_for_ramunas})
'UBERON:0001911': 'mammary gland', 'UBERON:0001630': 'muscle organ', 'UBERON:0000007': 'pituitary gland', 'UBERON:0002370': 'thymus', 'UBERON:0000478': 'extraembryonic structure' } s = connection.search(query, index=index, size=20000) results = s['hits']['hits'] terms = [] # Loops compares the terms from different indexes for result in results: try: s1 = connection.get(index_compared, 'basic', result['_id']) for k in result['_source'].viewkeys() & s1['_source'].viewkeys(): # Checking for differences between documents between different indexes if result['_source'][k] != s1['_source'][k]: if k == 'organs' or k == 'systems': # I am worried only about CL terms. if 'CL' in result['_id']: # I don't want any duplicates if result['_source'] not in terms: terms.append(result['_source']) except: print result['_id'] # Initializes the graph structure and appends edges for each term G = nx.DiGraph() for term in results:
class Datahub(): context = None _es = None index_name = None user_id = None def __init__(self,context,index=None,user_id=None): self.context = context self._es = ElasticSearch(self.context['datahub-store']) self.index_name = index if index is not None else self.context['datahub-index'] self.user_id = user_id def _get_user_id(self): if self.user_id is None: return "public" else: return "user_%s" % (self.user_id) def query(self,name,type_name="_all",es_from=0): if self.user_id is None: q = '%s AND owner:public' % name else: q = '%s AND (owner:public OR owner:"%s")' % (name,self._get_user_id()) results= self._es.search(q,index=self.index_name,doc_type=type_name,es_from=es_from) if results['hits']['total'] > 0: return (map(lambda s : (s['_id'],s['_source']),results['hits']['hits']),results['hits']['total'],results['took']) else: return None,results['hits']['total'],results['took'] def get(self,key,type_name="_all"): result={} try: result = self._es.get(self.index_name,type_name,id=key) if result['_source']['owner'] == 'public' or result['_source']['owner'] == self._get_user_id(): return result['_source'] else: return None except ElasticHttpNotFoundError: return None def index(self,name,display_type,data,category="private",source="private",zone="private",description=""): serie = { "name" : name, "owner" : self._get_user_id(), "display": display_type, "zone": zone, "category" : category, "source" : source, 'description' : description, "data" : data } id = self._es.index(self.index_name, display_type, serie) return id['_id'] def get_user_series(self): series = [] q = 'owner:%s' % (self._get_user_id()) results = self._es.search(q,index=self.index_name,doc_type='_all',es_from=0,size=999) series = map(lambda serie : '%s;%s;%s' % (serie['_id'],serie['_source']['name'],cjson.encode(serie['_source']['data']['series'][0]['data'])),results['hits']['hits']) return series
class ElasticSearchDataStore(datastore.DataStore): """Implements the API.""" def __init__(self, index_list): # Connect to the Elasticsearch server. self.client = ElasticSearch('http://%s:%s/' % (ELASTICSEARCH_SERVER_IP, ELASTICSEARCH_PORT)) # TODO Refactor this to not need the index list at this stage. self.index_list = index_list def search(self, sketch, query, filters): """Search ElasticSearch. This will take a query string from the UI together with a filter definition. Based on this it will send the search request to elasticsearch and get result back. Args: sketch -- string, sketch ID query -- string, query string filters -- dict, Dictionary containing filters to apply Returns: Set of event documents in JSON format """ if filters.get("time_start", None): query = { "query": { "query_string": { "query": query } }, "filter": { "range": { "datetime": { "gte": filters['time_start'], "lte": filters['time_end'] } } }, "sort": { "datetime": "asc" } } elif filters.get("star", None): query = { "query": { "match_all": {} }, "filter": { "nested": { "path": "timesketch_label", "filter": { "bool": { "must": [ { "term": { "timesketch_label.name": "__ts_star" } }, { "term": { "timesketch_label.sketch": str(sketch) } } ] } } } }, "sort": { "datetime": "asc" } } else: query = { "query": { "query_string": { "query": query } }, "sort": { "datetime": "asc" } } return self.client.search(query, index=self.index_list, doc_type="plaso_event", size=500) def get_single_event(self, event_id): """Get singel event document form elasticsearch Args: event_id -- string, event ID Returns: Event document as JSON """ return self.client.get(index=self.index_list[0], doc_type="plaso_event",id=event_id) def add_label_to_event(self, event, sketch, user, label, toggle=False): """Add label to a event document in ElasticSearch. Args: event -- string, event ID sketch -- string, sketch ID user -- string, user ID label -- string, the label to apply toggle -- Bool, Toggle label or create a new one Returns: HTTP status code In order for this to work, we need to add a mapping for this nested document. This needs to be done when the index is forst created. mapping = { "plaso_event": { "properties": { "timesketch_label": { "type": "nested" } } } } """ doc = self.client.get(self.index_list, "plaso_event", event) try: doc['_source']['timesketch_label'] except KeyError: doc = {"timesketch_label": []} self.client.update(self.index_list, "plaso_event", event, doc=doc) if toggle: script_string = "if(ctx._source.timesketch_label.contains"\ "(timesketch_label)) {ctx._source.timesketch_label"\ ".remove(timesketch_label)} else {ctx._source."\ "timesketch_label += timesketch_label}" else: script_string = "if( ! ctx._source.timesketch_label.contains"\ "(timesketch_label)) {ctx._source.timesketch_label"\ "+= timesketch_label}" script = { "script": script_string, "params": { "timesketch_label": { "name": label, "user": user, "sketch": sketch } } } self.client.update(self.index_list, "plaso_event", event, script)
#!/usr/bin/env python #coding:utf-8 from pyelasticsearch import ElasticSearch from pyelasticsearch import bulk_chunks es = ElasticSearch('http://localhost:9200/') def create_index(es): # _index, _type, _source, _id es.index('contacts','person',{'name': 'Joe Tester', 'age': 25, 'title': 'QA Master'},id=1) docs = [{'id': 2, 'name': 'Jessica Coder', 'age': 32, 'title': 'Programmer'},\ {'id': 3, 'name': 'Freddy Tester', 'age': 29, 'title': 'Office Assistant'}] es.bulk((es.index_op(doc, id=doc.pop('id')) for doc in docs),\ index='contacts',\ doc_type='person') es.refresh('contacts') def delete_index(es): es.delete_index('contacts') #create_index(es) print es.get('contacts', 'person', 2) print es.search('name:joe OR name:freddy', index='contacts')
from pyelasticsearch import ElasticSearch from pprint import pprint es = ElasticSearch('http://localhost:9200/') pprint(es.get('pin','bm', 1)) pprint(es.search('tags:blog', size=2, index='pin')) pprint(es.search('tags:blog AND genomics', index='pin'))
class GetHistorical(): ''' Used to return list of point for historical plots ''' def __init__(self): self.es = ElasticSearch(config.DATABASE_URL) # normally es will crop results to 20 # and a million rows is more than we have in db self.overflow = 1000000 def select_dataset(self, ds1, ds2): ''' This selection is from statsMonitoring.py ''' t1=ds1.split('/')[1:] t2=ds2.split('/')[1:] if len(t1[1]) > len(t2[1]): return 1 else: def tierP(t): tierPriority=[ '/RECO', 'SIM-RECO', 'DIGI-RECO', 'AOD', 'SIM-RAW-RECO', 'DQM' , 'GEN-SIM', 'RAW-RECO', 'USER', 'ALCARECO'] for (p, tier) in enumerate(tierPriority): if tier in t: return p return t p1 = tierP(t1[2]) p2 = tierP(t2[2]) decision = (p1 > p2) if t1[2] == 'AODSIM' and t2[2] == 'MINIAODSIM': decision = True return decision * 2 - 1 def db_query(self, input): ''' Query DB and return array of raw documents ''' iterable = [] # try to query for campaign and get list of requests req_arr = [s['_source'] for s in self.es.search(('member_of_campaign:%s' % input), index='requests', size=self.overflow)['hits']['hits']] # if empty, assume input is a request if not len(req_arr): try: req_arr = [self.es.get('requests', 'request', input)['_source']] except: # if exception thrown this may be a workglow iterable = [input] # iterate over array and collect details for req in req_arr: try: dataset_list = req['output_dataset'] if len(dataset_list): dataset_list.sort(cmp=self.select_dataset) ds = dataset_list[0] else: ds = None for reqmgr in req['reqmgr_name']: i = {} i['expected'] = req['total_events'] i['name'] = reqmgr i['output_dataset'] = ds i['priority'] = req['priority'] i['pwg'] = req['pwg'] i['request'] = True i['status'] = req['status'] iterable.append(i) except: pass # iterate over workflows and yield documents for i in iterable: if 'request' in i: try: yield [i['request'], self.es.get( 'stats', 'stats', i['name'])['_source'], i] except: yield [True, None, i] else: try: yield [False, self.es.get( 'stats', 'stats', i)['_source'], None] except: yield [False, None, None] def rm_useless(self, arr): ''' Compressing data: remove first probe of resubmissions and points that are equal to previous measurement ''' r = [] prev = {'e': -1, 'x': -1} for (x, a) in enumerate(arr): if ((a['e'] != prev['e'] or a['x'] != prev['x']) and (a['e'] != 0 or x == 0)): r.append(a) prev = a return r def prepare_response(self, query, probe, p_min, p_max, status_i, pwg_i): stop = False r = [] status = {} pwg = {} for q in query: # Process the db documents for (is_request, document, details) in self.db_query(q): # skip empty documents if document is None: continue # filter out requests if is_request: def get_filter_dict(doc, arr, inp): if doc not in arr: arr[doc] = False if inp is None: arr[doc] = True else: for i in inp: if i == doc: arr[doc] = True break return arr # generate stauts dict status = get_filter_dict(details['status'], status, status_i) # generate pwg dict pwg = get_filter_dict(details['pwg'], pwg, pwg_i) # pwg filtering if not (pwg_i is None or details['pwg'] in pwg_i): continue # status filtering if not (status_i is None or details['status'] in status_i): continue # priority filtering if (details['priority'] < p_min or ( details['priority'] > p_max and p_max != -1)): continue # skip requests with not desired output dataset if (document['pdmv_dataset_name'] != details['output_dataset']): if details['output_dataset'] is not None and document['pdmv_dataset_name'] != 'None Yet': continue # create an array of requests to be processed response = {} response['data'] = [] response['request'] = document['pdmv_prep_id'] # taskchain handiling if not is_request and (document['pdmv_type'] == 'TaskChain'): # load taskchain instead of normal req for t in document['pdmv_monitor_taskchain']: res = {} res['request'] = t['dataset'] res['data'] = [] for record in t['monitor']: if len(record['pdmv_monitor_time']): data = {} data['e'] = record['pdmv_evts_in_DAS'] data['t'] = time.mktime(time.strptime( record['pdmv_monitor_time']))*1000 data['x'] = document['pdmv_expected_events'] res['data'].append(data) r.append(res) re = {} re['data'] = r re['status'] = {} re['pwg'] = {} re['taskchain'] = True stop = True else: if 'pdmv_monitor_history' in document: for record in document['pdmv_monitor_history']: if len(record['pdmv_monitor_time']): data = {} if details is None or details['output_dataset'] is not None: # a is events in das data['e'] = record['pdmv_evts_in_DAS'] else: # if the output in mcm is not specified yet, # treat as this has not produced anything # ensures present=historical data['e'] = 0 data['t'] = time.mktime(time.strptime( record['pdmv_monitor_time']))*1000 # x is expected events if is_request: data['x'] = details['expected'] else: data['x'] = document[ 'pdmv_expected_events'] response['data'].append(data) r.append(response) if stop: return re # Step 1: Get accumulated requests tmp = {} for x in r: s = x['request'] if s not in tmp: tmp[s] = {} tmp[s]['data'] = [] tmp[s]['data'] += x['data'] tmp[s]['data'] = sorted(tmp[s]['data'], key=lambda e: e['t']) tmp[s]['data'] = self.rm_useless(tmp[s]['data']) # Step 2: Get and sort timestamps times = [] for t in tmp: times += (x['t'] for x in tmp[t]['data']) times = sorted(set(times)) if len(times) > (probe-1): skiper = len(times) / (probe-1) else: skiper = -1 filter_times = [] i = 0 for (x, t) in enumerate(times): if i < skiper and x < len(times) - 1 and x != 0: i += 1 else: filter_times.append(t) i = 0 # Step 3 & 4: Cycle through requests and add data points data = [] for ft in filter_times: d = {'e': 0, 't': ft, 'x': 0} for t in tmp: prevx = {'e': 0, 'x': 0} for (i, x) in enumerate(tmp[t]['data']): if x['t'] > ft: d['e'] += prevx['e'] d['x'] += prevx['x'] break elif x['t'] == ft or i == len(tmp[t]['data'])-1: d['e'] += x['e'] d['x'] += x['x'] break else: prevx = x data.append(d) re = {} re['data'] = data re['status'] = status re['pwg'] = pwg re['taskchain'] = False return re def get(self, query, probe=100, priority_min=0, priority_max=-1, status=None, pwg=None): return json.dumps({"results": self.prepare_response( query.split(','), probe, priority_min, priority_max, status, pwg)})
class GetLifetime(): def __init__(self): self.es = ElasticSearch(config.DATABASE_URL) # normally es will crop results # and a million rows is more than we have in db self.overflow = 1000000 def db_query(self, input): """ Query DB and return array of raw documents """ iterable = [] try: # check if the input is a campaign req_arr = [s['_source'] for s in self.es.search(('member_of_campaign:%s' % input), index='requests', size=self.overflow)['hits']['hits']] for r in req_arr: res = ([s['name'] for s in self.es.get('requests', 'request', r['prepid'])['_source']['reqmgr_name']]) for e in res: iterable.append(e) except: pass if not len(iterable): try: # check if the input is a request iterable = [s['name'] for s in self.es.get('requests', 'request', input)['_source']['reqmgr_name']] except: # input can be a reqmgr_name iterable = [input] for i in iterable: try: yield self.es.get('stats', 'stats', i)['_source'] except: yield None def rm_useless(self, arr): r = [] prev = {'a': -1, 'e': -1, 'x': -1} for a in arr: if a['a'] != prev['a'] or a['e'] != prev['e'] or a['x'] != prev['x']: r.append(a) prev = a return r def prepare_response(self, query): #print "Start" #prev = int(round(time.time() * 1000)) #print prev r = [] # Process the db documents for d in self.db_query(query): if d is None: continue response = {} response['campaign'] = d['pdmv_campaign'] response['data'] = [] response['input'] = query response['priority'] = d['pdmv_priority'] response['pwg'] = '#HaveToQueryRequest' response['request'] = d['pdmv_prep_id'] response['status'] = '#HaveToQueryRequest' response['title'] = d['pdmv_prep_id'] + d['pdmv_dataset_name'] if 'pdmv_monitor_history' in d: for record in d['pdmv_monitor_history']: if len(record['pdmv_monitor_time']): data = {} data['a'] = record['pdmv_evts_in_DAS'] + record['pdmv_open_evts_in_DAS'] data['e'] = record['pdmv_evts_in_DAS'] data['t'] = time.mktime(time.strptime(record['pdmv_monitor_time']))*1000 data['x'] = d['pdmv_expected_events'] response['data'].append(data) r.append(response) #print "Data prepared" #print int(round(time.time() * 1000)) - prev #prev = int(round(time.time() * 1000)) # Step 1: Get accumulated requests tmp = {} for x in r: s = x['request'] try: tmp[s] += x['data'] except KeyError: tmp[s] = x['data'] tmp[s] = self.rm_useless(tmp[s]) #for name in tmp: # tmp[name] = sorted(tmp[name], key=lambda e: e['t']) #print "Accum request" #print int(round(time.time() * 1000)) - prev #prev = int(round(time.time() * 1000)) # Step 2: Get and sort timestamps times = [] for t in tmp: times += (x['t'] for x in tmp[t]) times = sorted(set(times)) #print "Sorted times" #print int(round(time.time() * 1000)) - prev #prev = int(round(time.time() * 1000)) ''' Step 3 & Step 4 data = [] for t in times: dummy = {'a':0, 'e':0, 'x':0, 't': t} for name in tmp: pre = {'a':0, 'e':0, 'x':0} for i in xrange(len(tmp[name])): if tmp[name][i]['t'] == t: dummy['a'] += tmp[name][i]['a'] dummy['e'] += tmp[name][i]['e'] dummy['x'] += tmp[name][i]['x'] break elif tmp[name][i]['t'] > t: dummy['a'] += pre['a'] dummy['e'] += pre['e'] dummy['x'] += pre['x'] break elif tmp[name][i]['t'] < t: pre = tmp[name][i] data.append(dummy) ''' # Step 3: Create dummy points for each request tmp2 = {} for t in tmp: nxw = [] cur_index = 0 dummy = {'a':0, 'e':0, 'x':0} listed = sorted(tmp[t], key=lambda e: e['t']) for a in times: if cur_index < len(listed) and a == listed[cur_index]['t']: dummy = listed[cur_index] cur_index += 1 dummy['t'] = a nxw.append(dummy) tmp2[t] = nxw #print "Dummy points" #print int(round(time.time() * 1000)) - prev #prev = int(round(time.time() * 1000)) # get only 1000 points skiper = len(times) / 20 # Step 4: Generating data points data = [] i = 0 for (x, t) in enumerate(times): if i < skiper and x < len(times) - 1 and x != 0: i += 1 else: i = 0 d = {'a': 0, 'e':0, 't': t, 'x': 0} for m in tmp2: d['a'] += tmp2[m][x]['a'] d['e'] += tmp2[m][x]['e'] d['x'] += tmp2[m][x]['x'] data.append(d) #print "Data points" #print int(round(time.time() * 1000)) - prev print len(data) return data def get(self, query): return json.dumps({"results": self.prepare_response(query)})
class GetChain(): def __init__(self): self.countDummy = 0 self.es = ElasticSearch(config.DATABASE_URL) self.overflow = 1000000 def fakeId(self): self.countDummy += 1 return 'X'*(5-len('%d' % (self.countDummy)))+'%d' % (self.countDummy) def __createDummyRequest(self, req, memberOfCampaign, status='upcoming', total=None): fake_r = {} fake_r['status'] = status fake_r['member_of_campaign'] = memberOfCampaign for member in ['pwg', 'priority', 'total_events', 'time_event']: fake_r[member] = req[member] if total is not None: fake_r['total_events'] = total fake_r['prepid'] = '-'.join([req['pwg'], memberOfCampaign, self.fakeId()]) fake_r['cloned_from'] = req['prepid'] return fake_r def get(self, campaign): arg_list = campaign.split(',') # Get all chained campaigns which contain selected CAMPAIGN # reduction to only cc while True: again = False for arg in arg_list: if not arg.startswith('chain'): # this is a flow, or a campaign: not matter for the query ccs = [s['_source'] for s in self.es.search(('campaigns:%s' % arg), index='chained_campaigns', size=self.overflow)['hits']['hits']] arg_list.extend(map(lambda cc: cc['prepid'], ccs)) arg_list.remove(arg) again = True break if not again: break # arg_list contains only chained campaigns steps = [] # what are the successive campaigns all_cr = [] # what are the chained requests to look at all_cc = {} # unique it arg_list = list(set(arg_list)) # collect all crs for a_cc in arg_list: try: mcm_cc = self.es.get('chained_campaigns', 'chain_campaign', a_cc)['_source'] except Exception: # try to see if that's a flow return '%s does not exists' % (a_cc) all_cc[a_cc] = mcm_cc # keep it in mind all_cr.extend([s['_source'] for s in self.es.search(('member_of_campaign:%s' % a_cc), index='chained_requests', size=self.overflow)['hits']['hits']]) these_steps = map(lambda s: s[0], mcm_cc['campaigns']) if len(steps) == 0: steps = these_steps else: # concatenate to existing steps # add possible steps at the beginning connection = 0 while not steps[connection] in these_steps: connection += 1 new_start = these_steps.index(steps[connection]) if new_start != 0: # they do not start at the same campaign for where in range(new_start): steps.insert(where, these_steps[where]) # verify strict overlapping # ==> does not function properly and limits the flexibility for check in range(new_start, len(these_steps)): if these_steps[check] not in steps: steps.append(these_steps[check]) # preload all requests !!! all_requests = {} for step in steps: for r in [s['_source'] for s in self.es.search(('member_of_campaign:%s' % step), index='requests', size=self.overflow)['hits']['hits']]: all_requests[r['prepid']] = r # avoid double counting already_counted = set() # the list of requests to be emitted to d3js list_of_request_for_ramunas = [] for cr in all_cr: upcoming = 0 if len(cr['chain']) == 0: # crap data continue stop_at = cr['step'] stop_at = len(cr['chain'])-1 for (r_i, r) in enumerate(cr['chain']): if r_i > stop_at: # this is a reserved request, will count as upcoming later continue mcm_r = all_requests[r] upcoming = mcm_r['total_events'] if r in already_counted: continue else: already_counted.add(r) # add it to emit def pop(mcm_r): for member in mcm_r.keys(): if member not in ['prepid', 'pwg', 'priority', 'total_events', 'status', 'member_of_campaign', 'time_event']: mcm_r.pop(member) return mcm_r if mcm_r['status'] == 'submitted': mcm_r_fake_done = copy.deepcopy(mcm_r) mcm_r_fake_done['status'] = 'done' mcm_r_fake_done['total_events'] = mcm_r['completed_events'] mcm_r_fake_subm = copy.deepcopy(mcm_r) mcm_r_fake_subm['total_events'] = max( [0, mcm_r['total_events'] - mcm_r['completed_events']]) list_of_request_for_ramunas.append(pop(mcm_r_fake_subm)) list_of_request_for_ramunas.append(pop(mcm_r_fake_done)) else: list_of_request_for_ramunas.append(pop(mcm_r)) for noyet in all_cc[cr[ 'member_of_campaign']]['campaigns'][stop_at+1:]: # create a fake request with the proper member of campaign processing_r = all_requests[cr['chain'][stop_at]] fake_one = self.__createDummyRequest(processing_r, noyet[0], total=upcoming) list_of_request_for_ramunas.append(fake_one) return json.dumps({"results": list_of_request_for_ramunas})