def missing_field(doctype=None, field='_source', stats_only=True): if not _DATABASE_AVAILABLE: _logger.warning( "Could not get documents missing a field: No database instance available" ) return [] query = {'query': {'bool': {'must_not': {'exists': {'field': field}}}}} if not doctype: result = _client.search(_elastic_index, body=query) else: result = _client.search(_elastic_index, doctype, body=query) if not stats_only: return result['hits']['hits'] else: total = doctype and _client.search( _elastic_index, doctype)['hits']['total'] or _client.search( _elastic_index)['hits']['total'] stats = { 'doctype': doctype and doctype or '*', 'field': field, 'missing': result['hits']['total'], 'total': total, 'percentage_missing': ((result['hits']['total']) / (total * 1.)) * 100 } return stats
def list_apps(service_name=None): """Lists the API apps registered in this INCA instance Parameters ---- service_name : string (default=None) The name of the service, such as 'twitter', 'facebook' or 'google'.If no name is provided, apps for all services are returned Returns ---- Dictionary or list When a service_name is provided, a list of apps registered for this service are returned. Otherwise, a dictionary with the structure `{service_name : [app1, app2, app3]}` are returned. """ res = _client.search(index='.apps', doc_type=service_name, size=10000) get_num_credentials = lambda app: _client.search( index='.credentials', doc_type=app['_type'] + "_" + app[ '_id'], size=0)['hits']['total'] apps = {} for app in res['hits']['hits']: if service_name and service_name != app['_type']: continue app_ob = {'name': app['_id'], 'credentials': get_num_credentials(app)} if not apps.get(app['_type']): apps[app['_type']] = [app_ob] else: apps[app['_type']].append(app_ob) if service_name: return apps.get(service_name, []) return apps
def list_assets(): return [{'id':asset.get('_id'), 'name': dotkeys(asset,'_source.name'), 'project':dotkeys(asset, '_source.project'), 'added':dotkeys(asset,'_source.added_at'), 'length':len(dotkeys(asset,'_source.content'))} for asset in client.search('assets')['hits']['hits']]
def document_generator(query="*"): """A generator to get results for a query Parameters ---- query : string (default="*") or dict A string query specifying the documents to return or a dict that is a elasticsearch query Yields ---- dict representing a document """ if not _DATABASE_AVAILABLE: _logger.warning("Unable to generate documents, no database available!") else: if query == "*": _logger.info("No query specified, returning all documents") if type(query) == str: _logger.info( "String input: searching for {query}".format(query=query)) es_query = { "query": { "bool": { "must": { "query_string": { "query": query } } } } } _logger.debug("query: {es_query}".format(es_query=es_query)) elif type(query) == dict: _logger.info("Dict input: using input as ES query") _logger.debug( "query: {query}".format(query=_json.dumps(query, indent=2))) es_query = query else: _logger.warning("Unknown input") es_query = False if es_query: total = _client.search(_elastic_index, body=es_query, size=0)['hits']['total'] for num, doc in enumerate(_scroll_query(es_query)): if not num % 10: _logger.info("returning {num} of {total}".format( num=num, total=total)) yield doc
def get_asset(id=None, name=None): if not id and not name: info.warning("requires either id or name! None given...") return False if id: return client.get('assets', id=id) elif name: units = client.search('assets', body={'filter':{'match':{'name':name}}})['hits']['hits'] if len(units)==1: return units[0] elif len(units)<1: logger.warning("ambiguous designation! Use ID?") return {} else: logger.info("no asset found matching this {name}[{id}]".format(**locals())) return {}
def list_doctypes(): if not _DATABASE_AVAILABLE: _logger.warning( "Could not list documents: No database instance available") return [] existing_doctypes = [ key for key in _client.indices.get_mapping(_elastic_index).get( _elastic_index, {}).get('mappings', {}).keys() if key != '_default_' and key != 'core.document' ] overview = { doctype: _client.search(index=_elastic_index, doc_type=doctype).get('hits', {}).get('total', "NA") for doctype in existing_doctypes } return overview
def list_credentials(service_name, app_name): """Lists the credentials associated with a registered app Parameters ---- service_name : string a string specifying the name of the service the app targets, such as 'twitter','facebook' or 'google' app_name : string a string with the internal application name Returns ---- A list of credentials belonging to the application """ app_type = service_name + "_" + app_name credentials = _client.search(index=".credentials", doc_type=app_type) return credentials['hits']['hits']
def doctype_inspect(doctype): '''Show some information about documents of a specified type Parameters ---------- doctype : string string specifying the doctype to examine (see list_doctypes for available documents) Returns ------- dictionary summary of documents of the specified type: total collected : integer the amount of documents of this type (approximation) first_collected : _datetime the minimal 'META.ADDED' field of these documents, which specifies the oldest documents last_collected : _datetime the maximum 'META.ADDED' field of these documents which specifies when the last document of this type was collected keys : dictionary <keyname> : dictionary coverage : float the proportion of documents that have this key type : string the elasticsearch index type of this field ''' firstdocs = doctype_first(doctype, by_field="META.ADDED") lastdocs = doctype_last(doctype, by_field="META.ADDED") summary = dict( total_collected=_client.search(index=_elastic_index, doc_type=doctype)['hits']['total'], first_collected=firstdocs and firstdocs[0].get('_source', {}).get('META', {}).get("ADDED", None) or None, last_collected=lastdocs and lastdocs[0].get('_source', {}).get('META', {}).get("ADDED", None), keys=doctype_fields(doctype)) return summary
def doctype_examples(doctype, field=None, seed=42, num=10): if not _DATABASE_AVAILABLE: _logger.warning( "Could not get example documents: No database instance available") return [] if check_mapping(doctype) == "mixed_mapping": field2 = "doctype.keyword" elif check_mapping(doctype) == "new_mapping": field2 = "doctype" elif check_mapping(doctype) == None: return _logger.warning( "Could not find mapping of doctype, please check whether you are using the correct doctype" ) docs = _client.search(index=_elastic_index, body={ 'size': num, "query": { "function_score": { "query": { "term": { field2: doctype } }, "functions": [{ "random_score": { "seed": seed } }] } } }) if not field: return docs['hits']['hits'] elif type(field) == str: return [_dotkeys(doc, field) for doc in docs['hits']['hits']] else: return [{fi: _dotkeys(doc, fi) for fi in field} for doc in docs['hits']['hits']]
def doctype_fields(doctype): ''' returns a summary of fields for documents of `doctype`: field : type - count (coverage) note: As elasticsearch does not natively support an 'all fields' query, this function runs a 1000 document sample and takes the union of found keys as a proxy of fields shared by all documents. ''' if not _DATABASE_AVAILABLE: _logger.warning( "Could not get document information: No database instance available" ) return [] if check_mapping(doctype) == "mixed_mapping": field = "doctype.keyword" elif check_mapping(doctype) == "new_mapping": field = "doctype" elif check_mapping(doctype) == None: _logger.warning( "Could not find mapping of doctype, please check whether you are using the correct doctype" ) return [] from collections import Counter key_count = Counter() doc_num = _client.search(index=_elastic_index, body={'query': { "term": { field: doctype } }})['hits']['total'] mappings = _client.indices.get_mapping(_elastic_index).get( _elastic_index, {}).get('mappings', {}).get(doctype, {}).get('properties', {}) coverage = { key: _client.search(_elastic_index, body={ 'query': { 'bool': { 'filter': [{ 'exists': { 'field': key } }, { 'term': { field: doctype } }] } } }).get('hits', {}).get('total', 0) for key in mappings.keys() if key != "META" } summary = { k: { 'coverage': coverage.get(k, 'unknown') / float(doc_num), 'type': mappings[k].get('type', 'unknown') } for k in mappings.keys() if k != "META" } return summary
def doctype_last(doctype, num=1, by_field="META.ADDED", query=None): '''Returns the last document of a given doctype Input --- doctype: string The document type you whish to retrieved num: int The number of documents to retrieve by_field: string The _datetime field by which to determine the last document query : string (default None) An Elasticsearch string query to filter results. Example: query="user.screen_name:google" ''' if not _DATABASE_AVAILABLE: _logger.warning( "Could not get last documents: No database instance available") return [] exotic_by_field = by_field.replace('.', '.properties.') _logger.debug("looking for {exotic_by_field}".format( exotic_by_field=exotic_by_field)) mapping = _client.indices.get_mapping() _logger.debug("Got mapping {mapping}".format(**locals())) target_key = "{_elastic_index}.mappings.{doctype}.properties.{exotic_by_field}".format( _elastic_index=_elastic_index, **locals()) _logger.debug("Target key: {target_key}".format(**locals())) found_mapping = _dotkeys(mapping, target_key) _logger.debug("found mapping: {found_mapping}".format(**locals())) if not found_mapping: _logger.debug("Mapping not seen yet") return [] if check_mapping(doctype) == "mixed_mapping": field = "doctype.keyword" elif check_mapping(doctype) == "new_mapping": field = "doctype" elif check_mapping(doctype) == None: _logger.warning( "Could not find mapping of doctype, please check whether you are using the correct doctype" ) return [] body = { "sort": [{ by_field: { "order": "desc" } }], "size": num, "query": { "term": { field: doctype } } } if query: _logger.debug("adding string query: {query}".format(**locals())) body['query'] = {'query_string': {'query': query}} docs = _client.search(index=_elastic_index, body={ "sort": [{ by_field: { "order": "desc" } }], "size": num, "query": { "term": { field: doctype } } }).get('hits', {}).get('hits', [""]) return docs
def load_credentials(self, app='default', id=None, update_last_loaded=True, retries=3): """Load a credential from the specified app Retrieves credentials from a specified app. Choices are based on the `sort_field` and `preference` class properties that should indicate which field indicates how suited a credential is. If the class properties `sort_field` and `preference` are not set, it defaults to `last_loaded.<self.__name__>` & `lowest`, i.e. the credential that has not been used this function the longest time. Parameters ---------- app : string (default='default') the appname from which the credentials should be drawn. Will be prepended with service name, i.e. "{service_name}_{app}" and retrieved from the database id : string (default=None) a specific credential ID to retrieve, for instance related to user-specific content (e.g. direct messages). Otherwise the `self.sort_field` and `self.preference` are used to select credentials to apss to the .get method. NOTE: overrides app Returns ------- dictionary the credentials record (empty if not found) Notes ----- This function updates the last_loaded.<classname> field with the current time. """ ordering = {'lowest': 'asc', 'highest': 'desc'} doctype = "{self.service_name}_{app}".format(**locals()) try: if id: credentials = client.get(index=CREDENTIALS_INDEX, doc_type=doctype, id=id) else: docs = client.search(index=CREDENTIALS_INDEX, body={ "sort": [{ self.sort_field: { "order": ordering[self.preference] } }], "size": 1, "query": { "match": { "_type": doctype } } }).get('hits', {}).get('hits', []) if not docs: logger.warning( "No credentials found for {app}".format(**locals())) return {} credentials = docs[0] if update_last_loaded: logger.debug("Updating last-loaded field") self.store_credentials( id=credentials['_id'], doc_type=doctype, app=app, credentials=credentials['_source']['credentials'], last_loaded=datetime.datetime.now().isoformat(), content=credentials['_source']['content']) except ConnectionTimeout: retries -= 1 logger.info( "Connection timeout when trying to retrieve, retrying {retries} more times" ) if retries: return self.load_credentials(app=app, id=id, retries=retries) else: return {} except ConnectionError: logger.warning("Unable to contact Elasticsearch, is it up?") return {} except NotFoundError: logger.warning("No credentials found") return {} except RequestError: logger.warning("You specified a sort field that does not exist!") return {} return credentials
def analyse(self, queries, timefield, granularity="week", querytype="count", field=None, from_time=None, to_time=None, filter=None): '''returns a pandas dataframe ''' if type(queries)==str: queries = [queries] if type(querytype)==str: querytype = [querytype]*len(queries) if field and type(field)==str or type(field)==type(None): field = [field]*len(queries) assert len(queries)==len(querytype), "there should be one querytype for each query" if field: assert len(queries)==len(field), "if specified, there should be a field for each query" target_dataframe = False prepend = [] for num, q, qt, f in zip(range(len(queries)),queries, querytype, field): logger.debug(num,q,qt,f) if qt!='count' and not field: logger.info("metrics require a field to which the metric should be applied!," "which field should be {qt}-ed".format(**locals())) # basic elastic query to select documents for each timeseries elastic_query = {'query':{"bool": { 'must': [ {'query_string':{'query':q}}]}}, 'aggs':{'timeline' : {"date_histogram": { "field":timefield, "interval":granularity } }}} if qt!="count": elastic_query['aggs']['timeline'].update( {"aggs": { "metric": { qt: { "field": f } } }} ) # add time range if from or to time is specified time_range = {timefield:{}} if from_time : time_range[timefield].update({ 'gte' : from_time }) if to_time : time_range[timefield].update({ 'lte' : to_time }) # apply filter if specified if type(filter)==str: elastic_query['query']['bool']['must'].append({'query_string':{'query':filter}}) elif type(filter)==dict: elastic_query['query']['bool']['must'].append({"match":filter}) if from_time or to_time: elastic_query['query']['bool']['must'].append({'range':time_range}) logger.debug("elastic query = {elastic_query}".format(**locals())) res = client.search(elastic_index, body=elastic_query, size=0) logger.debug("found {res[hits][total]} results in total".format(**locals())) if qt=='count': df = pandas.DataFrame(res['aggregations']['timeline']['buckets']) else: df = pandas.DataFrame([{'doc_count':b['metric']['value'],'key_as_string':b['key_as_string']} for b in res['aggregations']['timeline']['buckets']]) logger.debug("dataframe: {df}".format(**locals())) num +=1 longer = len(q)>10 and '...' or ' ' new_name = "{num}. {q:.10}{longer}".format(**locals()) df = df.rename(columns={"doc_count": new_name, "key_as_string":"timestamp"}) if type(target_dataframe)==bool and target_dataframe==False: if df.empty: prepend.append(new_name) continue else: target_dataframe = df[["timestamp",new_name]] elif df.empty: target_dataframe[new_name] = numpy.nan else: target_dataframe = target_dataframe.merge(df[["timestamp",new_name]], on='timestamp', how='outer') if prepend: colnames = [name for name in target_dataframe.columns if name!="timestamp"] for empty_column in prepend: target_dataframe[empty_column] = numpy.nan ordered_cols = ["timestamp"] + prepend + list(colnames) target_dataframe = target_dataframe[ordered_cols] prepend = [] if not (type(target_dataframe)==bool and target_dataframe==False): target_dataframe = target_dataframe.replace(numpy.nan, 0) return target_dataframe else: logger.info("Empty result") return pandas.DataFrame()