Ejemplo n.º 1
0
def missing_field(doctype=None, field='_source', stats_only=True):
    if not _DATABASE_AVAILABLE:
        _logger.warning(
            "Could not get documents missing a field: No database instance available"
        )
        return []
    query = {'query': {'bool': {'must_not': {'exists': {'field': field}}}}}
    if not doctype:
        result = _client.search(_elastic_index, body=query)
    else:
        result = _client.search(_elastic_index, doctype, body=query)
    if not stats_only:
        return result['hits']['hits']
    else:
        total = doctype and _client.search(
            _elastic_index, doctype)['hits']['total'] or _client.search(
                _elastic_index)['hits']['total']
        stats = {
            'doctype': doctype and doctype or '*',
            'field': field,
            'missing': result['hits']['total'],
            'total': total,
            'percentage_missing':
            ((result['hits']['total']) / (total * 1.)) * 100
        }

        return stats
Ejemplo n.º 2
0
def list_apps(service_name=None):
    """Lists the API apps registered in this INCA instance

    Parameters
    ----
    service_name : string (default=None)
        The name of the service, such as 'twitter', 'facebook' or 'google'.If
        no name is provided, apps for all services are returned

    Returns
    ----
    Dictionary or list
        When a service_name is provided, a list of apps registered for this
        service are returned. Otherwise, a dictionary with the structure
        `{service_name : [app1, app2, app3]}` are returned.

    """
    res = _client.search(index='.apps', doc_type=service_name, size=10000)
    get_num_credentials = lambda app: _client.search(
        index='.credentials', doc_type=app['_type'] + "_" + app[
            '_id'], size=0)['hits']['total']
    apps = {}
    for app in res['hits']['hits']:
        if service_name and service_name != app['_type']: continue
        app_ob = {'name': app['_id'], 'credentials': get_num_credentials(app)}
        if not apps.get(app['_type']):
            apps[app['_type']] = [app_ob]
        else:
            apps[app['_type']].append(app_ob)
    if service_name:
        return apps.get(service_name, [])
    return apps
Ejemplo n.º 3
0
Archivo: assets.py Proyecto: uless/inca
def list_assets():
    return [{'id':asset.get('_id'),
             'name': dotkeys(asset,'_source.name'),
             'project':dotkeys(asset, '_source.project'),
             'added':dotkeys(asset,'_source.added_at'),
             'length':len(dotkeys(asset,'_source.content'))} for
            asset in client.search('assets')['hits']['hits']]
Ejemplo n.º 4
0
def document_generator(query="*"):
    """A generator to get results for a query

    Parameters
    ----
    query : string (default="*") or dict
        A string query specifying the documents to return or a dict
        that is a elasticsearch query

    Yields
    ----
    dict representing a document
    """
    if not _DATABASE_AVAILABLE:
        _logger.warning("Unable to generate documents, no database available!")
    else:
        if query == "*":
            _logger.info("No query specified, returning all documents")
        if type(query) == str:
            _logger.info(
                "String input: searching for {query}".format(query=query))
            es_query = {
                "query": {
                    "bool": {
                        "must": {
                            "query_string": {
                                "query": query
                            }
                        }
                    }
                }
            }
            _logger.debug("query: {es_query}".format(es_query=es_query))
        elif type(query) == dict:
            _logger.info("Dict input: using input as ES query")
            _logger.debug(
                "query: {query}".format(query=_json.dumps(query, indent=2)))
            es_query = query
        else:
            _logger.warning("Unknown input")
            es_query = False
        if es_query:
            total = _client.search(_elastic_index, body=es_query,
                                   size=0)['hits']['total']
            for num, doc in enumerate(_scroll_query(es_query)):
                if not num % 10:
                    _logger.info("returning {num} of {total}".format(
                        num=num, total=total))
                yield doc
Ejemplo n.º 5
0
Archivo: assets.py Proyecto: uless/inca
def get_asset(id=None, name=None):
    if not id and not name:
        info.warning("requires either id or name! None given...")
        return False
    if id:
        return client.get('assets', id=id)
    elif name:
        units = client.search('assets', body={'filter':{'match':{'name':name}}})['hits']['hits']
    if len(units)==1:
        return units[0]
    elif len(units)<1:
        logger.warning("ambiguous designation! Use ID?")
        return {}
    else:
        logger.info("no asset found matching this {name}[{id}]".format(**locals()))
        return {}
Ejemplo n.º 6
0
def list_doctypes():
    if not _DATABASE_AVAILABLE:
        _logger.warning(
            "Could not list documents: No database instance available")
        return []
    existing_doctypes = [
        key for key in _client.indices.get_mapping(_elastic_index).get(
            _elastic_index, {}).get('mappings', {}).keys()
        if key != '_default_' and key != 'core.document'
    ]
    overview = {
        doctype: _client.search(index=_elastic_index,
                                doc_type=doctype).get('hits',
                                                      {}).get('total', "NA")
        for doctype in existing_doctypes
    }
    return overview
Ejemplo n.º 7
0
def list_credentials(service_name, app_name):
    """Lists the credentials associated with a registered app

    Parameters
    ----
    service_name : string
        a string specifying the name of the service the app targets, such as
        'twitter','facebook' or 'google'
    app_name : string
        a string with the internal application name

    Returns
    ----
    A list of credentials belonging to the application
    """
    app_type = service_name + "_" + app_name
    credentials = _client.search(index=".credentials", doc_type=app_type)
    return credentials['hits']['hits']
Ejemplo n.º 8
0
def doctype_inspect(doctype):
    '''Show some information about documents of a specified type

    Parameters
    ----------
    doctype : string
        string specifying the doctype to examine (see list_doctypes for available documents)

    Returns
    -------
    dictionary
        summary of documents of the specified type:
            total collected : integer
                the amount of documents of this type (approximation)
            first_collected : _datetime
                the minimal 'META.ADDED' field of these documents, which
                specifies the oldest documents
            last_collected : _datetime
                the maximum 'META.ADDED' field of these documents which
                specifies when the last document of this type was collected
            keys : dictionary
                <keyname> : dictionary
                    coverage : float
                        the proportion of documents that have this key
                    type     : string
                        the elasticsearch index type of this field

    '''

    firstdocs = doctype_first(doctype, by_field="META.ADDED")
    lastdocs = doctype_last(doctype, by_field="META.ADDED")

    summary = dict(
        total_collected=_client.search(index=_elastic_index,
                                       doc_type=doctype)['hits']['total'],
        first_collected=firstdocs
        and firstdocs[0].get('_source', {}).get('META', {}).get("ADDED", None)
        or None,
        last_collected=lastdocs
        and lastdocs[0].get('_source', {}).get('META', {}).get("ADDED", None),
        keys=doctype_fields(doctype))

    return summary
Ejemplo n.º 9
0
def doctype_examples(doctype, field=None, seed=42, num=10):
    if not _DATABASE_AVAILABLE:
        _logger.warning(
            "Could not get example documents: No database instance available")
        return []
    if check_mapping(doctype) == "mixed_mapping":
        field2 = "doctype.keyword"
    elif check_mapping(doctype) == "new_mapping":
        field2 = "doctype"
    elif check_mapping(doctype) == None:
        return _logger.warning(
            "Could not find mapping of doctype, please check whether you are using the correct doctype"
        )

    docs = _client.search(index=_elastic_index,
                          body={
                              'size': num,
                              "query": {
                                  "function_score": {
                                      "query": {
                                          "term": {
                                              field2: doctype
                                          }
                                      },
                                      "functions": [{
                                          "random_score": {
                                              "seed": seed
                                          }
                                      }]
                                  }
                              }
                          })
    if not field:
        return docs['hits']['hits']
    elif type(field) == str:
        return [_dotkeys(doc, field) for doc in docs['hits']['hits']]
    else:
        return [{fi: _dotkeys(doc, fi)
                 for fi in field} for doc in docs['hits']['hits']]
Ejemplo n.º 10
0
def doctype_fields(doctype):
    '''
    returns a summary of fields for documents of `doctype`:
    field : type - count (coverage)

    note:
        As elasticsearch does not natively support an 'all fields' query,
        this function runs a 1000 document sample and takes the union of
        found keys as a proxy of fields shared by all documents.
    '''
    if not _DATABASE_AVAILABLE:
        _logger.warning(
            "Could not get document information: No database instance available"
        )
        return []
    if check_mapping(doctype) == "mixed_mapping":
        field = "doctype.keyword"
    elif check_mapping(doctype) == "new_mapping":
        field = "doctype"
    elif check_mapping(doctype) == None:
        _logger.warning(
            "Could not find mapping of doctype, please check whether you are using the correct doctype"
        )
        return []

    from collections import Counter
    key_count = Counter()
    doc_num = _client.search(index=_elastic_index,
                             body={'query': {
                                 "term": {
                                     field: doctype
                                 }
                             }})['hits']['total']
    mappings = _client.indices.get_mapping(_elastic_index).get(
        _elastic_index, {}).get('mappings', {}).get(doctype,
                                                    {}).get('properties', {})
    coverage = {
        key: _client.search(_elastic_index,
                            body={
                                'query': {
                                    'bool': {
                                        'filter': [{
                                            'exists': {
                                                'field': key
                                            }
                                        }, {
                                            'term': {
                                                field: doctype
                                            }
                                        }]
                                    }
                                }
                            }).get('hits', {}).get('total', 0)
        for key in mappings.keys() if key != "META"
    }
    summary = {
        k: {
            'coverage': coverage.get(k, 'unknown') / float(doc_num),
            'type': mappings[k].get('type', 'unknown')
        }
        for k in mappings.keys() if k != "META"
    }
    return summary
Ejemplo n.º 11
0
def doctype_last(doctype, num=1, by_field="META.ADDED", query=None):
    '''Returns the last document of a given doctype

    Input
    ---
    doctype: string
        The document type you whish to retrieved
    num: int
        The number of documents to retrieve
    by_field: string
        The _datetime field by which to determine the
        last document
    query : string (default None)
        An Elasticsearch string query to filter results.
        Example: query="user.screen_name:google"
    '''
    if not _DATABASE_AVAILABLE:
        _logger.warning(
            "Could not get last documents: No database instance available")
        return []

    exotic_by_field = by_field.replace('.', '.properties.')
    _logger.debug("looking for {exotic_by_field}".format(
        exotic_by_field=exotic_by_field))
    mapping = _client.indices.get_mapping()
    _logger.debug("Got mapping {mapping}".format(**locals()))
    target_key = "{_elastic_index}.mappings.{doctype}.properties.{exotic_by_field}".format(
        _elastic_index=_elastic_index, **locals())
    _logger.debug("Target key: {target_key}".format(**locals()))
    found_mapping = _dotkeys(mapping, target_key)
    _logger.debug("found mapping: {found_mapping}".format(**locals()))
    if not found_mapping:
        _logger.debug("Mapping not seen yet")
        return []
    if check_mapping(doctype) == "mixed_mapping":
        field = "doctype.keyword"
    elif check_mapping(doctype) == "new_mapping":
        field = "doctype"
    elif check_mapping(doctype) == None:
        _logger.warning(
            "Could not find mapping of doctype, please check whether you are using the correct doctype"
        )
        return []

    body = {
        "sort": [{
            by_field: {
                "order": "desc"
            }
        }],
        "size": num,
        "query": {
            "term": {
                field: doctype
            }
        }
    }

    if query:
        _logger.debug("adding string query: {query}".format(**locals()))
        body['query'] = {'query_string': {'query': query}}

    docs = _client.search(index=_elastic_index,
                          body={
                              "sort": [{
                                  by_field: {
                                      "order": "desc"
                                  }
                              }],
                              "size": num,
                              "query": {
                                  "term": {
                                      field: doctype
                                  }
                              }
                          }).get('hits', {}).get('hits', [""])

    return docs
Ejemplo n.º 12
0
    def load_credentials(self,
                         app='default',
                         id=None,
                         update_last_loaded=True,
                         retries=3):
        """Load a credential from the specified app

        Retrieves credentials from a specified app. Choices are based
        on the `sort_field` and `preference` class properties that should
        indicate which field indicates how suited a credential is. If the
        class properties `sort_field` and `preference` are not set, it defaults
        to `last_loaded.<self.__name__>` & `lowest`, i.e. the credential that
        has not been used this function the longest time.

        Parameters
        ----------
        app : string (default='default')
            the appname from which the credentials should be drawn. Will be
            prepended with service name, i.e. "{service_name}_{app}" and
            retrieved from the database

        id   : string (default=None)
            a specific credential ID to retrieve, for instance related to
            user-specific content (e.g. direct messages). Otherwise the
            `self.sort_field` and `self.preference` are used to select
            credentials to apss to the .get method. NOTE: overrides app

        Returns
        -------
        dictionary
            the credentials record (empty if not found)

        Notes
        -----
        This function updates the last_loaded.<classname> field with the current
        time.

        """
        ordering = {'lowest': 'asc', 'highest': 'desc'}
        doctype = "{self.service_name}_{app}".format(**locals())
        try:
            if id:
                credentials = client.get(index=CREDENTIALS_INDEX,
                                         doc_type=doctype,
                                         id=id)

            else:
                docs = client.search(index=CREDENTIALS_INDEX,
                                     body={
                                         "sort": [{
                                             self.sort_field: {
                                                 "order":
                                                 ordering[self.preference]
                                             }
                                         }],
                                         "size":
                                         1,
                                         "query": {
                                             "match": {
                                                 "_type": doctype
                                             }
                                         }
                                     }).get('hits', {}).get('hits', [])
                if not docs:
                    logger.warning(
                        "No credentials found for {app}".format(**locals()))
                    return {}
                credentials = docs[0]
            if update_last_loaded:
                logger.debug("Updating last-loaded field")
                self.store_credentials(
                    id=credentials['_id'],
                    doc_type=doctype,
                    app=app,
                    credentials=credentials['_source']['credentials'],
                    last_loaded=datetime.datetime.now().isoformat(),
                    content=credentials['_source']['content'])

        except ConnectionTimeout:
            retries -= 1
            logger.info(
                "Connection timeout when trying to retrieve, retrying {retries} more times"
            )
            if retries:
                return self.load_credentials(app=app, id=id, retries=retries)
            else:
                return {}
        except ConnectionError:
            logger.warning("Unable to contact Elasticsearch, is it up?")
            return {}
        except NotFoundError:
            logger.warning("No credentials found")
            return {}
        except RequestError:
            logger.warning("You specified a sort field that does not exist!")
            return {}

        return credentials
Ejemplo n.º 13
0
    def analyse(self, queries, timefield, granularity="week", querytype="count", field=None,
                from_time=None, to_time=None, filter=None):
        '''returns a pandas dataframe '''
        if type(queries)==str:
            queries = [queries]
        if type(querytype)==str:
            querytype = [querytype]*len(queries)
        if field and type(field)==str or type(field)==type(None):
            field = [field]*len(queries)

        assert len(queries)==len(querytype), "there should be one querytype for each query"
        if field: assert len(queries)==len(field), "if specified, there should be a field for each query"

        target_dataframe = False
        prepend = []

        for num, q, qt, f in zip(range(len(queries)),queries, querytype, field):
            logger.debug(num,q,qt,f)
            if qt!='count' and not field:
                logger.info("metrics require a field to which the metric should be applied!,"
                            "which field should be {qt}-ed".format(**locals()))

            # basic elastic query to select documents for each timeseries
            elastic_query = {'query':{"bool": { 'must': [ {'query_string':{'query':q}}]}},
                             'aggs':{'timeline' : {"date_histogram": {
                                 "field":timefield,
                                 "interval":granularity
                             } }}}
            if qt!="count":
                elastic_query['aggs']['timeline'].update(
                    {"aggs": {
                    "metric": {
                        qt: {
                            "field": f
                        }
                    }
                }}
                )


            # add time range if from or to time is specified
            time_range = {timefield:{}}
            if from_time : time_range[timefield].update({ 'gte' : from_time })
            if to_time   : time_range[timefield].update({ 'lte' : to_time   })

            # apply filter if specified
            if type(filter)==str:
                elastic_query['query']['bool']['must'].append({'query_string':{'query':filter}})
            elif type(filter)==dict:
                elastic_query['query']['bool']['must'].append({"match":filter})

            if from_time or to_time:
                elastic_query['query']['bool']['must'].append({'range':time_range})

            logger.debug("elastic query = {elastic_query}".format(**locals()))
            res = client.search(elastic_index, body=elastic_query, size=0)
            logger.debug("found {res[hits][total]} results in total".format(**locals()))

            if qt=='count':
                df = pandas.DataFrame(res['aggregations']['timeline']['buckets'])
            else:
                df = pandas.DataFrame([{'doc_count':b['metric']['value'],'key_as_string':b['key_as_string']} for b in res['aggregations']['timeline']['buckets']])
            logger.debug("dataframe: {df}".format(**locals()))
            num +=1
            longer = len(q)>10 and '...' or '   '
            new_name = "{num}. {q:.10}{longer}".format(**locals())

            df = df.rename(columns={"doc_count": new_name, "key_as_string":"timestamp"})

            if type(target_dataframe)==bool and target_dataframe==False:
                if df.empty:
                    prepend.append(new_name)
                    continue
                else:
                    target_dataframe = df[["timestamp",new_name]]
            elif df.empty:
                target_dataframe[new_name] = numpy.nan
            else:
                target_dataframe = target_dataframe.merge(df[["timestamp",new_name]], on='timestamp', how='outer')

            if prepend:
                colnames = [name for name in target_dataframe.columns if name!="timestamp"]
                for empty_column in prepend:
                    target_dataframe[empty_column] = numpy.nan
                ordered_cols = ["timestamp"] + prepend + list(colnames)
                target_dataframe = target_dataframe[ordered_cols]
                prepend = []


        if not (type(target_dataframe)==bool and target_dataframe==False):
            target_dataframe = target_dataframe.replace(numpy.nan, 0)
            return target_dataframe
        else:
            logger.info("Empty result")
            return pandas.DataFrame()