Beispiel #1
0
def doctype_generator(doctype):
    if check_mapping(doctype) == "mixed_mapping":
        field = "doctype.keyword"
    elif check_mapping(doctype) == "new_mapping":
        field = "doctype"
    elif check_mapping(doctype) == None:
        _logger.warning(
            "Could not find mapping of doctype, please check whether you are using the correct doctype"
        )
        return []

    query = {'query': {'term': {field: doctype}}}
    for num, doc in enumerate(_scroll_query(query)):
        if not _DATABASE_AVAILABLE:
            _logger.warning(
                "Could not get documents: No database instance available")
            break
        _logger.info("returning {num}".format(**locals()))
        yield doc
Beispiel #2
0
def doctype_examples(doctype, field=None, seed=42, num=10):
    if not _DATABASE_AVAILABLE:
        _logger.warning(
            "Could not get example documents: No database instance available")
        return []
    if check_mapping(doctype) == "mixed_mapping":
        field2 = "doctype.keyword"
    elif check_mapping(doctype) == "new_mapping":
        field2 = "doctype"
    elif check_mapping(doctype) == None:
        return _logger.warning(
            "Could not find mapping of doctype, please check whether you are using the correct doctype"
        )

    docs = _client.search(index=_elastic_index,
                          body={
                              'size': num,
                              "query": {
                                  "function_score": {
                                      "query": {
                                          "term": {
                                              field2: doctype
                                          }
                                      },
                                      "functions": [{
                                          "random_score": {
                                              "seed": seed
                                          }
                                      }]
                                  }
                              }
                          })
    if not field:
        return docs['hits']['hits']
    elif type(field) == str:
        return [_dotkeys(doc, field) for doc in docs['hits']['hits']]
    else:
        return [{fi: _dotkeys(doc, fi)
                 for fi in field} for doc in docs['hits']['hits']]
Beispiel #3
0
def doctype_fields(doctype):
    '''
    returns a summary of fields for documents of `doctype`:
    field : type - count (coverage)

    note:
        As elasticsearch does not natively support an 'all fields' query,
        this function runs a 1000 document sample and takes the union of
        found keys as a proxy of fields shared by all documents.
    '''
    if not _DATABASE_AVAILABLE:
        _logger.warning(
            "Could not get document information: No database instance available"
        )
        return []
    if check_mapping(doctype) == "mixed_mapping":
        field = "doctype.keyword"
    elif check_mapping(doctype) == "new_mapping":
        field = "doctype"
    elif check_mapping(doctype) == None:
        _logger.warning(
            "Could not find mapping of doctype, please check whether you are using the correct doctype"
        )
        return []

    from collections import Counter
    key_count = Counter()
    doc_num = _client.search(index=_elastic_index,
                             body={'query': {
                                 "term": {
                                     field: doctype
                                 }
                             }})['hits']['total']
    mappings = _client.indices.get_mapping(_elastic_index).get(
        _elastic_index, {}).get('mappings', {}).get(doctype,
                                                    {}).get('properties', {})
    coverage = {
        key: _client.search(_elastic_index,
                            body={
                                'query': {
                                    'bool': {
                                        'filter': [{
                                            'exists': {
                                                'field': key
                                            }
                                        }, {
                                            'term': {
                                                field: doctype
                                            }
                                        }]
                                    }
                                }
                            }).get('hits', {}).get('total', 0)
        for key in mappings.keys() if key != "META"
    }
    summary = {
        k: {
            'coverage': coverage.get(k, 'unknown') / float(doc_num),
            'type': mappings[k].get('type', 'unknown')
        }
        for k in mappings.keys() if k != "META"
    }
    return summary
Beispiel #4
0
def doctype_last(doctype, num=1, by_field="META.ADDED", query=None):
    '''Returns the last document of a given doctype

    Input
    ---
    doctype: string
        The document type you whish to retrieved
    num: int
        The number of documents to retrieve
    by_field: string
        The _datetime field by which to determine the
        last document
    query : string (default None)
        An Elasticsearch string query to filter results.
        Example: query="user.screen_name:google"
    '''
    if not _DATABASE_AVAILABLE:
        _logger.warning(
            "Could not get last documents: No database instance available")
        return []

    exotic_by_field = by_field.replace('.', '.properties.')
    _logger.debug("looking for {exotic_by_field}".format(
        exotic_by_field=exotic_by_field))
    mapping = _client.indices.get_mapping()
    _logger.debug("Got mapping {mapping}".format(**locals()))
    target_key = "{_elastic_index}.mappings.{doctype}.properties.{exotic_by_field}".format(
        _elastic_index=_elastic_index, **locals())
    _logger.debug("Target key: {target_key}".format(**locals()))
    found_mapping = _dotkeys(mapping, target_key)
    _logger.debug("found mapping: {found_mapping}".format(**locals()))
    if not found_mapping:
        _logger.debug("Mapping not seen yet")
        return []
    if check_mapping(doctype) == "mixed_mapping":
        field = "doctype.keyword"
    elif check_mapping(doctype) == "new_mapping":
        field = "doctype"
    elif check_mapping(doctype) == None:
        _logger.warning(
            "Could not find mapping of doctype, please check whether you are using the correct doctype"
        )
        return []

    body = {
        "sort": [{
            by_field: {
                "order": "desc"
            }
        }],
        "size": num,
        "query": {
            "term": {
                field: doctype
            }
        }
    }

    if query:
        _logger.debug("adding string query: {query}".format(**locals()))
        body['query'] = {'query_string': {'query': query}}

    docs = _client.search(index=_elastic_index,
                          body={
                              "sort": [{
                                  by_field: {
                                      "order": "desc"
                                  }
                              }],
                              "size": num,
                              "query": {
                                  "term": {
                                      field: doctype
                                  }
                              }
                          }).get('hits', {}).get('hits', [""])

    return docs