Example #1
0
 def authors(num_columns=None):
     """Returns list of published light Author objects.
     
     @returns: list
     """
     KEY = 'encyc-front:authors'
     TIMEOUT = 60*5
     data = cache.get(KEY)
     if not data:
         s = Search(doc_type='authors')[0:MAX_SIZE]
         s = s.sort('title_sort')
         s = s.fields([
             'url_title',
             'title',
             'title_sort',
             'published',
             'modified',
         ])
         response = s.execute()
         data = [
             Author(
                 url_title  = hitvalue(hit, 'url_title'),
                 title      = hitvalue(hit, 'title'),
                 title_sort = hitvalue(hit, 'title_sort'),
                 published  = hitvalue(hit, 'published'),
                 modified   = hitvalue(hit, 'modified'),
             )
             for hit in response
             if hitvalue(hit, 'published')
         ]
         cache.set(KEY, data, TIMEOUT)
     if num_columns:
         return _columnizer(data, num_columns)
     return data
Example #2
0
 def pages():
     """Returns list of published light Page objects.
     
     @returns: list
     """
     KEY = 'encyc-front:pages'
     TIMEOUT = 60*5
     data = cache.get(KEY)
     if not data:
         s = Search(doc_type='articles').filter('term', published_encyc=True)[0:MAX_SIZE]
         s = s.sort('title_sort')
         s = s.fields([
             'url_title',
             'title',
             'title_sort',
             'published',
             'modified',
             'categories',
         ])
         response = s.execute()
         data = [
             Page(
                 url_title  = hitvalue(hit, 'url_title'),
                 title      = hitvalue(hit, 'title'),
                 title_sort = hitvalue(hit, 'title_sort'),
                 published  = hitvalue(hit, 'published'),
                 modified   = hitvalue(hit, 'modified'),
                 categories = hit.get('categories',[]),
                )
             for hit in response
             if hitvalue(hit, 'published')
         ]
         cache.set(KEY, data, TIMEOUT)
     return data
def session_times():
    # {"@fields": {}, "@timestamp": "2015-02-23T17:03:41.738412Z", "@source_host": "newair.brainbot.com", "@message": "scenario.p2p_connect.started"}

    start_message = 'scenario.p2p_connect.starting.clients.sequentially'
    stop_message = 'scenario.p2p_connect.stopping.clients'
    s = Search(client)
    s = s.filter('bool',
                 should=[F('term', message=start_message),
                         F('term', message=stop_message)])
    s = s.fields(['message', '@timestamp'])
    s = s[0:100000]
    s = s.sort('-@timestamp')  # desc,  we want the latest events
    response = s.execute()

    events = []  # joungest to oldest, last should be a stop message
    for h in response:
        msg = 'start' if h['message'][0] == start_message else 'stop'
        ts = h['@timestamp'][0]
        events.append((msg, ts))
    assert not events or events[0][0] == 'stop'
    sessions = []
    while len(events) >= 2:
        stop = events.pop()
        start = events.pop()
        sessions.append(dict([start, stop]))
    return list(reversed(sessions))
Example #4
0
 def locations():
     """Returns list of Location objects.
     
     @returns: list
     """
     s = Search(doc_type='location')[0:MAX_SIZE]
     s = s.sort('id')
     s = s.fields([
         'id',
         'category',
         'title',
         'location_name',
         'description',
         'lat',
         'lng',
         'resource_uri',
         'location_uri',
         'location_url',
     ])
     response = s.execute()
     return [
         Location(
             id = hitvalue(hit, 'id'),
             category = hitvalue(hit, 'category'),
             title = hitvalue(hit, 'title'),
             location_name = hitvalue(hit, 'location_name'),
             description = hitvalue(hit, 'description'),
             lat = hitvalue(hit, 'lat'),
             lng = hitvalue(hit, 'lng'),
             resource_uri = hitvalue(hit, 'resource_uri'),
             location_uri = hitvalue(hit, 'location_uri'),
             location_url = hitvalue(hit, 'location_url'),
         )
         for hit in response
     ]
def session_times():
    # {"@fields": {}, "@timestamp": "2015-02-23T17:03:41.738412Z", "@source_host": "newair.brainbot.com", "@message": "scenario.p2p_connect.started"}

    start_message = 'scenario.p2p_connect.starting.clients'
    stop_message = 'scenario.p2p_connect.stopping.clients'
    s = Search(client)
    s = s.filter('bool',
                 should=[
                     F('term', at_message=start_message),
                     F('term', at_message=stop_message)
                 ])
    s = s.fields(['@message', '@timestamp'])
    s = s[0:100000]
    s = s.sort('-@timestamp')  # desc,  we want the latest events
    response = s.execute()

    events = []  # joungest to oldest, last should be a stop message
    for h in response:
        msg = 'start' if h['@message'][0] == start_message else 'stop'
        ts = h['@timestamp'][0]
        events.append((msg, ts))
    assert not events or events[0][0] == 'stop'
    sessions = []
    while len(events) >= 2:
        stop = events.pop()
        start = events.pop()
        sessions.append(dict([start, stop]))
    return list(reversed(sessions))
Example #6
0
def cmd_list_crashids(ctx, field):
    """
    List crash ids for crash reports that contain a specified field.
    """

    es_conn = get_es_conn()
    indices = es_conn.get_indices()

    click.echo("# %s indexes." % len(indices))
    click.echo("# %r" % indices)
    total = 0
    for index in indices:
        click.echo("# working on %s..." % index)
        with es_conn() as conn:
            search = Search(using=conn,
                            index=index,
                            doc_type=es_conn.get_doctype())
            search = search.filter("exists", field=field)
            search = search.fields(["processed_crash.uuid"])
            results = search.scan()
            for hit in results:
                print(
                    json.dumps({
                        "crashid": hit["processed_crash.uuid"][0],
                        "index": index
                    }))
                total += 1

    click.echo("# total found %d" % total)
def cmd_list_crashids(field):
    es_conn = get_es_conn()
    indices = es_conn.get_indices()

    print("# %s indexes." % len(indices))
    print("# %r" % indices)
    total = 0
    for index in indices:
        print("# working on %s..." % index)
        with es_conn() as conn:
            search = Search(using=conn,
                            index=index,
                            doc_type=es_conn.get_doctype())
            search = search.filter("exists", field=field)
            search = search.fields(["processed_crash.uuid"])
            results = search.scan()
            for hit in results:
                print(
                    json.dumps({
                        "crashid": hit["processed_crash.uuid"][0],
                        "index": index
                    }))
                total += 1

    print("# total found %d" % total)
Example #8
0
 def events():
     """Returns list of Event objects.
     
     @returns: list
     """
     s = Search(doc_type='events')[0:MAX_SIZE]
     s = s.sort('start_date')
     s = s.fields([
         'id',
         'published',
         'title',
         'description',
         'start_date',
         'end_date',
         'article_title',
         'resource_uri',
     ])
     response = s.execute()
     data = [
         Event(
             id = hit.meta.id,
             published = hitvalue(hit, 'published'),
             title = hitvalue(hit, 'title'),
             description = hitvalue(hit, 'description'),
             start_date = hitvalue(hit, 'start_date'),
             end_date = hitvalue(hit, 'end_date'),
             article_title = hitvalue(hit, 'article_title'),
             resource_uri = hitvalue(hit, 'resource_uri'),
         )
         for hit in response
     ]
     return data
Example #9
0
 def sources():
     """Returns list of published light Source objects.
     
     @returns: list
     """
     KEY = 'encyc-front:sources'
     TIMEOUT = 60*5
     data = cache.get(KEY)
     if not data:
         s = Search(doc_type='sources')[0:MAX_SIZE]
         s = s.sort('encyclopedia_id')
         s = s.fields([
             'encyclopedia_id',
             'published',
             'modified',
             'headword',
             'media_format',
             'img_path',
         ])
         response = s.execute()
         data = [
             Source(
                 encyclopedia_id = hitvalue(hit, 'encyclopedia_id'),
                 published = hitvalue(hit, 'published'),
                 modified = hitvalue(hit, 'modified'),
                 headword = hitvalue(hit, 'headword'),
                 media_format = hitvalue(hit, 'media_format'),
                 img_path = hitvalue(hit, 'img_path'),
                )
             for hit in response
             if hitvalue(hit, 'published')
         ]
         cache.set(KEY, data, TIMEOUT)
     return data
Example #10
0
    def get_update_list_single_process(self):
        """ Find units that needs updating and their sidstopdateret (last updated)
        the sidstopdateret may be inaccurate and thus way to far back in time therefore we cannot use take the largest
        of sidstopdateret from the database. Seems we download like 600 dicts a second with match_all.
        Should take around 2 hours and 30 minuttes then. This takes 30 so i need to save half an hour on downloads.

        :return datetime (min sidstopdateret), list (enhedsnumer, sidstopdateret)
        """
        enh_samtid_map = self.make_samtid_dict()
        oldest_sidstopdateret = datetime.datetime.utcnow().replace(
            tzinfo=pytz.utc) + datetime.timedelta(days=1)
        update_dicts = {
            x: {
                'units': [],
                'sidstopdateret': oldest_sidstopdateret
            }
            for x in self.source_keymap.values()
        }
        if len(enh_samtid_map) == 0:
            return update_dicts
        dummy = CvrConnection.update_info(samtid=-1,
                                          sidstopdateret=self.dummy_date)
        print('Get update time for all data')

        for _type in self.source_keymap.values():
            search = Search(using=self.elastic_client, index=self.index)
            search = search.query('match_all')
            sidst_key = '{0}.sidstOpdateret'.format(_type)
            samt_key = '{0}.samtId'.format(_type)
            field_list = ['_id', sidst_key, samt_key]
            # field_list = ['_id'] + ['{0}.sidstOpdateret'.format(key) for key in self.source_keymap.values()] + \
            #          ['{0}.samtId'.format(key) for key in self.source_keymap.values()]
            search = search.fields(fields=field_list)
            params = {'scroll': self.elastic_search_scroll_time, 'size': 2**12}
            search = search.params(**params)
            print('ElasticSearch Query: ', search.to_dict())
            generator = search.scan()
            for cvr_update in tqdm.tqdm(generator):
                enhedsnummer = int(cvr_update.meta.id)
                raw_dat = cvr_update.to_dict()
                samtid = raw_dat[samt_key][0] if samt_key in raw_dat else None
                sidstopdateret = raw_dat[sidst_key][
                    0] if sidst_key in raw_dat else None
                if sidstopdateret is None or samtid is None:
                    continue
                current_update = enh_samtid_map[
                    enhedsnummer] if enhedsnummer in enh_samtid_map else dummy
                if samtid > current_update.samtid:
                    utc_sidstopdateret = utc_transform(sidstopdateret)
                    update_dicts[_type]['sidstopdateret'] = min(
                        utc_sidstopdateret,
                        update_dicts[_type]['sidstopdateret'])
                    update_dicts[_type]['units'].append(
                        (enhedsnummer, utc_sidstopdateret))
                    # break
        print('Update Info: ')
        print([(k, v['sidstopdateret'], len(v['units']))
               for k, v in update_dicts.items()])
        return update_dicts
def fetch(session):
    s = Search(client)
    s = s.filter('bool',
                 should=[F('term', message='p2p.disconnected'),
                         F('term', message='p2p.connected')])
    s = s.filter('range', **{'@timestamp': dict(gte=session['start'], lte=session['stop'])})
    s = s.fields(['json_message.p2p.connected.remote_id', 'guid', 'message', '@timestamp'])
    s = s[0:100000]
    # s = s[0:10]
    s = s.sort('@timestamp')
    response = s.execute()
    return response
Example #12
0
def cmd_list_crashids(ctx, index):
    """List crashids for index."""
    es_conn = get_conn()
    with es_conn() as conn:
        search = Search(
            using=conn,
            index=index,
            doc_type=es_conn.get_doctype(),
        )
        search = search.fields("processed_crash.uuid")
        results = search.execute()
        click.echo("Crashids in %s:" % index)
        for hit in results:
            click.echo(hit["processed_crash.uuid"][0])
Example #13
0
 def search(self):
     """
     Construct the Search object.
     """
     s = Search(doc_type=self.doc_types, using=es.client,
                index=es.index_name)
     # don't return any fields, just the metadata
     s = s.fields([])
     # Sort from parameters
     s = s.sort(*self.sorts)
     # Paginate from parameters
     s = s[self.page_start:self.page_end]
     # Same construction as parent class
     # Allows to give the same signature as simple search
     # ie. Response(data) instead of Response(search, data)
     return s.response_class(partial(SearchResult, self))
def fetch(session):
    s = Search(client)
    s = s.filter('bool',
                 should=[
                     F('term', at_message='p2p.disconnected'),
                     F('term', at_message='p2p.connected')
                 ])
    s = s.filter(
        'range',
        **{'@timestamp': dict(gte=session['start'], lte=session['stop'])})
    s = s.fields(['@fields.remote_id', 'guid', '@message', '@timestamp'])
    s = s[0:100000]
    #s = s[0:10]
    s = s.sort('@timestamp')
    response = s.execute()
    return response
Example #15
0
 def mapcategories(num_columns=None):
     """Returns list of MapCategory objects.
     
     @returns: list
     """
     s = Search(doc_type='mapcategory')[0:MAX_SIZE]
     s = s.sort('id')
     s = s.fields([
         'id',
         'title',
     ])
     response = s.execute()
     return [
         MapCategory(
             id = hitvalue(hit, 'id'),
             title = hitvalue(hit, 'title'),
         )
         for hit in response
     ]
Example #16
0
def search(hosts, index, query):
    logging.info('query: "%s"' % query)
    
    s = Search().doc_type(models.Record)
    s = s.fields(definitions.FIELDS_MASTER)
    s = s.sort('m_pseudoid')
    s = s.query(
        'multi_match', query=query, fields=definitions.FIELDS_MASTER
    )[0:10000]
    response = s.execute()
    records = [models.Record.from_hit(hit) for hit in response]

    logging.info('%s records' % len(records))
    for record in records:
        logging.info(record)
    # if only single result, display it
    if len(records) == 1:
        for field in definitions.FIELDS_MASTER:
            logging.info('%s: %s' % (field, getattr(record, field)))
Example #17
0
    def find_missing(self):
        """
        Check if we are missing anything

        :return:
        """
        search = Search(using=self.elastic_client, index=self.index)
        search = search.query('match_all')
        field_list = ['_id']
        search = search.fields(fields=field_list)
        params = {
            'scroll': self.elastic_search_scroll_time,
            'size': 2 * self.elastic_search_scan_size
        }
        search = search.params(**params)
        print('ElasticSearch Query: ', search.to_dict())
        generator = search.scan()
        ids = [x.meta.id for x in generator]
        return ids
def GetGeneratorQuery(case, endpoint_id, start, length, str_query, sort, order):
	s = Search()
	s = s[int(start):int(length)+int(start)]
	s = s.fields([	"Record.Path",
				    "Record.Url",
				    "Record.SourceUrl",
				    "Record.TlnTime",
				    "Record.File.Accessed",
				    "Record.File.Modified",
				    "Record.File.Changed",
				    "AuditType.Generator"
				])

	order_dict = {
		"0": "TlnTime"
	}

	if str_query == "":
		_sort = {
			"Record.{0}".format(order_dict[str(sort)]): {
				"order": order
			}
		}

		t = Q('query_string', default_field="Record.TlnTime", query="*") & Q('match', ComputerName=endpoint_id) & ~Q('match', AuditType__Generator="w32processes-memory") & ~Q('match', AuditType__Generator="w32useraccounts")
		query = s.query(t).filter('term', CaseInfo__case_name=case).sort(_sort)

	else:
		_sort = {
			"Record.{0}".format(order_dict[str(sort)]): {
				"order": order
			}
		}
		
		t = Q('query_string', default_field="Record.TlnTime", query="*") & Q('match', ComputerName=endpoint_id) & ~Q('match', AuditType__Generator="w32processes-memory") & ~Q('match', AuditType__Generator="w32useraccounts") & Q('query_string', fields=[
					"Record.Path",
				    "Record.Url",
				    "Record.SourceUrl",
				    "AuditType.Generator"], query="{0}*".format(str_query))
		query = s.query(t).filter('term', CaseInfo__case_name=case).sort(_sort)

	return query.to_dict()
Example #19
0
def search(
        hosts, index, query_type='multi_match', query='', filters={},
        sort='m_pseudoid', start=0, pagesize=10
):
    """Constructs Search object
    
    Note: allows any combination of filters, even illogical ones
    
    @param hosts: list settings.DOCSTORE_HOSTS
    @param index: elasticsearch_dsl.Index
    @param query_type: str Name of query type.
    @param query: str Query string.
    @param filters: dict Filters and their arguments.
    @param sort: str Name of field on which to sort.
    @param start: int Start of result set.
    @param pagesize: int Number of records to return.
    @returns: elasticsearch_dsl.Search
    """
    ## remove empty filter args
    #filter_args = {key:val for key,val in filters.items() if val}
    #if not (query or filter_args):
    #    return None,[]
    s = Search(using=ES, index=index)
    s = s.doc_type(Record)
    if filters:
        for field,values in filters.items():
            if values:
                # multiple terms for a field are OR-ed
                s = s.filter('terms', **{field: values})
    if query:
        s = s.query(
            query_type, query=query, fields=definitions.FIELDS_MASTER
        )
    # aggregations
    if filters:
        for field in filters.keys():
            s.aggs.bucket(field, 'terms', field=field, size=1000)
    s = s.fields(definitions.FIELDS_MASTER)
    s = s.sort(sort)
    s = s[start:start+pagesize]
    return s
Example #20
0
def update_time_worker(args):
    _type = args[0]
    url = args[1]
    user = args[2]
    password = args[3]
    index = args[4]
    enh_samtid_map = CvrConnection.make_samtid_dict()
    dummy_date = datetime.datetime(year=1001, month=1, day=1, tzinfo=pytz.utc)
    dummy = CvrConnection.update_info(samtid=-1, sidstopdateret=dummy_date)
    oldest_sidstopdateret = datetime.datetime.utcnow().replace(
        tzinfo=pytz.utc) + datetime.timedelta(days=1)
    type_dict = {'units': [], 'sidstopdateret': oldest_sidstopdateret}
    if len(enh_samtid_map) == 0:
        return type_dict
    elastic_client = create_elastic_connection(url, (user, password))
    search = Search(using=elastic_client, index=index).query('match_all')
    sidst_key = '{0}.sidstOpdateret'.format(_type)
    samt_key = '{0}.samtId'.format(_type)
    field_list = ['_id', sidst_key, samt_key]
    search = search.fields(fields=field_list)
    params = {'scroll': '10m', 'size': 2**12}
    search = search.params(**params)
    print('ElasticSearch Query: ', search.to_dict())
    generator = search.scan()
    for cvr_update in generator:
        enhedsnummer = int(cvr_update.meta.id)
        raw_dat = cvr_update.to_dict()
        samtid = raw_dat[samt_key][0] if samt_key in raw_dat else None
        sidstopdateret = raw_dat[sidst_key][0] if sidst_key in raw_dat else None
        if sidstopdateret is None or samtid is None:
            continue
        current_update = enh_samtid_map[
            enhedsnummer] if enhedsnummer in enh_samtid_map else dummy
        if samtid > current_update.samtid:
            utc_sidstopdateret = utc_transform(sidstopdateret)
            type_dict['sidstopdateret'] = min(utc_sidstopdateret,
                                              type_dict['sidstopdateret'])
            type_dict['units'].append((enhedsnummer, utc_sidstopdateret))
    return _type, type_dict
Example #21
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']
        self._build_fields()

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError('_results_number too large')
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value, full=False)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value)
                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=facets_size,
                )

        # Create signature aggregations.
        if params.get('_aggs.signature'):
            sig_bucket = A(
                'terms',
                field=self.get_field_name('signature'),
                size=facets_size,
            )
            for param in params['_aggs.signature']:
                for value in param.value:
                    if not value:
                        continue

                    if value.startswith('_histogram.'):
                        # This is a histogram aggregation we want to run,
                        # not a terms aggregation.
                        field_name = value[len('_histogram.'):]
                        if field_name not in self.histogram_fields:
                            continue

                        histogram_type = (
                            self.all_fields[field_name]['query_type'] == 'date'
                            and 'date_histogram' or 'histogram'
                        )
                        sig_bucket.bucket(
                            'histogram_%s' % field_name,
                            histogram_type,
                            field=self.get_field_name(field_name),
                            interval=histogram_intervals[field_name],
                        )
                    else:
                        sig_bucket.bucket(
                            value,
                            'terms',
                            field=self.get_field_name(value),
                            size=facets_size,
                        )

            search.aggs.bucket('signature', sig_bucket)

        # Create histograms.
        for f in self.histogram_fields:
            if params.get('_histogram.%s' % f):
                histogram_type = (
                    self.all_fields[f]['query_type'] == 'date'
                    and 'date_histogram' or 'histogram'
                )
                date_bucket = A(
                    histogram_type,
                    field=self.get_field_name(f),
                    interval=histogram_intervals[f],
                )
                for param in params['_histogram.%s' % f]:
                    for value in param.value:
                        if not value:
                            continue

                        field_name = self.get_field_name(value)
                        val_bucket = A(
                            'terms',
                            field=field_name,
                            size=facets_size,
                        )
                        date_bucket.bucket(value, val_bucket)

                search.aggs.bucket('histogram_%s' % f, date_bucket)

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
Example #22
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get("_fields"):
            raise MissingArgumentError("_fields")
        self.all_fields = kwargs["_fields"]

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params["date"])

        if "%" in self.context.get_index_template():
            # If the index template is date-centric, remove indices before the retention
            # policy because they're not valid to search through and probably don't
            # exist
            policy = datetime.timedelta(
                weeks=self.context.get_retention_policy())
            template = self.context.get_index_template()
            indices = prune_invalid_indices(indices, policy, template)

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.context.get_doctype(),
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith("_"):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == "_results_offset":
                        results_from = param.value[0]
                    elif param.name == "_results_number":
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError(
                                "_results_number",
                                msg=("_results_number cannot be greater "
                                     "than 1,000"),
                            )
                        if results_number < 0:
                            raise BadArgumentError(
                                "_results_number",
                                msg="_results_number cannot be negative",
                            )
                    elif param.name == "_facets_size":
                        facets_size = param.value[0]
                        # Why cap it?
                        # Because if the query is covering a lot of different
                        # things you can get a really really large query
                        # which can hog resources excessively.
                        # Downloading, as an example, 100k facets (and 0 hits)
                        # when there is plenty of data yields a 11MB JSON
                        # file.
                        if facets_size > 10000:
                            raise BadArgumentError(
                                "_facets_size greater than 10,000")

                    for f in self.histogram_fields:
                        if param.name == "_histogram_interval.%s" % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]
                name = self.get_full_field_name(field_data)

                if param.data_type in ("date", "datetime"):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == "enum":
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == "str" and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    "~": "*%s*",  # contains
                    "^": "%s*",  # starts with
                    "$": "*%s",  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    ">": "gt",
                    "<": "lt",
                    ">=": "gte",
                    "<=": "lte"
                }

                args = {}
                filter_type = "term"
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, str) or " " not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = "query"
                            args = Q(
                                "simple_query_string",
                                query=param.value[0],
                                fields=[name],
                                default_operator="and",
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = "terms"
                        filter_value = param.value
                elif param.operator == "=":
                    # is exactly
                    if field_data["has_full_version"]:
                        name = "%s.full" % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = "range"
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == "__null__":
                    filter_type = "missing"
                    args["field"] = name
                elif param.operator == "__true__":
                    filter_type = "term"
                    filter_value = True
                elif param.operator == "@":
                    filter_type = "regexp"
                    if field_data["has_full_version"]:
                        name = "%s.full" % name
                    filter_value = param.value
                elif param.operator in operator_wildcards:
                    filter_type = "query"

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data["has_full_version"]:
                        name = "%s.full" % name

                    q_args = {}
                    q_args[name] = operator_wildcards[
                        param.operator] % param.value
                    query = Q("wildcard", **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == "range":
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F("bool", must=filters))

        # Restricting returned fields.
        fields = []

        # We keep track of the requested columns in order to make sure we
        # return those column names and not aliases for example.
        self.request_columns = []
        for param in params["_columns"]:
            for value in param.value:
                if not value:
                    continue

                self.request_columns.append(value)
                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params["_sort"]:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product then descending version.
                desc = False
                if value.startswith("-"):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = "-" + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        if facets_size:
            self._create_aggregations(params, search, facets_size,
                                      histogram_intervals)

        # Query and compute results.
        hits = []

        if params["_return_query"][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {"query": search.to_dict(), "indices": indices}

        errors = []

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()

                aggregations = getattr(results, "aggregations", {})
                if aggregations:
                    aggregations = self.format_aggregations(aggregations)

                shards = getattr(results, "_shards", {})

                break  # Yay! Results!
            except NotFoundError as e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                errors.append({
                    "type": "missing_index",
                    "index": missing_index
                })

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    shards = None
                    break
            except RequestError as exception:
                # Try to handle it gracefully if we can find out what
                # input was bad and caused the exception.
                try:
                    bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall(
                        exception.error)[-1]
                    # Loop over the original parameters to try to figure
                    # out which *key* had the bad input.
                    for key, value in kwargs.items():
                        if value == bad_input:
                            raise BadArgumentError(key)
                except IndexError:
                    # Not an ElasticsearchParseException exception
                    pass

                # Re-raise the original exception
                raise

        if shards and shards.failed:
            # Some shards failed. We want to explain what happened in the
            # results, so the client can decide what to do.
            failed_indices = defaultdict(int)
            for failure in shards.failures:
                failed_indices[failure.index] += 1

            for index, shards_count in failed_indices.items():
                errors.append({
                    "type": "shards",
                    "index": index,
                    "shards_count": shards_count
                })

        return {
            "hits": hits,
            "total": total,
            "facets": aggregations,
            "errors": errors
        }
import elasticsearch
from elasticsearch_dsl import Search, Q, A

def GetGeneratorQuery(case, endpoint_id, start, length, str_query, sort, order):
	s = Search()
	s = s[int(start):int(length)+int(start)]
	s = s.fields([	"Record.Path",
				    "Record.Url",
				    "Record.SourceUrl",
				    "Record.TlnTime",
				    "Record.File.Accessed",
				    "Record.File.Modified",
				    "Record.File.Changed",
				    "AuditType.Generator"
				])

	order_dict = {
		"0": "TlnTime"
	}

	if str_query == "":
		_sort = {
			"Record.{0}".format(order_dict[str(sort)]): {
				"order": order
			}
		}

		t = Q('query_string', default_field="Record.TlnTime", query="*") & Q('match', ComputerName=endpoint_id) & ~Q('match', AuditType__Generator="w32processes-memory") & ~Q('match', AuditType__Generator="w32useraccounts")
		query = s.query(t).filter('term', CaseInfo__case_name=case).sort(_sort)

	else:
Example #24
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = None

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:

                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]
                    elif param.name == '_histogram_interval.date':
                        histogram_interval_date = param.value[0]
                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                args = {}
                filter_type = 'term'
                filter_value = None
                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]
                        if not isinstance(val, basestring) or (
                            isinstance(val, basestring) and ' ' not in val
                        ):
                            filter_value = val

                        # If the term contains white spaces, we want to perform
                        # a phrase query. Thus we do nothing here and let this
                        # value be handled later.
                    else:
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator == '>':
                    # greater than
                    filter_type = 'range'
                    filter_value = {
                        'gt': param.value
                    }
                elif param.operator == '<':
                    # lower than
                    filter_type = 'range'
                    filter_value = {
                        'lt': param.value
                    }
                elif param.operator == '>=':
                    # greater than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'gte': param.value
                    }
                elif param.operator == '<=':
                    # lower than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'lte': param.value
                    }
                elif param.operator == '__null__':
                    # is null
                    filter_type = 'missing'
                    args['field'] = name

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    if param.operator_not:
                        new_filter = ~F(filter_type, **args)
                    else:
                        new_filter = F(filter_type, **args)

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif param.data_type == 'enum':
                        sub_filters |= new_filter
                    else:
                        sub_filters &= new_filter

                    continue

                # These use a wildcard and thus need to be in a query
                # instead of a filter.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                if param.operator in operator_wildcards:
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    query_type = 'wildcard'
                    args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                elif not param.operator:
                    # This is a phrase that was passed down.
                    query_type = 'simple_query_string'
                    args['query'] = param.value[0]
                    args['fields'] = [name]
                    args['default_operator'] = 'and'

                if args:
                    query = Q(query_type, **args)
                    if param.operator_not:
                        query = ~query
                    search = search.query(query)
                else:
                    # If we reach this point, that means the operator is
                    # not supported, and we should raise an error about that.
                    raise NotImplementedError(
                        'Operator %s is not supported' % param.operator
                    )

            if filters is None:
                filters = sub_filters
            elif sub_filters is not None:
                filters &= sub_filters

        search = search.filter(filters)

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value, full=False)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value)
                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=facets_size,
                )

        # Create signature aggregations.
        if params.get('_aggs.signature'):
            sig_bucket = A(
                'terms',
                field=self.get_field_name('signature'),
                size=facets_size,
            )
            for param in params['_aggs.signature']:
                for value in param.value:
                    if not value:
                        continue

                    field_name = self.get_field_name(value)
                    sig_bucket.bucket(
                        value,
                        'terms',
                        field=field_name,
                        size=facets_size,
                    )

            search.aggs.bucket('signature', sig_bucket)

        # Create date histograms.
        if params.get('_histogram.date'):
            date_bucket = A(
                'date_histogram',
                field=self.get_field_name('date'),
                interval=histogram_interval_date,
            )
            for param in params['_histogram.date']:
                for value in param.value:
                    if not value:
                        continue

                    field_name = self.get_field_name(value)
                    val_bucket = A(
                        'terms',
                        field=field_name,
                        size=facets_size,
                    )
                    date_bucket.bucket(value, val_bucket)

            search.aggs.bucket('histogram_date', date_bucket)

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
Example #25
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = None

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:

                if param.name.startswith('_'):
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (field_data['namespace'],
                                  field_data['in_database_name'])

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                args = {}
                filter_type = 'term'
                filter_value = None
                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]
                        if not isinstance(val, basestring) or (isinstance(
                                val, basestring) and ' ' not in val):
                            filter_value = val

                        # If the term contains white spaces, we want to perform
                        # a phrase query. Thus we do nothing here and let this
                        # value be handled later.
                    else:
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator == '>':
                    # greater than
                    filter_type = 'range'
                    filter_value = {'gt': param.value}
                elif param.operator == '<':
                    # lower than
                    filter_type = 'range'
                    filter_value = {'lt': param.value}
                elif param.operator == '>=':
                    # greater than or equal to
                    filter_type = 'range'
                    filter_value = {'gte': param.value}
                elif param.operator == '<=':
                    # lower than or equal to
                    filter_type = 'range'
                    filter_value = {'lte': param.value}
                elif param.operator == '__null__':
                    # is null
                    filter_type = 'missing'
                    args['field'] = name

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    if param.operator_not:
                        new_filter = ~F(filter_type, **args)
                    else:
                        new_filter = F(filter_type, **args)

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif param.data_type == 'enum':
                        sub_filters |= new_filter
                    else:
                        sub_filters &= new_filter

                    continue

                # These use a wildcard and thus need to be in a query
                # instead of a filter.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                if param.operator in operator_wildcards:
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    query_type = 'wildcard'
                    args[name] = (operator_wildcards[param.operator] %
                                  param.value)
                elif not param.operator:
                    # This is a phrase that was passed down.
                    query_type = 'simple_query_string'
                    args['query'] = param.value[0]
                    args['fields'] = [name]
                    args['default_operator'] = 'and'

                if args:
                    query = Q(query_type, **args)
                    if param.operator_not:
                        query = ~query
                    search = search.query(query)
                else:
                    # If we reach this point, that means the operator is
                    # not supported, and we should raise an error about that.
                    raise NotImplementedError('Operator %s is not supported' %
                                              param.operator)

            if filters is None:
                filters = sub_filters
            elif sub_filters is not None:
                filters &= sub_filters

        search = search.filter(filters)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't facet on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot facet on it' % value)

                field_name = '%s.%s' % (field_['namespace'],
                                        field_['in_database_name'])

                if field_['has_full_version']:
                    # If the param has a full version, that means what matters
                    # is the full string, and not its individual terms.
                    field_name += '.full'

                search.aggs.bucket(value,
                                   'terms',
                                   field=field_name,
                                   size=self.config.facets_max_number)

        # Query and compute results.
        hits = []
        fields = [
            '%s.%s' % (x['namespace'], x['in_database_name'])
            for x in self.all_fields.values() if x['is_returned']
        ]
        search = search.fields(*fields)

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
Example #26
0
File: utils.py Project: olabi/lore
def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None):
    """
    Perform a search in Elasticsearch.

    Args:
        tokens (unicode): string of one or more words
        repo_slug (unicode): repository slug
        sort_by (string): field to sort by
        terms: (dict): {"vocabulary name": ["term1" [, "term2"]]}
    Returns:
        results (SearchResults)
    """
    if terms is None:
        terms = {}

    search = Search(index=INDEX_NAME, doc_type=DOC_TYPE)

    # Limit returned fields since content_xml can be huge and is unnecessary.
    search = search.fields(_get_field_names())

    if tokens is not None:
        # Search on title, description, and content_xml (minus markup).
        multi = query.MultiMatch(
            query=tokens, fields=["title", "description", "content_stripped"])
        search = search.query(multi)

    # Filter further on taxonomy terms.
    for key, value in terms.items():
        if value is None:
            search = search.query(
                "query_string",
                query="_missing_:({key})".format(key=key)
            )
        else:
            search = search.query("match", **{key: value})

    if repo_slug is not None:
        # Filter further on repository.
        search = search.query("match", repository=repo_slug)
    if sort_by is None:
        # Always sort by ID to preserve ordering.
        search = search.sort("id")
    else:
        # Temporary workaround; the values in sorting.py should be updated,
        # but for now Haystack is still using them. Also, the hyphen is
        # required because we sort the numeric values high to low.
        if "title" not in sort_by:
            reverse = sort_by.startswith("-")
            if reverse:
                sort_by = sort_by[1:]
            if "xa" not in sort_by:
                sort_by = "xa_{0}".format(sort_by)
            if reverse:
                sort_by = "-{0}".format(sort_by)
        # Always sort by ID to preserve ordering.
        search = search.sort(sort_by, "id")

    vocab_ids = set(get_vocab_ids(repo_slug=repo_slug))
    for vocab_id in vocab_ids:
        vocab_key = make_vocab_key(vocab_id)
        search.aggs.bucket(
            "{key}_missing".format(key=vocab_key),
            "missing", field=vocab_key
        )
        search.aggs.bucket(
            "{key}_buckets".format(key=vocab_key),
            "terms", field=vocab_key
        )
    for key in ('run', 'course', 'resource_type'):
        search.aggs.bucket(
            '{key}_builtins'.format(key=key), "terms", field=key
        )

    return SearchResults(search)
Example #27
0
def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None):
    """
    Perform a search in Elasticsearch.

    Args:
        tokens (unicode): string of one or more words
        repo_slug (unicode): repository slug
        sort_by (string): field to sort by
        terms: (dict): {"vocabulary name": ["term1" [, "term2"]]}
    Returns:
        results (SearchResults)
    """
    if terms is None:
        terms = {}

    search = Search(index=INDEX_NAME, doc_type=DOC_TYPE)

    # Limit returned fields since content_xml can be huge and is unnecessary.
    search = search.fields(_get_field_names())

    if tokens is not None:
        # Search on title, description, and content_xml (minus markup).
        multi = query.MultiMatch(
            query=tokens, fields=["title", "description", "content_stripped"])
        search = search.query(multi)

    # Filter further on taxonomy terms.
    for key, value in terms.items():
        if value is None:
            search = search.query("query_string",
                                  query="_missing_:({key})".format(key=key))
        else:
            search = search.query("match", **{key: value})

    if repo_slug is not None:
        # Filter further on repository.
        search = search.query("match", repository=repo_slug)
    if sort_by is None:
        # Always sort by ID to preserve ordering.
        search = search.sort("id")
    else:
        # Temporary workaround; the values in sorting.py should be updated,
        # but for now Haystack is still using them. Also, the hyphen is
        # required because we sort the numeric values high to low.
        if "title" not in sort_by:
            reverse = sort_by.startswith("-")
            if reverse:
                sort_by = sort_by[1:]
            if "xa" not in sort_by:
                sort_by = "xa_{0}".format(sort_by)
            if reverse:
                sort_by = "-{0}".format(sort_by)
        # Always sort by ID to preserve ordering.
        search = search.sort(sort_by, "id")

    vocab_ids = set(get_vocab_ids(repo_slug=repo_slug))
    for vocab_id in vocab_ids:
        vocab_key = make_vocab_key(vocab_id)
        search.aggs.bucket("{key}_missing".format(key=vocab_key),
                           "missing",
                           field=vocab_key)
        search.aggs.bucket("{key}_buckets".format(key=vocab_key),
                           "terms",
                           field=vocab_key)
    for key in ('run', 'course', 'resource_type'):
        search.aggs.bucket('{key}_builtins'.format(key=key),
                           "terms",
                           field=key)

    return SearchResults(search)
Example #28
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = None

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:

                if param.name.startswith('_'):
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                args = {}
                filter_type = 'term'
                filter_value = None
                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]
                        if not isinstance(val, basestring) or (
                            isinstance(val, basestring) and ' ' not in val
                        ):
                            filter_value = val

                        # If the term contains white spaces, we want to perform
                        # a phrase query. Thus we do nothing here and let this
                        # value be handled later.
                    else:
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator == '>':
                    # greater than
                    filter_type = 'range'
                    filter_value = {
                        'gt': param.value
                    }
                elif param.operator == '<':
                    # lower than
                    filter_type = 'range'
                    filter_value = {
                        'lt': param.value
                    }
                elif param.operator == '>=':
                    # greater than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'gte': param.value
                    }
                elif param.operator == '<=':
                    # lower than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'lte': param.value
                    }
                elif param.operator == '__null__':
                    # is null
                    filter_type = 'missing'
                    args['field'] = name

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    if param.operator_not:
                        new_filter = ~F(filter_type, **args)
                    else:
                        new_filter = F(filter_type, **args)

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif param.data_type == 'enum':
                        sub_filters |= new_filter
                    else:
                        sub_filters &= new_filter

                    continue

                # These use a wildcard and thus need to be in a query
                # instead of a filter.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                if param.operator in operator_wildcards:
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    query_type = 'wildcard'
                    args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                elif not param.operator:
                    # This is a phrase that was passed down.
                    query_type = 'simple_query_string'
                    args['query'] = param.value[0]
                    args['fields'] = [name]
                    args['default_operator'] = 'and'

                if args:
                    query = Q(query_type, **args)
                    if param.operator_not:
                        query = ~query
                    search = search.query(query)
                else:
                    # If we reach this point, that means the operator is
                    # not supported, and we should raise an error about that.
                    raise NotImplementedError(
                        'Operator %s is not supported' % param.operator
                    )

            if filters is None:
                filters = sub_filters
            elif sub_filters is not None:
                filters &= sub_filters

        search = search.filter(filters)

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't restrict on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot return it' % value
                    )

                if not field_['is_returned']:
                    # Returning this field is not allowed.
                    raise BadArgumentError(
                        value,
                        msg='Field "%s" is not allowed to be returned' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't sort on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot sort on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't facet on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot facet on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if field_['has_full_version']:
                    # If the param has a full version, that means what matters
                    # is the full string, and not its individual terms.
                    field_name += '.full'

                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=self.config.facets_max_number
                )

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
Example #29
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError(
                                '_results_number',
                                msg=(
                                    '_results_number cannot be greater '
                                    'than 1,000'
                                )
                            )
                        if results_number < 0:
                            raise BadArgumentError(
                                '_results_number',
                                msg='_results_number cannot be negative'
                            )
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]
                        # Why cap it?
                        # Because if the query is covering a lot of different
                        # things you can get a really really large query
                        # which can hog resources excessively.
                        # Downloading, as an example, 100k facets (and 0 hits)
                        # when there is plenty of data yields a 11MB JSON
                        # file.
                        if facets_size > 10000:
                            raise BadArgumentError(
                                '_facets_size greater than 10,000'
                            )

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]
                name = self.get_full_field_name(field_data)

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '^': '%s*',  # starts with
                    '$': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator == '__true__':
                    filter_type = 'term'
                    filter_value = True
                elif param.operator == '@':
                    filter_type = 'regexp'
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []

        # We keep track of the requested columns in order to make sure we
        # return those column names and not aliases for example.
        self.request_columns = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                self.request_columns.append(value)
                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product then descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        if facets_size:
            self._create_aggregations(
                params,
                search,
                facets_size,
                histogram_intervals
            )

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        errors = []

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()

                aggregations = getattr(results, 'aggregations', {})
                if aggregations:
                    aggregations = self.format_aggregations(aggregations)

                shards = getattr(results, '_shards', {})

                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                errors.append({
                    'type': 'missing_index',
                    'index': missing_index,
                })

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    shards = None
                    break
            except RequestError as exception:
                # Try to handle it gracefully if we can find out what
                # input was bad and caused the exception.
                try:
                    bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall(
                        exception.error
                    )[-1]
                    # Loop over the original parameters to try to figure
                    # out which *key* had the bad input.
                    for key, value in kwargs.items():
                        if value == bad_input:
                            raise BadArgumentError(key)
                except IndexError:
                    # Not an ElasticsearchParseException exception
                    pass
                raise
Example #30
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']
        self._build_fields()

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError('_results_number too large')
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (field_data['namespace'],
                                  field_data['in_database_name'])

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (operator_wildcards[param.operator] %
                                    param.value)
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value, full=False)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            self._add_second_level_aggs(
                param,
                search.aggs,
                facets_size,
                histogram_intervals,
            )

        # Create sub-aggregations.
        for key in params:
            if not key.startswith('_aggs.'):
                continue

            fields = key.split('.')[1:]

            if fields[0] not in self.all_fields:
                continue

            base_bucket = self._get_fields_agg(fields[0], facets_size)
            sub_bucket = base_bucket

            for field in fields[1:]:
                # For each field, make a bucket, then include that bucket in
                # the latest one, and then make that new bucket the latest.
                if field in self.all_fields:
                    tmp_bucket = self._get_fields_agg(field, facets_size)
                    sub_bucket.bucket(field, tmp_bucket)
                    sub_bucket = tmp_bucket

            for value in params[key]:
                self._add_second_level_aggs(
                    value,
                    sub_bucket,
                    facets_size,
                    histogram_intervals,
                )

            search.aggs.bucket(fields[0], base_bucket)

        # Create histograms.
        for f in self.histogram_fields:
            key = '_histogram.%s' % f
            if params.get(key):
                histogram_bucket = self._get_histogram_agg(
                    f, histogram_intervals)

                for param in params[key]:
                    self._add_second_level_aggs(
                        param,
                        histogram_bucket,
                        facets_size,
                        histogram_intervals,
                    )

                search.aggs.bucket('histogram_%s' % f, histogram_bucket)

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
Example #31
0
def text_similarity_clustering(index, doc_type, searchsize, cutv, **kwargs):

    _build_default(kwargs)
    es = Elasticsearch()

    # --- retrieve the document _ids
    from elasticsearch_dsl import Search
    s = Search(using=es, index=index, doc_type=doc_type)
    s = s.fields(
        [])  # only get ids, otherwise `fields` takes a list of field names
    ids = [h.meta.id for h in s.scan()]

    n = s.count()

    # restrict the searchsize between 1 and 10
    searchsize = 10 if searchsize >= 10 else searchsize
    searchsize = 1 if searchsize < 1 else searchsize

    # restrict the cutv between > 0
    cutv = 0 if cutv < 0 else cutv
    # store the sparse adjaency matrix

    I = []
    J = []
    V = []
    RV = []
    for i in ids:
        rv = es.mlt(index, doc_type, id=i, **kwargs)
        results = rv['hits']['hits']
        if len(results) > 0:
            for loop in xrange(searchsize):
                j = int(rv['hits']['hits'][loop]['_id'])
                score = rv['hits']['hits'][loop]['_score']
                if score >= cutv:
                    I.append(ids.index(i))
                    J.append(j)
                    V.append(score)
                    RV.append(score)

    # construct the adjaency matrix using Sparse Index
    A = coo_matrix((V, (I, J)), shape=(n, n))
    RA = coo_matrix((RV, (I, J)), shape=(n, n))

    # construct a graph
    G = nx.from_scipy_sparse_matrix(A)

    # obtain the degree vlaues for all the nodes
    # {0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 4, 7: 1, 8: 2, 9: 0,
    # 10: 4, 11: 0, 12: 4, 13: 1, 14: 1, 15: 1, 16: 4, 17: 4, 18: 3,
    # 19: 4, 20: 4, 21: 2, 22: 1, 23: 0, 24: 1, 25: 4, 26: 3, 27: 1,
    # 28: 1, 29: 2, 30: 3, 31: 4, 32: 1, 33: 4, 34: 2, 35: 2, 36: 1,
    # 37: 1, 38: 3, 39: 1, 40: 1, 41: 4, 42: 0, 43: 0, 44: 2, 45: 4,
    # 46: 1, 47: 1, 48: 2, 49: 0, 50: 1, 51: 1, 52: 3, 53: 1, 54: 2,
    # 55: 0, 56: 1, 57: 1, 58: 3, 59: 5, 60: 0, 61: 2, 62: 2, 63: 3,
    # 64: 5, 65: 1, 66: 1, 67: 4, 68: 5, 69: 2, 70: 4, 71: 3, 72: 4,
    # 73: 4, 74: 4, 75: 2, 76: 2, 77: 3, 78: 3, 79: 2, 80: 3, 81: 1,
    # 82: 1, 83: 2, 84: 0, 85: 2, 86: 4, 87: 2, 88: 1, 89: 1, 90: 4,
    # 91: 2, 92: 0, 93: 3, 94: 1, 95: 3, 96: 2, 97: 3, 98: 0, 99: 3,
    # 100: 0, 101: 1, 102: 1, 103: 2, 104: 3, 105: 4, 106: 4, 107: 1,
    # 108: 2, 109: 4, 110: 2, 111: 0, 112: 1, 113: 1, 114: 1, 115: 1,
    # 116: 4, 117: 4, 118: 3, 119: 4, 120: 4, 121: 2, 122: 1, 123: 0,
    # 124: 2, 125: 4, 126: 5, 127: 1, 128: 1, 129: 2, 130: 3, 131: 4,
    # 132: 1, 133: 4, 134: 2, 135: 2, 136: 0, 137: 1, 138: 3, 139: 0,
    # 140: 1, 141: 4, 142: 0, 143: 0, 144: 2, 145: 4, 146: 1, 147: 1,
    # 148: 2, 149: 0, 150: 1, 151: 1, 152: 3, 153: 1, 154: 2, 155: 0,
    # 156: 1, 157: 1, 158: 3, 159: 5, 160: 0, 161: 2, 162: 2, 163: 3,
    # 164: 5, 165: 1, 166: 1, 167: 4, 168: 5, 169: 2, 170: 4, 171: 3,
    # 172: 4, 173: 4, 174: 4, 175: 2, 176: 2, 177: 0, 178: 4, 179: 2,
    # 180: 3, 181: 1, 182: 1, 183: 2, 184: 0, 185: 2, 186: 4, 187: 2,
    # 188: 1, 189: 1, 190: 4, 191: 2, 192: 0, 193: 3, 194: 1, 195: 3,
    # 196: 2, 197: 3, 198: 0, 199: 3}

    D = G.degree().values()

    # partition the graph by modularity
    # return a dictionary
    #  {0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 4, 7: 1, 8: 2, 9: 0, 10: 4, 11: 0, 12: 4, 13: 1, 14: 1, 15: 1, 16: 4, 17: 4, 18: 3, 19: 4,
    #  20: 4, 21: 2, 22: 1, 23: 0, 24: 1, 25: 4, 26: 3, 27: 1, 28: 1, 29: 2, 30: 3, 31: 4, 32: 1, 33: 4, 34: 2, 35: 2, 36: 1, 37: 1, 38:
    #  3, 39: 1, 40: 1, 41: 4, 42: 0, 43: 0, 44: 2, 45: 4, 46: 1, 47: 1, 48: 2, 49: 0, 50: 1, 51: 1, 52: 3, 53: 1, 54: 2, 55: 0, 56: 1,
    #  57: 1, 58: 3, 59: 5, 60: 0, 61: 2, 62: 2, 63: 3, 64: 5, 65: 1, 66: 1, 67: 4, 68: 5, 69: 2, 70: 4, 71: 3, 72: 4, 73: 4, 74: 4,
    #  75: 2, 76: 2, 77: 3, 78: 3, 79: 2, 80: 3, 81: 1, 82: 1, 83: 2, 84: 0, 85: 2, 86: 4, 87: 2, 88: 1, 89: 1, 90: 4, 91: 2, 92: 0,
    #  93: 3, 94: 1, 95: 3, 96: 2, 97: 3, 98: 0, 99: 3, 100: 0, 101: 1, 102: 1, 103: 2, 104: 3, 105: 4, 106: 4, 107: 1, 108: 2, 109: 4,
    #  110: 2, 111: 0, 112: 1, 113: 1, 114: 1, 115: 1, 116: 4, 117: 4, 118: 3, 119: 4, 120: 4, 121: 2, 122: 1, 123: 0, 124: 2, 125: 4,
    #  126: 5, 127: 1, 128: 1, 129: 2, 130: 3, 131: 4, 132: 1, 133: 4, 134: 2, 135: 2, 136: 0, 137: 1, 138: 3, 139: 0, 140: 1, 141: 4,
    #  142: 0, 143: 0, 144: 2, 145: 4, 146: 1, 147: 1, 148: 2, 149: 0, 150: 1, 151: 1, 152: 3, 153: 1, 154: 2, 155: 0, 156: 1, 157: 1,
    #  158: 3, 159: 5, 160: 0, 161: 2, 162: 2, 163: 3, 164: 5, 165: 1, 166: 1, 167: 4, 168: 5, 169: 2, 170: 4, 171: 3, 172: 4, 173: 4,
    #  174: 4, 175: 2, 176: 2, 177: 0, 178: 4, 179: 2, 180: 3, 181: 1, 182: 1, 183: 2, 184: 0, 185: 2, 186: 4, 187: 2, 188: 1, 189: 1,
    #  190: 4, 191: 2, 192: 0, 193: 3, 194: 1, 195: 3, 196: 2, 197: 3, 198: 0, 199: 3}
    # where key is the document id, and value is the parition id

    partition = community.best_partition(G)
    RAC = RA.tocsr()

    return get_map_document(partition, ids, D, RAC)
Example #32
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError(
                                '_results_number',
                                msg=('_results_number cannot be greater '
                                     'than 1,000'))
                        if results_number < 0:
                            raise BadArgumentError(
                                '_results_number',
                                msg='_results_number cannot be negative')
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]
                        # Why cap it?
                        # Because if the query is covering a lot of different
                        # things you can get a really really large query
                        # which can hog resources excessively.
                        # Downloading, as an example, 100k facets (and 0 hits)
                        # when there is plenty of data yields a 11MB JSON
                        # file.
                        if facets_size > 10000:
                            raise BadArgumentError(
                                '_facets_size greater than 10,000')

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]
                name = self.get_full_field_name(field_data)

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '^': '%s*',  # starts with
                    '$': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator == '__true__':
                    filter_type = 'term'
                    filter_value = True
                elif param.operator == '@':
                    filter_type = 'regexp'
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (operator_wildcards[param.operator] %
                                    param.value)
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []

        # We keep track of the requested columns in order to make sure we
        # return those column names and not aliases for example.
        self.request_columns = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                self.request_columns.append(value)
                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product then descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        if facets_size:
            self._create_aggregations(params, search, facets_size,
                                      histogram_intervals)

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        errors = []

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()

                aggregations = getattr(results, 'aggregations', {})
                if aggregations:
                    aggregations = self.format_aggregations(aggregations)

                shards = getattr(results, '_shards', {})

                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                errors.append({
                    'type': 'missing_index',
                    'index': missing_index,
                })

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    shards = None
                    break
            except RequestError as exception:
                # Try to handle it gracefully if we can find out what
                # input was bad and caused the exception.
                try:
                    bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall(
                        exception.error)[-1]
                    # Loop over the original parameters to try to figure
                    # out which *key* had the bad input.
                    for key, value in kwargs.items():
                        if value == bad_input:
                            raise BadArgumentError(key)
                except IndexError:
                    # Not an ElasticsearchParseException exception
                    pass
                raise