Ejemplo n.º 1
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':', '/',
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias, **connection_options)

        if not 'URL' in connection_options:
            raise ImproperlyConfigured("You must specify a 'URL' in your settings for connection '%s'." % connection_alias)

        self.conn = Solr(connection_options['URL'], timeout=self.timeout, **connection_options.get('KWARGS', {}))
        self.log = logging.getLogger('haystack')

    def get_schema_admin(self):
        '''
        SolrSchemaAdmin singleton
        '''
        if not hasattr(self, '_schema_admin'):
            self._schema_admin = SolrSchemaAdmin(self.conn.url, self.conn.session)
        return self._schema_admin

    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(u"UnicodeDecodeError while preparing object for update", exc_info=True, extra={
                    "data": {
                        "index": index,
                        "object": get_identifier(obj)
                    }
                })

        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError) as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s", e)

    def remove(self, obj_or_string, commit=True):
        solr_id = get_identifier(obj_or_string)

        try:
            kwargs = {
                'commit': commit,
                'id': solr_id
            }
            self.conn.delete(**kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Solr: %s", solr_id, e)

    def clear(self, models=[], commit=True):
        try:
            if not models:
                # *:* matches all docs in Solr
                self.conn.delete(q='*:*', commit=commit)
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model)))

                self.conn.delete(q=" OR ".join(models_to_delete), commit=commit)

            if commit:
                # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
                self.conn.optimize()
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            if len(models):
                self.log.error("Failed to clear Solr index of models '%s': %s", ','.join(models_to_delete), e)
            else:
                self.log.error("Failed to clear Solr index: %s", e)

    @log_query
    def search(self, query_string, **kwargs):
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        search_kwargs = self.build_search_kwargs(query_string, **kwargs)

        try:
            raw_results = self.conn.search(query_string, **search_kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to query Solr using '%s': %s", query_string, e)
            raw_results = EmptyResults()

        return self._process_results(raw_results, highlight=kwargs.get('highlight'), result_class=kwargs.get('result_class', SearchResult), distance_point=kwargs.get('distance_point'))

    def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None,
                            fields='', highlight=False, facets=None,
                            date_facets=None, query_facets=None,
                            narrow_queries=None, spelling_query=None,
                            within=None, dwithin=None, distance_point=None,
                            models=None, limit_to_registered_models=None,
                            result_class=None, stats=None):
        kwargs = {'fl': '* score'}

        if fields:
            if isinstance(fields, (list, set)):
                fields = " ".join(fields)

            kwargs['fl'] = fields

        if sort_by is not None:
            if sort_by in ['distance asc', 'distance desc'] and distance_point:
                # Do the geo-enabled sort.
                lng, lat = distance_point['point'].get_coords()
                kwargs['sfield'] = distance_point['field']
                kwargs['pt'] = '%s,%s' % (lat, lng)

                if sort_by == 'distance asc':
                    kwargs['sort'] = 'geodist() asc'
                else:
                    kwargs['sort'] = 'geodist() desc'
            else:
                if sort_by.startswith('distance '):
                    warnings.warn("In order to sort by distance, you must call the '.distance(...)' method.")

                # Regular sorting.
                kwargs['sort'] = sort_by

        if start_offset is not None:
            kwargs['start'] = start_offset

        if end_offset is not None:
            kwargs['rows'] = end_offset - start_offset

        if highlight is True:
            kwargs['hl'] = 'true'
            kwargs['hl.fragsize'] = '200'

        if self.include_spelling is True:
            kwargs['spellcheck'] = 'true'
            kwargs['spellcheck.collate'] = 'true'
            kwargs['spellcheck.count'] = 1

            if spelling_query:
                kwargs['spellcheck.q'] = spelling_query

        if facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.field'] = facets.keys()

            for facet_field, options in facets.items():
                for key, value in options.items():
                    kwargs['f.%s.facet.%s' % (facet_field, key)] = self.conn._from_python(value)

        if date_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.date'] = date_facets.keys()
            kwargs['facet.date.other'] = 'none'

            for key, value in date_facets.items():
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date'))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date'))
                gap_by_string = value.get('gap_by').upper()
                gap_string = "%d%s" % (value.get('gap_amount'), gap_by_string)

                if value.get('gap_amount') != 1:
                    gap_string += "S"

                kwargs["f.%s.facet.date.gap" % key] = '+%s/%s' % (gap_string, gap_by_string)

        if query_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.query'] = ["%s:%s" % (field, value) for field, value in query_facets]

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices)))

        if narrow_queries is not None:
            kwargs['fq'] = list(narrow_queries)

        if stats:
            kwargs['stats'] = "true"

            for k in stats.keys():
                kwargs['stats.field'] = k

                for facet in stats[k]:
                    kwargs['f.%s.stats.facet' % k] = facet

        if within is not None:
            from haystack.utils.geo import generate_bounding_box

            kwargs.setdefault('fq', [])
            ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(within['point_1'], within['point_2'])
            # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT*
            # very clear on this.
            bbox = '%s:[%s,%s TO %s,%s]' % (within['field'], min_lat, min_lng, max_lat, max_lng)
            kwargs['fq'].append(bbox)

        if dwithin is not None:
            kwargs.setdefault('fq', [])
            lng, lat = dwithin['point'].get_coords()
            geofilt = '{!geofilt pt=%s,%s sfield=%s d=%s}' % (lat, lng, dwithin['field'], dwithin['distance'].km)
            kwargs['fq'].append(geofilt)

        # Check to see if the backend should try to include distances
        # (Solr 4.X+) in the results.
        if self.distance_available and distance_point:
            # In early testing, you can't just hand Solr 4.X a proper bounding box
            # & request distances. To enable native distance would take calculating
            # a center point & a radius off the user-provided box, which kinda
            # sucks. We'll avoid it for now, since Solr 4.x's release will be some
            # time yet.
            # kwargs['fl'] += ' _dist_:geodist()'
            pass

        return kwargs

    def more_like_this(self, model_instance, additional_query_string=None,
                       start_offset=0, end_offset=None, models=None,
                       limit_to_registered_models=None, result_class=None, **kwargs):
        from haystack import connections

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = connections[self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }

        if start_offset is not None:
            params['start'] = start_offset

        if end_offset is not None:
            params['rows'] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' % (DJANGO_CT, ' OR '.join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params['fq'] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to fetch More Like This from Solr for document '%s': %s", query, e)
            raw_results = EmptyResults()

        return self._process_results(raw_results, result_class=result_class)

    def _process_results(self, raw_results, highlight=False, result_class=None, distance_point=None):
        from haystack import connections
        results = []
        hits = raw_results.hits
        facets = {}
        stats = {}
        spelling_suggestion = None

        if result_class is None:
            result_class = SearchResult

        if hasattr(raw_results,'stats'):
            stats = raw_results.stats.get('stats_fields',{})

        if hasattr(raw_results, 'facets'):
            facets = {
                'fields': raw_results.facets.get('facet_fields', {}),
                'dates': raw_results.facets.get('facet_dates', {}),
                'queries': raw_results.facets.get('facet_queries', {}),
            }

            for key in ['fields']:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = list(zip(facets[key][facet_field][::2], facets[key][facet_field][1::2]))

        if self.include_spelling is True:
            if hasattr(raw_results, 'spellcheck'):
                if len(raw_results.spellcheck.get('suggestions', [])):
                    # For some reason, it's an array of pairs. Pull off the
                    # collated result from the end.
                    spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1]

        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for raw_result in raw_results.docs:
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                index = unified_index.get_index(model)
                index_field_map = index.field_map
                for key, value in raw_result.items():
                    string_key = str(key)
                    # re-map key if alternate name used
                    if string_key in index_field_map:
                        string_key = index_field_map[key]

                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self.conn._to_python(value)

                del(additional_fields[DJANGO_CT])
                del(additional_fields[DJANGO_ID])
                del(additional_fields['score'])

                if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
                    additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]

                if distance_point:
                    additional_fields['_point_of_origin'] = distance_point

                    if raw_result.get('__dist__'):
                        from haystack.utils.geo import Distance
                        additional_fields['_distance'] = Distance(km=float(raw_result['__dist__']))
                    else:
                        additional_fields['_distance'] = None

                result = result_class(app_label, model_name, raw_result[DJANGO_ID], raw_result['score'], **additional_fields)
                results.append(result)
            else:
                hits -= 1

        return {
            'results': results,
            'hits': hits,
            'stats': stats,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

    def build_schema(self, fields):
        content_field_name = ''
        schema_fields = []

        for field_name, field_class in fields.items():
            field_data = {
                'name': field_class.index_fieldname,
                'type': 'text_en',
                'indexed': 'true',
                'stored': 'true',
                'multiValued': 'false',
            }

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

            # DRL_FIXME: Perhaps move to something where, if none of these
            #            checks succeed, call a custom method on the form that
            #            returns, per-backend, the right type of storage?
            if field_class.field_type in ['date', 'datetime']:
                field_data['type'] = 'date'
            elif field_class.field_type == 'integer':
                field_data['type'] = 'long'
            elif field_class.field_type == 'float':
                field_data['type'] = 'float'
            elif field_class.field_type == 'boolean':
                field_data['type'] = 'boolean'
            elif field_class.field_type == 'ngram':
                field_data['type'] = 'ngram'
            elif field_class.field_type == 'edge_ngram':
                field_data['type'] = 'edge_ngram'
            elif field_class.field_type == 'location':
                field_data['type'] = 'location'

            if field_class.is_multivalued:
                field_data['multiValued'] = 'true'

            if field_class.stored is False:
                field_data['stored'] = 'false'

            # Do this last to override `text` fields.
            if field_class.indexed is False:
                field_data['indexed'] = 'false'

                # If it's text and not being indexed, we probably don't want
                # to do the normal lowercase/tokenize/stemming/etc. dance.
                if field_data['type'] == 'text_en':
                    field_data['type'] = 'string'

            # If it's a ``FacetField``, make sure we don't postprocess it.
            if hasattr(field_class, 'facet_for'):
                # If it's text, it ought to be a string.
                if field_data['type'] == 'text_en':
                    field_data['type'] = 'string'

            schema_fields.append(field_data)

        return (content_field_name, schema_fields)

    def extract_file_contents(self, file_obj):
        """Extract text and metadata from a structured file (PDF, MS Word, etc.)

        Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
        See the Solr wiki for details:

            http://wiki.apache.org/solr/ExtractingRequestHandler

        Due to the way the ExtractingRequestHandler is implemented it completely
        replaces the normal Haystack indexing process with several unfortunate
        restrictions: only one file per request, the extracted data is added to
        the index with no ability to modify it, etc. To simplify the process and
        allow for more advanced use we'll run using the extract-only mode to
        return the extracted data without adding it to the index so we can then
        use it within Haystack's normal templating process.

        Returns None if metadata cannot be extracted; otherwise returns a
        dictionary containing at least two keys:

            :contents:
                        Extracted full-text content, if applicable
            :metadata:
                        key:value pairs of text strings
        """

        try:
            return self.conn.extract(file_obj)
        except Exception as e:
            self.log.warning(u"Unable to extract file contents: %s", e,
                             exc_info=True, extra={"data": {"file": file_obj}})
            return None
Ejemplo n.º 2
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\',
        '+',
        '-',
        '&&',
        '||',
        '!',
        '(',
        ')',
        '{',
        '}',
        '[',
        ']',
        '^',
        '"',
        '~',
        '*',
        '?',
        ':',
        '/',
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias,
                                                **connection_options)

        if not 'URL' in connection_options:
            raise ImproperlyConfigured(
                "You must specify a 'URL' in your settings for connection '%s'."
                % connection_alias)

        self.conn = Solr(connection_options['URL'],
                         timeout=self.timeout,
                         **connection_options.get('KWARGS', {}))
        self.log = logging.getLogger('haystack')

    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except SkipDocument:
                self.log.debug(u"Indexing for object `%s` skipped", obj)
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(
                    u"UnicodeDecodeError while preparing object for update",
                    exc_info=True,
                    extra={
                        "data": {
                            "index": index,
                            "object": get_identifier(obj)
                        }
                    })

        if len(docs) > 0:
            try:
                self.conn.add(docs,
                              commit=commit,
                              boost=index.get_field_weights())
            except (IOError, SolrError) as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s",
                               e,
                               exc_info=True)

    def remove(self, obj_or_string, commit=True):
        solr_id = get_identifier(obj_or_string)

        try:
            kwargs = {'commit': commit, 'id': solr_id}
            self.conn.delete(**kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to remove document '%s' from Solr: %s",
                           solr_id,
                           e,
                           exc_info=True)

    def clear(self, models=None, commit=True):
        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                # *:* matches all docs in Solr
                self.conn.delete(q='*:*', commit=commit)
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append("%s:%s" %
                                            (DJANGO_CT, get_model_ct(model)))

                self.conn.delete(q=" OR ".join(models_to_delete),
                                 commit=commit)

            if commit:
                # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
                self.conn.optimize()
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            if models is not None:
                self.log.error("Failed to clear Solr index of models '%s': %s",
                               ','.join(models_to_delete),
                               e,
                               exc_info=True)
            else:
                self.log.error("Failed to clear Solr index: %s",
                               e,
                               exc_info=True)

    @log_query
    def search(self, query_string, **kwargs):
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        search_kwargs = self.build_search_kwargs(query_string, **kwargs)

        try:
            raw_results = self.conn.search(query_string, **search_kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to query Solr using '%s': %s",
                           query_string,
                           e,
                           exc_info=True)
            raw_results = EmptyResults()

        return self._process_results(
            raw_results,
            highlight=kwargs.get('highlight'),
            result_class=kwargs.get('result_class', SearchResult),
            distance_point=kwargs.get('distance_point'))

    def build_search_kwargs(self,
                            query_string,
                            sort_by=None,
                            start_offset=0,
                            end_offset=None,
                            fields='',
                            highlight=False,
                            facets=None,
                            date_facets=None,
                            query_facets=None,
                            narrow_queries=None,
                            spelling_query=None,
                            within=None,
                            dwithin=None,
                            distance_point=None,
                            models=None,
                            limit_to_registered_models=None,
                            result_class=None,
                            stats=None,
                            **extra_kwargs):
        kwargs = {'fl': '* score'}

        if fields:
            if isinstance(fields, (list, set)):
                fields = " ".join(fields)

            kwargs['fl'] = fields

        if sort_by is not None:
            if sort_by in ['distance asc', 'distance desc'] and distance_point:
                # Do the geo-enabled sort.
                lng, lat = distance_point['point'].get_coords()
                kwargs['sfield'] = distance_point['field']
                kwargs['pt'] = '%s,%s' % (lat, lng)

                if sort_by == 'distance asc':
                    kwargs['sort'] = 'geodist() asc'
                else:
                    kwargs['sort'] = 'geodist() desc'
            else:
                if sort_by.startswith('distance '):
                    warnings.warn(
                        "In order to sort by distance, you must call the '.distance(...)' method."
                    )

                # Regular sorting.
                kwargs['sort'] = sort_by

        if start_offset is not None:
            kwargs['start'] = start_offset

        if end_offset is not None:
            kwargs['rows'] = end_offset - start_offset

        if highlight:
            # `highlight` can either be True or a dictionary containing custom parameters
            # which will be passed to the backend and may override our default settings:

            kwargs['hl'] = 'true'
            kwargs['hl.fragsize'] = '200'

            if isinstance(highlight, dict):
                kwargs.update(highlight)

        if self.include_spelling is True:
            kwargs['spellcheck'] = 'true'
            kwargs['spellcheck.collate'] = 'true'
            kwargs['spellcheck.count'] = 1

            if spelling_query:
                kwargs['spellcheck.q'] = spelling_query

        if facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.field'] = facets.keys()

            for facet_field, options in facets.items():
                for key, value in options.items():
                    kwargs['f.%s.facet.%s' %
                           (facet_field, key)] = self.conn._from_python(value)

        if date_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.date'] = date_facets.keys()
            kwargs['facet.date.other'] = 'none'

            for key, value in date_facets.items():
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(
                    value.get('start_date'))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(
                    value.get('end_date'))
                gap_by_string = value.get('gap_by').upper()
                gap_string = "%d%s" % (value.get('gap_amount'), gap_by_string)

                if value.get('gap_amount') != 1:
                    gap_string += "S"

                kwargs["f.%s.facet.date.gap" %
                       key] = '+%s/%s' % (gap_string, gap_by_string)

        if query_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.query'] = [
                "%s:%s" % (field, value) for field, value in query_facets
            ]

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' %
                               (DJANGO_CT, ' OR '.join(model_choices)))

        if narrow_queries is not None:
            kwargs['fq'] = list(narrow_queries)

        if stats:
            kwargs['stats'] = "true"

            for k in stats.keys():
                kwargs['stats.field'] = k

                for facet in stats[k]:
                    kwargs['f.%s.stats.facet' % k] = facet

        if within is not None:
            from haystack.utils.geo import generate_bounding_box

            kwargs.setdefault('fq', [])
            ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(
                within['point_1'], within['point_2'])
            # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT*
            # very clear on this.
            bbox = '%s:[%s,%s TO %s,%s]' % (within['field'], min_lat, min_lng,
                                            max_lat, max_lng)
            kwargs['fq'].append(bbox)

        if dwithin is not None:
            kwargs.setdefault('fq', [])
            lng, lat = dwithin['point'].get_coords()
            geofilt = '{!geofilt pt=%s,%s sfield=%s d=%s}' % (
                lat, lng, dwithin['field'], dwithin['distance'].km)
            kwargs['fq'].append(geofilt)

        # Check to see if the backend should try to include distances
        # (Solr 4.X+) in the results.
        if self.distance_available and distance_point:
            # In early testing, you can't just hand Solr 4.X a proper bounding box
            # & request distances. To enable native distance would take calculating
            # a center point & a radius off the user-provided box, which kinda
            # sucks. We'll avoid it for now, since Solr 4.x's release will be some
            # time yet.
            # kwargs['fl'] += ' _dist_:geodist()'
            pass

        if extra_kwargs:
            kwargs.update(extra_kwargs)

        return kwargs

    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        from haystack import connections

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = connections[
            self.connection_alias].get_unified_index().get_index(model_klass)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }

        if start_offset is not None:
            params['start'] = start_offset

        if end_offset is not None:
            params['rows'] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, 'HAYSTACK_LIMIT_TO_REGISTERED_MODELS', True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add('%s:(%s)' %
                               (DJANGO_CT, ' OR '.join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params['fq'] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to fetch More Like This from Solr for document '%s': %s",
                query,
                e,
                exc_info=True)
            raw_results = EmptyResults()

        return self._process_results(raw_results, result_class=result_class)

    def _process_results(self,
                         raw_results,
                         highlight=False,
                         result_class=None,
                         distance_point=None):
        from haystack import connections
        results = []
        hits = raw_results.hits
        facets = {}
        stats = {}
        spelling_suggestion = None

        if result_class is None:
            result_class = SearchResult

        if hasattr(raw_results, 'stats'):
            stats = raw_results.stats.get('stats_fields', {})

        if hasattr(raw_results, 'facets'):
            facets = {
                'fields': raw_results.facets.get('facet_fields', {}),
                'dates': raw_results.facets.get('facet_dates', {}),
                'queries': raw_results.facets.get('facet_queries', {}),
            }

            for key in ['fields']:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = list(
                        zip(facets[key][facet_field][::2],
                            facets[key][facet_field][1::2]))

        if self.include_spelling and hasattr(raw_results, 'spellcheck'):
            # Solr 5+ changed the JSON response format so the suggestions will be key-value mapped rather
            # than simply paired elements in a list, which is a nice improvement but incompatible with
            # Solr 4: https://issues.apache.org/jira/browse/SOLR-3029
            if len(raw_results.spellcheck.get('collations', [])):
                spelling_suggestion = raw_results.spellcheck['collations'][-1]
            elif len(raw_results.spellcheck.get('suggestions', [])):
                spelling_suggestion = raw_results.spellcheck['suggestions'][-1]

            assert spelling_suggestion is None or isinstance(
                spelling_suggestion, six.string_types)

        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for raw_result in raw_results.docs:
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                index = unified_index.get_index(model)
                index_field_map = index.field_map
                for key, value in raw_result.items():
                    string_key = str(key)
                    # re-map key if alternate name used
                    if string_key in index_field_map:
                        string_key = index_field_map[key]

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        additional_fields[string_key] = index.fields[
                            string_key].convert(value)
                    else:
                        additional_fields[string_key] = self.conn._to_python(
                            value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])
                del (additional_fields['score'])

                if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
                    additional_fields[
                        'highlighted'] = raw_results.highlighting[
                            raw_result[ID]]

                if distance_point:
                    additional_fields['_point_of_origin'] = distance_point

                    if raw_result.get('__dist__'):
                        from haystack.utils.geo import Distance
                        additional_fields['_distance'] = Distance(
                            km=float(raw_result['__dist__']))
                    else:
                        additional_fields['_distance'] = None

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID],
                                      raw_result['score'], **additional_fields)
                results.append(result)
            else:
                hits -= 1

        return {
            'results': results,
            'hits': hits,
            'stats': stats,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }

    def build_schema(self, fields):
        content_field_name = ''
        schema_fields = []

        for field_name, field_class in fields.items():
            field_data = {
                'field_name': field_class.index_fieldname,
                'type': 'text_en',
                'indexed': 'true',
                'stored': 'true',
                'multi_valued': 'false',
            }

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

            # DRL_FIXME: Perhaps move to something where, if none of these
            #            checks succeed, call a custom method on the form that
            #            returns, per-backend, the right type of storage?
            if field_class.field_type in ['date', 'datetime']:
                field_data['type'] = 'date'
            elif field_class.field_type == 'integer':
                field_data['type'] = 'long'
            elif field_class.field_type == 'float':
                field_data['type'] = 'float'
            elif field_class.field_type == 'boolean':
                field_data['type'] = 'boolean'
            elif field_class.field_type == 'ngram':
                field_data['type'] = 'ngram'
            elif field_class.field_type == 'edge_ngram':
                field_data['type'] = 'edge_ngram'
            elif field_class.field_type == 'location':
                field_data['type'] = 'location'

            if field_class.is_multivalued:
                field_data['multi_valued'] = 'true'

            if field_class.stored is False:
                field_data['stored'] = 'false'

            # Do this last to override `text` fields.
            if field_class.indexed is False:
                field_data['indexed'] = 'false'

                # If it's text and not being indexed, we probably don't want
                # to do the normal lowercase/tokenize/stemming/etc. dance.
                if field_data['type'] == 'text_en':
                    field_data['type'] = 'string'

            # If it's a ``FacetField``, make sure we don't postprocess it.
            if hasattr(field_class, 'facet_for'):
                # If it's text, it ought to be a string.
                if field_data['type'] == 'text_en':
                    field_data['type'] = 'string'

            schema_fields.append(field_data)

        return (content_field_name, schema_fields)

    def extract_file_contents(self, file_obj):
        """Extract text and metadata from a structured file (PDF, MS Word, etc.)

        Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
        See the Solr wiki for details:

            http://wiki.apache.org/solr/ExtractingRequestHandler

        Due to the way the ExtractingRequestHandler is implemented it completely
        replaces the normal Haystack indexing process with several unfortunate
        restrictions: only one file per request, the extracted data is added to
        the index with no ability to modify it, etc. To simplify the process and
        allow for more advanced use we'll run using the extract-only mode to
        return the extracted data without adding it to the index so we can then
        use it within Haystack's normal templating process.

        Returns None if metadata cannot be extracted; otherwise returns a
        dictionary containing at least two keys:

            :contents:
                        Extracted full-text content, if applicable
            :metadata:
                        key:value pairs of text strings
        """

        try:
            return self.conn.extract(file_obj)
        except Exception as e:
            self.log.warning(u"Unable to extract file contents: %s",
                             e,
                             exc_info=True,
                             extra={"data": {
                                 "file": file_obj
                             }})
            return None
Ejemplo n.º 3
0
class SearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':',
    )
    
    def __init__(self, site=None):
        super(SearchBackend, self).__init__(site)
        
        if not hasattr(settings, 'HAYSTACK_SOLR_URL'):
            raise ImproperlyConfigured('You must specify a HAYSTACK_SOLR_URL in your settings.')
        
        timeout = getattr(settings, 'HAYSTACK_SOLR_TIMEOUT', 10)
        self.conn = Solr(settings.HAYSTACK_SOLR_URL, timeout=timeout)
    
    def update(self, index, iterable, commit=True):
        docs = []
        
        try:
            for obj in iterable:
                doc = {}
                doc['id'] = self.get_identifier(obj)
                doc['django_ct'] = "%s.%s" % (obj._meta.app_label, obj._meta.module_name)
                doc['django_id'] = force_unicode(obj.pk)
                doc.update(index.prepare(obj))
                docs.append(doc)
        except UnicodeDecodeError:
            sys.stderr.write("Chunk failed.\n")
        
        self.conn.add(docs, commit=commit)

    def remove(self, obj_or_string, commit=True):
        solr_id = self.get_identifier(obj_or_string)
        self.conn.delete(id=solr_id, commit=commit)

    def clear(self, models=[], commit=True):
        if not models:
            # *:* matches all docs in Solr
            self.conn.delete(q='*:*', commit=commit)
        else:
            models_to_delete = []
            
            for model in models:
                models_to_delete.append("django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name))
            
            self.conn.delete(q=" OR ".join(models_to_delete), commit=commit)
        
        # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
        self.conn.optimize()

    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, **kwargs):
        if len(query_string) == 0:
            return []
        
        kwargs = {
            'fl': '* score',
        }
        
        if fields:
            kwargs['fl'] = fields
        
        if sort_by is not None:
            kwargs['sort'] = sort_by
        
        if start_offset is not None:
            kwargs['start'] = start_offset
        
        if end_offset is not None:
            kwargs['rows'] = end_offset
        
        if highlight is True:
            kwargs['hl'] = 'true'
            kwargs['hl.fragsize'] = '200'
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
            kwargs['spellcheck'] = 'true'
            kwargs['spellcheck.collate'] = 'true'
            kwargs['spellcheck.count'] = 1
        
        if facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.field'] = facets
        
        if date_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.date'] = date_facets.keys()
            
            for key, value in date_facets.items():
                # Date-based facets in Solr kinda suck.
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date'))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date'))
                kwargs["f.%s.facet.date.gap" % key] = value.get('gap')
        
        if query_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.query'] = ["%s:%s" % (field, value) for field, value in query_facets.items()]
        
        if narrow_queries is not None:
            kwargs['fq'] = list(narrow_queries)
        
        raw_results = self.conn.search(query_string, **kwargs)
        return self._process_results(raw_results, highlight=highlight)
    
    def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, **kwargs):
        index = self.site.get_index(model_instance.__class__)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }
        
        if start_offset is not None:
            params['start'] = start_offset
        
        if end_offset is not None:
            params['rows'] = end_offset
        
        if additional_query_string:
            params['fq'] = additional_query_string
        
        raw_results = self.conn.more_like_this("id:%s" % self.get_identifier(model_instance), field_name, **params)
        return self._process_results(raw_results)
    
    def _process_results(self, raw_results, highlight=False):
        from haystack import site
        results = []
        hits = raw_results.hits
        facets = {}
        spelling_suggestion = None
        
        if hasattr(raw_results, 'facets'):
            facets = {
                'fields': raw_results.facets.get('facet_fields', {}),
                'dates': raw_results.facets.get('facet_dates', {}),
                'queries': raw_results.facets.get('facet_queries', {}),
            }
            
            for key in ['fields']:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = zip(facets[key][facet_field][::2], facets[key][facet_field][1::2])
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
            if hasattr(raw_results, 'spellcheck'):
                if len(raw_results.spellcheck.get('suggestions', [])):
                    # For some reason, it's an array of pairs. Pull off the
                    # collated result from the end.
                    spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1]
        
        indexed_models = site.get_indexed_models()
        
        for raw_result in raw_results.docs:
            app_label, model_name = raw_result['django_ct'].split('.')
            additional_fields = {}
            
            for key, value in raw_result.items():
                additional_fields[str(key)] = self.conn._to_python(value)
            
            del(additional_fields['django_ct'])
            del(additional_fields['django_id'])
            del(additional_fields['score'])
            
            if raw_result['id'] in getattr(raw_results, 'highlighting', {}):
                additional_fields['highlighted'] = raw_results.highlighting[raw_result['id']]
            
            model = get_model(app_label, model_name)
            
            if model:
                if model in indexed_models:
                    result = SearchResult(app_label, model_name, raw_result['django_id'], raw_result['score'], **additional_fields)
                    results.append(result)
                else:
                    hits -= 1
            else:
                hits -= 1
        
        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
Ejemplo n.º 4
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = ("AND", "NOT", "OR", "TO")

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        "\\",
        "+",
        "-",
        "&&",
        "||",
        "!",
        "(",
        ")",
        "{",
        "}",
        "[",
        "]",
        "^",
        '"',
        "~",
        "*",
        "?",
        ":",
        "/",
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias,
                                                **connection_options)

        if "URL" not in connection_options:
            raise ImproperlyConfigured(
                "You must specify a 'URL' in your settings for connection '%s'."
                % connection_alias)

        self.collate = connection_options.get("COLLATE_SPELLING", True)

        self.conn = Solr(connection_options["URL"],
                         timeout=self.timeout,
                         **connection_options.get("KWARGS", {}))
        self.log = logging.getLogger("haystack")

    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except SkipDocument:
                self.log.debug("Indexing for object `%s` skipped", obj)
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(
                    "UnicodeDecodeError while preparing object for update",
                    exc_info=True,
                    extra={
                        "data": {
                            "index": index,
                            "object": get_identifier(obj)
                        }
                    },
                )

        if len(docs) > 0:
            try:
                self.conn.add(docs,
                              commit=commit,
                              boost=index.get_field_weights())
            except (IOError, SolrError) as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s",
                               e,
                               exc_info=True)

    def remove(self, obj_or_string, commit=True):
        solr_id = get_identifier(obj_or_string)

        try:
            kwargs = {"commit": commit, "id": solr_id}
            self.conn.delete(**kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to remove document '%s' from Solr: %s",
                solr_id,
                e,
                exc_info=True,
            )

    def clear(self, models=None, commit=True):
        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                # *:* matches all docs in Solr
                self.conn.delete(q="*:*", commit=commit)
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append("%s:%s" %
                                            (DJANGO_CT, get_model_ct(model)))

                self.conn.delete(q=" OR ".join(models_to_delete),
                                 commit=commit)

            if commit:
                # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
                self.conn.optimize()
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            if models is not None:
                self.log.error(
                    "Failed to clear Solr index of models '%s': %s",
                    ",".join(models_to_delete),
                    e,
                    exc_info=True,
                )
            else:
                self.log.error("Failed to clear Solr index: %s",
                               e,
                               exc_info=True)

    @log_query
    def search(self, query_string, **kwargs):
        if len(query_string) == 0:
            return {"results": [], "hits": 0}

        search_kwargs = self.build_search_kwargs(query_string, **kwargs)

        try:
            raw_results = self.conn.search(query_string, **search_kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to query Solr using '%s': %s",
                           query_string,
                           e,
                           exc_info=True)
            raw_results = EmptyResults()

        return self._process_results(
            raw_results,
            highlight=kwargs.get("highlight"),
            result_class=kwargs.get("result_class", SearchResult),
            distance_point=kwargs.get("distance_point"),
        )

    def build_search_kwargs(self,
                            query_string,
                            sort_by=None,
                            start_offset=0,
                            end_offset=None,
                            fields="",
                            highlight=False,
                            facets=None,
                            date_facets=None,
                            query_facets=None,
                            narrow_queries=None,
                            spelling_query=None,
                            within=None,
                            dwithin=None,
                            distance_point=None,
                            models=None,
                            limit_to_registered_models=None,
                            result_class=None,
                            stats=None,
                            collate=None,
                            **extra_kwargs):

        index = haystack.connections[self.connection_alias].get_unified_index()

        kwargs = {"fl": "* score", "df": index.document_field}

        if fields:
            if isinstance(fields, (list, set)):
                fields = " ".join(fields)

            kwargs["fl"] = fields

        if sort_by is not None:
            if sort_by in ["distance asc", "distance desc"] and distance_point:
                # Do the geo-enabled sort.
                lng, lat = distance_point["point"].coords
                kwargs["sfield"] = distance_point["field"]
                kwargs["pt"] = "%s,%s" % (lat, lng)

                if sort_by == "distance asc":
                    kwargs["sort"] = "geodist() asc"
                else:
                    kwargs["sort"] = "geodist() desc"
            else:
                if sort_by.startswith("distance "):
                    warnings.warn(
                        "In order to sort by distance, you must call the '.distance(...)' method."
                    )

                # Regular sorting.
                kwargs["sort"] = sort_by

        if start_offset is not None:
            kwargs["start"] = start_offset

        if end_offset is not None:
            kwargs["rows"] = end_offset - start_offset

        if highlight:
            # `highlight` can either be True or a dictionary containing custom parameters
            # which will be passed to the backend and may override our default settings:

            kwargs["hl"] = "true"
            kwargs["hl.fragsize"] = "200"

            if isinstance(highlight, dict):
                # autoprefix highlighter options with 'hl.', all of them start with it anyway
                # this makes option dicts shorter: {'maxAnalyzedChars': 42}
                # and lets some of options be used as keyword arguments: `.highlight(preserveMulti=False)`
                kwargs.update({
                    key if key.startswith("hl.") else ("hl." + key):
                    highlight[key]
                    for key in highlight.keys()
                })

        if collate is None:
            collate = self.collate
        if self.include_spelling is True:
            kwargs["spellcheck"] = "true"
            kwargs["spellcheck.collate"] = str(collate).lower()
            kwargs["spellcheck.count"] = 1

            if spelling_query:
                kwargs["spellcheck.q"] = spelling_query

        if facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.field"] = facets.keys()

            for facet_field, options in facets.items():
                for key, value in options.items():
                    kwargs["f.%s.facet.%s" %
                           (facet_field, key)] = self.conn._from_python(value)

        if date_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.date"] = date_facets.keys()
            kwargs["facet.date.other"] = "none"

            for key, value in date_facets.items():
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(
                    value.get("start_date"))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(
                    value.get("end_date"))
                gap_by_string = value.get("gap_by").upper()
                gap_string = "%d%s" % (value.get("gap_amount"), gap_by_string)

                if value.get("gap_amount") != 1:
                    gap_string += "S"

                kwargs["f.%s.facet.date.gap" % key] = "+%s/%s" % (
                    gap_string,
                    gap_by_string,
                )

        if query_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.query"] = [
                "%s:%s" % (field, value) for field, value in query_facets
            ]

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add("%s:(%s)" %
                               (DJANGO_CT, " OR ".join(model_choices)))

        if narrow_queries is not None:
            kwargs["fq"] = list(narrow_queries)

        if stats:
            kwargs["stats"] = "true"

            for k in stats.keys():
                kwargs["stats.field"] = k

                for facet in stats[k]:
                    kwargs["f.%s.stats.facet" % k] = facet

        if within is not None:
            from haystack.utils.geo import generate_bounding_box

            kwargs.setdefault("fq", [])
            ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(
                within["point_1"], within["point_2"])
            # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT*
            # very clear on this.
            bbox = "%s:[%s,%s TO %s,%s]" % (
                within["field"],
                min_lat,
                min_lng,
                max_lat,
                max_lng,
            )
            kwargs["fq"].append(bbox)

        if dwithin is not None:
            kwargs.setdefault("fq", [])
            lng, lat = dwithin["point"].coords
            geofilt = "{!geofilt pt=%s,%s sfield=%s d=%s}" % (
                lat,
                lng,
                dwithin["field"],
                dwithin["distance"].km,
            )
            kwargs["fq"].append(geofilt)

        # Check to see if the backend should try to include distances
        # (Solr 4.X+) in the results.
        if self.distance_available and distance_point:
            # In early testing, you can't just hand Solr 4.X a proper bounding box
            # & request distances. To enable native distance would take calculating
            # a center point & a radius off the user-provided box, which kinda
            # sucks. We'll avoid it for now, since Solr 4.x's release will be some
            # time yet.
            # kwargs['fl'] += ' _dist_:geodist()'
            pass

        if extra_kwargs:
            kwargs.update(extra_kwargs)

        return kwargs

    def more_like_this(self,
                       model_instance,
                       additional_query_string=None,
                       start_offset=0,
                       end_offset=None,
                       models=None,
                       limit_to_registered_models=None,
                       result_class=None,
                       **kwargs):
        from haystack import connections

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = (connections[
            self.connection_alias].get_unified_index().get_index(model_klass))
        field_name = index.get_content_field()
        params = {"fl": "*,score"}

        if start_offset is not None:
            params["start"] = start_offset

        if end_offset is not None:
            params["rows"] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True)

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add("%s:(%s)" %
                               (DJANGO_CT, " OR ".join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params["fq"] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to fetch More Like This from Solr for document '%s': %s",
                query,
                e,
                exc_info=True,
            )
            raw_results = EmptyResults()

        return self._process_results(raw_results, result_class=result_class)

    def _process_results(self,
                         raw_results,
                         highlight=False,
                         result_class=None,
                         distance_point=None):
        from haystack import connections

        results = []
        hits = raw_results.hits
        facets = {}
        stats = {}
        spelling_suggestion = spelling_suggestions = None

        if result_class is None:
            result_class = SearchResult

        if hasattr(raw_results, "stats"):
            stats = raw_results.stats.get("stats_fields", {})

        if hasattr(raw_results, "facets"):
            facets = {
                "fields": raw_results.facets.get("facet_fields", {}),
                "dates": raw_results.facets.get("facet_dates", {}),
                "queries": raw_results.facets.get("facet_queries", {}),
            }

            for key in ["fields"]:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = list(
                        zip(
                            facets[key][facet_field][::2],
                            facets[key][facet_field][1::2],
                        ))

        if self.include_spelling and hasattr(raw_results, "spellcheck"):
            try:
                spelling_suggestions = self.extract_spelling_suggestions(
                    raw_results)
            except Exception as exc:
                self.log.error(
                    "Error extracting spelling suggestions: %s",
                    exc,
                    exc_info=True,
                    extra={"data": {
                        "spellcheck": raw_results.spellcheck
                    }},
                )

                if not self.silently_fail:
                    raise

                spelling_suggestions = None

            if spelling_suggestions:
                # Maintain compatibility with older versions of Haystack which returned a single suggestion:
                spelling_suggestion = spelling_suggestions[-1]
                assert isinstance(spelling_suggestion, six.string_types)
            else:
                spelling_suggestion = None

        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for raw_result in raw_results.docs:
            app_label, model_name = raw_result[DJANGO_CT].split(".")
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                index = unified_index.get_index(model)
                index_field_map = index.field_map
                for key, value in raw_result.items():
                    string_key = str(key)
                    # re-map key if alternate name used
                    if string_key in index_field_map:
                        string_key = index_field_map[key]

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], "convert"):
                        additional_fields[string_key] = index.fields[
                            string_key].convert(value)
                    else:
                        additional_fields[string_key] = self.conn._to_python(
                            value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])
                del (additional_fields["score"])

                if raw_result[ID] in getattr(raw_results, "highlighting", {}):
                    additional_fields[
                        "highlighted"] = raw_results.highlighting[
                            raw_result[ID]]

                if distance_point:
                    additional_fields["_point_of_origin"] = distance_point

                    if raw_result.get("__dist__"):
                        from haystack.utils.geo import Distance

                        additional_fields["_distance"] = Distance(
                            km=float(raw_result["__dist__"]))
                    else:
                        additional_fields["_distance"] = None

                result = result_class(app_label, model_name,
                                      raw_result[DJANGO_ID],
                                      raw_result["score"], **additional_fields)
                results.append(result)
            else:
                hits -= 1

        return {
            "results": results,
            "hits": hits,
            "stats": stats,
            "facets": facets,
            "spelling_suggestion": spelling_suggestion,
            "spelling_suggestions": spelling_suggestions,
        }

    def extract_spelling_suggestions(self, raw_results):
        # There are many different formats for Legacy, 6.4, and 6.5 e.g.
        # https://issues.apache.org/jira/browse/SOLR-3029 and depending on the
        # version and configuration the response format may be a dict of dicts,
        # a list of dicts, or a list of strings.

        collations = raw_results.spellcheck.get("collations", None)
        suggestions = raw_results.spellcheck.get("suggestions", None)

        # We'll collect multiple suggestions here. For backwards
        # compatibility with older versions of Haystack we'll still return
        # only a single suggestion but in the future we can expose all of
        # them.

        spelling_suggestions = []

        if collations:
            if isinstance(collations, dict):
                # Solr 6.5
                collation_values = collations["collation"]
                if isinstance(collation_values, six.string_types):
                    collation_values = [collation_values]
                elif isinstance(collation_values, dict):
                    # spellcheck.collateExtendedResults changes the format to a dictionary:
                    collation_values = [collation_values["collationQuery"]]
            elif isinstance(collations[1], dict):
                # Solr 6.4
                collation_values = collations
            else:
                # Older versions of Solr
                collation_values = collations[-1:]

            for i in collation_values:
                # Depending on the options the values are either simple strings or dictionaries:
                spelling_suggestions.append(
                    i["collationQuery"] if isinstance(i, dict) else i)
        elif suggestions:
            if isinstance(suggestions, dict):
                for i in suggestions.values():
                    for j in i["suggestion"]:
                        if isinstance(j, dict):
                            spelling_suggestions.append(j["word"])
                        else:
                            spelling_suggestions.append(j)
            elif isinstance(suggestions[0], six.string_types) and isinstance(
                    suggestions[1], dict):
                # Solr 6.4 uses a list of paired (word, dictionary) pairs:
                for suggestion in suggestions:
                    if isinstance(suggestion, dict):
                        for i in suggestion["suggestion"]:
                            if isinstance(i, dict):
                                spelling_suggestions.append(i["word"])
                            else:
                                spelling_suggestions.append(i)
            else:
                # Legacy Solr
                spelling_suggestions.append(suggestions[-1])

        return spelling_suggestions

    def build_schema(self, fields):
        content_field_name = ""
        schema_fields = []

        for field_name, field_class in fields.items():
            field_data = {
                "field_name": field_class.index_fieldname,
                "type": "text_en",
                "indexed": "true",
                "stored": "true",
                "multi_valued": "false",
            }

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

            # DRL_FIXME: Perhaps move to something where, if none of these
            #            checks succeed, call a custom method on the form that
            #            returns, per-backend, the right type of storage?
            if field_class.field_type in ["date", "datetime"]:
                field_data["type"] = "date"
            elif field_class.field_type == "integer":
                field_data["type"] = "long"
            elif field_class.field_type == "float":
                field_data["type"] = "float"
            elif field_class.field_type == "boolean":
                field_data["type"] = "boolean"
            elif field_class.field_type == "ngram":
                field_data["type"] = "ngram"
            elif field_class.field_type == "edge_ngram":
                field_data["type"] = "edge_ngram"
            elif field_class.field_type == "location":
                field_data["type"] = "location"

            if field_class.is_multivalued:
                field_data["multi_valued"] = "true"

            if field_class.stored is False:
                field_data["stored"] = "false"

            # Do this last to override `text` fields.
            if field_class.indexed is False:
                field_data["indexed"] = "false"

                # If it's text and not being indexed, we probably don't want
                # to do the normal lowercase/tokenize/stemming/etc. dance.
                if field_data["type"] == "text_en":
                    field_data["type"] = "string"

            # If it's a ``FacetField``, make sure we don't postprocess it.
            if hasattr(field_class, "facet_for"):
                # If it's text, it ought to be a string.
                if field_data["type"] == "text_en":
                    field_data["type"] = "string"

            schema_fields.append(field_data)

        return (content_field_name, schema_fields)

    def extract_file_contents(self, file_obj, **kwargs):
        """Extract text and metadata from a structured file (PDF, MS Word, etc.)

        Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
        See the Solr wiki for details:

            http://wiki.apache.org/solr/ExtractingRequestHandler

        Due to the way the ExtractingRequestHandler is implemented it completely
        replaces the normal Haystack indexing process with several unfortunate
        restrictions: only one file per request, the extracted data is added to
        the index with no ability to modify it, etc. To simplify the process and
        allow for more advanced use we'll run using the extract-only mode to
        return the extracted data without adding it to the index so we can then
        use it within Haystack's normal templating process.

        Returns None if metadata cannot be extracted; otherwise returns a
        dictionary containing at least two keys:

            :contents:
                        Extracted full-text content, if applicable
            :metadata:
                        key:value pairs of text strings
        """

        try:
            return self.conn.extract(file_obj, **kwargs)
        except Exception as e:
            self.log.warning(
                "Unable to extract file contents: %s",
                e,
                exc_info=True,
                extra={"data": {
                    "file": file_obj
                }},
            )
            return None
Ejemplo n.º 5
0
class SolrTestCase(unittest.TestCase):
    def setUp(self):
        super(SolrTestCase, self).setUp()
        self.default_solr = Solr('http://*****:*****@unittest.skipUnless(HAS_LXML, "Cannot test Tomcat error extraction without lxml")
    def test__scrape_response_tomcat(self):
        """Tests for Tomcat error responses, which currently require lxml.html to parse"""

        # Tomcat.
        resp_1 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p><span>Error message</span><span>messed up.</span></p></body></html>')
        self.assertEqual(resp_1, ('messed up.', ''))

        # Broken Tomcat.
        resp_2 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p>Really broken. Scraping Java-generated HTML sucks.</pre></body></html>')
        self.assertEqual(resp_2, (None, u'<div><body><p>Really broken. Scraping Java-generated HTML sucks.</p></body></div>'))

    def test__from_python(self):
        self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z')
        self.assertEqual(self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z')
        self.assertEqual(self.solr._from_python(True), 'true')
        self.assertEqual(self.solr._from_python(False), 'false')
        self.assertEqual(self.solr._from_python(1), '1')
        self.assertEqual(self.solr._from_python(1.2), '1.2')
        self.assertEqual(self.solr._from_python(b'hello'), 'hello')
        self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._from_python('\x01test\x02'), 'test')

    def test__to_python(self):
        self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18))
        self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28))
        self.assertEqual(self.solr._to_python('true'), True)
        self.assertEqual(self.solr._to_python('false'), False)
        self.assertEqual(self.solr._to_python(1), 1)
        self.assertEqual(self.solr._to_python(1.2), 1.2)
        self.assertEqual(self.solr._to_python(b'hello'), 'hello')
        self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo')
        self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo')
        self.assertEqual(self.solr._to_python('tuple("foo", "bar")'), 'tuple("foo", "bar")')

    def test__is_null_value(self):
        self.assertTrue(self.solr._is_null_value(None))
        self.assertTrue(self.solr._is_null_value(''))

        self.assertFalse(self.solr._is_null_value('Hello'))
        self.assertFalse(self.solr._is_null_value(1))

    def test_search(self):
        results = self.solr.search('doc')
        self.assertEqual(len(results), 3)

        results = self.solr.search('example')
        self.assertEqual(len(results), 2)

        results = self.solr.search('nothing')
        self.assertEqual(len(results), 0)

        # Advanced options.
        results = self.solr.search('doc', **{
            'debug': 'true',
            'hl': 'true',
            'hl.fragsize': 8,
            'facet': 'on',
            'facet.field': 'popularity',
            'spellcheck': 'true',
            'spellcheck.collate': 'true',
            'spellcheck.count': 1,
            # TODO: Can't get these working in my test setup.
            # 'group': 'true',
            # 'group.field': 'id',
        })
        self.assertEqual(len(results), 3)
        self.assertTrue('explain' in results.debug)
        self.assertEqual(results.highlighting, {u'doc_4': {}, u'doc_2': {}, u'doc_1': {}})
        self.assertEqual(results.spellcheck, {})
        self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0])
        self.assertTrue(results.qtime is not None)
        # TODO: Can't get these working in my test setup.
        # self.assertEqual(results.grouped, '')

    def test_more_like_this(self):
        results = self.solr.more_like_this('id:doc_1', 'text')
        self.assertEqual(len(results), 0)

    def test_suggest_terms(self):
        results = self.solr.suggest_terms('title', '')
        self.assertEqual(len(results), 1)
        self.assertEqual(results, {'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]})

    def test__build_doc(self):
        doc = {
            'id': 'doc_1',
            'title': 'Example doc ☃ 1',
            'price': 12.59,
            'popularity': 10,
        }
        doc_xml = force_unicode(ET.tostring(self.solr._build_doc(doc), encoding='utf-8'))
        self.assertTrue('<field name="title">Example doc ☃ 1</field>' in doc_xml)
        self.assertTrue('<field name="id">doc_1</field>' in doc_xml)
        self.assertEqual(len(doc_xml), 152)

    def test_add(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.assertEqual(len(self.solr.search('example')), 2)

        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            },
            {
                'id': 'doc_7',
                'title': 'Another example doc',
            },
        ])

        self.assertEqual(len(self.solr.search('doc')), 5)
        self.assertEqual(len(self.solr.search('example')), 3)

    def test_add_with_boost(self):
        self.assertEqual(len(self.solr.search('doc')), 3)

        self.solr.add([{'id': 'doc_6', 'title': 'Important doc'}],
                      boost={'title': 10.0})

        self.solr.add([{'id': 'doc_7', 'title': 'Spam doc doc'}],
                      boost={'title': 0})

        res = self.solr.search('doc')
        self.assertEqual(len(res), 5)
        self.assertEqual('doc_6', res.docs[0]['id'])

    def test_field_update(self):
        originalDocs = self.solr.search('doc')
        self.assertEqual(len(originalDocs), 3)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append( {'id': doc['id'], 'popularity': 5} )
        self.solr.add(updateList, fieldUpdates={'popularity': 'inc'})

        updatedDocs = self.solr.search('doc')
        self.assertEqual(len(updatedDocs), 3)
        for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['popularity'], originalDoc['popularity'] + 5)
            self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'popularity']))

        self.solr.add([
            {
                'id': 'multivalued_1',
                'title': 'Multivalued doc 1',
                'word_ss': ['alpha', 'beta'],
            },
            {
                'id': 'multivalued_2',
                'title': 'Multivalued doc 2',
                'word_ss': ['charlie', 'delta'],
            },
        ])

        originalDocs = self.solr.search('multivalued')
        self.assertEqual(len(originalDocs), 2)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append( {'id': doc['id'], 'word_ss': ['epsilon', 'gamma']} )
        self.solr.add(updateList, fieldUpdates={'word_ss': 'add'})

        updatedDocs = self.solr.search('multivalued')
        self.assertEqual(len(updatedDocs), 2)
        for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['word_ss'], originalDoc['word_ss'] + ['epsilon', 'gamma'])
            self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'word_ss']))

    def test_delete(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.delete(id='doc_1')
        self.assertEqual(len(self.solr.search('doc')), 2)
        self.solr.delete(q='price:[0 TO 15]')
        self.assertEqual(len(self.solr.search('doc')), 1)

        self.assertEqual(len(self.solr.search('*:*')), 1)
        self.solr.delete(q='*:*')
        self.assertEqual(len(self.solr.search('*:*')), 0)

        # Need at least one.
        self.assertRaises(ValueError, self.solr.delete)
        # Can't have both.
        self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar')

    def test_commit(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.commit()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_optimize(self):
        # Make sure it doesn't blow up. Side effects are hard to measure. :/
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.optimize()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_extract(self):
        fake_f = StringIO("""
            <html>
                <head>
                    <meta charset="utf-8">
                    <meta name="haystack-test" content="test 1234">
                    <title>Test Title ☃&#x2603;</title>
                </head>
                    <body>foobar</body>
            </html>
        """)
        fake_f.name = "test.html"
        extracted = self.solr.extract(fake_f)

        # Verify documented response structure:
        self.assertIn('contents', extracted)
        self.assertIn('metadata', extracted)

        self.assertIn('foobar', extracted['contents'])

        m = extracted['metadata']

        self.assertEqual([fake_f.name], m['stream_name'])

        self.assertIn('haystack-test', m, "HTML metadata should have been extracted!")
        self.assertEqual(['test 1234'], m['haystack-test'])

        # Note the underhanded use of a double snowman to verify both that Tika
        # correctly decoded entities and that our UTF-8 characters survived the
        # round-trip:
        self.assertEqual(['Test Title ☃☃'], m['title'])

    def test_full_url(self):
        self.solr.url = 'http://localhost:8983/solr/core0'
        full_url = self.solr._create_full_url(path='/update')

        # Make sure trailing and leading slashes do not collide:
        self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')
Ejemplo n.º 6
0
class SolrTestCase(unittest.TestCase):
    def setUp(self):
        super(SolrTestCase, self).setUp()
        self.default_solr = Solr('http://localhost:8983/solr/core0')
        # Short timeouts.
        self.solr = Solr('http://localhost:8983/solr/core0', timeout=2)
        self.docs = [
            {
                'id': 'doc_1',
                'title': 'Example doc 1',
                'price': 12.59,
                'popularity': 10,
            },
            {
                'id': 'doc_2',
                'title': 'Another example ☃ doc 2',
                'price': 13.69,
                'popularity': 7,
            },
            {
                'id': 'doc_3',
                'title': 'Another thing',
                'price': 2.35,
                'popularity': 8,
            },
            {
                'id': 'doc_4',
                'title': 'doc rock',
                'price': 99.99,
                'popularity': 10,
            },
            {
                'id': 'doc_5',
                'title': 'Boring',
                'price': 1.12,
                'popularity': 2,
            },
        ]

        # Clear it.
        self.solr.delete(q='*:*')

        # Index our docs. Yes, this leans on functionality we're going to test
        # later & if it's broken, everything will catastrophically fail.
        # Such is life.
        self.solr.add(self.docs)

    def tearDown(self):
        self.solr.delete(q='*:*')
        super(SolrTestCase, self).tearDown()

    def test_init(self):
        self.assertEqual(self.default_solr.url,
                         'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.default_solr.decoder,
                                   json.JSONDecoder))
        self.assertEqual(self.default_solr.timeout, 60)

        self.assertEqual(self.solr.url, 'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder))
        self.assertEqual(self.solr.timeout, 2)

    def test__create_full_url(self):
        # Nada.
        self.assertEqual(self.solr._create_full_url(path=''),
                         'http://localhost:8983/solr/core0')
        # Basic path.
        self.assertEqual(self.solr._create_full_url(path='pysolr_tests'),
                         'http://localhost:8983/solr/core0/pysolr_tests')
        # Leading slash (& making sure we don't touch the trailing slash).
        self.assertEqual(
            self.solr._create_full_url(
                path='/pysolr_tests/select/?whatever=/'),
            'http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/')

    def test__send_request(self):
        # Test a valid request.
        resp_body = self.solr._send_request('GET', 'select/?q=doc&wt=json')
        self.assertTrue('"numFound":3' in resp_body)

        # Test a lowercase method & a body.
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee! ☃</field></doc></add>'
        resp_body = self.solr._send_request('POST',
                                            'update/?commit=true',
                                            body=xml_body,
                                            headers={
                                                'Content-type':
                                                'text/xml; charset=utf-8',
                                            })
        self.assertTrue('<int name="status">0</int>' in resp_body)

        # Test a non-existent URL.
        old_url = self.solr.url
        self.solr.url = 'http://127.0.0.1:567898/wahtever'
        self.assertRaises(SolrError, self.solr._send_request, 'get',
                          'select/?q=doc&wt=json')
        self.solr.url = old_url

    def test__select(self):
        # Short params.
        resp_body = self.solr._select({'q': 'doc'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 3)

        # Long params.
        resp_body = self.solr._select({'q': 'doc' * 1024})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)
        self.assertEqual(len(resp_data['responseHeader']['params']['q']),
                         3 * 1024)

    def test__mlt(self):
        resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__suggest_terms(self):
        resp_body = self.solr._select({'terms.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__update(self):
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._update(xml_body)
        self.assertTrue('<int name="status">0</int>' in resp_body)

    def test__soft_commit(self):
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._update(xml_body, softCommit=True)
        self.assertTrue('<int name="status">0</int>' in resp_body)

    def test__extract_error(self):
        class RubbishResponse(object):
            def __init__(self, content, headers=None):
                if isinstance(content, bytes):
                    content = content.decode('utf-8')
                self.content = content
                self.headers = headers

                if self.headers is None:
                    self.headers = {}

            def json(self):
                return json.loads(self.content)

        # Just the reason.
        resp_1 = RubbishResponse("We don't care.",
                                 {'reason': 'Something went wrong.'})
        self.assertEqual(self.solr._extract_error(resp_1),
                         "[Reason: Something went wrong.]")

        # Empty reason.
        resp_2 = RubbishResponse("We don't care.", {'reason': None})
        self.assertEqual(self.solr._extract_error(resp_2),
                         "[Reason: None]\nWe don't care.")

        # No reason. Time to scrape.
        resp_3 = RubbishResponse(
            '<html><body><pre>Something is broke.</pre></body></html>',
            {'server': 'jetty'})
        self.assertEqual(self.solr._extract_error(resp_3),
                         "[Reason: Something is broke.]")

        # No reason. JSON response.
        resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}',
                                 {'server': 'tomcat'})
        self.assertEqual(self.solr._extract_error(resp_4),
                         "[Reason: It happens]")

        # No reason. Weird JSON response.
        resp_5 = RubbishResponse(b'{"kinda": "weird"}', {'server': 'jetty'})
        self.assertEqual(self.solr._extract_error(resp_5),
                         '[Reason: None]\n{"kinda": "weird"}')

    def test__scrape_response(self):
        # Jetty.
        resp_1 = self.solr._scrape_response(
            {'server': 'jetty'},
            '<html><body><pre>Something is broke.</pre></body></html>')
        self.assertEqual(resp_1, ('Something is broke.', u''))

        # Other.
        resp_2 = self.solr._scrape_response({
            'server': 'crapzilla'
        }, '<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>'
                                            )
        self.assertEqual(resp_2, ('Wow. Seriously weird.', u''))

    @unittest.skipIf(
        sys.version_info < (2, 7),
        reason=
        u'Python 2.6 lacks the ElementTree 1.3 interface required for Solr XML error message parsing'
    )
    def test__scrape_response_coyote_xml(self):
        resp_3 = self.solr._scrape_response({
            'server': 'coyote'
        }, '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>\n</response>\n'
                                            )
        self.assertEqual(resp_3, ("Invalid Date String:'2015-03-23 10:43:33'",
                                  "Invalid Date String:'2015-03-23 10:43:33'"))

        # Valid XML with a traceback
        resp_4 = self.solr._scrape_response({'server': 'coyote'},
                                            """<?xml version="1.0"?>
<response>
<lst name="responseHeader"><int name="status">500</int><int name="QTime">138</int></lst><lst name="error"><str name="msg">Internal Server Error</str><str name="trace">org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)</str><int name="code">500</int></lst>
</response>""")
        self.assertEqual(resp_4, (
            u"Internal Server Error",
            u"org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)"
        ))

    def test__scrape_response_tomcat(self):
        """Tests for Tomcat error responses"""

        resp_0 = self.solr._scrape_response({
            'server': 'coyote'
        }, '<html><body><h1>Something broke!</h1><pre>gigantic stack trace</pre></body></html>'
                                            )
        self.assertEqual(resp_0, ('Something broke!', ''))

        # Invalid XML
        bogus_xml = '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>'
        reason, full_html = self.solr._scrape_response({'server': 'coyote'},
                                                       bogus_xml)
        self.assertEqual(reason, None)
        self.assertEqual(full_html, bogus_xml.replace("\n", ""))

    def test__from_python(self):
        self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)),
                         '2013-01-18T00:00:00Z')
        self.assertEqual(
            self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)),
            '2013-01-18T00:30:28Z')
        self.assertEqual(self.solr._from_python(True), 'true')
        self.assertEqual(self.solr._from_python(False), 'false')
        self.assertEqual(self.solr._from_python(1), '1')
        self.assertEqual(self.solr._from_python(1.2), '1.2')
        self.assertEqual(self.solr._from_python(b'hello'), 'hello')
        self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._from_python('\x01test\x02'), 'test')

    def test__to_python(self):
        self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'),
                         datetime.datetime(2013, 1, 18))
        self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'),
                         datetime.datetime(2013, 1, 18, 0, 30, 28))
        self.assertEqual(self.solr._to_python('true'), True)
        self.assertEqual(self.solr._to_python('false'), False)
        self.assertEqual(self.solr._to_python(1), 1)
        self.assertEqual(self.solr._to_python(1.2), 1.2)
        self.assertEqual(self.solr._to_python(b'hello'), 'hello')
        self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo')
        self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo')
        self.assertEqual(self.solr._to_python('tuple("foo", "bar")'),
                         'tuple("foo", "bar")')

    def test__is_null_value(self):
        self.assertTrue(self.solr._is_null_value(None))
        self.assertTrue(self.solr._is_null_value(''))

        self.assertFalse(self.solr._is_null_value('Hello'))
        self.assertFalse(self.solr._is_null_value(1))

    def test_search(self):
        results = self.solr.search('doc')
        self.assertEqual(len(results), 3)

        results = self.solr.search('example')
        self.assertEqual(len(results), 2)

        results = self.solr.search('nothing')
        self.assertEqual(len(results), 0)

        # Advanced options.
        results = self.solr.search(
            'doc',
            **{
                'debug': 'true',
                'hl': 'true',
                'hl.fragsize': 8,
                'facet': 'on',
                'facet.field': 'popularity',
                'spellcheck': 'true',
                'spellcheck.collate': 'true',
                'spellcheck.count': 1,
                # TODO: Can't get these working in my test setup.
                # 'group': 'true',
                # 'group.field': 'id',
            })
        self.assertEqual(len(results), 3)
        self.assertTrue('explain' in results.debug)
        self.assertEqual(results.highlighting, {
            u'doc_4': {},
            u'doc_2': {},
            u'doc_1': {}
        })
        self.assertEqual(results.spellcheck, {})
        self.assertEqual(results.facets['facet_fields']['popularity'],
                         ['10', 2, '7', 1, '2', 0, '8', 0])
        self.assertTrue(results.qtime is not None)
        # TODO: Can't get these working in my test setup.
        # self.assertEqual(results.grouped, '')

    def test_more_like_this(self):
        results = self.solr.more_like_this('id:doc_1', 'text')
        self.assertEqual(len(results), 0)

    def test_suggest_terms(self):
        results = self.solr.suggest_terms('title', '')
        self.assertEqual(len(results), 1)
        self.assertEqual(
            results, {
                'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1),
                          ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]
            })

    def test__build_doc(self):
        doc = {
            'id': 'doc_1',
            'title': 'Example doc ☃ 1',
            'price': 12.59,
            'popularity': 10,
        }
        doc_xml = force_unicode(
            ET.tostring(self.solr._build_doc(doc), encoding='utf-8'))
        self.assertTrue(
            '<field name="title">Example doc ☃ 1</field>' in doc_xml)
        self.assertTrue('<field name="id">doc_1</field>' in doc_xml)
        self.assertEqual(len(doc_xml), 152)

    def test_add(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.assertEqual(len(self.solr.search('example')), 2)

        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            },
            {
                'id': 'doc_7',
                'title': 'Another example doc',
            },
        ])

        self.assertEqual(len(self.solr.search('doc')), 5)
        self.assertEqual(len(self.solr.search('example')), 3)

    def test_add_with_boost(self):
        self.assertEqual(len(self.solr.search('doc')), 3)

        self.solr.add([{
            'id': 'doc_6',
            'title': 'Important doc'
        }],
                      boost={'title': 10.0})

        self.solr.add([{
            'id': 'doc_7',
            'title': 'Spam doc doc'
        }],
                      boost={'title': 0})

        res = self.solr.search('doc')
        self.assertEqual(len(res), 5)
        self.assertEqual('doc_6', res.docs[0]['id'])

    def test_field_update(self):
        originalDocs = self.solr.search('doc')
        self.assertEqual(len(originalDocs), 3)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append({'id': doc['id'], 'popularity': 5})
        self.solr.add(updateList, fieldUpdates={'popularity': 'inc'})

        updatedDocs = self.solr.search('doc')
        self.assertEqual(len(updatedDocs), 3)
        for i, (originalDoc,
                updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['popularity'],
                             originalDoc['popularity'] + 5)
            self.assertEqual(
                True,
                all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys()
                    if not k in ['_version_', 'popularity']))

        self.solr.add([
            {
                'id': 'multivalued_1',
                'title': 'Multivalued doc 1',
                'word_ss': ['alpha', 'beta'],
            },
            {
                'id': 'multivalued_2',
                'title': 'Multivalued doc 2',
                'word_ss': ['charlie', 'delta'],
            },
        ])

        originalDocs = self.solr.search('multivalued')
        self.assertEqual(len(originalDocs), 2)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append({
                'id': doc['id'],
                'word_ss': ['epsilon', 'gamma']
            })
        self.solr.add(updateList, fieldUpdates={'word_ss': 'add'})

        updatedDocs = self.solr.search('multivalued')
        self.assertEqual(len(updatedDocs), 2)
        for i, (originalDoc,
                updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['word_ss'],
                             originalDoc['word_ss'] + ['epsilon', 'gamma'])
            self.assertEqual(
                True,
                all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys()
                    if not k in ['_version_', 'word_ss']))

    def test_delete(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.delete(id='doc_1')
        self.assertEqual(len(self.solr.search('doc')), 2)
        self.solr.delete(q='price:[0 TO 15]')
        self.assertEqual(len(self.solr.search('doc')), 1)

        self.assertEqual(len(self.solr.search('*:*')), 1)
        self.solr.delete(q='*:*')
        self.assertEqual(len(self.solr.search('*:*')), 0)

        # Need at least one.
        self.assertRaises(ValueError, self.solr.delete)
        # Can't have both.
        self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar')

    def test_commit(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([{
            'id': 'doc_6',
            'title': 'Newly added doc',
        }],
                      commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.commit()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_optimize(self):
        # Make sure it doesn't blow up. Side effects are hard to measure. :/
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([{
            'id': 'doc_6',
            'title': 'Newly added doc',
        }],
                      commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.optimize()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_extract(self):
        fake_f = StringIO("""
            <html>
                <head>
                    <meta charset="utf-8">
                    <meta name="haystack-test" content="test 1234">
                    <title>Test Title ☃&#x2603;</title>
                </head>
                    <body>foobar</body>
            </html>
        """)
        fake_f.name = "test.html"
        extracted = self.solr.extract(fake_f)

        # Verify documented response structure:
        self.assertIn('contents', extracted)
        self.assertIn('metadata', extracted)

        self.assertIn('foobar', extracted['contents'])

        m = extracted['metadata']

        self.assertEqual([fake_f.name], m['stream_name'])

        self.assertIn('haystack-test', m,
                      "HTML metadata should have been extracted!")
        self.assertEqual(['test 1234'], m['haystack-test'])

        # Note the underhanded use of a double snowman to verify both that Tika
        # correctly decoded entities and that our UTF-8 characters survived the
        # round-trip:
        self.assertEqual(['Test Title ☃☃'], m['title'])

    def test_full_url(self):
        self.solr.url = 'http://localhost:8983/solr/core0'
        full_url = self.solr._create_full_url(path='/update')

        # Make sure trailing and leading slashes do not collide:
        self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')
Ejemplo n.º 7
0
class SolrTestCase(unittest.TestCase):
    def setUp(self):
        super(SolrTestCase, self).setUp()
        self.default_solr = Solr('http://localhost:8983/solr/core0')
        # Short timeouts.
        self.solr = Solr('http://localhost:8983/solr/core0', timeout=2)
        self.docs = [
            {
                'id': 'doc_1',
                'title': 'Example doc 1',
                'price': 12.59,
                'popularity': 10,
            },
            {
                'id': 'doc_2',
                'title': 'Another example ☃ doc 2',
                'price': 13.69,
                'popularity': 7,
            },
            {
                'id': 'doc_3',
                'title': 'Another thing',
                'price': 2.35,
                'popularity': 8,
            },
            {
                'id': 'doc_4',
                'title': 'doc rock',
                'price': 99.99,
                'popularity': 10,
            },
            {
                'id': 'doc_5',
                'title': 'Boring',
                'price': 1.12,
                'popularity': 2,
            },
        ]

        # Clear it.
        self.solr.delete(q='*:*')

        # Index our docs. Yes, this leans on functionality we're going to test
        # later & if it's broken, everything will catastrophically fail.
        # Such is life.
        self.solr.add(self.docs)

    def tearDown(self):
        self.solr.delete(q='*:*')
        super(SolrTestCase, self).tearDown()

    def test_init(self):
        self.assertEqual(self.default_solr.url,
                         'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.default_solr.decoder,
                                   json.JSONDecoder))
        self.assertEqual(self.default_solr.timeout, 60)

        self.assertEqual(self.solr.url, 'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder))
        self.assertEqual(self.solr.timeout, 2)

    def test__create_full_url(self):
        # Nada.
        self.assertEqual(self.solr._create_full_url(path=''),
                         'http://localhost:8983/solr/core0')
        # Basic path.
        self.assertEqual(self.solr._create_full_url(path='pysolr_tests'),
                         'http://localhost:8983/solr/core0/pysolr_tests')
        # Leading slash (& making sure we don't touch the trailing slash).
        self.assertEqual(
            self.solr._create_full_url(
                path='/pysolr_tests/select/?whatever=/'),
            'http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/')

    def test__send_request(self):
        # Test a valid request.
        resp_body = self.solr._send_request('GET', 'select/?q=doc&wt=json')
        self.assertTrue('"numFound":3' in resp_body)

        # Test a lowercase method & a body.
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee! ☃</field></doc></add>'
        resp_body = self.solr._send_request('POST',
                                            'update/?commit=true',
                                            body=xml_body,
                                            headers={
                                                'Content-type':
                                                'text/xml; charset=utf-8',
                                            })
        self.assertTrue('<int name="status">0</int>' in resp_body)

        # Test a non-existent URL.
        old_url = self.solr.url
        self.solr.url = 'http://127.0.0.1:567898/wahtever'
        self.assertRaises(SolrError, self.solr._send_request, 'get',
                          'select/?q=doc&wt=json')
        self.solr.url = old_url

    def test__select(self):
        # Short params.
        resp_body = self.solr._select({'q': 'doc'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 3)

        # Long params.
        resp_body = self.solr._select({'q': 'doc' * 1024})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)
        self.assertEqual(len(resp_data['responseHeader']['params']['q']),
                         3 * 1024)

    def test__mlt(self):
        resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__suggest_terms(self):
        resp_body = self.solr._select({'terms.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__update(self):
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._update(xml_body)
        self.assertTrue('<int name="status">0</int>' in resp_body)

    def test__extract_error(self):
        class RubbishResponse(object):
            def __init__(self, content, headers=None):
                if isinstance(content, bytes):
                    content = content.decode('utf-8')
                self.content = content
                self.headers = headers

                if self.headers is None:
                    self.headers = {}

            def json(self):
                return json.loads(self.content)

        # Just the reason.
        resp_1 = RubbishResponse("We don't care.",
                                 {'reason': 'Something went wrong.'})
        self.assertEqual(self.solr._extract_error(resp_1),
                         "[Reason: Something went wrong.]")

        # Empty reason.
        resp_2 = RubbishResponse("We don't care.", {'reason': None})
        self.assertEqual(self.solr._extract_error(resp_2),
                         "[Reason: None]\nWe don't care.")

        # No reason. Time to scrape.
        resp_3 = RubbishResponse(
            '<html><body><pre>Something is broke.</pre></body></html>',
            {'server': 'jetty'})
        self.assertEqual(self.solr._extract_error(resp_3),
                         "[Reason: Something is broke.]")

        # No reason. JSON response.
        resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}',
                                 {'server': 'tomcat'})
        self.assertEqual(self.solr._extract_error(resp_4),
                         "[Reason: It happens]")

        # No reason. Weird JSON response.
        resp_5 = RubbishResponse(b'{"kinda": "weird"}', {'server': 'jetty'})
        self.assertEqual(self.solr._extract_error(resp_5),
                         '[Reason: None]\n{"kinda": "weird"}')

    def test__scrape_response(self):
        # Jetty.
        resp_1 = self.solr._scrape_response(
            {'server': 'jetty'},
            '<html><body><pre>Something is broke.</pre></body></html>')
        self.assertEqual(resp_1, ('Something is broke.', u''))

        # Other.
        resp_2 = self.solr._scrape_response({
            'server': 'crapzilla'
        }, '<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>'
                                            )
        self.assertEqual(resp_2, ('Wow. Seriously weird.', u''))

    @unittest.skipUnless(HAS_LXML,
                         "Cannot test Tomcat error extraction without lxml")
    def test__scrape_response_tomcat(self):
        """Tests for Tomcat error responses, which currently require lxml.html to parse"""

        # Tomcat.
        resp_1 = self.solr._scrape_response({
            'server': 'coyote'
        }, '<html><body><p><span>Error message</span><span>messed up.</span></p></body></html>'
                                            )
        self.assertEqual(resp_1, ('messed up.', ''))

        # Broken Tomcat.
        resp_2 = self.solr._scrape_response({
            'server': 'coyote'
        }, '<html><body><p>Really broken. Scraping Java-generated HTML sucks.</pre></body></html>'
                                            )
        self.assertEqual(resp_2, (
            None,
            u'<div><body><p>Really broken. Scraping Java-generated HTML sucks.</p></body></div>'
        ))

    def test__from_python(self):
        self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)),
                         '2013-01-18T00:00:00Z')
        self.assertEqual(
            self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)),
            '2013-01-18T00:30:28Z')
        self.assertEqual(self.solr._from_python(True), 'true')
        self.assertEqual(self.solr._from_python(False), 'false')
        self.assertEqual(self.solr._from_python(1), '1')
        self.assertEqual(self.solr._from_python(1.2), '1.2')
        self.assertEqual(self.solr._from_python(b'hello'), 'hello')
        self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._from_python('\x01test\x02'), 'test')

    def test__to_python(self):
        self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'),
                         datetime.datetime(2013, 1, 18))
        self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'),
                         datetime.datetime(2013, 1, 18, 0, 30, 28))
        self.assertEqual(self.solr._to_python('true'), True)
        self.assertEqual(self.solr._to_python('false'), False)
        self.assertEqual(self.solr._to_python(1), 1)
        self.assertEqual(self.solr._to_python(1.2), 1.2)
        self.assertEqual(self.solr._to_python(b'hello'), 'hello')
        self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo')
        self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo')
        self.assertEqual(self.solr._to_python('tuple("foo", "bar")'),
                         'tuple("foo", "bar")')

    def test__is_null_value(self):
        self.assertTrue(self.solr._is_null_value(None))
        self.assertTrue(self.solr._is_null_value(''))

        self.assertFalse(self.solr._is_null_value('Hello'))
        self.assertFalse(self.solr._is_null_value(1))

    def test_search(self):
        results = self.solr.search('doc')
        self.assertEqual(len(results), 3)

        results = self.solr.search('example')
        self.assertEqual(len(results), 2)

        results = self.solr.search('nothing')
        self.assertEqual(len(results), 0)

        # Advanced options.
        results = self.solr.search(
            'doc',
            **{
                'debug': 'true',
                'hl': 'true',
                'hl.fragsize': 8,
                'facet': 'on',
                'facet.field': 'popularity',
                'spellcheck': 'true',
                'spellcheck.collate': 'true',
                'spellcheck.count': 1,
                # TODO: Can't get these working in my test setup.
                # 'group': 'true',
                # 'group.field': 'id',
            })
        self.assertEqual(len(results), 3)
        self.assertTrue('explain' in results.debug)
        self.assertEqual(results.highlighting, {
            u'doc_4': {},
            u'doc_2': {},
            u'doc_1': {}
        })
        self.assertEqual(results.spellcheck, {})
        self.assertEqual(results.facets['facet_fields']['popularity'],
                         ['10', 2, '7', 1, '2', 0, '8', 0])
        self.assertTrue(results.qtime is not None)
        # TODO: Can't get these working in my test setup.
        # self.assertEqual(results.grouped, '')

    def test_more_like_this(self):
        results = self.solr.more_like_this('id:doc_1', 'text')
        self.assertEqual(len(results), 0)

    def test_suggest_terms(self):
        results = self.solr.suggest_terms('title', '')
        self.assertEqual(len(results), 1)
        self.assertEqual(
            results, {
                'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1),
                          ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]
            })

    def test__build_doc(self):
        doc = {
            'id': 'doc_1',
            'title': 'Example doc ☃ 1',
            'price': 12.59,
            'popularity': 10,
        }
        doc_xml = force_unicode(
            ET.tostring(self.solr._build_doc(doc), encoding='utf-8'))
        self.assertTrue(
            '<field name="title">Example doc ☃ 1</field>' in doc_xml)
        self.assertTrue('<field name="id">doc_1</field>' in doc_xml)
        self.assertEqual(len(doc_xml), 152)

    def test_add(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.assertEqual(len(self.solr.search('example')), 2)

        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            },
            {
                'id': 'doc_7',
                'title': 'Another example doc',
            },
        ])

        self.assertEqual(len(self.solr.search('doc')), 5)
        self.assertEqual(len(self.solr.search('example')), 3)

    def test_add_with_boost(self):
        self.assertEqual(len(self.solr.search('doc')), 3)

        self.solr.add([{
            'id': 'doc_6',
            'title': 'Important doc'
        }],
                      boost={'title': 10.0})

        self.solr.add([{
            'id': 'doc_7',
            'title': 'Spam doc doc'
        }],
                      boost={'title': 0})

        res = self.solr.search('doc')
        self.assertEqual(len(res), 5)
        self.assertEqual('doc_6', res.docs[0]['id'])

    def test_delete(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.delete(id='doc_1')
        self.assertEqual(len(self.solr.search('doc')), 2)
        self.solr.delete(q='price:[0 TO 15]')
        self.assertEqual(len(self.solr.search('doc')), 1)

        self.assertEqual(len(self.solr.search('*:*')), 1)
        self.solr.delete(q='*:*')
        self.assertEqual(len(self.solr.search('*:*')), 0)

        # Need at least one.
        self.assertRaises(ValueError, self.solr.delete)
        # Can't have both.
        self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar')

    def test_commit(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([{
            'id': 'doc_6',
            'title': 'Newly added doc',
        }],
                      commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.commit()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_optimize(self):
        # Make sure it doesn't blow up. Side effects are hard to measure. :/
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([{
            'id': 'doc_6',
            'title': 'Newly added doc',
        }],
                      commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.optimize()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_extract(self):
        fake_f = StringIO("""
            <html>
                <head>
                    <meta charset="utf-8">
                    <meta name="haystack-test" content="test 1234">
                    <title>Test Title ☃&#x2603;</title>
                </head>
                    <body>foobar</body>
            </html>
        """)
        fake_f.name = "test.html"
        extracted = self.solr.extract(fake_f)

        # Verify documented response structure:
        self.assertIn('contents', extracted)
        self.assertIn('metadata', extracted)

        self.assertIn('foobar', extracted['contents'])

        m = extracted['metadata']

        self.assertEqual([fake_f.name], m['stream_name'])

        self.assertIn('haystack-test', m,
                      "HTML metadata should have been extracted!")
        self.assertEqual(['test 1234'], m['haystack-test'])

        # Note the underhanded use of a double snowman to verify both that Tika
        # correctly decoded entities and that our UTF-8 characters survived the
        # round-trip:
        self.assertEqual(['Test Title ☃☃'], m['title'])

    def test_full_url(self):
        self.solr.url = 'http://localhost:8983/solr/core0'
        full_url = self.solr._create_full_url(path='/update')

        # Make sure trailing and leading slashes do not collide:
        self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')
Ejemplo n.º 8
0
class SearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = (
        'AND',
        'NOT',
        'OR',
        'TO',
    )
    
    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        '\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
        '[', ']', '^', '"', '~', '*', '?', ':',
    )
    
    def __init__(self, site=None):
        super(SearchBackend, self).__init__(site)
        
        if not hasattr(settings, 'HAYSTACK_SOLR_URL'):
            raise ImproperlyConfigured('You must specify a HAYSTACK_SOLR_URL in your settings.')
        
        timeout = getattr(settings, 'HAYSTACK_SOLR_TIMEOUT', 10)
        self.conn = Solr(settings.HAYSTACK_SOLR_URL, timeout=timeout)
    
    def update(self, index, iterable, commit=True):
        docs = []
        
        try:
            for obj in iterable:
                doc = {}
                doc['id'] = self.get_identifier(obj)
                doc['django_ct'] = "%s.%s" % (obj._meta.app_label, obj._meta.module_name)
                doc['django_id'] = force_unicode(obj.pk)
                doc.update(index.prepare(obj))
                docs.append(doc)
        except UnicodeDecodeError:
            sys.stderr.write("Chunk failed.\n")
        
        self.conn.add(docs, commit=commit)

    def remove(self, obj_or_string, commit=True):
        solr_id = self.get_identifier(obj_or_string)
        self.conn.delete(id=solr_id, commit=commit)

    def clear(self, models=[], commit=True):
        if not models:
            # *:* matches all docs in Solr
            self.conn.delete(q='*:*', commit=commit)
        else:
            models_to_delete = []
            
            for model in models:
                models_to_delete.append("django_ct:%s.%s" % (model._meta.app_label, model._meta.module_name))
            
            self.conn.delete(q=" OR ".join(models_to_delete), commit=commit)
        
        # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
        self.conn.optimize()

    def search(self, query_string, sort_by=None, start_offset=0, end_offset=None,
               fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
               narrow_queries=None, collapse_field=None, collapse_max=None, collapse_type=None, **kwargs):
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }
        
        kwargs = {
            'fl': '* score',
        }
        
        if fields:
            kwargs['fl'] = fields
        
        if sort_by is not None:
            kwargs['sort'] = sort_by
        
        if start_offset is not None:
            kwargs['start'] = start_offset
        
        if end_offset is not None:
            kwargs['rows'] = end_offset
        
        if highlight is True:
            kwargs['hl'] = 'true'
            kwargs['hl.fragsize'] = '70'
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
            kwargs['spellcheck'] = 'true'
            kwargs['spellcheck.collate'] = 'true'
            kwargs['spellcheck.count'] = 1
        
        if facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.field'] = facets
        
        if date_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.date'] = date_facets.keys()
            
            for key, value in date_facets.items():
                # Date-based facets in Solr kinda suck.
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get('start_date'))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get('end_date'))
                gap_string = value.get('gap_by').upper()
                
                if value.get('gap_amount') != 1:
                    gap_string = "%d%sS" % (value.get('gap_amount'), gap_string)
                
                kwargs["f.%s.facet.date.gap" % key] = "/%s" % gap_string
        
        if query_facets is not None:
            kwargs['facet'] = 'on'
            kwargs['facet.query'] = ["%s:%s" % (field, value) for field, value in query_facets.items()]
        
        if narrow_queries is not None:
            kwargs['fq'] = list(narrow_queries)
        
        if collapse_field is not None:
            #http://wiki.apache.org/solr/FieldCollapsing
            #&collapse.field=url&collapse.max=1&collapse.type=normal
            kwargs['collapse'] = 'true'
            kwargs['collapse.type'] = collapse_type
            kwargs['collapse.max'] = collapse_max# assumes defaults to an int
            kwargs['collapse.field'] = collapse_field
        
        raw_results = self.conn.search(query_string, **kwargs)
        return self._process_results(raw_results, highlight=highlight)
    
    def more_like_this(self, model_instance, additional_query_string=None, start_offset=0, end_offset=None, **kwargs):
        index = self.site.get_index(model_instance.__class__)
        field_name = index.get_content_field()
        params = {
            'fl': '*,score',
        }
        
        if start_offset is not None:
            params['start'] = start_offset
        
        if end_offset is not None:
            params['rows'] = end_offset
        
        if additional_query_string:
            params['fq'] = additional_query_string
        
        raw_results = self.conn.more_like_this("id:%s" % self.get_identifier(model_instance), field_name, **params)
        return self._process_results(raw_results)
    
    def _process_results(self, raw_results, highlight=False):
        from haystack import site
        results = []
        hits = raw_results.hits
        facets = {}
        spelling_suggestion = None
        
        if hasattr(raw_results, 'facets'):
            facets = {
                'fields': raw_results.facets.get('facet_fields', {}),
                'dates': raw_results.facets.get('facet_dates', {}),
                'queries': raw_results.facets.get('facet_queries', {}),
            }
            
            for key in ['fields']:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = zip(facets[key][facet_field][::2], facets[key][facet_field][1::2])
        
        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
            if hasattr(raw_results, 'spellcheck'):
                if len(raw_results.spellcheck.get('suggestions', [])):
                    # For some reason, it's an array of pairs. Pull off the
                    # collated result from the end.
                    spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1]
        
        indexed_models = site.get_indexed_models()
        
        for raw_result in raw_results.docs:
            app_label, model_name = raw_result['django_ct'].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)
            
            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = site.get_index(model)
                    string_key = str(key)
                    
                    if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                        additional_fields[string_key] = index.fields[string_key].convert(value)
                    else:
                        additional_fields[string_key] = self.conn._to_python(value)
                
                del(additional_fields['django_ct'])
                del(additional_fields['django_id'])
                del(additional_fields['score'])
                
                if raw_result['id'] in getattr(raw_results, 'highlighting', {}):
                    additional_fields['highlighted'] = raw_results.highlighting[raw_result['id']]
                
                result = SearchResult(app_label, model_name, raw_result['django_id'], raw_result['score'], **additional_fields)
                results.append(result)
            else:
                hits -= 1
        
        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
    
    def build_schema(self, fields):
        content_field_name = ''
        schema_fields = []
        
        for field_name, field_class in fields.items():
            field_data = {
                'field_name': field_name,
                'type': 'text',
                'indexed': 'true',
                'multi_valued': 'false',
            }
            
            if field_class.document is True:
                content_field_name = field_name
            
            if field_class.indexed is False:
                field_data['indexed'] = 'false'
            
            # DRL_FIXME: Perhaps move to something where, if none of these
            #            checks succeed, call a custom method on the form that
            #            returns, per-backend, the right type of storage?
            # DRL_FIXME: Also think about removing `isinstance` and replacing
            #            it with a method call/string returned (like 'text' or
            #            'date').
            if isinstance(field_class, (DateField, DateTimeField)):
                field_data['type'] = 'date'
            elif isinstance(field_class, IntegerField):
                field_data['type'] = 'slong'
            elif isinstance(field_class, FloatField):
                field_data['type'] = 'sfloat'
            elif isinstance(field_class, BooleanField):
                field_data['type'] = 'boolean'
            elif isinstance(field_class, MultiValueField):
                field_data['multi_valued'] = 'true'
            
            schema_fields.append(field_data)
        
        return (content_field_name, schema_fields)
Ejemplo n.º 9
0
class SearchBackend(BaseSearchBackend):
    def __init__(self):
        if not hasattr(settings, "HAYSTACK_SOLR_URL"):
            raise ImproperlyConfigured("You must specify a HAYSTACK_SOLR_URL in your settings.")

        # DRL_TODO: This should handle the connection more graceful, especially
        #           if the backend is down.
        self.conn = Solr(settings.HAYSTACK_SOLR_URL)

    def update(self, index, iterable, commit=True):
        docs = []

        try:
            for obj in iterable:
                doc = {}
                doc["id"] = self.get_identifier(obj)
                doc["django_ct_s"] = "%s.%s" % (obj._meta.app_label, obj._meta.module_name)
                doc["django_id_s"] = force_unicode(obj.pk)
                doc.update(index.prepare(obj))
                docs.append(doc)
        except UnicodeDecodeError:
            sys.stderr.write("Chunk failed.\n")
            pass

        self.conn.add(docs, commit=commit)

    def remove(self, obj, commit=True):
        solr_id = self.get_identifier(obj)
        self.conn.delete(id=solr_id, commit=commit)

    def clear(self, models=[], commit=True):
        if not models:
            # *:* matches all docs in Solr
            self.conn.delete(q="*:*", commit=commit)
        else:
            models_to_delete = []

            for model in models:
                models_to_delete.append("django_ct_s:%s.%s" % (model._meta.app_label, model._meta.module_name))

            self.conn.delete(q=" OR ".join(models_to_delete), commit=commit)

        # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
        self.conn.optimize()

    def search(
        self,
        query_string,
        sort_by=None,
        start_offset=0,
        end_offset=None,
        fields="",
        highlight=False,
        facets=None,
        date_facets=None,
        query_facets=None,
        narrow_queries=None,
    ):
        if len(query_string) == 0:
            return []

        kwargs = {"fl": "* score"}

        if fields:
            kwargs["fl"] = fields

        if sort_by is not None:
            kwargs["sort"] = sort_by

        if start_offset is not None:
            kwargs["start"] = start_offset

        if end_offset is not None:
            kwargs["rows"] = end_offset

        if highlight is True:
            kwargs["hl"] = "true"
            kwargs["hl.fragsize"] = "200"

        if facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.field"] = facets

        if date_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.date"] = date_facets.keys()

            for key, value in date_facets.items():
                # Date-based facets in Solr kinda suck.
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(value.get("start_date"))
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(value.get("end_date"))
                kwargs["f.%s.facet.date.gap" % key] = value.get("gap")

        if query_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.query"] = ["%s:%s" % (field, value) for field, value in query_facets.items()]

        if narrow_queries is not None:
            kwargs["fq"] = list(narrow_queries)

        raw_results = self.conn.search(query_string, **kwargs)
        return self._process_results(raw_results, highlight=highlight)

    def more_like_this(self, model_instance):
        from haystack.sites import site, NotRegistered

        index = site.get_index(model_instance.__class__)
        field_name = index.get_content_field()
        raw_results = self.conn.more_like_this("id:%s" % self.get_identifier(model_instance), field_name, fl="*,score")
        return self._process_results(raw_results)

    def _process_results(self, raw_results, highlight=False):
        results = []
        facets = {}

        if hasattr(raw_results, "facets"):
            facets = {
                "fields": raw_results.facets.get("facet_fields", {}),
                "dates": raw_results.facets.get("facet_dates", {}),
                "queries": raw_results.facets.get("facet_queries", {}),
            }

            for key in ["fields"]:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = zip(facets[key][facet_field][::2], facets[key][facet_field][1::2])

        for raw_result in raw_results.docs:
            app_label, module_name = raw_result["django_ct_s"].split(".")
            additional_fields = {}

            for key, value in raw_result.items():
                additional_fields[str(key)] = self.conn._to_python(value)

            del (additional_fields["django_ct_s"])
            del (additional_fields["django_id_s"])
            del (additional_fields["score"])

            if raw_result["id"] in getattr(raw_results, "highlighting", {}):
                additional_fields["highlighted"] = raw_results.highlighting[raw_result["id"]]

            result = SearchResult(
                app_label, module_name, raw_result["django_id_s"], raw_result["score"], **additional_fields
            )
            results.append(result)

        return {"results": results, "hits": raw_results.hits, "facets": facets}
Ejemplo n.º 10
0
class SolrTestCase(unittest.TestCase):
    def setUp(self):
        super(SolrTestCase, self).setUp()
        self.default_solr = Solr('http://localhost:8983/solr/core0')
        # Short timeouts.
        self.solr = Solr('http://localhost:8983/solr/core0', timeout=2)
        self.docs = [
            {
                'id': 'doc_1',
                'title': 'Example doc 1',
                'price': 12.59,
                'popularity': 10,
            },
            {
                'id': 'doc_2',
                'title': 'Another example ☃ doc 2',
                'price': 13.69,
                'popularity': 7,
            },
            {
                'id': 'doc_3',
                'title': 'Another thing',
                'price': 2.35,
                'popularity': 8,
            },
            {
                'id': 'doc_4',
                'title': 'doc rock',
                'price': 99.99,
                'popularity': 10,
            },
            {
                'id': 'doc_5',
                'title': 'Boring',
                'price': 1.12,
                'popularity': 2,
            },
            {
                "id": "sn1",
                "cat": "pony",
                "comments": "blue",
                "description": "black",
                "store": "50.03131,10.12135"
            },
            {
                "id": "sn2",
                "cat": "pony",
                "name": "fake unicorn",
                "comments": "yellow",
                "description": "blue",
                "store": "54.23131,10.12135"
            },
            {
                "id": "sn3",
                "cat": "pony",
                "comments": "yellow",
                "description": "red",
                "store": "54.33131,10.12135"
            },
            {
                "id": "sn4",
                "cat": "unicorn",
                "comments": "yellow",
                "description": "blue"
            },
            {
                "id": "sn5",
                "cat": "unicorn",
                "comments": "steel",
                "description": "steel",
                "store": "54.43131,10.12135"
            },
            {
                "id": "sn6",
                "name": "blue pony",
                "cat": "unicorn",
                "comments": "blue",
                "description": "blue",
                "store": "54.33131,10.22135"
            },
        ]

        # Clear it.
        self.solr.delete(q='*:*')

        # Index our docs. Yes, this leans on functionality we're going to test
        # later & if it's broken, everything will catastrophically fail.
        # Such is life.
        self.solr.add(self.docs)

    def tearDown(self):
        self.solr.delete(q='*:*')
        super(SolrTestCase, self).tearDown()

    def test_init(self):
        self.assertEqual(self.default_solr.url, 'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.default_solr.decoder, json.JSONDecoder))
        self.assertEqual(self.default_solr.timeout, 60)

        self.assertEqual(self.solr.url, 'http://localhost:8983/solr/core0')
        self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder))
        self.assertEqual(self.solr.timeout, 2)

    def assertSameIDs(self, docs, expected_ids):
        doc_ids = frozenset([doc['id'] for doc in docs])
        ids_set = frozenset(expected_ids)
        self.assertEqual(doc_ids, ids_set)

    def test__create_full_url(self):
        # Nada.
        self.assertEqual(self.solr._create_full_url(path=''), 'http://localhost:8983/solr/core0')
        # Basic path.
        self.assertEqual(self.solr._create_full_url(path='pysolr_tests'), 'http://localhost:8983/solr/core0/pysolr_tests')
        # Leading slash (& making sure we don't touch the trailing slash).
        self.assertEqual(self.solr._create_full_url(path='/pysolr_tests/select/?whatever=/'), 'http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/')

    def test__send_request(self):
        # Test a valid request.
        resp_body = self.solr._send_request('GET', 'select/?q=doc&wt=json')
        self.assertTrue('"numFound":3' in resp_body)

        # Test a lowercase method & a body.
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._send_request('POST', 'update/?commit=true', body=xml_body, headers={
            'Content-type': 'text/xml; charset=utf-8',
        })
        self.assertTrue('<int name="status">0</int>' in resp_body)

        # Test a non-existent URL.
        old_url = self.solr.url
        self.solr.url = 'http://127.0.0.1:567898/wahtever'
        self.assertRaises(SolrError, self.solr._send_request, 'get', 'select/?q=doc&wt=json')
        self.solr.url = old_url

    def test__select(self):
        # Short params.
        resp_body = self.solr._select({'q': 'doc'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 3)

        # Long params.
        resp_body = self.solr._select({'q': 'doc' * 1024})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)
        self.assertEqual(len(resp_data['responseHeader']['params']['q']), 3 * 1024)

    def test__mlt(self):
        resp_body = self.solr._mlt({'q': 'id:doc_1', 'mlt.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__suggest_terms(self):
        resp_body = self.solr._select({'terms.fl': 'title'})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data['response']['numFound'], 0)

    def test__update(self):
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._update(xml_body)
        self.assertTrue('<int name="status">0</int>' in resp_body)

    def test__extract_error(self):
        class RubbishResponse(object):
            def __init__(self, content, headers=None):
                self.content = content
                self.headers = headers

                if self.headers is None:
                    self.headers = {}

        # Just the reason.
        resp_1 = RubbishResponse("We don't care.", {'reason': 'Something went wrong.'})
        self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]")

        # Empty reason.
        resp_2 = RubbishResponse("We don't care.", {'reason': None})
        self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.")

        # No reason. Time to scrape.
        resp_3 = RubbishResponse('<html><body><pre>Something is broke.</pre></body></html>', {'server': 'jetty'})
        self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]")

    def test__scrape_response(self):
        # Tomcat.
        resp_1 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p><span>Error message</span><span>messed up.</span></p></body></html>')
        self.assertEqual(resp_1, ('messed up.', ''))

        # Jetty.
        resp_2 = self.solr._scrape_response({'server': 'jetty'}, '<html><body><pre>Something is broke.</pre></body></html>')
        self.assertEqual(resp_2, ('Something is broke.', u''))

        # Broken Tomcat.
        resp_3 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><p>Really broken. Scraping Java-generated HTML sucks.</pre></body></html>')
        self.assertEqual(resp_3, (None, u'<div><body><p>Really broken. Scraping Java-generated HTML sucks.</p></body></div>'))

        # Other.
        resp_4 = self.solr._scrape_response({'server': 'crapzilla'}, '<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>')
        self.assertEqual(resp_4, ('Wow. Seriously weird.', u''))

    def test__from_python(self):
        self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z')
        self.assertEqual(self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z')
        self.assertEqual(self.solr._from_python(True), 'true')
        self.assertEqual(self.solr._from_python(False), 'false')
        self.assertEqual(self.solr._from_python(1), '1')
        self.assertEqual(self.solr._from_python(1.2), '1.2')
        self.assertEqual(self.solr._from_python(b'hello'), 'hello')
        self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃')

    def test__to_python(self):
        self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18))
        self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28))
        self.assertEqual(self.solr._to_python('true'), True)
        self.assertEqual(self.solr._to_python('false'), False)
        self.assertEqual(self.solr._to_python(1), 1)
        self.assertEqual(self.solr._to_python(1.2), 1.2)
        self.assertEqual(self.solr._to_python(b'hello'), 'hello')
        self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo')
        self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo')

    def test__is_null_value(self):
        self.assertTrue(self.solr._is_null_value(None))
        self.assertTrue(self.solr._is_null_value(''))

        self.assertFalse(self.solr._is_null_value('Hello'))
        self.assertFalse(self.solr._is_null_value(1))

    def test_create_nested_q(self):
        query = self.solr.create_nested_q("dismax", "how now brown cow", **{
            'pf': 'myfield',
            'qf': 'myfield2',
        })
        self.assertEqual(query,
            '_query_:"{!dismax pf=\'myfield\' qf=\'myfield2\'}how now brown cow"')

    def test_search(self):
        results = self.solr.search('doc')
        self.assertEqual(len(results), 3)

        results = self.solr.search('example')
        self.assertEqual(len(results), 2)

        results = self.solr.search('nothing')
        self.assertEqual(len(results), 0)

        # Advanced options.
        results = self.solr.search('doc', **{
            'debug': 'true',
            'hl': 'true',
            'hl.fragsize': 8,
            'facet': 'on',
            'facet.field': 'popularity',
            'spellcheck': 'true',
            'spellcheck.collate': 'true',
            'spellcheck.count': 1,
            # TODO: Can't get these working in my test setup.
            # 'group': 'true',
            # 'group.field': 'id',
        })
        self.assertEqual(len(results), 3)
        self.assertTrue('explain' in results.debug)
        self.assertEqual(results.highlighting, {u'doc_4': {}, u'doc_2': {}, u'doc_1': {}})
        self.assertEqual(results.spellcheck, {})
        self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0])
        self.assertTrue(results.qtime is not None)
        # TODO: Can't get these working in my test setup.
        # self.assertEqual(results.grouped, '')

    def test_search_with_nested_q(self):
        nested_q = self.solr.create_nested_q('edismax', 'blue', **{
                'qf': 'description comments'
        })
        results = self.solr.search('pony AND {}'.format(nested_q))
        
        self.assertSameIDs(results, ['sn6', 'sn2', 'sn1'])

    def test_disjunction_max(self):
        results = self.solr.disjunction_max('blue', 'description comments')
        
        self.assertSameIDs(results, ['sn6', 'sn4', 'sn2', 'sn1'])

    def test_disjunction_max_with_nested_q(self):
        nested_q = self.solr.create_nested_q('edismax', 'blue', **{
                'qf': 'description comments'
        })
        results = self.solr.disjunction_max('unicorn AND {}'.format(nested_q), 'cat name')
        
        self.assertSameIDs(results, ['sn6', 'sn4', 'sn2'])

    def test_spatial_search(self):
        results = self.solr.spatial_search('pony', 'store', '54.33131,10.12135', '100')
        
        self.assertSameIDs(results, ['sn6', 'sn3', 'sn2'])

    def test_more_like_this(self):
        results = self.solr.more_like_this('id:doc_1', 'text')
        self.assertEqual(len(results), 0)

    def test_suggest_terms(self):
        results = self.solr.suggest_terms('title', '')
        self.assertEqual(len(results), 1)
        self.assertEqual(results, {'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]})

    def test__build_doc(self):
        doc = {
            'id': 'doc_1',
            'title': 'Example doc ☃ 1',
            'price': 12.59,
            'popularity': 10,
        }
        doc_xml = force_unicode(ET.tostring(self.solr._build_doc(doc), encoding='utf-8'))
        self.assertTrue('<field name="title">Example doc ☃ 1</field>' in doc_xml)
        self.assertTrue('<field name="id">doc_1</field>' in doc_xml)
        self.assertEqual(len(doc_xml), 152)

    def test_add(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.assertEqual(len(self.solr.search('example')), 2)

        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            },
            {
                'id': 'doc_7',
                'title': 'Another example doc',
            },
        ])

        self.assertEqual(len(self.solr.search('doc')), 5)
        self.assertEqual(len(self.solr.search('example')), 3)

    def test_add_with_boost(self):
        self.assertEqual(len(self.solr.search('doc')), 3)

        self.solr.add([{'id': 'doc_6', 'title': 'Important doc'}],
                      boost={'title': 10.0})

        self.solr.add([{'id': 'doc_7', 'title': 'Spam doc doc'}],
                      boost={'title': 0})

        res = self.solr.search('doc')
        self.assertEqual(len(res), 5)
        self.assertEqual('doc_6', res.docs[0]['id'])

    def test_delete(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.delete(id='doc_1')
        self.assertEqual(len(self.solr.search('doc')), 2)
        self.solr.delete(q='price:[0 TO 15]')
        self.assertEqual(len(self.solr.search('doc')), 1)

        self.assertEqual(len(self.solr.search('*:*')), 7)
        self.solr.delete(q='*:*')
        self.assertEqual(len(self.solr.search('*:*')), 0)

        # Need at least one.
        self.assertRaises(ValueError, self.solr.delete)
        # Can't have both.
        self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar')

    def test_commit(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.commit()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_optimize(self):
        # Make sure it doesn't blow up. Side effects are hard to measure. :/
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.optimize()
        self.assertEqual(len(self.solr.search('doc')), 4)

    def test_extract(self):
        fake_f = StringIO("""
            <html>
                <head>
                    <meta charset="utf-8">
                    <meta name="haystack-test" content="test 1234">
                    <title>Test Title ☃&#x2603;</title>
                </head>
                    <body>foobar</body>
            </html>
        """)
        fake_f.name = "test.html"
        extracted = self.solr.extract(fake_f)

        # Verify documented response structure:
        self.assertIn('contents', extracted)
        self.assertIn('metadata', extracted)

        self.assertIn('foobar', extracted['contents'])

        m = extracted['metadata']

        self.assertEqual([fake_f.name], m['stream_name'])

        self.assertIn('haystack-test', m, "HTML metadata should have been extracted!")
        self.assertEqual(['test 1234'], m['haystack-test'])

        # Note the underhanded use of a double snowman to verify both that Tika
        # correctly decoded entities and that our UTF-8 characters survived the
        # round-trip:
        self.assertEqual(['Test Title ☃☃'], m['title'])

    def test_full_url(self):
        self.solr.url = 'http://localhost:8983/solr/'
        full_url = self.solr._create_full_url(path='/update')

        # Make sure trailing and leading slashes do not collide:
        self.assertEqual(full_url, 'http://localhost:8983/solr/update')
Ejemplo n.º 11
0
class SolrSearchBackend(BaseSearchBackend):
    # Word reserved by Solr for special use.
    RESERVED_WORDS = ("AND", "NOT", "OR", "TO")

    # Characters reserved by Solr for special use.
    # The '\\' must come first, so as not to overwrite the other slash replacements.
    RESERVED_CHARACTERS = (
        "\\",
        "+",
        "-",
        "&&",
        "||",
        "!",
        "(",
        ")",
        "{",
        "}",
        "[",
        "]",
        "^",
        '"',
        "~",
        "*",
        "?",
        ":",
        "/",
    )

    def __init__(self, connection_alias, **connection_options):
        super(SolrSearchBackend, self).__init__(connection_alias, **connection_options)

        if "URL" not in connection_options:
            raise ImproperlyConfigured(
                "You must specify a 'URL' in your settings for connection '%s'."
                % connection_alias
            )

        self.collate = connection_options.get("COLLATE_SPELLING", True)

        self.conn = Solr(
            connection_options["URL"],
            timeout=self.timeout,
            **connection_options.get("KWARGS", {})
        )
        self.log = logging.getLogger("haystack")

    def update(self, index, iterable, commit=True):
        docs = []

        for obj in iterable:
            try:
                docs.append(index.full_prepare(obj))
            except SkipDocument:
                self.log.debug("Indexing for object `%s` skipped", obj)
            except UnicodeDecodeError:
                if not self.silently_fail:
                    raise

                # We'll log the object identifier but won't include the actual object
                # to avoid the possibility of that generating encoding errors while
                # processing the log message:
                self.log.error(
                    "UnicodeDecodeError while preparing object for update",
                    exc_info=True,
                    extra={"data": {"index": index, "object": get_identifier(obj)}},
                )

        if len(docs) > 0:
            try:
                self.conn.add(docs, commit=commit, boost=index.get_field_weights())
            except (IOError, SolrError) as e:
                if not self.silently_fail:
                    raise

                self.log.error("Failed to add documents to Solr: %s", e, exc_info=True)

    def remove(self, obj_or_string, commit=True):
        solr_id = get_identifier(obj_or_string)

        try:
            kwargs = {"commit": commit, "id": solr_id}
            self.conn.delete(**kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to remove document '%s' from Solr: %s",
                solr_id,
                e,
                exc_info=True,
            )

    def clear(self, models=None, commit=True):
        if models is not None:
            assert isinstance(models, (list, tuple))

        try:
            if models is None:
                # *:* matches all docs in Solr
                self.conn.delete(q="*:*", commit=commit)
            else:
                models_to_delete = []

                for model in models:
                    models_to_delete.append("%s:%s" % (DJANGO_CT, get_model_ct(model)))

                self.conn.delete(q=" OR ".join(models_to_delete), commit=commit)

            if commit:
                # Run an optimize post-clear. http://wiki.apache.org/solr/FAQ#head-9aafb5d8dff5308e8ea4fcf4b71f19f029c4bb99
                self.conn.optimize()
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            if models is not None:
                self.log.error(
                    "Failed to clear Solr index of models '%s': %s",
                    ",".join(models_to_delete),
                    e,
                    exc_info=True,
                )
            else:
                self.log.error("Failed to clear Solr index: %s", e, exc_info=True)

    @log_query
    def search(self, query_string, **kwargs):
        if len(query_string) == 0:
            return {"results": [], "hits": 0}

        search_kwargs = self.build_search_kwargs(query_string, **kwargs)

        try:
            raw_results = self.conn.search(query_string, **search_kwargs)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to query Solr using '%s': %s", query_string, e, exc_info=True
            )
            raw_results = EmptyResults()

        return self._process_results(
            raw_results,
            highlight=kwargs.get("highlight"),
            result_class=kwargs.get("result_class", SearchResult),
            distance_point=kwargs.get("distance_point"),
        )

    def build_search_kwargs(
        self,
        query_string,
        sort_by=None,
        start_offset=0,
        end_offset=None,
        fields="",
        highlight=False,
        facets=None,
        date_facets=None,
        query_facets=None,
        narrow_queries=None,
        spelling_query=None,
        within=None,
        dwithin=None,
        distance_point=None,
        models=None,
        limit_to_registered_models=None,
        result_class=None,
        stats=None,
        collate=None,
        **extra_kwargs
    ):

        index = haystack.connections[self.connection_alias].get_unified_index()

        kwargs = {"fl": "* score", "df": index.document_field}

        if fields:
            if isinstance(fields, (list, set)):
                fields = " ".join(fields)

            kwargs["fl"] = fields

        if sort_by is not None:
            if sort_by in ["distance asc", "distance desc"] and distance_point:
                # Do the geo-enabled sort.
                lng, lat = distance_point["point"].coords
                kwargs["sfield"] = distance_point["field"]
                kwargs["pt"] = "%s,%s" % (lat, lng)

                if sort_by == "distance asc":
                    kwargs["sort"] = "geodist() asc"
                else:
                    kwargs["sort"] = "geodist() desc"
            else:
                if sort_by.startswith("distance "):
                    warnings.warn(
                        "In order to sort by distance, you must call the '.distance(...)' method."
                    )

                # Regular sorting.
                kwargs["sort"] = sort_by

        if start_offset is not None:
            kwargs["start"] = start_offset

        if end_offset is not None:
            kwargs["rows"] = end_offset - start_offset

        if highlight:
            # `highlight` can either be True or a dictionary containing custom parameters
            # which will be passed to the backend and may override our default settings:

            kwargs["hl"] = "true"
            kwargs["hl.fragsize"] = "200"

            if isinstance(highlight, dict):
                # autoprefix highlighter options with 'hl.', all of them start with it anyway
                # this makes option dicts shorter: {'maxAnalyzedChars': 42}
                # and lets some of options be used as keyword arguments: `.highlight(preserveMulti=False)`
                kwargs.update(
                    {
                        key if key.startswith("hl.") else ("hl." + key): highlight[key]
                        for key in highlight.keys()
                    }
                )

        if collate is None:
            collate = self.collate
        if self.include_spelling is True:
            kwargs["spellcheck"] = "true"
            kwargs["spellcheck.collate"] = str(collate).lower()
            kwargs["spellcheck.count"] = 1

            if spelling_query:
                kwargs["spellcheck.q"] = spelling_query

        if facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.field"] = facets.keys()

            for facet_field, options in facets.items():
                for key, value in options.items():
                    kwargs[
                        "f.%s.facet.%s" % (facet_field, key)
                    ] = self.conn._from_python(value)

        if date_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.date"] = date_facets.keys()
            kwargs["facet.date.other"] = "none"

            for key, value in date_facets.items():
                kwargs["f.%s.facet.date.start" % key] = self.conn._from_python(
                    value.get("start_date")
                )
                kwargs["f.%s.facet.date.end" % key] = self.conn._from_python(
                    value.get("end_date")
                )
                gap_by_string = value.get("gap_by").upper()
                gap_string = "%d%s" % (value.get("gap_amount"), gap_by_string)

                if value.get("gap_amount") != 1:
                    gap_string += "S"

                kwargs["f.%s.facet.date.gap" % key] = "+%s/%s" % (
                    gap_string,
                    gap_by_string,
                )

        if query_facets is not None:
            kwargs["facet"] = "on"
            kwargs["facet.query"] = [
                "%s:%s" % (field, value) for field, value in query_facets
            ]

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True
            )

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add("%s:(%s)" % (DJANGO_CT, " OR ".join(model_choices)))

        if narrow_queries is not None:
            kwargs["fq"] = list(narrow_queries)

        if stats:
            kwargs["stats"] = "true"

            for k in stats.keys():
                kwargs["stats.field"] = k

                for facet in stats[k]:
                    kwargs["f.%s.stats.facet" % k] = facet

        if within is not None:
            from haystack.utils.geo import generate_bounding_box

            kwargs.setdefault("fq", [])
            ((min_lat, min_lng), (max_lat, max_lng)) = generate_bounding_box(
                within["point_1"], within["point_2"]
            )
            # Bounding boxes are min, min TO max, max. Solr's wiki was *NOT*
            # very clear on this.
            bbox = "%s:[%s,%s TO %s,%s]" % (
                within["field"],
                min_lat,
                min_lng,
                max_lat,
                max_lng,
            )
            kwargs["fq"].append(bbox)

        if dwithin is not None:
            kwargs.setdefault("fq", [])
            lng, lat = dwithin["point"].coords
            geofilt = "{!geofilt pt=%s,%s sfield=%s d=%s}" % (
                lat,
                lng,
                dwithin["field"],
                dwithin["distance"].km,
            )
            kwargs["fq"].append(geofilt)

        # Check to see if the backend should try to include distances
        # (Solr 4.X+) in the results.
        if self.distance_available and distance_point:
            # In early testing, you can't just hand Solr 4.X a proper bounding box
            # & request distances. To enable native distance would take calculating
            # a center point & a radius off the user-provided box, which kinda
            # sucks. We'll avoid it for now, since Solr 4.x's release will be some
            # time yet.
            # kwargs['fl'] += ' _dist_:geodist()'
            pass

        if extra_kwargs:
            kwargs.update(extra_kwargs)

        return kwargs

    def more_like_this(
        self,
        model_instance,
        additional_query_string=None,
        start_offset=0,
        end_offset=None,
        models=None,
        limit_to_registered_models=None,
        result_class=None,
        **kwargs
    ):
        from haystack import connections

        # Deferred models will have a different class ("RealClass_Deferred_fieldname")
        # which won't be in our registry:
        model_klass = model_instance._meta.concrete_model

        index = (
            connections[self.connection_alias]
            .get_unified_index()
            .get_index(model_klass)
        )
        field_name = index.get_content_field()
        params = {"fl": "*,score"}

        if start_offset is not None:
            params["start"] = start_offset

        if end_offset is not None:
            params["rows"] = end_offset

        narrow_queries = set()

        if limit_to_registered_models is None:
            limit_to_registered_models = getattr(
                settings, "HAYSTACK_LIMIT_TO_REGISTERED_MODELS", True
            )

        if models and len(models):
            model_choices = sorted(get_model_ct(model) for model in models)
        elif limit_to_registered_models:
            # Using narrow queries, limit the results to only models handled
            # with the current routers.
            model_choices = self.build_models_list()
        else:
            model_choices = []

        if len(model_choices) > 0:
            if narrow_queries is None:
                narrow_queries = set()

            narrow_queries.add("%s:(%s)" % (DJANGO_CT, " OR ".join(model_choices)))

        if additional_query_string:
            narrow_queries.add(additional_query_string)

        if narrow_queries:
            params["fq"] = list(narrow_queries)

        query = "%s:%s" % (ID, get_identifier(model_instance))

        try:
            raw_results = self.conn.more_like_this(query, field_name, **params)
        except (IOError, SolrError) as e:
            if not self.silently_fail:
                raise

            self.log.error(
                "Failed to fetch More Like This from Solr for document '%s': %s",
                query,
                e,
                exc_info=True,
            )
            raw_results = EmptyResults()

        return self._process_results(raw_results, result_class=result_class)

    def _process_results(
        self, raw_results, highlight=False, result_class=None, distance_point=None
    ):
        from haystack import connections

        results = []
        hits = raw_results.hits
        facets = {}
        stats = {}
        spelling_suggestion = spelling_suggestions = None

        if result_class is None:
            result_class = SearchResult

        if hasattr(raw_results, "stats"):
            stats = raw_results.stats.get("stats_fields", {})

        if hasattr(raw_results, "facets"):
            facets = {
                "fields": raw_results.facets.get("facet_fields", {}),
                "dates": raw_results.facets.get("facet_dates", {}),
                "queries": raw_results.facets.get("facet_queries", {}),
            }

            for key in ["fields"]:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = list(
                        zip(
                            facets[key][facet_field][::2],
                            facets[key][facet_field][1::2],
                        )
                    )

        if self.include_spelling and hasattr(raw_results, "spellcheck"):
            try:
                spelling_suggestions = self.extract_spelling_suggestions(raw_results)
            except Exception as exc:
                self.log.error(
                    "Error extracting spelling suggestions: %s",
                    exc,
                    exc_info=True,
                    extra={"data": {"spellcheck": raw_results.spellcheck}},
                )

                if not self.silently_fail:
                    raise

                spelling_suggestions = None

            if spelling_suggestions:
                # Maintain compatibility with older versions of Haystack which returned a single suggestion:
                spelling_suggestion = spelling_suggestions[-1]
                assert isinstance(spelling_suggestion, six.string_types)
            else:
                spelling_suggestion = None

        unified_index = connections[self.connection_alias].get_unified_index()
        indexed_models = unified_index.get_indexed_models()

        for raw_result in raw_results.docs:
            app_label, model_name = raw_result[DJANGO_CT].split(".")
            additional_fields = {}
            model = haystack_get_model(app_label, model_name)

            if model and model in indexed_models:
                index = unified_index.get_index(model)
                index_field_map = index.field_map
                for key, value in raw_result.items():
                    string_key = str(key)
                    # re-map key if alternate name used
                    if string_key in index_field_map:
                        string_key = index_field_map[key]

                    if string_key in index.fields and hasattr(
                        index.fields[string_key], "convert"
                    ):
                        additional_fields[string_key] = index.fields[
                            string_key
                        ].convert(value)
                    else:
                        additional_fields[string_key] = self.conn._to_python(value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])
                del (additional_fields["score"])

                if raw_result[ID] in getattr(raw_results, "highlighting", {}):
                    additional_fields["highlighted"] = raw_results.highlighting[
                        raw_result[ID]
                    ]

                if distance_point:
                    additional_fields["_point_of_origin"] = distance_point

                    if raw_result.get("__dist__"):
                        from django.contrib.gis.measure import Distance

                        additional_fields["_distance"] = Distance(
                            km=float(raw_result["__dist__"])
                        )
                    else:
                        additional_fields["_distance"] = None

                result = result_class(
                    app_label,
                    model_name,
                    raw_result[DJANGO_ID],
                    raw_result["score"],
                    **additional_fields
                )
                results.append(result)
            else:
                hits -= 1

        return {
            "results": results,
            "hits": hits,
            "stats": stats,
            "facets": facets,
            "spelling_suggestion": spelling_suggestion,
            "spelling_suggestions": spelling_suggestions,
        }

    def extract_spelling_suggestions(self, raw_results):
        # There are many different formats for Legacy, 6.4, and 6.5 e.g.
        # https://issues.apache.org/jira/browse/SOLR-3029 and depending on the
        # version and configuration the response format may be a dict of dicts,
        # a list of dicts, or a list of strings.

        collations = raw_results.spellcheck.get("collations", None)
        suggestions = raw_results.spellcheck.get("suggestions", None)

        # We'll collect multiple suggestions here. For backwards
        # compatibility with older versions of Haystack we'll still return
        # only a single suggestion but in the future we can expose all of
        # them.

        spelling_suggestions = []

        if collations:
            if isinstance(collations, dict):
                # Solr 6.5
                collation_values = collations["collation"]
                if isinstance(collation_values, six.string_types):
                    collation_values = [collation_values]
                elif isinstance(collation_values, dict):
                    # spellcheck.collateExtendedResults changes the format to a dictionary:
                    collation_values = [collation_values["collationQuery"]]
            elif isinstance(collations[1], dict):
                # Solr 6.4
                collation_values = collations
            else:
                # Older versions of Solr
                collation_values = collations[-1:]

            for i in collation_values:
                # Depending on the options the values are either simple strings or dictionaries:
                spelling_suggestions.append(
                    i["collationQuery"] if isinstance(i, dict) else i
                )
        elif suggestions:
            if isinstance(suggestions, dict):
                for i in suggestions.values():
                    for j in i["suggestion"]:
                        if isinstance(j, dict):
                            spelling_suggestions.append(j["word"])
                        else:
                            spelling_suggestions.append(j)
            elif isinstance(suggestions[0], six.string_types) and isinstance(
                suggestions[1], dict
            ):
                # Solr 6.4 uses a list of paired (word, dictionary) pairs:
                for suggestion in suggestions:
                    if isinstance(suggestion, dict):
                        for i in suggestion["suggestion"]:
                            if isinstance(i, dict):
                                spelling_suggestions.append(i["word"])
                            else:
                                spelling_suggestions.append(i)
            else:
                # Legacy Solr
                spelling_suggestions.append(suggestions[-1])

        return spelling_suggestions

    def build_schema(self, fields):
        content_field_name = ""
        schema_fields = []

        for field_name, field_class in fields.items():
            field_data = {
                "field_name": field_class.index_fieldname,
                "type": "text_en",
                "indexed": "true",
                "stored": "true",
                "multi_valued": "false",
            }

            if field_class.document is True:
                content_field_name = field_class.index_fieldname

            # DRL_FIXME: Perhaps move to something where, if none of these
            #            checks succeed, call a custom method on the form that
            #            returns, per-backend, the right type of storage?
            if field_class.field_type in ["date", "datetime"]:
                field_data["type"] = "date"
            elif field_class.field_type == "integer":
                field_data["type"] = "long"
            elif field_class.field_type == "float":
                field_data["type"] = "float"
            elif field_class.field_type == "boolean":
                field_data["type"] = "boolean"
            elif field_class.field_type == "ngram":
                field_data["type"] = "ngram"
            elif field_class.field_type == "edge_ngram":
                field_data["type"] = "edge_ngram"
            elif field_class.field_type == "location":
                field_data["type"] = "location"

            if field_class.is_multivalued:
                field_data["multi_valued"] = "true"

            if field_class.stored is False:
                field_data["stored"] = "false"

            # Do this last to override `text` fields.
            if field_class.indexed is False:
                field_data["indexed"] = "false"

                # If it's text and not being indexed, we probably don't want
                # to do the normal lowercase/tokenize/stemming/etc. dance.
                if field_data["type"] == "text_en":
                    field_data["type"] = "string"

            # If it's a ``FacetField``, make sure we don't postprocess it.
            if hasattr(field_class, "facet_for"):
                # If it's text, it ought to be a string.
                if field_data["type"] == "text_en":
                    field_data["type"] = "string"

            schema_fields.append(field_data)

        return (content_field_name, schema_fields)

    def extract_file_contents(self, file_obj, **kwargs):
        """Extract text and metadata from a structured file (PDF, MS Word, etc.)

        Uses the Solr ExtractingRequestHandler, which is based on Apache Tika.
        See the Solr wiki for details:

            http://wiki.apache.org/solr/ExtractingRequestHandler

        Due to the way the ExtractingRequestHandler is implemented it completely
        replaces the normal Haystack indexing process with several unfortunate
        restrictions: only one file per request, the extracted data is added to
        the index with no ability to modify it, etc. To simplify the process and
        allow for more advanced use we'll run using the extract-only mode to
        return the extracted data without adding it to the index so we can then
        use it within Haystack's normal templating process.

        Returns None if metadata cannot be extracted; otherwise returns a
        dictionary containing at least two keys:

            :contents:
                        Extracted full-text content, if applicable
            :metadata:
                        key:value pairs of text strings
        """

        try:
            return self.conn.extract(file_obj, **kwargs)
        except Exception as e:
            self.log.warning(
                "Unable to extract file contents: %s",
                e,
                exc_info=True,
                extra={"data": {"file": file_obj}},
            )
            return None
Ejemplo n.º 12
0
class SolrTestCase(unittest.TestCase):
    def setUp(self):
        super(SolrTestCase, self).setUp()
        self.default_solr = Solr("http://localhost:8983/solr/core0")
        # Short timeouts.
        self.solr = Solr("http://localhost:8983/solr/core0", timeout=2)
        self.docs = [
            {"id": "doc_1", "title": "Example doc 1", "price": 12.59, "popularity": 10},
            {"id": "doc_2", "title": "Another example ☃ doc 2", "price": 13.69, "popularity": 7},
            {"id": "doc_3", "title": "Another thing", "price": 2.35, "popularity": 8},
            {"id": "doc_4", "title": "doc rock", "price": 99.99, "popularity": 10},
            {"id": "doc_5", "title": "Boring", "price": 1.12, "popularity": 2},
        ]

        # Clear it.
        self.solr.delete(q="*:*")

        # Index our docs. Yes, this leans on functionality we're going to test
        # later & if it's broken, everything will catastrophically fail.
        # Such is life.
        self.solr.add(self.docs)

    def tearDown(self):
        self.solr.delete(q="*:*")
        super(SolrTestCase, self).tearDown()

    def test_init(self):
        self.assertEqual(self.default_solr.url, "http://localhost:8983/solr/core0")
        self.assertTrue(isinstance(self.default_solr.decoder, json.JSONDecoder))
        self.assertEqual(self.default_solr.timeout, 60)

        self.assertEqual(self.solr.url, "http://localhost:8983/solr/core0")
        self.assertTrue(isinstance(self.solr.decoder, json.JSONDecoder))
        self.assertEqual(self.solr.timeout, 2)

    def test__create_full_url(self):
        # Nada.
        self.assertEqual(self.solr._create_full_url(path=""), "http://localhost:8983/solr/core0")
        # Basic path.
        self.assertEqual(
            self.solr._create_full_url(path="pysolr_tests"), "http://localhost:8983/solr/core0/pysolr_tests"
        )
        # Leading slash (& making sure we don't touch the trailing slash).
        self.assertEqual(
            self.solr._create_full_url(path="/pysolr_tests/select/?whatever=/"),
            "http://localhost:8983/solr/core0/pysolr_tests/select/?whatever=/",
        )

    def test__send_request(self):
        # Test a valid request.
        resp_body = self.solr._send_request("GET", "select/?q=doc&wt=json")
        self.assertTrue('"numFound":3' in resp_body)

        # Test a lowercase method & a body.
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee! ☃</field></doc></add>'
        resp_body = self.solr._send_request(
            "POST", "update/?commit=true", body=xml_body, headers={"Content-type": "text/xml; charset=utf-8"}
        )
        self.assertTrue('<int name="status">0</int>' in resp_body)

        # Test a non-existent URL.
        old_url = self.solr.url
        self.solr.url = "http://127.0.0.1:567898/wahtever"
        self.assertRaises(SolrError, self.solr._send_request, "get", "select/?q=doc&wt=json")
        self.solr.url = old_url

        # Test bad core as well
        self.solr.url = "http://localhost:8983/solr/bad_core"
        try:
            self.assertRaises(SolrError, self.solr._send_request, "get", "select/?q=doc&wt=json")
        finally:
            self.solr.url = old_url

    def test__select(self):
        # Short params.
        resp_body = self.solr._select({"q": "doc"})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data["response"]["numFound"], 3)

        # Long params.
        resp_body = self.solr._select({"q": "doc" * 1024})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data["response"]["numFound"], 0)
        self.assertEqual(len(resp_data["responseHeader"]["params"]["q"]), 3 * 1024)

        # Test Deep Pagination CursorMark
        resp_body = self.solr._select({"q": "*", "cursorMark": "*", "sort": "id desc", "start": 0, "rows": 2})
        resp_data = json.loads(resp_body)
        self.assertEqual(len(resp_data["response"]["docs"]), 2)
        self.assertIn("nextCursorMark", resp_data)

    def test__mlt(self):
        resp_body = self.solr._mlt({"q": "id:doc_1", "mlt.fl": "title"})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data["response"]["numFound"], 0)

    def test__suggest_terms(self):
        resp_body = self.solr._select({"terms.fl": "title"})
        resp_data = json.loads(resp_body)
        self.assertEqual(resp_data["response"]["numFound"], 0)

    def test__update(self):
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._update(xml_body)
        self.assertTrue('<int name="status">0</int>' in resp_body)

    def test__soft_commit(self):
        xml_body = '<add><doc><field name="id">doc_12</field><field name="title">Whee!</field></doc></add>'
        resp_body = self.solr._update(xml_body, softCommit=True)
        self.assertTrue('<int name="status">0</int>' in resp_body)

    def test__extract_error(self):
        class RubbishResponse(object):
            def __init__(self, content, headers=None):
                if isinstance(content, bytes):
                    content = content.decode("utf-8")
                self.content = content
                self.headers = headers

                if self.headers is None:
                    self.headers = {}

            def json(self):
                return json.loads(self.content)

        # Just the reason.
        resp_1 = RubbishResponse("We don't care.", {"reason": "Something went wrong."})
        self.assertEqual(self.solr._extract_error(resp_1), "[Reason: Something went wrong.]")

        # Empty reason.
        resp_2 = RubbishResponse("We don't care.", {"reason": None})
        self.assertEqual(self.solr._extract_error(resp_2), "[Reason: None]\nWe don't care.")

        # No reason. Time to scrape.
        resp_3 = RubbishResponse("<html><body><pre>Something is broke.</pre></body></html>", {"server": "jetty"})
        self.assertEqual(self.solr._extract_error(resp_3), "[Reason: Something is broke.]")

        # No reason. JSON response.
        resp_4 = RubbishResponse(b'\n {"error": {"msg": "It happens"}}', {"server": "tomcat"})
        self.assertEqual(self.solr._extract_error(resp_4), "[Reason: It happens]")

        # No reason. Weird JSON response.
        resp_5 = RubbishResponse(b'{"kinda": "weird"}', {"server": "jetty"})
        self.assertEqual(self.solr._extract_error(resp_5), '[Reason: None]\n{"kinda": "weird"}')

    def test__scrape_response(self):
        # Jetty.
        resp_1 = self.solr._scrape_response(
            {"server": "jetty"}, "<html><body><pre>Something is broke.</pre></body></html>"
        )
        self.assertEqual(resp_1, ("Something is broke.", ""))

        # Other.
        resp_2 = self.solr._scrape_response(
            {"server": "crapzilla"},
            "<html><head><title>Wow. Seriously weird.</title></head><body><pre>Something is broke.</pre></body></html>",
        )
        self.assertEqual(resp_2, ("Wow. Seriously weird.", ""))

    @unittest.skipIf(
        sys.version_info < (2, 7),
        reason="Python 2.6 lacks the ElementTree 1.3 interface required for Solr XML error message parsing",
    )
    def test__scrape_response_coyote_xml(self):
        resp_3 = self.solr._scrape_response(
            {"server": "coyote"},
            '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>\n</response>\n',
        )
        self.assertEqual(
            resp_3, ("Invalid Date String:'2015-03-23 10:43:33'", "Invalid Date String:'2015-03-23 10:43:33'")
        )

        # Valid XML with a traceback
        resp_4 = self.solr._scrape_response(
            {"server": "coyote"},
            """<?xml version="1.0"?>
<response>
<lst name="responseHeader"><int name="status">500</int><int name="QTime">138</int></lst><lst name="error"><str name="msg">Internal Server Error</str><str name="trace">org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)</str><int name="code">500</int></lst>
</response>""",
        )
        self.assertEqual(
            resp_4,
            (
                "Internal Server Error",
                "org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)",
            ),
        )

    def test__scrape_response_tomcat(self):
        """Tests for Tomcat error responses"""

        resp_0 = self.solr._scrape_response(
            {"server": "coyote"}, "<html><body><h1>Something broke!</h1><pre>gigantic stack trace</pre></body></html>"
        )
        self.assertEqual(resp_0, ("Something broke!", ""))

        # Invalid XML
        bogus_xml = '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>'
        reason, full_html = self.solr._scrape_response({"server": "coyote"}, bogus_xml)
        self.assertEqual(reason, None)
        self.assertEqual(full_html, bogus_xml.replace("\n", ""))

    def test__from_python(self):
        self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), "2013-01-18T00:00:00Z")
        self.assertEqual(self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), "2013-01-18T00:30:28Z")
        self.assertEqual(self.solr._from_python(True), "true")
        self.assertEqual(self.solr._from_python(False), "false")
        self.assertEqual(self.solr._from_python(1), "1")
        self.assertEqual(self.solr._from_python(1.2), "1.2")
        self.assertEqual(self.solr._from_python(b"hello"), "hello")
        self.assertEqual(self.solr._from_python("hello ☃"), "hello ☃")
        self.assertEqual(self.solr._from_python("\x01test\x02"), "test")

    def test__to_python(self):
        self.assertEqual(self.solr._to_python("2013-01-18T00:00:00Z"), datetime.datetime(2013, 1, 18))
        self.assertEqual(self.solr._to_python("2013-01-18T00:30:28Z"), datetime.datetime(2013, 1, 18, 0, 30, 28))
        self.assertEqual(self.solr._to_python("true"), True)
        self.assertEqual(self.solr._to_python("false"), False)
        self.assertEqual(self.solr._to_python(1), 1)
        self.assertEqual(self.solr._to_python(1.2), 1.2)
        self.assertEqual(self.solr._to_python(b"hello"), "hello")
        self.assertEqual(self.solr._to_python("hello ☃"), "hello ☃")
        self.assertEqual(self.solr._to_python(["foo", "bar"]), "foo")
        self.assertEqual(self.solr._to_python(("foo", "bar")), "foo")
        self.assertEqual(self.solr._to_python('tuple("foo", "bar")'), 'tuple("foo", "bar")')

    def test__is_null_value(self):
        self.assertTrue(self.solr._is_null_value(None))
        self.assertTrue(self.solr._is_null_value(""))

        self.assertFalse(self.solr._is_null_value("Hello"))
        self.assertFalse(self.solr._is_null_value(1))

    def test_search(self):
        results = self.solr.search("doc")
        self.assertEqual(len(results), 3)

        results = self.solr.search("example")
        self.assertEqual(len(results), 2)

        results = self.solr.search("nothing")
        self.assertEqual(len(results), 0)

        # Advanced options.
        results = self.solr.search(
            "doc",
            **{
                "debug": "true",
                "hl": "true",
                "hl.fragsize": 8,
                "facet": "on",
                "facet.field": "popularity",
                "spellcheck": "true",
                "spellcheck.collate": "true",
                "spellcheck.count": 1,
                # TODO: Can't get these working in my test setup.
                # 'group': 'true',
                # 'group.field': 'id',
            }
        )
        self.assertEqual(len(results), 3)
        self.assertTrue("explain" in results.debug)
        self.assertEqual(results.highlighting, {"doc_4": {}, "doc_2": {}, "doc_1": {}})
        self.assertEqual(results.spellcheck, {})
        self.assertEqual(results.facets["facet_fields"]["popularity"], ["10", 2, "7", 1, "2", 0, "8", 0])
        self.assertTrue(results.qtime is not None)
        # TODO: Can't get these working in my test setup.
        # self.assertEqual(results.grouped, '')

    def test_more_like_this(self):
        results = self.solr.more_like_this("id:doc_1", "text")
        self.assertEqual(len(results), 0)

    def test_suggest_terms(self):
        results = self.solr.suggest_terms("title", "")
        self.assertEqual(len(results), 1)
        self.assertEqual(
            results,
            {
                "title": [
                    ("doc", 3),
                    ("another", 2),
                    ("example", 2),
                    ("1", 1),
                    ("2", 1),
                    ("boring", 1),
                    ("rock", 1),
                    ("thing", 1),
                ]
            },
        )

    def test__build_doc(self):
        doc = {"id": "doc_1", "title": "Example doc ☃ 1", "price": 12.59, "popularity": 10}
        doc_xml = force_unicode(ET.tostring(self.solr._build_doc(doc), encoding="utf-8"))
        self.assertTrue('<field name="title">Example doc ☃ 1</field>' in doc_xml)
        self.assertTrue('<field name="id">doc_1</field>' in doc_xml)
        self.assertEqual(len(doc_xml), 152)

    def test_add(self):
        self.assertEqual(len(self.solr.search("doc")), 3)
        self.assertEqual(len(self.solr.search("example")), 2)

        self.solr.add([{"id": "doc_6", "title": "Newly added doc"}, {"id": "doc_7", "title": "Another example doc"}])

        self.assertEqual(len(self.solr.search("doc")), 5)
        self.assertEqual(len(self.solr.search("example")), 3)

    def test_add_with_boost(self):
        self.assertEqual(len(self.solr.search("doc")), 3)

        self.solr.add([{"id": "doc_6", "title": "Important doc"}], boost={"title": 10.0})

        self.solr.add([{"id": "doc_7", "title": "Spam doc doc"}], boost={"title": 0})

        res = self.solr.search("doc")
        self.assertEqual(len(res), 5)
        self.assertEqual("doc_6", res.docs[0]["id"])

    def test_field_update(self):
        originalDocs = self.solr.search("doc")
        self.assertEqual(len(originalDocs), 3)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append({"id": doc["id"], "popularity": 5})
        self.solr.add(updateList, fieldUpdates={"popularity": "inc"})

        updatedDocs = self.solr.search("doc")
        self.assertEqual(len(updatedDocs), 3)
        for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc["popularity"], originalDoc["popularity"] + 5)
            self.assertEqual(
                True,
                all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ["_version_", "popularity"]),
            )

        self.solr.add(
            [
                {"id": "multivalued_1", "title": "Multivalued doc 1", "word_ss": ["alpha", "beta"]},
                {"id": "multivalued_2", "title": "Multivalued doc 2", "word_ss": ["charlie", "delta"]},
            ]
        )

        originalDocs = self.solr.search("multivalued")
        self.assertEqual(len(originalDocs), 2)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append({"id": doc["id"], "word_ss": ["epsilon", "gamma"]})
        self.solr.add(updateList, fieldUpdates={"word_ss": "add"})

        updatedDocs = self.solr.search("multivalued")
        self.assertEqual(len(updatedDocs), 2)
        for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc["word_ss"], originalDoc["word_ss"] + ["epsilon", "gamma"])
            self.assertEqual(
                True,
                all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ["_version_", "word_ss"]),
            )

    def test_delete(self):
        self.assertEqual(len(self.solr.search("doc")), 3)
        self.solr.delete(id="doc_1")
        self.assertEqual(len(self.solr.search("doc")), 2)
        self.solr.delete(q="price:[0 TO 15]")
        self.assertEqual(len(self.solr.search("doc")), 1)

        self.assertEqual(len(self.solr.search("*:*")), 1)
        self.solr.delete(q="*:*")
        self.assertEqual(len(self.solr.search("*:*")), 0)

        # Need at least one.
        self.assertRaises(ValueError, self.solr.delete)
        # Can't have both.
        self.assertRaises(ValueError, self.solr.delete, id="foo", q="bar")

    def test_commit(self):
        self.assertEqual(len(self.solr.search("doc")), 3)
        self.solr.add([{"id": "doc_6", "title": "Newly added doc"}], commit=False)
        self.assertEqual(len(self.solr.search("doc")), 3)
        self.solr.commit()
        self.assertEqual(len(self.solr.search("doc")), 4)

    def test_optimize(self):
        # Make sure it doesn't blow up. Side effects are hard to measure. :/
        self.assertEqual(len(self.solr.search("doc")), 3)
        self.solr.add([{"id": "doc_6", "title": "Newly added doc"}], commit=False)
        self.assertEqual(len(self.solr.search("doc")), 3)
        self.solr.optimize()
        self.assertEqual(len(self.solr.search("doc")), 4)

    def test_extract(self):
        fake_f = StringIO(
            """
            <html>
                <head>
                    <meta charset="utf-8">
                    <meta name="haystack-test" content="test 1234">
                    <title>Test Title ☃&#x2603;</title>
                </head>
                    <body>foobar</body>
            </html>
        """
        )
        fake_f.name = "test.html"
        extracted = self.solr.extract(fake_f)

        # Verify documented response structure:
        self.assertIn("contents", extracted)
        self.assertIn("metadata", extracted)

        self.assertIn("foobar", extracted["contents"])

        m = extracted["metadata"]

        self.assertEqual([fake_f.name], m["stream_name"])

        self.assertIn("haystack-test", m, "HTML metadata should have been extracted!")
        self.assertEqual(["test 1234"], m["haystack-test"])

        # Note the underhanded use of a double snowman to verify both that Tika
        # correctly decoded entities and that our UTF-8 characters survived the
        # round-trip:
        self.assertEqual(["Test Title ☃☃"], m["title"])

    def test_full_url(self):
        self.solr.url = "http://localhost:8983/solr/core0"
        full_url = self.solr._create_full_url(path="/update")

        # Make sure trailing and leading slashes do not collide:
        self.assertEqual(full_url, "http://localhost:8983/solr/core0/update")
Ejemplo n.º 13
-1
class SolrTestCase(unittest.TestCase):
    def setUp(self):
        super(SolrTestCase, self).setUp()
        self.default_solr = Solr('http://*****:*****@unittest.skipIf(sys.version_info < (2, 7), reason=u'Python 2.6 lacks the ElementTree 1.3 interface required for Solr XML error message parsing')
    def test__scrape_response_coyote_xml(self):
        resp_3 = self.solr._scrape_response({'server': 'coyote'}, '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>\n</response>\n')
        self.assertEqual(resp_3, ("Invalid Date String:'2015-03-23 10:43:33'", "Invalid Date String:'2015-03-23 10:43:33'"))

        # Valid XML with a traceback
        resp_4 = self.solr._scrape_response({'server': 'coyote'}, """<?xml version="1.0"?>
<response>
<lst name="responseHeader"><int name="status">500</int><int name="QTime">138</int></lst><lst name="error"><str name="msg">Internal Server Error</str><str name="trace">org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)</str><int name="code">500</int></lst>
</response>""")
        self.assertEqual(resp_4, (u"Internal Server Error", u"org.apache.solr.common.SolrException: Internal Server Error at java.lang.Thread.run(Thread.java:745)"))

    def test__scrape_response_tomcat(self):
        """Tests for Tomcat error responses"""

        resp_0 = self.solr._scrape_response({'server': 'coyote'}, '<html><body><h1>Something broke!</h1><pre>gigantic stack trace</pre></body></html>')
        self.assertEqual(resp_0, ('Something broke!', ''))

        # Invalid XML
        bogus_xml = '<?xml version="1.0"?>\n<response>\n<lst name="responseHeader"><int name="status">400</int><int name="QTime">0</int></lst><lst name="error"><str name="msg">Invalid Date String:\'2015-03-23 10:43:33\'</str><int name="code">400</int></lst>'
        reason, full_html = self.solr._scrape_response({'server': 'coyote'}, bogus_xml)
        self.assertEqual(reason, None)
        self.assertEqual(full_html, bogus_xml.replace("\n", ""))


    def test__from_python(self):
        self.assertEqual(self.solr._from_python(datetime.date(2013, 1, 18)), '2013-01-18T00:00:00Z')
        self.assertEqual(self.solr._from_python(datetime.datetime(2013, 1, 18, 0, 30, 28)), '2013-01-18T00:30:28Z')
        self.assertEqual(self.solr._from_python(True), 'true')
        self.assertEqual(self.solr._from_python(False), 'false')
        self.assertEqual(self.solr._from_python(1), '1')
        self.assertEqual(self.solr._from_python(1.2), '1.2')
        self.assertEqual(self.solr._from_python(b'hello'), 'hello')
        self.assertEqual(self.solr._from_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._from_python('\x01test\x02'), 'test')

    def test__to_python(self):
        self.assertEqual(self.solr._to_python('2013-01-18T00:00:00Z'), datetime.datetime(2013, 1, 18))
        self.assertEqual(self.solr._to_python('2013-01-18T00:30:28Z'), datetime.datetime(2013, 1, 18, 0, 30, 28))
        self.assertEqual(self.solr._to_python('true'), True)
        self.assertEqual(self.solr._to_python('false'), False)
        self.assertEqual(self.solr._to_python(1), 1)
        self.assertEqual(self.solr._to_python(1.2), 1.2)
        self.assertEqual(self.solr._to_python(b'hello'), 'hello')
        self.assertEqual(self.solr._to_python('hello ☃'), 'hello ☃')
        self.assertEqual(self.solr._to_python(['foo', 'bar']), 'foo')
        self.assertEqual(self.solr._to_python(('foo', 'bar')), 'foo')
        self.assertEqual(self.solr._to_python('tuple("foo", "bar")'), 'tuple("foo", "bar")')

    def test__is_null_value(self):
        self.assertTrue(self.solr._is_null_value(None))
        self.assertTrue(self.solr._is_null_value(''))

        self.assertFalse(self.solr._is_null_value('Hello'))
        self.assertFalse(self.solr._is_null_value(1))

    def test_search(self):
        results = self.solr.search('doc')
        self.assertEqual(len(results), 3)
        # search should default to 'select' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('select/?'))

        results = self.solr.search('example')
        self.assertEqual(len(results), 2)

        results = self.solr.search('nothing')
        self.assertEqual(len(results), 0)

        # Advanced options.
        results = self.solr.search('doc', **{
            'debug': 'true',
            'hl': 'true',
            'hl.fragsize': 8,
            'facet': 'on',
            'facet.field': 'popularity',
            'spellcheck': 'true',
            'spellcheck.collate': 'true',
            'spellcheck.count': 1,
            # TODO: Can't get these working in my test setup.
            # 'group': 'true',
            # 'group.field': 'id',
        })
        self.assertEqual(len(results), 3)
        self.assertTrue('explain' in results.debug)
        self.assertEqual(results.highlighting, {u'doc_4': {}, u'doc_2': {}, u'doc_1': {}})
        self.assertEqual(results.spellcheck, {})
        self.assertEqual(results.facets['facet_fields']['popularity'], ['10', 2, '7', 1, '2', 0, '8', 0])
        self.assertTrue(results.qtime is not None)
        # TODO: Can't get these working in my test setup.
        # self.assertEqual(results.grouped, '')

        # search should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.search('doc', handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_more_like_this(self):
        results = self.solr.more_like_this('id:doc_1', 'text')
        self.assertEqual(len(results), 0)
        # more_like_this should default to 'mlt' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('mlt/?'))

        # more_like_this should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.more_like_this('id:doc_1', 'text', handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_suggest_terms(self):
        results = self.solr.suggest_terms('title', '')
        self.assertEqual(len(results), 1)
        self.assertEqual(results, {'title': [('doc', 3), ('another', 2), ('example', 2), ('1', 1), ('2', 1), ('boring', 1), ('rock', 1), ('thing', 1)]})
        # suggest_terms should default to 'mlt' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('terms/?'))

        # suggest_terms should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.suggest_terms('title', '', handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test__build_doc(self):
        doc = {
            'id': 'doc_1',
            'title': 'Example doc ☃ 1',
            'price': 12.59,
            'popularity': 10,
        }
        doc_xml = force_unicode(ET.tostring(self.solr._build_doc(doc), encoding='utf-8'))
        self.assertTrue('<field name="title">Example doc ☃ 1</field>' in doc_xml)
        self.assertTrue('<field name="id">doc_1</field>' in doc_xml)
        self.assertEqual(len(doc_xml), 152)

    def test_add(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.assertEqual(len(self.solr.search('example')), 2)

        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            },
            {
                'id': 'doc_7',
                'title': 'Another example doc',
            },
        ])
        # add should default to 'update' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('update/?'))

        self.assertEqual(len(self.solr.search('doc')), 5)
        self.assertEqual(len(self.solr.search('example')), 3)

        # add should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.add([], handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_add_with_boost(self):
        self.assertEqual(len(self.solr.search('doc')), 3)

        self.solr.add([{'id': 'doc_6', 'title': 'Important doc'}],
                      boost={'title': 10.0})

        self.solr.add([{'id': 'doc_7', 'title': 'Spam doc doc'}],
                      boost={'title': 0})

        res = self.solr.search('doc')
        self.assertEqual(len(res), 5)
        self.assertEqual('doc_6', res.docs[0]['id'])

    def test_field_update(self):
        originalDocs = self.solr.search('doc')
        self.assertEqual(len(originalDocs), 3)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append( {'id': doc['id'], 'popularity': 5} )
        self.solr.add(updateList, fieldUpdates={'popularity': 'inc'})

        updatedDocs = self.solr.search('doc')
        self.assertEqual(len(updatedDocs), 3)
        for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['popularity'], originalDoc['popularity'] + 5)
            self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'popularity']))

        self.solr.add([
            {
                'id': 'multivalued_1',
                'title': 'Multivalued doc 1',
                'word_ss': ['alpha', 'beta'],
            },
            {
                'id': 'multivalued_2',
                'title': 'Multivalued doc 2',
                'word_ss': ['charlie', 'delta'],
            },
        ])

        originalDocs = self.solr.search('multivalued')
        self.assertEqual(len(originalDocs), 2)
        updateList = []
        for i, doc in enumerate(originalDocs):
            updateList.append( {'id': doc['id'], 'word_ss': ['epsilon', 'gamma']} )
        self.solr.add(updateList, fieldUpdates={'word_ss': 'add'})

        updatedDocs = self.solr.search('multivalued')
        self.assertEqual(len(updatedDocs), 2)
        for i, (originalDoc, updatedDoc) in enumerate(zip(originalDocs, updatedDocs)):
            self.assertEqual(len(updatedDoc.keys()), len(originalDoc.keys()))
            self.assertEqual(updatedDoc['word_ss'], originalDoc['word_ss'] + ['epsilon', 'gamma'])
            self.assertEqual(True, all(updatedDoc[k] == originalDoc[k] for k in updatedDoc.keys() if not k in ['_version_', 'word_ss']))

    def test_delete(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.delete(id='doc_1')
        # delete should default to 'update' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('update/?'))

        self.assertEqual(len(self.solr.search('doc')), 2)
        self.solr.delete(q='price:[0 TO 15]')
        self.assertEqual(len(self.solr.search('doc')), 1)

        self.assertEqual(len(self.solr.search('*:*')), 1)
        self.solr.delete(q='*:*')
        self.assertEqual(len(self.solr.search('*:*')), 0)

        # Need at least one.
        self.assertRaises(ValueError, self.solr.delete)
        # Can't have both.
        self.assertRaises(ValueError, self.solr.delete, id='foo', q='bar')

        # delete should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.delete(id='doc_1', handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_commit(self):
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.commit()
        # commit should default to 'update' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('update/?'))
        self.assertEqual(len(self.solr.search('doc')), 4)

        # commit should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.commit(handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_optimize(self):
        # Make sure it doesn't blow up. Side effects are hard to measure. :/
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.add([
            {
                'id': 'doc_6',
                'title': 'Newly added doc',
            }
        ], commit=False)
        self.assertEqual(len(self.solr.search('doc')), 3)
        self.solr.optimize()
        # optimize should default to 'update' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('update/?'))
        self.assertEqual(len(self.solr.search('doc')), 4)

        # optimize should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.optimize(handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

    def test_extract(self):
        fake_f = StringIO("""
            <html>
                <head>
                    <meta charset="utf-8">
                    <meta name="haystack-test" content="test 1234">
                    <title>Test Title ☃&#x2603;</title>
                </head>
                    <body>foobar</body>
            </html>
        """)
        fake_f.name = "test.html"
        extracted = self.solr.extract(fake_f)
        # extract should default to 'update/extract' handler
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('update/extract'))

        # extract should support custom handlers
        with self.assertRaises(SolrError):
            self.solr.extract(fake_f, handler='fakehandler')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('fakehandler'))

        # Verify documented response structure:
        self.assertIn('contents', extracted)
        self.assertIn('metadata', extracted)

        self.assertIn('foobar', extracted['contents'])

        m = extracted['metadata']

        self.assertEqual([fake_f.name], m['stream_name'])

        self.assertIn('haystack-test', m, "HTML metadata should have been extracted!")
        self.assertEqual(['test 1234'], m['haystack-test'])

        # Note the underhanded use of a double snowman to verify both that Tika
        # correctly decoded entities and that our UTF-8 characters survived the
        # round-trip:
        self.assertEqual(['Test Title ☃☃'], m['title'])

    def test_full_url(self):
        self.solr.url = 'http://localhost:8983/solr/core0'
        full_url = self.solr._create_full_url(path='/update')

        # Make sure trailing and leading slashes do not collide:
        self.assertEqual(full_url, 'http://localhost:8983/solr/core0/update')

    def test_request_handler(self):
        before_test_use_qt_param = self.solr.use_qt_param
        before_test_search_handler = self.solr.search_handler

        self.solr.use_qt_param = True

        response = self.solr.search('my query')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('select'))

        response = self.solr.search('my', handler='/autocomplete')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('select'))
        self.assertTrue(args[1].find("qt=%2Fautocomplete") > -1)

        self.solr.search_handler = '/autocomplete'

        response = self.solr.search('my')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('select'))
        self.assertTrue(args[1].find("qt=%2Fautocomplete") > -1)

        self.solr.use_qt_param = False
        # will change the path, so expect a 404
        with self.assertRaises(SolrError):
            response = self.solr.search('my')
        args, kwargs = self.solr._send_request.call_args
        self.assertTrue(args[1].startswith('/autocomplete'))
        self.assertTrue(args[1].find("qt=%2Fautocomplete") < 0)

        # reset the values to what they were before the test
        self.solr.use_qt_param = before_test_use_qt_param
        self.solr.search_handler = before_test_search_handler