Ejemplo n.º 1
0
def _update_search_index(video):
    """
    Updates the search team video index for that video if that video is under moderation
    """
    if video.moderated_by:
        tv = TeamVideo.objects.get(video=video, team=video.moderated_by)
        site.get_index(TeamVideo).update_object(tv)
Ejemplo n.º 2
0
    def handle(self, *args, **options):
        if len(args) != 1:
            raise CommandError('Usage index_team_videos <team-slug>')
        try:
            team = Team.objects.get(slug=args[0])
        except Team.DoesNotExist:
            raise CommandError('Team with slug %r not found' % (args[0], ))

        video_index = site.get_index(Video)
        team_video_index = site.get_index(TeamVideo)
        self.stdout.write("Fetching videos\n")
        video_list = list(
            TeamVideo.objects.filter(team=team).select_related('video'))
        start_time = time.time()
        self.stdout.write("Indexing")
        self.stdout.flush()
        with transaction.commit_manually():
            for team_video in video_list:
                video_index.update_object(team_video.video)
                team_video_index.update_object(team_video)
                self.stdout.write(".")
                self.stdout.flush()
                # commit after each pass to make sure that we aren't keeping
                # open any database locks
                transaction.commit()
        end_time = time.time()
        self.stdout.write("\ndone indexed %s videos in %0.1f seconds\n" %
                          (len(video_list), end_time - start_time))
Ejemplo n.º 3
0
    def handle(self, **options):

        
        if options['index_type'] == 'feature':
                    feature_index = site.get_index(Feature)
                    features = Feature.objects.filter(published=True)
                    back.update(feature_index, features)
        else:
            recipient_index = site.get_index(Recipient)
            location_index = site.get_index(Location)
            
            if options['country']:
                index_data = Recipient.objects.select_related().filter(countrypayment=options['country'], total__gt=1000).only('name', 'geo1', 'geo2', 'geo3', 'geo4', 'zipcode', 'countrypayment')
                locations = Location.objects.filter(country=options['country'])
            else:
                raise ValueError('Country is required')
            
            settings.HAYSTACK_XAPIAN_PATH = "%s-%s" % (settings.HAYSTACK_XAPIAN_PATH, options['country'])
            back = backend.SearchBackend()
            print "now indexing Recipients"
            back.update(recipient_index, index_data)

            print "now indexing Location"
            back.update(location_index, locations)
        connection.close()
Ejemplo n.º 4
0
def _update_search_index(video):
    """
    Updates the search team video index for that video if that video is under moderation
    """
    if video.moderated_by:
        tv = TeamVideo.objects.get(video=video, team=video.moderated_by)
        site.get_index(TeamVideo).update_object(tv)
Ejemplo n.º 5
0
def add_to_index(obj_identifier, **kwargs):
    object_path, pk = utils.split_obj_identifier(obj_identifier)
    model_class = utils.get_model_class(object_path)
    instance = model_class.objects.get(pk=pk)

    index = site.get_index(model_class)
    index.backend.update(index, [instance])
Ejemplo n.º 6
0
 def handle(self, *args, **options):
     if len(args) < 1 or len(args) > 3:
         raise CommandError(
             'Usage profile_index <video-pk> [sort] [restrictions]')
     try:
         video = Video.objects.get(pk=args[0])
     except Video.DoesNotExist:
         raise CommandError('Video not found: %s' % (args[0],))
     try:
         sort = args[1]
     except IndexError:
         sort = 'cumulative'
     try:
         restrictions = args[2]
     except IndexError:
         restrictions = 10
     else:
         if '.' in restrictions:
             restrictions = float(restrictions)
         else:
             restrictions = int(restrictions)
     video_index = site.get_index(Video)
     pr = cProfile.Profile()
     pr.enable()
     video_index.update_object(video)
     pr.disable()
     stats = pstats.Stats(pr, stream=self.stdout)
     stats.strip_dirs().sort_stats(sort).print_stats(restrictions)
Ejemplo n.º 7
0
    def handle_app(self, app, **options):
        # Cause the default site to load.
        from haystack import site
        from django.db.models import get_models
        from haystack.exceptions import NotRegistered
        from haystack.query import SearchQuerySet
        from haystack_scheduled.indexes import ScheduledSearchIndex

        for model in get_models(app):
            try:
                index = site.get_index(model)
            except NotRegistered:
                if self.verbosity >= 2:
                    print "Skipping '%s' - no index." % model
                continue

            if not isinstance(index, ScheduledSearchIndex):
                if self.verbosity >= 2:
                    print "Skipping '%s' - only ScheduledSearchIndex is supported." % model
                continue

            print "'%s' - unindexing removed objects." % model

            existings_pks = set(map(smart_str, model.objects.values_list("pk", flat=True)))
            for result in SearchQuerySet().models(model):
                if smart_str(result.pk) not in existings_pks:
                    if self.verbosity >= 2:
                        print "Unindexing pk %s" % result.pk
                    index.backend.remove(".".join([result.app_label, result.model_name, str(result.pk)]))
Ejemplo n.º 8
0
Archivo: query.py Proyecto: rob-b/Grab
 def _fill_cache(self):
     from haystack import site
     
     if self._result_cache is None:
         self._result_cache = []
     
     # Tell the query where to start from and how many we'd like.
     cache_length = len(self._result_cache)
     self.query._reset()
     self.query.set_limits(cache_length, cache_length + ITERATOR_LOAD_PER_QUERY)
     results = self.query.get_results()
     
     # Check if we wish to load all objects.
     if self._load_all:
         original_results = []
         models_pks = {}
         loaded_objects = {}
         
         # Remember the search position for each result so we don't have to resort later.
         for result in results:
             original_results.append(result)
             models_pks.setdefault(result.model, []).append(result.pk)
         
         # Load the objects for each model in turn.
         for model in models_pks:
             if model in self._load_all_querysets:
                 # Use the overriding queryset.
                 loaded_objects[model] = self._load_all_querysets[model].in_bulk(models_pks[model])
             else:
                 # Check the SearchIndex for the model for an override.
                 try:
                     index = site.get_index(model)
                     qs = index.load_all_queryset()
                     loaded_objects[model] = qs.in_bulk(models_pks[model])
                 except NotRegistered:
                     # The model returned doesn't seem to be registered with
                     # the current site. We should silently fail and populate
                     # nothing for those objects.
                     loaded_objects[model] = []
     
     if len(results) < ITERATOR_LOAD_PER_QUERY:
         self._ignored_result_count += ITERATOR_LOAD_PER_QUERY - len(results)
     
     for result in results:
         if self._load_all:
             # We have to deal with integer keys being cast from strings; if this
             # fails we've got a character pk.
             try:
                 result.pk = int(result.pk)
             except ValueError:
                 pass
             try:
                 result._object = loaded_objects[result.model][result.pk]
             except (KeyError, IndexError):
                 # The object was either deleted since we indexed or should
                 # be ignored; fail silently.
                 self._ignored_result_count += 1
                 continue
         
         self._result_cache.append(result)
Ejemplo n.º 9
0
 def _rebuild_index(self):
     """
     Rebuilds the search index.
     """
     from haystack import site
     index = site.get_index(models.Video)
     index.reindex()
Ejemplo n.º 10
0
 def __init__(self, obj, admin_site=None):
     self.admin = admin_site
     self.object = obj
     if getattr(self.object, 'searchindex', None) is None:
         # < Haystack 1.2
         from haystack import site
         self.object.searchindex = site.get_index(self.object.model)
Ejemplo n.º 11
0
 def get_stored_fields(self):
     """
     Returns a dictionary of all of the stored fields from the SearchIndex.
     
     Useful for serializing results. Only returns the fields Haystack's
     indexes are aware of as being 'stored'.
     """
     if self._stored_fields is None:
         from haystack import site
         from haystack.exceptions import NotRegistered
         
         try:
             index = site.get_index(self.model)
         except NotRegistered:
             # Not found? Return nothing.
             return {}
         
         self._stored_fields = {}
         
         # Iterate through the index's fields, pulling out the fields that
         # are stored.
         for fieldname, field in index.fields.items():
             if field.stored is True:
                 self._stored_fields[fieldname] = getattr(self, fieldname, u'')
     
     return self._stored_fields
Ejemplo n.º 12
0
 def get_index(self, model_class):
     """Fetch the model's registered ``SearchIndex`` in a standarized way."""
     try:
         return site.get_index(model_class)
     except NotRegistered:
         self.log.error("Couldn't find a registered SearchIndex for %s." % model_class)
         return None
Ejemplo n.º 13
0
 def _rebuild_index(self):
     """
     Rebuilds the search index.
     """
     from haystack import site
     index = site.get_index(models.Video)
     index.reindex()
Ejemplo n.º 14
0
def remove_index(app_name, model_name, identifier):
    from haystack import site
    import openPLM.plmapp.search_indexes

    model_class = get_model(app_name, model_name)
    search_index = site.get_index(model_class)
    search_index.remove_object(identifier)
Ejemplo n.º 15
0
 def __init__(self, obj, admin_site=None):
     self.admin = admin_site
     self.object = obj
     if getattr(self.object, 'searchindex', None) is None:
         # < Haystack 1.2
         from haystack import site
         self.object.searchindex = site.get_index(self.object.model)
    def handle(self, **options):

        from parliament.search.models import IndexingTask

        delete_tasks = list(
            IndexingTask.objects.filter(action='delete')
        )

        update_tasks = list(
            IndexingTask.objects.filter(action='update').prefetch_related('content_object')
        )

        solr = pysolr.Solr(settings.HAYSTACK_SOLR_URL)

        if update_tasks:
            update_objs = [t.content_object for t in update_tasks if t.content_object]

            update_objs.sort(key=lambda o: o.__class__.__name__)
            for cls, objs in itertools.groupby(update_objs, lambda o: o.__class__):
                print "Indexing %s" % cls
                index = site.get_index(cls)
                prepared_objs = [index.prepare(o) for o in objs]
                solr.add(prepared_objs)

            IndexingTask.objects.filter(id__in=[t.id for t in update_tasks]).delete()

        if delete_tasks:
            for dt in delete_tasks:
                print "Deleting %s" % dt.identifier
                solr.delete(id=dt.identifier, commit=False)
            solr.commit()

            IndexingTask.objects.filter(id__in=[t.id for t in delete_tasks]).delete()
Ejemplo n.º 17
0
    def get_stored_fields(self):
        """
        Returns a dictionary of all of the stored fields from the SearchIndex.
        
        Useful for serializing results. Only returns the fields Haystack's
        indexes are aware of as being 'stored'.
        """
        if self._stored_fields is None:
            from haystack import site
            from haystack.exceptions import NotRegistered

            try:
                index = site.get_index(self.model)
            except NotRegistered:
                # Not found? Return nothing.
                return {}

            self._stored_fields = {}

            # Iterate through the index's fields, pulling out the fields that
            # are stored.
            for fieldname, field in index.fields.items():
                if field.stored is True:
                    self._stored_fields[fieldname] = getattr(
                        self, fieldname, u'')

        return self._stored_fields
 def get_index(self, model_class):
     """Fetch the model's registered ``SearchIndex`` in a standarized way."""
     try:
         return site.get_index(model_class)
     except NotRegistered:
         self.log.error("Couldn't find a registered SearchIndex for %s." % model_class)
         return None
Ejemplo n.º 19
0
def detail(request, idea_id):
    """
    Detail view; idea_id must be a string containing an int.
    """
    idea = get_object_or_404(Idea, pk=int(idea_id))
    if request.method == 'POST':
        tag_form = IdeaTagForm(request.POST)
        if tag_form.is_valid():
            data = tag_form.clean()['tags']
            tags = [tag.strip() for tag in data.split(',') 
                    if tag.strip() != '']
            idea.tags.add(*tags)
            #   Make sure the search index included the tags
            site.get_index(Idea).update_object(idea)
            return HttpResponseRedirect(
                    reverse('idea_detail', args=(idea.id,)))
    else:
        tag_form = IdeaTagForm()

    voters = User.objects.filter(vote__idea=idea, vote__vote=UP_VOTE)

    for v in voters:
        try:
            v.profile =  v.get_profile()
        except (ObjectDoesNotExist, SiteProfileNotAvailable):
            v.profile = None

            
    idea_type = ContentType.objects.get(app_label="idea", model="idea")

    tags = idea.tags.extra(select={
        'tag_count': """
            SELECT COUNT(*) from taggit_taggeditem tt WHERE tt.tag_id = taggit_tag.id 
            AND content_type_id = %s 
        """
    }, select_params=[idea_type.id]).order_by('name')

    for tag in tags:
        tag.tag_url = "%s?tags=%s"  %  (reverse('idea_list'), tag.slug)

    return _render(request, 'idea/detail.html', {
        'idea': idea,   #   title, body, user name, user photo, time
        'support': request.user in voters,
        'tags': tags, 
        'voters': voters,
        'tag_form': tag_form
        })
Ejemplo n.º 20
0
def update_index(app_name, model_name, pk, **kwargs):
    from haystack import site
    import openPLM.plmapp.search_indexes

    model_class = get_model(app_name, model_name)
    instance = model_class.objects.select_related(depth=1).get(pk=pk)
    search_index = site.get_index(model_class)
    search_index.update_object(instance)
Ejemplo n.º 21
0
def remove_search_index(model_class, obj_identifier):
    try:
        search_index = site.get_index(model_class)
    except NotRegistered:
        log(u'Search index is not registered for %s' % model_class)
        return None

    search_index.remove_object(obj_identifier)
Ejemplo n.º 22
0
def remove_search_index(model_class, obj_identifier):
    try:
        search_index = site.get_index(model_class)
    except NotRegistered:
        log(u'Seacrh index is not registered for %s' % model_class)
        return None
    
    search_index.remove_object(obj_identifier)
Ejemplo n.º 23
0
    def handle_app(self, app, **options):
        # Cause the default site to load.
        from haystack import site
        from django.db.models import get_models
        from haystack.exceptions import NotRegistered

        if self.site:
            path_bits = self.site.split(".")
            module_name = ".".join(path_bits[:-1])
            site_name = path_bits[-1]

            try:
                module = importlib.import_module(module_name)
                site = getattr(module, site_name)
            except (ImportError, NameError):
                pass

        for model in get_models(app):
            try:
                index = site.get_index(model)
            except NotRegistered:
                if self.verbosity >= 2:
                    print "Skipping '%s' - no index." % model
                continue

            extra_lookup_kwargs = {}
            updated_field = index.get_updated_field()

            if self.age:
                if updated_field:
                    extra_lookup_kwargs["%s__gte" % updated_field] = datetime.datetime.now() - datetime.timedelta(
                        hours=self.age
                    )
                else:
                    if self.verbosity >= 2:
                        print "No updated date field found for '%s' - not restricting by age." % model.__name__

            # `.select_related()` seems like a good idea here but can fail on
            # nullable `ForeignKey` as well as what seems like other cases.
            qs = index.get_queryset().filter(**extra_lookup_kwargs).order_by(model._meta.pk.name)
            total = qs.count()

            if self.verbosity >= 1:
                print "Indexing %d %s." % (total, smart_str(model._meta.verbose_name_plural))

            for start in range(0, total, self.batchsize):
                end = min(start + self.batchsize, total)

                if self.verbosity >= 2:
                    print "  indexing %s - %d of %d." % (start + 1, end, total)

                # Get a clone of the QuerySet so that the cache doesn't bloat up
                # in memory. Useful when reindexing large amounts of data.
                small_cache_qs = qs.all()
                index.backend.update(index, small_cache_qs[start:end])

                # Clear out the DB connections queries because it bloats up RAM.
                reset_queries()
def search_index_delete(app_name, model_name, obj_identifier, **kwargs):
    logger = search_index_delete.get_logger(**kwargs)
    try:
        model_class = get_model(app_name, model_name)
        search_index = site.get_index(model_class)
        search_index.remove_object(obj_identifier)
    except Exception, exc:
        logger.error(exc)
        search_index_delete.retry(exc=exc)
Ejemplo n.º 25
0
def update_one_team_video(team_video_id):
    from teams.models import TeamVideo, TeamVideoLanguage
    try:
        team_video = TeamVideo.objects.get(id=team_video_id)
    except TeamVideo.DoesNotExist:
        return

    tv_search_index = site.get_index(TeamVideo)
    tv_search_index.backend.update(tv_search_index, [team_video])
Ejemplo n.º 26
0
 def run(self, app_name, model_name, pk, **kwargs):
     logger = self.get_logger(**kwargs)
     try:
         model_class = get_model(app_name, model_name)
         instance = model_class.objects.get(pk=pk)
         search_index = site.get_index(model_class)
         search_index.update_object(instance)
     except ObjectDoesNotExist, exc:
         logger.warn(exc)
def search_index_delete(app_name, model_name, obj_identifier, **kwargs):
    logger = search_index_delete.get_logger(**kwargs)
    try:
        model_class = get_model(app_name, model_name)
        search_index = site.get_index(model_class)
        search_index.remove_object(obj_identifier)
    except Exception, exc:
        logger.error(exc)
        search_index_delete.retry(exc=exc)
Ejemplo n.º 28
0
def update_one_team_video(team_video_id):
    """Update the Solr index for the given team video."""
    from teams.models import TeamVideo
    try:
        team_video = TeamVideo.objects.get(id=team_video_id)
    except TeamVideo.DoesNotExist:
        return

    tv_search_index = site.get_index(TeamVideo)
    tv_search_index.backend.update(tv_search_index, [team_video])
Ejemplo n.º 29
0
def update_one_team_video(team_video_id):
    from teams.models import TeamVideo, TeamVideoLanguage
    try:
        team_video = TeamVideo.objects.get(id=team_video_id)
    except TeamVideo.DoesNotExist:
        return

    tv_search_index = site.get_index(TeamVideo)
    tv_search_index.backend.update(
        tv_search_index, [team_video])
Ejemplo n.º 30
0
 def get_index(self, model_class, **kwargs):
     """
     Fetch the model's registered ``SearchIndex`` in a standarized way.
     """
     logger = self.get_logger(**kwargs)
     try:
         return index_holder.get_index(model_class)
     except IndexNotFoundException:
         logger.error("Couldn't find a SearchIndex for %s." % model_class)
     return None
Ejemplo n.º 31
0
 def run(self, app_name, model_name, pk, **kwargs):
     logger = self.get_logger(**kwargs)
     try:
         model_class = get_model(app_name, model_name)
         instance = model_class.objects.get(pk=pk)
         search_index = site.get_index(model_class)
         search_index.update_object(instance)
     except Exception, exc:
         logger.error(exc)
         self.retry([app_name, model_name, pk], kwargs, exc=exc)
Ejemplo n.º 32
0
 def run(self, app_name, model_name, pk, **kwargs):
     logger = self.get_logger(**kwargs)
     try:
         model_class = get_model(app_name, model_name)
         instance = model_class.objects.get(pk=pk)
         search_index = site.get_index(model_class)
         search_index.update_object(instance)
     except Exception, exc:
         logger.error(exc)
         self.retry([app_name, model_name, pk], kwargs, exc=exc)
 def get_index(self, model_class, **kwargs):
     """
     Fetch the model's registered ``SearchIndex`` in a standarized way.
     """
     logger = self.get_logger(**kwargs)
     try:
         return index_holder.get_index(model_class)
     except IndexNotFoundException:
         logger.error("Couldn't find a SearchIndex for %s." % model_class)
     return None
Ejemplo n.º 34
0
def update_index(app_name, model_name, pk, fast_reindex=False, **kwargs):
    from haystack import site
    import openPLM.plmapp.search_indexes

    model_class = get_model(app_name, model_name)
    manager = _get_manager(model_class)
    instance = manager.get(pk=pk)
    if fast_reindex:
        instance.fast_reindex = True
    search_index = site.get_index(model_class)
    search_index.update_object(instance)
Ejemplo n.º 35
0
def haystack_update_index(app_label, model_name, pk, is_removal, using="default"):
    """
    Updates a haystack index for the given model (specified by ``app_label``
    and ``model_name``). If ``is_removal`` is ``True``, a fake instance is
    constructed with the given ``pk`` and passed to the index's
    :meth:`remove_object` method. Otherwise, the latest version of the instance
    is fetched from the database and passed to the index's
    :meth:`update_object` method.

    If an import_app_label, import_model, and import_pk are provided, this task
    will spawn ``mark_import_complete``.

    """
    model_class = get_model(app_label, model_name)
    search_index = site.get_index(model_class)
    try:
        if is_removal:
            instance = model_class(pk=pk)
            search_index.remove_object(instance)
        else:
            try:
                instance = Video.objects.using(using).get(pk=pk)
            except model_class.DoesNotExist:
                logging.debug(
                    ("haystack_update_index(%r, %r, %r, %r, using=%r)" " could not find video with pk %i"),
                    app_label,
                    model_name,
                    pk,
                    is_removal,
                    using,
                    pk,
                )
            else:
                if instance.status == Video.ACTIVE:
                    search_index.update_object(instance)
                else:
                    search_index.remove_object(instance)
    except (DatabaseLockError, LockError), e:
        # maximum wait is ~30s
        exp = min(haystack_update_index.request.retries, 4)
        countdown = random.random() * (2 ** exp)
        logging.debug(
            ("haystack_update_index(%r, %r, %r, %r, using=%r) " "retrying due to %s with countdown %r"),
            app_label,
            model_name,
            pk,
            is_removal,
            using,
            e.__class__.__name__,
            countdown,
        )
        haystack_update_index.retry(countdown=countdown)
Ejemplo n.º 36
0
def update_search_index_for_qs(model_class, pks):
    start = time.time()
    
    qs = model_class._default_manager.filter(pk__in=pks)

    try:
        search_index = site.get_index(model_class)
    except NotRegistered:
        log(u'Seacrh index is not registered for %s' % model_class)
        return None
    
    search_index.backend.update(search_index, qs)
    
    LogEntry(num=len(pks), time=time.time()-start).save()
Ejemplo n.º 37
0
def update_search_index_for_qs(model_class, pks):
    start = time.time()

    qs = model_class._default_manager.filter(pk__in=pks)

    try:
        search_index = site.get_index(model_class)
    except NotRegistered:
        log(u'Seacrh index is not registered for %s' % model_class)
        return None

    search_index.backend.update(search_index, qs)

    LogEntry(num=len(pks), time=time.time() - start).save()
Ejemplo n.º 38
0
def update_search_index(model_class, pk):
    try:
        obj = model_class.objects.get(pk=pk)
    except model_class.DoesNotExist:
        log(u'Object does not exist for %s %s' % (model_class, pk))
        return

    try:
        search_index = site.get_index(model_class)
    except NotRegistered:
        log(u'Seacrh index is not registered for %s' % model_class)
        return None

    search_index.update_object(obj)
Ejemplo n.º 39
0
def update_search_index(model_class, pk):
    try:
        obj = model_class.objects.get(pk=pk)
    except model_class.DoesNotExist:
        log(u'Object does not exist for %s %s' % (model_class, pk))
        return

    try:
        search_index = site.get_index(model_class)
    except NotRegistered:
        log(u'Seacrh index is not registered for %s' % model_class)
        return None

    search_index.update_object(obj)
Ejemplo n.º 40
0
def add_idea(request):
    banner = get_banner()
    if request.method == 'POST':
        idea = Idea(creator=request.user, state=state_helper.get_first_state())
        if idea.state.name == 'Active':
            form = IdeaForm(request.POST, instance=idea)
            if form.is_valid():
                new_idea = form.save()
                vote_up(new_idea, request.user)
                #   Make sure the search index included the tags
                site.get_index(Idea).update_object(new_idea)
                return HttpResponseRedirect(reverse('idea_detail', args=(idea.id,)))
        else:
            return HttpResponse('Idea is archived', status=403)
    else:
        idea_title = request.GET.get('idea_title', '')
        form = IdeaForm(initial={'title':idea_title})
        return _render(request, 'idea/add.html', {
            'form':form,
            'banner':banner,
            'similar': [r.object for r in more_like_text(idea_title,
                Idea)]
            })
Ejemplo n.º 41
0
    def handle_app(self, app, **options):
        # Cause the default site to load.
        from haystack import handle_registrations
        handle_registrations()
        
        from django.db.models import get_models
        from haystack import site
        from haystack.exceptions import NotRegistered

        for model in get_models(app):
            try:
                index = site.get_index(model)
            except NotRegistered:
                if self.verbosity >= 2:
                    print "Skipping '%s' - no index." % model
                continue

            extra_lookup_kwargs = {}
            updated_field = index.get_updated_field()
            
            if self.age:
                if updated_field:
                    extra_lookup_kwargs['%s__gte' % updated_field] = datetime.datetime.now() - datetime.timedelta(hours=self.age)
                else:
                    if self.verbosity >= 2:
                        print "No updated date field found for '%s' - not restricting by age." % model.__name__
            
            # DRL_TODO: .select_related() seems like a good idea here but
            #           can cause empty QuerySets. Why?
            qs = index.get_query_set().filter(**extra_lookup_kwargs).order_by(model._meta.pk.name)
            total = qs.count()

            if self.verbosity >= 1:
                print "Indexing %d %s." % (total, smart_str(model._meta.verbose_name_plural))

            for start in range(0, total, self.batchsize):
                end = min(start + self.batchsize, total)
                
                if self.verbosity >= 2:
                    print "  indexing %s - %d of %d." % (start+1, end, total)
                
                # Get a clone of the QuerySet so that the cache doesn't bloat up
                # in memory. Useful when reindexing large amounts of data.
                small_cache_qs = qs.all()
                index.backend.update(index, small_cache_qs[start:end])
                
                # Clear out the DB connections queries because it bloats up RAM.
                reset_queries()
Ejemplo n.º 42
0
 def update_external(self, print_delta=0, start=0, select_related=None):
     """
     Update search index and cached_templates for all objects
     """
     qs = self.all()
     if select_related:
         qs = qs.select_related(*select_related)
     if start:
         qs = qs.filter(pk__gte=start)
     qs = queryset_iterator(qs)
     #context = Context(dict(STATIC_URL=settings.STATIC_URL))
     search_index = site.get_index(self.model)
     for obj in qs:
         obj.update_search_index(search_index)
         #obj.update_cached_template(context)
         if print_delta and not obj.id % print_delta:
             print obj.id
Ejemplo n.º 43
0
 def update_external(self, print_delta=0, start=0, select_related=None):
     """
     Update search index and cached_templates for all objects
     """
     qs = self.all()
     if select_related:
         qs = qs.select_related(*select_related)
     if start:
         qs = qs.filter(pk__gte=start)
     qs = queryset_iterator(qs)
     #context = Context(dict(STATIC_URL=settings.STATIC_URL))
     search_index = site.get_index(self.model)
     for obj in qs:
         obj.update_search_index(search_index)
         #obj.update_cached_template(context)
         if print_delta and not obj.id % print_delta:
             print obj.id
Ejemplo n.º 44
0
def video_changed_tasks(video_pk, new_version_id=None):
    from videos import metadata_manager
    from videos.models import Video
    from teams.models import TeamVideo

    metadata_manager.update_metadata(video_pk)
    if new_version_id is not None:
        _send_notification(new_version_id)
        _check_alarm(new_version_id)
        _detect_language(new_version_id)

    video = Video.objects.get(pk=video_pk)
    if video.teamvideo_set.count() > 0:
        tv_search_index = site.get_index(TeamVideo)
        tv_search_index.backend.update(tv_search_index,
                                       list(video.teamvideo_set.all()))

    video.update_search_index()
Ejemplo n.º 45
0
def extract_and_index_pdf_text(document_pk=None, **kwargs):
    logger = extract_and_index_pdf_text.get_logger(**kwargs)
    logger.debug("indexing doc with pk %s" % document_pk)
    try:
        doc = Document.objects.get(pk=document_pk)
    except Document.DoesNotExist:
        logger.warning("Warning, Document with pk %s does not exist" % str(document_pk))
        return False
    if not doc.pages or doc.mimetype != 'application/pdf':
        logger.info("Warning, doc.pages (%s) not set or doc.mimetype (%s) != 'application/pdf'" % (str(doc.pages), str(doc.mimetype)))
        return False
    #logger.debug("filename path %s %s" % (str(doc.file.path), str(doc.file.name)))
    for p in xrange(1, doc.pages + 1):
        text = pdf2text(doc.file.path, p)
        doc.page_set.create(num=p, text=text)
    index = site.get_index(Page)
    index.backend.update(index, doc.page_set.all())
    return True
Ejemplo n.º 46
0
def video_changed_tasks(video_pk, new_version_id=None):
    from videos import metadata_manager
    from videos.models import Video
    from teams.models import TeamVideo
    
    metadata_manager.update_metadata(video_pk)
    if new_version_id is not None:
        _send_notification(new_version_id)
        _check_alarm(new_version_id)
        _detect_language(new_version_id)

    video = Video.objects.get(pk=video_pk)
    if video.teamvideo_set.count() > 0:
        tv_search_index = site.get_index(TeamVideo)
        tv_search_index.backend.update(
            tv_search_index,
            list(video.teamvideo_set.all()))
    
    video.update_search_index()
Ejemplo n.º 47
0
def _get_team_video_from_search_record(search_record):
    if getattr(search_record, '_team_video', None):
        # This is ugly, but allows us to pre-fetch the teamvideos for the
        # search records all at once to avoid multiple DB queries.
        return search_record._team_video
    else:
        try:
            return TeamVideo.objects.get(pk=search_record.team_video_pk)
        except TeamVideo.DoesNotExist:
            from raven.contrib.django.models import client
            client.create_from_exception()

        # ok, for some reason, this search record got stale.
        # no idea why.
        # so let's delete it so this can't happen again
        tv_search_index = site.get_index(TeamVideo)
        tv_search_index.backend.remove(search_record.id)
        logger.error("Removing %s from solr since it's stale" %
                     search_record.id)

        return None
Ejemplo n.º 48
0
def _get_team_video_from_search_record(search_record):
    if getattr(search_record, '_team_video', None):
        # This is ugly, but allows us to pre-fetch the teamvideos for the
        # search records all at once to avoid multiple DB queries.
        return search_record._team_video
    else:
        try:
            return TeamVideo.objects.get(pk=search_record.team_video_pk)
        except TeamVideo.DoesNotExist:
            logger.warn('DoesNotExist error when looking up search record',
                        exc_info=True)

        # ok, for some reason, this search record got stale.
        # no idea why.
        # so let's delete it so this can't happen again
        tv_search_index = site.get_index(TeamVideo)
        tv_search_index.backend.remove(search_record.id)
        logger.error("Removing %s from solr since it's stale" %
                     search_record.id)

        return None
Ejemplo n.º 49
0
    def handle(self, **options):

        from parliament.search.models import IndexingTask

        delete_tasks = list(IndexingTask.objects.filter(action='delete'))

        update_tasks = list(
            IndexingTask.objects.filter(
                action='update').prefetch_related('content_object'))

        solr = pysolr.Solr(settings.HAYSTACK_SOLR_URL, timeout=600)

        if update_tasks:
            update_objs = [
                t.content_object for t in update_tasks if t.content_object
            ]

            update_objs.sort(key=lambda o: o.__class__.__name__)
            for cls, objs in itertools.groupby(update_objs,
                                               lambda o: o.__class__):
                logger.debug("Indexing %s" % cls)
                index = site.get_index(cls)
                if hasattr(index, 'should_obj_be_indexed'):
                    objs = filter(index.should_obj_be_indexed, objs)
                prepared_objs = [index.prepare(o) for o in objs]
                solr.add(prepared_objs)

            IndexingTask.objects.filter(
                id__in=[t.id for t in update_tasks]).delete()

        if delete_tasks:
            for dt in delete_tasks:
                print "Deleting %s" % dt.identifier
                solr.delete(id=dt.identifier, commit=False)
            solr.commit()

            IndexingTask.objects.filter(
                id__in=[t.id for t in delete_tasks]).delete()
Ejemplo n.º 50
0
    def handle(self, *args, **options):
        self.queued_versions = []
        self.last_index_time = {}
        self.video_index = site.get_index(Video)
        self.last_fetch_all_videos_time = 0
        self.last_fetch_popular_videos_time = 0
        self.all_video_queue = []
        self.popular_video_queue = []
        time_per_version = 1.0 / float(options.get('rate', 1))

        while True:
            start_time = time.time()
            if not self.queued_versions:
                self.queue_up_versions()
                queue_time = time.time() - start_time
                self.stdout.write("queue_up_versions() took %0.3fs seconds\n" %
                                  queue_time)
                start_time = time.time()
            video_id = self.index_one_version()
            index_time = time.time() - start_time
            self.stdout.write("indexing %s took %0.3f seconds\n" % (
                video_id, index_time))
            if index_time < time_per_version:
                time.sleep(time_per_version - index_time)
Ejemplo n.º 51
0
 def _process_results(self, raw_results, highlight=False, result_class=None):
     if not self.site:
         from haystack import site
     else:
         site = self.site
     
     results = []
     hits = raw_results.hits
     facets = {}
     spelling_suggestion = None
     
     if result_class is None:
         result_class = SearchResult
     
     if hasattr(raw_results, 'facets'):
         facets = {
             'fields': raw_results.facets.get('facet_fields', {}),
             'dates': raw_results.facets.get('facet_dates', {}),
             'queries': raw_results.facets.get('facet_queries', {}),
         }
         
         for key in ['fields']:
             for facet_field in facets[key]:
                 # Convert to a two-tuple, as Solr's json format returns a list of
                 # pairs.
                 facets[key][facet_field] = zip(facets[key][facet_field][::2], facets[key][facet_field][1::2])
     
     if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
         if hasattr(raw_results, 'spellcheck'):
             if len(raw_results.spellcheck.get('suggestions', [])):
                 # For some reason, it's an array of pairs. Pull off the
                 # collated result from the end.
                 spelling_suggestion = raw_results.spellcheck.get('suggestions')[-1]
     
     indexed_models = site.get_indexed_models()
     
     for raw_result in raw_results.docs:
         app_label, model_name = raw_result[DJANGO_CT].split('.')
         additional_fields = {}
         model = get_model(app_label, model_name)
         
         if model and model in indexed_models:
             for key, value in raw_result.items():
                 index = site.get_index(model)
                 string_key = str(key)
                 
                 if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                     additional_fields[string_key] = index.fields[string_key].convert(value)
                 else:
                     additional_fields[string_key] = self.conn._to_python(value)
             
             del(additional_fields[DJANGO_CT])
             del(additional_fields[DJANGO_ID])
             del(additional_fields['score'])
             
             if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
                 additional_fields['highlighted'] = raw_results.highlighting[raw_result[ID]]
             
             result = result_class(app_label, model_name, raw_result[DJANGO_ID], raw_result['score'], searchsite=self.site, **additional_fields)
             results.append(result)
         else:
             hits -= 1
     
     return {
         'results': results,
         'hits': hits,
         'facets': facets,
         'spelling_suggestion': spelling_suggestion,
     }
Ejemplo n.º 52
0
 def _process_results(self, raw_page, highlight=False, query_string='', spelling_query=None):
     from haystack import site
     results = []
     
     # It's important to grab the hits first before slicing. Otherwise, this
     # can cause pagination failures.
     hits = len(raw_page)
     
     facets = {}
     spelling_suggestion = None
     indexed_models = site.get_indexed_models()
     
     for doc_offset, raw_result in enumerate(raw_page):
         score = raw_page.score(doc_offset) or 0
         app_label, model_name = raw_result['django_ct'].split('.')
         additional_fields = {}
         model = get_model(app_label, model_name)
         
         if model and model in indexed_models:
             for key, value in raw_result.items():
                 index = site.get_index(model)
                 string_key = str(key)
                 
                 if string_key in index.fields and hasattr(index.fields[string_key], 'convert'):
                     # Special-cased due to the nature of KEYWORD fields.
                     if isinstance(index.fields[string_key], MultiValueField):
                         if value is None or len(value) is 0:
                             additional_fields[string_key] = []
                         else:
                             additional_fields[string_key] = value.split(',')
                     else:
                         additional_fields[string_key] = index.fields[string_key].convert(value)
                 else:
                     additional_fields[string_key] = self._to_python(value)
             
             del(additional_fields['django_ct'])
             del(additional_fields['django_id'])
             
             if highlight:
                 from whoosh import analysis
                 from whoosh.highlight import highlight, ContextFragmenter, UppercaseFormatter
                 sa = analysis.StemmingAnalyzer()
                 terms = [term.replace('*', '') for term in query_string.split()]
                 
                 additional_fields['highlighted'] = {
                     self.content_field_name: [highlight(additional_fields.get(self.content_field_name), terms, sa, ContextFragmenter(terms), UppercaseFormatter())],
                 }
             
             result = SearchResult(app_label, model_name, raw_result['django_id'], score, **additional_fields)
             results.append(result)
         else:
             hits -= 1
     
     if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False):
         if spelling_query:
             spelling_suggestion = self.create_spelling_suggestion(spelling_query)
         else:
             spelling_suggestion = self.create_spelling_suggestion(query_string)
     
     return {
         'results': results,
         'hits': hits,
         'facets': facets,
         'spelling_suggestion': spelling_suggestion,
     }
Ejemplo n.º 53
0
 def reindex_team_videos(self):
     site.get_index(TeamVideo).reindex()
Ejemplo n.º 54
0
 def handle_app(self, app, **options):
     from django.db.models import get_models
     from haystack.exceptions import NotRegistered
     
     site = get_site(self.site)
     
     if self.workers > 0:
         import multiprocessing
     
     for model in get_models(app):
         try:
             index = site.get_index(model)
         except NotRegistered:
             if self.verbosity >= 2:
                 print "Skipping '%s' - no index." % model
             continue
             
         qs = build_queryset(index, model, age=self.age, verbosity=self.verbosity)
         total = qs.count()
         
         if self.verbosity >= 1:
             print "Indexing %d %s." % (total, smart_str(model._meta.verbose_name_plural))
         
         pks_seen = set([smart_str(pk) for pk in qs.values_list('pk', flat=True)])
         
         if self.workers > 0:
             ghetto_queue = []
         
         for start in range(0, total, self.batchsize):
             end = min(start + self.batchsize, total)
             
             if self.workers == 0:
                 do_update(index, qs, start, end, total, self.verbosity)
             else:
                 ghetto_queue.append(('do_update', model, start, end, total, self.site, self.age, self.verbosity))
         
         if self.workers > 0:
             pool = multiprocessing.Pool(self.workers)
             pool.map(worker, ghetto_queue)
         
         if self.remove:
             if self.age or total <= 0:
                 # They're using a reduced set, which may not incorporate
                 # all pks. Rebuild the list with everything.
                 qs = index.index_queryset().values_list('pk', flat=True)
                 pks_seen = set([smart_str(pk) for pk in qs])
                 total = len(pks_seen)
             
             if self.workers > 0:
                 ghetto_queue = []
             
             for start in range(0, total, self.batchsize):
                 upper_bound = start + self.batchsize
                 
                 if self.workers == 0:
                     do_remove(index, model, pks_seen, start, upper_bound)
                 else:
                     ghetto_queue.append(('do_remove', model, pks_seen, start, upper_bound, self.site, self.verbosity))
             
             if self.workers > 0:
                 pool = multiprocessing.Pool(self.workers)
                 pool.map(worker, ghetto_queue)
Ejemplo n.º 55
0
    def _process_results(self,
                         raw_results,
                         highlight=False,
                         result_class=None):
        if not self.site:
            from haystack import site
        else:
            site = self.site

        results = []
        hits = raw_results.hits
        facets = {}
        spelling_suggestion = None

        if result_class is None:
            result_class = SearchResult

        if hasattr(raw_results, 'facets'):
            facets = {
                'fields': raw_results.facets.get('facet_fields', {}),
                'dates': raw_results.facets.get('facet_dates', {}),
                'queries': raw_results.facets.get('facet_queries', {}),
            }

            for key in ['fields']:
                for facet_field in facets[key]:
                    # Convert to a two-tuple, as Solr's json format returns a list of
                    # pairs.
                    facets[key][facet_field] = zip(
                        facets[key][facet_field][::2],
                        facets[key][facet_field][1::2])

        if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
            if hasattr(raw_results, 'spellcheck'):
                if len(raw_results.spellcheck.get('suggestions', [])):
                    # For some reason, it's an array of pairs. Pull off the
                    # collated result from the end.
                    spelling_suggestion = raw_results.spellcheck.get(
                        'suggestions')[-1]

        indexed_models = site.get_indexed_models()

        for raw_result in raw_results.docs:
            app_label, model_name = raw_result[DJANGO_CT].split('.')
            additional_fields = {}
            model = get_model(app_label, model_name)

            if model and model in indexed_models:
                for key, value in raw_result.items():
                    index = site.get_index(model)
                    string_key = str(key)

                    if string_key in index.fields and hasattr(
                            index.fields[string_key], 'convert'):
                        additional_fields[string_key] = index.fields[
                            string_key].convert(value)
                    else:
                        additional_fields[string_key] = self.conn._to_python(
                            value)

                del (additional_fields[DJANGO_CT])
                del (additional_fields[DJANGO_ID])
                del (additional_fields['score'])

                if raw_result[ID] in getattr(raw_results, 'highlighting', {}):
                    additional_fields[
                        'highlighted'] = raw_results.highlighting[
                            raw_result[ID]]

                result = result_class(app_label,
                                      model_name,
                                      raw_result[DJANGO_ID],
                                      raw_result['score'],
                                      searchsite=self.site,
                                      **additional_fields)
                results.append(result)
            else:
                hits -= 1

        return {
            'results': results,
            'hits': hits,
            'facets': facets,
            'spelling_suggestion': spelling_suggestion,
        }
Ejemplo n.º 56
0
 def handle_app(self, app, **options):
     # Cause the default site to load.
     from haystack import site
     from django.db.models import get_models
     from haystack.exceptions import NotRegistered
     
     if self.site:
         path_bits = self.site.split('.')
         module_name = '.'.join(path_bits[:-1])
         site_name = path_bits[-1]
         
         try:
             module = importlib.import_module(module_name)
             site = getattr(module, site_name)
         except (ImportError, NameError):
             pass
     
     for model in get_models(app):
         try:
             index = site.get_index(model)
         except NotRegistered:
             if self.verbosity >= 2:
                 print "Skipping '%s' - no index." % model
             continue
             
         extra_lookup_kwargs = {}
         updated_field = index.get_updated_field()
         
         if self.age:
             if updated_field:
                 extra_lookup_kwargs['%s__gte' % updated_field] = datetime.datetime.now() - datetime.timedelta(hours=self.age)
             else:
                 if self.verbosity >= 2:
                     print "No updated date field found for '%s' - not restricting by age." % model.__name__
         
         # `.select_related()` seems like a good idea here but can fail on
         # nullable `ForeignKey` as well as what seems like other cases.
         qs = index.get_queryset().filter(**extra_lookup_kwargs).order_by(model._meta.pk.name)
         total = qs.count()
         
         if self.verbosity >= 1:
             print "Indexing %d %s." % (total, smart_str(model._meta.verbose_name_plural))
         
         pks_seen = set()
         
         for start in range(0, total, self.batchsize):
             end = min(start + self.batchsize, total)
             
             # Get a clone of the QuerySet so that the cache doesn't bloat up
             # in memory. Useful when reindexing large amounts of data.
             small_cache_qs = qs.all()
             current_qs = small_cache_qs[start:end]
             
             for obj in current_qs:
                 pks_seen.add(smart_str(obj.pk))
             
             if self.verbosity >= 2:
                 print "  indexing %s - %d of %d." % (start+1, end, total)
             
             index.backend.update(index, current_qs)
             
             # Clear out the DB connections queries because it bloats up RAM.
             reset_queries()
         
         if self.remove:
             if self.age or total <= 0:
                 # They're using a reduced set, which may not incorporate
                 # all pks. Rebuild the list with everything.
                 pks_seen = set()
                 qs = index.get_queryset().values_list('pk', flat=True)
                 total = qs.count()
                 
                 for pk in qs:
                     pks_seen.add(smart_str(pk))
             
             for start in range(0, total, self.batchsize):
                 upper_bound = start + self.batchsize
                 
                 # Fetch a list of results.
                 # Can't do pk range, because id's are strings (thanks comments
                 # & UUIDs!).
                 stuff_in_the_index = SearchQuerySet().models(model)[start:upper_bound]
                 
                 # Iterate over those results.
                 for result in stuff_in_the_index:
                     # Be careful not to hit the DB.
                     if not smart_str(result.pk) in pks_seen:
                         # The id is NOT in the small_cache_qs, issue a delete.
                         if self.verbosity >= 2:
                             print "  removing %s." % result.pk
                         
                         index.backend.remove(".".join([result.app_label, result.model_name, result.pk]))