Ejemplo n.º 1
0
    def segment_units(self):
        self.segunits = []
        
        self.init_variant_patterns()

        t0 = datetime.now()

        # TODO: derive the info from the faceted_search settings.py or from a new
        # settings variable.

        # arguments
        args = self.options
        # unpack hilite
        args['hilite'] = args.get('hilite', '').split(',')
        args['hilite_groups'] = [self.get_group_key_from_pattern_key(self.get_pattern_from_id(pid)['key']) for pid in args['hilite'] if pid and self.get_pattern_from_id(pid)]
        args['ignore'] = args.get('ignore', '')
        args['exclude'] = args.get('exclude', '')
        
        args['ulimit'] = dputils.get_int(args.get('ulimit', 10), 10)
        args['urange'] = args.get('urange', '')

        # Get the text units
        hand_filters.chrono('units:')
        self.stats = stats = {'duration_segmentation': 0, 'range_size': 0, 'patterns': {}, 'groups': {}}
        for pattern in self.get_patterns():
            group = re.sub(ur'-\d+$', '', pattern['key'])
            self.stats['groups'][group] = 0

        for unit in self.get_unit_model().objects.filter(content_xml__id=4).iterator():
            # only fief
            types = unit.get_entry_type()
            if not types or 'F' not in types: continue

            # only selected range
            if not dputils.is_unit_in_range(unit.unitid, args['urange']): continue

            stats['range_size'] += 1

            # segment the unit
            self.segment_unit(unit, args)

            if unit.match_conditions:
                self.segunits.append(unit)

        hand_filters.chrono(':units')

        self.variants = [{'text': variant, 'hits': self.variants[variant]} for variant in sorted(self.variants.keys())]

        # stats
        stats['result_size'] = len(self.segunits)
        stats['result_size_pc'] = int(100.0 * stats['result_size'] / stats['range_size']) if stats['range_size'] else 'N/A'

        # limit size of returned result
        if args['ulimit'] > 0:
            self.segunits = self.segunits[0:args['ulimit']]

        stats['duration_segmentation'] = (datetime.now() - t0).total_seconds()
Ejemplo n.º 2
0
    def input(self, **kwargs):
        
        #raise Exception('BACK TRACE')
        
        hand_filters.chrono('input:')
        
        # LESSC
        content = super(LessAndCssAbsoluteFilter, self).input(**kwargs)

        self.validate_input()
        
        # CssAbsoluteFilter
        hand_filters.chrono('\t %s' % repr(kwargs))
        kwargs['filename'] = self.init_filename
        ret = CssAbsoluteFilter(content).input(**kwargs)
        
        hand_filters.chrono(':input')
        
        return ret
Ejemplo n.º 3
0
    def input(self, **kwargs):

        #raise Exception('BACK TRACE')

        hand_filters.chrono('input:')

        # LESSC
        content = super(LessAndCssAbsoluteFilter, self).input(**kwargs)

        self.validate_input()

        # CssAbsoluteFilter
        hand_filters.chrono('\t %s' % repr(kwargs))
        kwargs['filename'] = self.init_filename
        ret = CssAbsoluteFilter(content).input(**kwargs)

        hand_filters.chrono(':input')

        return ret
Ejemplo n.º 4
0
    def segment_units(self):
        self.segunits = []

        self.init_variant_patterns()

        t0 = datetime.now()

        # TODO: derive the info from the faceted_search settings.py or from a new
        # settings variable.

        # arguments
        args = self.options
        # unpack hilite
        args['hilite'] = args.get('hilite', '').split(',')
        args['hilite_groups'] = [
            self.get_group_key_from_pattern_key(
                self.get_pattern_from_id(pid)['key']) for pid in args['hilite']
            if pid and self.get_pattern_from_id(pid)
        ]
        args['ignore'] = args.get('ignore', '')
        args['exclude'] = args.get('exclude', '')

        args['ulimit'] = dputils.get_int(args.get('ulimit', 10), 10)
        args['urange'] = args.get('urange', '')

        # Get the text units
        hand_filters.chrono('units:')
        self.stats = stats = {
            'duration_segmentation': 0,
            'range_size': 0,
            'patterns': {},
            'groups': {}
        }
        for pattern in self.get_patterns():
            group = re.sub(ur'-\d+$', '', pattern['key'])
            self.stats['groups'][group] = 0

        for unit in self.get_unit_model().objects.filter(
                content_xml__id=4).iterator():
            # only fief
            types = unit.get_entry_type()
            if not types or 'F' not in types:
                continue

            # only selected range
            if not dputils.is_unit_in_range(unit.unitid, args['urange']):
                continue

            stats['range_size'] += 1

            # segment the unit
            self.segment_unit(unit, args)

            if unit.match_conditions:
                self.segunits.append(unit)

        hand_filters.chrono(':units')

        self.variants = [{
            'text': variant,
            'hits': self.variants[variant]
        } for variant in sorted(self.variants.keys())]

        # stats
        stats['result_size'] = len(self.segunits)
        stats['result_size_pc'] = int(
            100.0 * stats['result_size'] /
            stats['range_size']) if stats['range_size'] else 'N/A'

        # limit size of returned result
        if args['ulimit'] > 0:
            self.segunits = self.segunits[0:args['ulimit']]

        stats['duration_segmentation'] = (datetime.now() - t0).total_seconds()
Ejemplo n.º 5
0
def set_search_results_to_context(request, context={}, allowed_type=None, show_advanced_search_form=False):
    ''' Read the information posted through the search form and create the queryset
        for each relevant type of content (e.g. MS, Hand) => context['results']

        If the form was not valid or submitted, context['results'] is left undefined.

        Other context variables used by the search template are also set.
    '''

    # allowed_type: this variable is used to restrict the search to one content type only.
    # This is useful when we display a specific record page and we only
    # have to search for the related content type to show the previous/next links.
    # allowed_type = kwargs.get('allowed_type', None)
    # context = kwargs.get('context', {})

    context['terms'] = ''

    # pagination sizes
    set_page_sizes_to_context(request, context)

    # list of query parameter/form fields which can be changed without
    # triggering a search
    context['submitted'] = False
    non_search_params = ['basic_search_type', 'from_link', 'result_type']
    for param in request.GET:
        if param not in non_search_params and request.GET.get(param):
            context['submitted'] = True

    context['can_edit'] = has_edit_permission(request, Hand)
    context['types'] = get_search_types(request)

    context['annotation_mode'] = request.GET.get('am', '0')

    context['view'] = request.GET.get('view', '')
    for type in context['types']:
        type.set_desired_view(context['view'])
        type.set_page_size(context['page_size'])

    context['search_types_display'] = get_search_types_display(
        context['types'])
    context['is_empty'] = True

    advanced_search_form = SearchPageForm(request.GET)

    advanced_search_form.fields['basic_search_type'].choices = [
        (type.key, type.label) for type in context['types']]

    if show_advanced_search_form:
        context['advanced_search_form'] = advanced_search_form

    if advanced_search_form.is_valid():
        # Read the inputs
        # - term
        term = advanced_search_form.cleaned_data['terms']
        context['terms'] = term or ' '
        context['query_summary'], context['query_summary_interactive'] = get_query_summary(
            request, term, context['submitted'], [type.get_form(request) for type in context['types']])

        # - search type
        context['search_type'] = advanced_search_form.cleaned_data['basic_search_type']

        context['search_type_defaulted'] = context['search_type']
        if not context['search_type_defaulted']:
            if context['types']:
                context['search_type_defaulted'] = context['types'][0].key
            else:
                context['search_type_defaulted'] = '?'

        has_result = False

        if context['submitted']:
            # Create the queryset for each allowed content type.
            # If allowed_types is None, search for each supported content type.
            for type in context['types']:
                if allowed_type in [None, type.key]:
                    hand_filters.chrono('Search %s:' % type.key)
                    context['results'] = type.build_queryset(
                        request, term, not has_result)
                    if type.is_empty == False:
                        has_result = True
                    hand_filters.chrono(':Search %s' % type.key)
Ejemplo n.º 6
0
def search_record_view(request):
    ret = reroute_to_static_search(request)
    if ret:
        return ret

    hand_filters.chrono('SEARCH VIEW:')
    hand_filters.chrono('SEARCH LOGIC:')

    # Backward compatibility.
    # Previously all the record pages would go through this search URL and view
    # and their URL was:
    #     /digipal/search/?id=1&result_type=scribes&basic_search_type=hands&terms=Wulfstan
    # Now we redirect those requests to the record page
    #     /digipal/scribes/1/?basic_search_type=hands&terms=Wulfstan+&result_type=scribes
    qs_id = request.GET.get('id', '')
    qs_result_type = request.GET.get('result_type', '')
    if qs_id and qs_result_type:
        from django.shortcuts import redirect
        # TODO: get digipal from current project name or current URL
        redirect_url = '/%s/%s/%s/?%s' % ('digipal',
                                          qs_result_type, qs_id, request.META['QUERY_STRING'])
        return redirect(redirect_url)

    # backward compatibility:
    # query string param 'name' and 'scribes' have ben renamed to 'scribe'
    request.GET = request.GET.copy()
    request.GET['scribe'] = request.GET.get('scribe', '') or request.GET.get(
        'scribes', '') or request.GET.get('name', '')

    request.GET['ms_date'] = request.GET.get(
        'ms_date', '') or request.GET.get('date', '')
    request.GET['hand_date'] = request.GET.get(
        'hand_date', '') or request.GET.get('date', '')
    request.GET['scribe_date'] = request.GET.get(
        'scribe_date', '') or request.GET.get('date', '')

    request.GET['hand_place'] = request.GET.get(
        'hand_place', '') or request.GET.get('place', '')
    request.GET['scriptorium'] = request.GET.get(
        'scriptorium', '') or request.GET.get('place', '')

    # Actually run the searches
    context = {}

    context['nofollow'] = True

    set_search_results_to_context(
        request, context=context, show_advanced_search_form=True)

    # check if the search was executed or not (e.g. form not submitted or
    # invalid form)
    if context.has_key('results'):
        # Tab Selection Logic =
        #     we pick the tab the user has selected even if it is empty. END
        #     if none, we select the filter/advanced search content type
        #     if none or its result is empty we select the first non empty type
        #     if none we select the first type. END
        result_type = request.GET.get('result_type', '')

        # requested result type does not exist => ignore it
        if result_type not in [type.key for type in context['types']]:
            result_type = ''

        if not result_type:
            first_non_empty_type = None
            for type in context['types']:
                if type.key == context['search_type'] and not type.is_empty:
                    result_type = context['search_type']
                    break
                if not first_non_empty_type and not type.is_empty:
                    first_non_empty_type = type.key
            if not result_type:
                result_type = first_non_empty_type

        result_type = result_type or context['types'][0].key
        context['result_type'] = result_type

        # No result at all?
        for type in context['types']:
            if not type.is_empty:
                context['is_empty'] = False

    from digipal import utils
    context['search_help_url'] = utils.get_cms_url_from_slug(
        getattr(settings, 'SEARCH_HELP_PAGE_SLUG', 'search_help'))

    # Initialise the advanced search forms
    # context['drilldownform'] = GraphSearchForm({'terms': context['terms'] or ''})

    page_options = get_search_page_js_data(
        context['types'], request.GET.get('from_link') in ('true', '1'), request)
    context['expanded_custom_filters'] = page_options['advanced_search_expanded']
    page_options['linked_fields'] = []

    for type in context['types']:
        type.add_field_links(page_options['linked_fields'])

    context['search_page_options_json'] = json.dumps(page_options)
    for custom_filter in page_options['filters']:
        if custom_filter['key'] == context['search_type_defaulted']:
            context['filters_form'] = custom_filter

    from digipal.models import RequestLog
    RequestLog.save_request(request, sum(
        [type.count for type in context['types']]))

    hand_filters.chrono(':SEARCH LOGIC')
    hand_filters.chrono('SEARCH TEMPLATE:')

    ret = render_to_response('search/search_record.html',
                             context, context_instance=RequestContext(request))

    hand_filters.chrono(':SEARCH TEMPLATE')

    hand_filters.chrono(':SEARCH VIEW')

    return ret
Ejemplo n.º 7
0
def search_ms_image_view(request):
    '''View for the Browse Image page'''

    hand_filters.chrono('BROWSE:')

    from digipal.utils import request_invisible_model
    request_invisible_model('image', request, 'Image')

    hand_filters.chrono('search:')
    hand_filters.chrono('all():')
    images = Image.objects.all()
    hand_filters.chrono(':all()')

    from digipal.forms import FilterManuscriptsImages

    # Get Buttons
    context = {}

    context['view'] = request.GET.get('view', 'images')

    town_or_city = request.GET.get('town_or_city', '')
    repository = request.GET.get('repository', '')
    date = request.GET.get('date', '')

    set_page_sizes_to_context(request, context, [12, 20, 40, 100])

    # Applying filters
    if town_or_city:
        images = images.filter(
            item_part__current_item__repository__place__name=town_or_city)
    if repository:
        # repo is in two parts: repo place, repo name (e.g. cambridge, corpus christi college)
        # but we also support old style URL which have only the name of the repo
        # if we don't, crawlers like Googlebot could receive a 500 error (see
        # JIRA DIGIPAL-483)
        repo_parts = [p.strip() for p in repository.split(',')]
        if repo_parts:
            images = images.filter(
                item_part__current_item__repository__name=repo_parts[-1])
        if len(repo_parts) > 1:
            images = images.filter(
                item_part__current_item__repository__place__name=repo_parts[0])
    if date:
        images = images.filter(hands__assigned_date__date=date)

    images = images.filter(item_part_id__gt=0)
    # not sufficient, see JIRA #552
    # images = Image.sort_query_set_by_locus(images)

    # images = list(images.order_by('id'))
    # from digipal.utils import natural_sort_key
    # images = sorted(images, key=lambda im: natural_sort_key(im.display_label, True))
    # context['images'] = Image.sort_query_set_by_locus(images.prefetch_related('hands', 'annotation_set'))

    # permission filter
    # OPT: on DigiPal prefetch_related of annotation_set takes 20s and it retrieves all the fields even
    # related to linked tables (allograph, character, etc.)
    # Same with hands which takes around 2/3 s.
    # images = Image.filter_permissions_from_request(images.prefetch_related('hands', 'annotation_set'), request)
    images = Image.filter_permissions_from_request(images, request)

    hand_filters.chrono(':search')

    # count hands
    hand_filters.chrono('hands:')
    from django.db.models import Count
    context['images'] = Image.sort_query_set_by_locus(images.select_related(
        'item_part__current_item__repository__place').annotate(hand_count=Count('hands')))
    hand_filters.chrono('hands:')

    image_search_form = FilterManuscriptsImages(request.GET)
    context['image_search_form'] = image_search_form
    context['query_summary'], context['query_summary_interactive'] = get_query_summary(
        request, '', True, [image_search_form])

    hand_filters.chrono('template:')
    ret = render_to_response('search/search_ms_image.html',
                             context, context_instance=RequestContext(request))
    hand_filters.chrono(':template')

    hand_filters.chrono(':BROWSE')

    return ret
Ejemplo n.º 8
0
def set_search_results_to_context(request, context={}, allowed_type=None, show_advanced_search_form=False):
    """ Read the information posted through the search form and create the queryset
        for each relevant type of content (e.g. MS, Hand) => context['results']

        If the form was not valid or submitted, context['results'] is left undefined.

        Other context variables used by the search template are also set.
    """

    # allowed_type: this variable is used to restrict the search to one content type only.
    # This is useful when we display a specific record page and we only
    # have to search for the related content type to show the previous/next links.
    # allowed_type = kwargs.get('allowed_type', None)
    # context = kwargs.get('context', {})

    context["terms"] = ""

    # pagination sizes
    set_page_sizes_to_context(request, context)

    # list of query parameter/form fields which can be changed without triggering a search
    context["submitted"] = False
    non_search_params = ["basic_search_type", "from_link", "result_type"]
    for param in request.GET:
        if param not in non_search_params and request.GET.get(param):
            context["submitted"] = True

    context["can_edit"] = has_edit_permission(request, Hand)
    context["types"] = get_search_types(request)

    context["annotation_mode"] = request.GET.get("am", "0")

    context["view"] = request.GET.get("view", "")
    for type in context["types"]:
        type.set_desired_view(context["view"])
        type.set_page_size(context["page_size"])

    context["search_types_display"] = get_search_types_display(context["types"])
    context["is_empty"] = True

    advanced_search_form = SearchPageForm(request.GET)

    advanced_search_form.fields["basic_search_type"].choices = [(type.key, type.label) for type in context["types"]]

    if show_advanced_search_form:
        context["advanced_search_form"] = advanced_search_form

    if advanced_search_form.is_valid():
        # Read the inputs
        # - term
        term = advanced_search_form.cleaned_data["terms"]
        context["terms"] = term or " "
        context["query_summary"], context["query_summary_interactive"] = get_query_summary(
            request, term, context["submitted"], [type.get_form(request) for type in context["types"]]
        )

        # - search type
        context["search_type"] = advanced_search_form.cleaned_data["basic_search_type"]

        context["search_type_defaulted"] = context["search_type"]
        if not context["search_type_defaulted"]:
            if context["types"]:
                context["search_type_defaulted"] = context["types"][0].key
            else:
                context["search_type_defaulted"] = "?"

        has_result = False

        if context["submitted"]:
            # Create the queryset for each allowed content type.
            # If allowed_types is None, search for each supported content type.
            for type in context["types"]:
                if allowed_type in [None, type.key]:
                    hand_filters.chrono("Search %s:" % type.key)
                    context["results"] = type.build_queryset(request, term, not has_result)
                    if type.is_empty == False:
                        has_result = True
                    hand_filters.chrono(":Search %s" % type.key)
Ejemplo n.º 9
0
def search_record_view(request):
    ret = reroute_to_static_search(request)
    if ret:
        return ret

    hand_filters.chrono("SEARCH VIEW:")
    hand_filters.chrono("SEARCH LOGIC:")

    # Backward compatibility.
    # Previously all the record pages would go through this search URL and view
    # and their URL was:
    #     /digipal/search/?id=1&result_type=scribes&basic_search_type=hands&terms=Wulfstan
    # Now we redirect those requests to the record page
    #     /digipal/scribes/1/?basic_search_type=hands&terms=Wulfstan+&result_type=scribes
    qs_id = request.GET.get("id", "")
    qs_result_type = request.GET.get("result_type", "")
    if qs_id and qs_result_type:
        from django.shortcuts import redirect

        # TODO: get digipal from current project name or current URL
        redirect_url = "/%s/%s/%s/?%s" % ("digipal", qs_result_type, qs_id, request.META["QUERY_STRING"])
        return redirect(redirect_url)

    # backward compatibility:
    # query string param 'name' and 'scribes' have ben renamed to 'scribe'
    request.GET = request.GET.copy()
    request.GET["scribe"] = (
        request.REQUEST.get("scribe", "") or request.REQUEST.get("scribes", "") or request.REQUEST.get("name", "")
    )

    request.GET["ms_date"] = request.REQUEST.get("ms_date", "") or request.REQUEST.get("date", "")
    request.GET["hand_date"] = request.REQUEST.get("hand_date", "") or request.REQUEST.get("date", "")
    request.GET["scribe_date"] = request.REQUEST.get("scribe_date", "") or request.REQUEST.get("date", "")

    request.GET["hand_place"] = request.REQUEST.get("hand_place", "") or request.REQUEST.get("place", "")
    request.GET["scriptorium"] = request.REQUEST.get("scriptorium", "") or request.REQUEST.get("place", "")

    # Actually run the searches
    context = {}

    context["nofollow"] = True

    set_search_results_to_context(request, context=context, show_advanced_search_form=True)

    # check if the search was executed or not (e.g. form not submitted or invalid form)
    if context.has_key("results"):
        # Tab Selection Logic =
        #     we pick the tab the user has selected even if it is empty. END
        #     if none, we select the filter/advanced search content type
        #     if none or its result is empty we select the first non empty type
        #     if none we select the first type. END
        result_type = request.GET.get("result_type", "")

        # requested result type does not exist => ignore it
        if result_type not in [type.key for type in context["types"]]:
            result_type = ""

        if not result_type:
            first_non_empty_type = None
            for type in context["types"]:
                if type.key == context["search_type"] and not type.is_empty:
                    result_type = context["search_type"]
                    break
                if not first_non_empty_type and not type.is_empty:
                    first_non_empty_type = type.key
            if not result_type:
                result_type = first_non_empty_type

        result_type = result_type or context["types"][0].key
        context["result_type"] = result_type

        # No result at all?
        for type in context["types"]:
            if not type.is_empty:
                context["is_empty"] = False

    from digipal import utils

    context["search_help_url"] = utils.get_cms_url_from_slug(getattr(settings, "SEARCH_HELP_PAGE_SLUG", "search_help"))

    # Initialise the advanced search forms
    # context['drilldownform'] = GraphSearchForm({'terms': context['terms'] or ''})

    page_options = get_search_page_js_data(context["types"], request.GET.get("from_link") in ("true", "1"), request)
    context["expanded_custom_filters"] = page_options["advanced_search_expanded"]
    page_options["linked_fields"] = []

    for type in context["types"]:
        type.add_field_links(page_options["linked_fields"])

    context["search_page_options_json"] = json.dumps(page_options)
    for custom_filter in page_options["filters"]:
        if custom_filter["key"] == context["search_type_defaulted"]:
            context["filters_form"] = custom_filter

    from digipal.models import RequestLog

    RequestLog.save_request(request, sum([type.count for type in context["types"]]))

    hand_filters.chrono(":SEARCH LOGIC")
    hand_filters.chrono("SEARCH TEMPLATE:")

    ret = render_to_response("search/search_record.html", context, context_instance=RequestContext(request))

    hand_filters.chrono(":SEARCH TEMPLATE")

    hand_filters.chrono(":SEARCH VIEW")

    return ret
Ejemplo n.º 10
0
    def populate_index(self, ct, index=None):
        chrono("POPULATE_INDEX:")

        # Add documents to the index
        print "\tgenerate sort rankings"

        chrono("RANK_VALUES:")
        ct.prepare_value_rankings(
            callback=lambda progress: self.write_state_update(ct, max(0.001, 1.0 / 3.0 * progress))
        )
        chrono(":RANK_VALUES")

        chrono("INDEXING QUERY:")
        print "\tretrieve all records"
        dputils.gc_collect()

        from whoosh.writing import BufferedWriter

        rcs = ct.get_all_records(True)
        record_count = rcs.count()

        writer = None

        chrono(":INDEXING QUERY")

        print "\tadd records to index"

        i = 0
        commit_size = 500
        progress_size = 200

        # settings.DEV_SERVER = True
        chrono("INDEXING:")
        chrono("First record:")

        record_condition = ct.get_option("condition", None)

        pbar = dputils.ProgressBar(record_count)

        # Indexing can use n x 100 MB
        # Which can be excessive for small VMs
        # One technique is to create small, independent index segments
        # Then optimise them outside this fct on a separate index
        for record in rcs.iterator():
            if i == 0:
                chrono(":First record")
            pbar.update(i + 1)

            if (i % commit_size) == 0:
                # we have to commit every x document otherwise the memory saturates on the VM
                # BufferedWriter is buggy and will crash after a few 100x docs
                if writer:
                    writer.commit(merge=False)

                # we have to recreate after commit because commit unlock index
                writer = None
                index = None
                dputils.gc_collect()

                index = ct.get_whoosh_index()
                writer = index.writer()

            i += 1

            if record_condition and not record_condition(record):
                continue

            writer.add_document(**ct.get_document_from_record(record))

            if (i % progress_size) == 0:
                self.write_state_update(ct, (1 + 1.0 * i / record_count) * 1.0 / 3)

        if writer:
            writer.commit(merge=False)
        # rcs = None
        # ct.clear_value_rankings()

        pbar.complete()
        chrono(":INDEXING")

        print "\n"

        chrono(":POPULATE_INDEX")

        print "\tdone (%s records)" % record_count
Ejemplo n.º 11
0
    def _build_queryset(self, request, term):
        """ View for Hand record drill-down """
        context = {}
        self.graphs_count = 0
        
        undefined = u''
        
        scribe = request.GET.get('scribe', undefined)
        # alternative names are for backward compatibility with old-style graph search page  
        script = request.GET.get('script', undefined)
        chartype = request.GET.get('chartype', undefined)
        character = request.GET.get('character', undefined)
        allograph = request.GET.get('allograph', undefined)
        component = request.GET.get('component', undefined)
        feature = request.GET.get('feature', undefined)
        repository = request.GET.get('repository', undefined)
        index = request.GET.get('index', undefined)
        
        
        excluded_images = None
        from digipal.utils import is_staff
        if not is_staff(request):
            excluded_images = Image.filter_permissions(Image.objects.all(), [MediaPermission.PERM_PRIVATE])
        
        none = u'-1'
        one_or_more = u'-2'
        
        from datetime import datetime
        
        t0 = datetime.now()
        t4 = datetime.now()
        
        wheres = []

        if self.search_hands:
            graphs = Graph.objects.filter(hand__id__in=self.search_hands.queryset)
        else:
        
            # .order_by('item_part__current_item__repository__name', 'item_part__current_item__shelfmark', 'descriptions__description','id')
            # Although we are listing hands on the front-end, we search for graphs and not for hand.
            # Two reasons: 
            #    searching for character and allograh at the same time through a Hand model would generate two separate joins to graph
            #        this would bring potentially invalid results and it is also much slower
            #    it is faster than excluding all the hands without a graph (yet another expensive join)
            #
            if term:
                term = term.replace('"', '')
                graphs = Graph.objects.filter(
                        Q(hand__descriptions__description__icontains=term) | \
                        Q(hand__scribe__name__icontains=term) | \
                        Q(hand__assigned_place__name__icontains=term) | \
                        Q(hand__assigned_date__date__icontains=term) | \
                        Q(hand__item_part__current_item__shelfmark__icontains=term) | \
                        Q(hand__item_part__current_item__repository__name__icontains=term) | \
                        Q(hand__item_part__current_item__repository__place__name__icontains=term) | \
                        Q(hand__item_part__historical_items__catalogue_number__icontains=term) | \
                        # JIRA 423
                        Q(hand__item_part__historical_items__name__icontains=term) | \
                        Q(hand__item_part__group__historical_items__name__icontains=term) | \
                        Q(hand__item_part__display_label__icontains=term) | \
                        Q(hand__item_part__group__display_label__icontains=term)
                        )
            else:
                graphs = Graph.objects.all()
                
            t1 = datetime.now()
            
            if index:
                graphs = graphs.filter(hand__item_part__historical_items__catalogue_number__iexact=index)
            if repository:
                matches = re.match(ur'^([^,]+?),([^,]+)$', repository)
                if matches:
                    graphs = graphs.filter(Q(hand__item_part__current_item__repository__place__name__iexact=matches.group(1).strip()) & Q(hand__item_part__current_item__repository__name__iexact=matches.group(2).strip()))
            if scribe:
                graphs = graphs.filter(hand__scribe__name__icontains=scribe)
            if script:
                graphs = graphs.filter(hand__script__name=script)
        
        if chartype:
            graphs = graphs.filter(idiograph__allograph__character__ontograph__ontograph_type__name=chartype)
        if character:
            graphs = graphs.filter(idiograph__allograph__character__name=character)
        if allograph:
            graphs = graphs.filter(idiograph__allograph__name=allograph)

        # we discard freak graph records (i.e. without annotation) to prevent errors further down the line. 
        graphs = graphs.filter(annotation__isnull=False)
        
        # if the user is not logged in we exclude graphs where the allograph is hidden
        from digipal.models import has_edit_permission
        if not has_edit_permission(request, self.get_model()):
            graphs = graphs.exclude(idiograph__allograph__hidden=True)
            
        # exclude private images
        if excluded_images and excluded_images.count():
            graphs = graphs.exclude(annotation__image__in=excluded_images)
        
        # condition on component
        if component:
            component_where = Q(graph_components__component__name=component)
            if feature in [undefined, none]:
                # If no feature is specified we find all the graph which are supposed to have a component
                # according to their idiograph
                component_where = component_where | Q(idiograph__allograph__allograph_components__component__name=component)
            wheres.append(component_where)

        # condition on feature
        if feature not in [undefined, none, one_or_more]:
            wheres.append(Q(graph_components__features__name=feature))
        if feature in [one_or_more]:
            wheres.append(Q(graph_components__features__id__isnull=False))

        # ANDs all the Q() where clauses together
        if wheres:
            where_and = wheres.pop(0)
            for where in wheres:
                where_and = where_and & where    
            
            graphs = graphs.filter(where_and)
        
        # Treat the feature=none case 
        if feature == none:
            excluded_q = Q(graph_components__features__id__isnull=False)
            if component:
                excluded_q = excluded_q & Q(graph_components__component__name=component)
            excluded_graphs = Graph.objects.filter(excluded_q)
            graphs = graphs.exclude(id__in=excluded_graphs.values_list('id', flat=True))
        
        from digipal.utils import set_left_joins_in_queryset, get_str_from_queryset
        set_left_joins_in_queryset(graphs)
        #print get_str_from_queryset(graphs)
        
        t2 = datetime.now()
    
        # Get the graphs then id of all the related Hands
        # We use values_list because it is much faster, we don't need to fetch all the Hands at this stage
        # That will be done after pagination in the template
        # Distinct is needed here.
        #graphs = graphs.distinct().order_by('hand__scribe__name', 'hand__id', 'idiograph__allograph__character__ontograph__sort_order')
        chrono('graph filter:')
        graphs = graphs.distinct().order_by('hand__scribe__name', 'hand__id')
        chrono(':graph filter')

        #print graphs.query
        chrono('graph values_list:')
        graph_ids = graphs.values_list('id', 'hand_id')
        chrono(':graph values_list')
        
#        chrono('len:')
#        l = len(graph_ids)
#        print graph_ids.query
#        chrono(':len')
        
        # Build a structure that groups all the graph ids by hand id
        # context['hand_ids'] = [[1, 101, 102], [2, 103, 104]]
        # In the above we have two hands: 1 and 2. For hand 1 we have Graph 101 and 102.
        chrono('hand_ids:')
        context['hand_ids'] = [[0]]
        last = 0
        for g in graph_ids:
            if g[1] != context['hand_ids'][-1][0]:
                context['hand_ids'].append([g[1]])
            context['hand_ids'][-1].append(g[0])
        del(context['hand_ids'][0])
        chrono(':hand_ids')

        t3 = datetime.now()

        self.graphs_count = len(graph_ids)
        
        t4 = datetime.now()
        
        #print 'search %s; hands query: %s + graph count: %s' % (t4 - t0, t3 - t2, t4 - t3)
            
        t5 = datetime.now()
        self._queryset = context['hand_ids']
        
        return self._queryset
Ejemplo n.º 12
0
    def _build_queryset(self, request, term):
        """ View for Hand record drill-down """
        context = {}
        self.graphs_count = 0

        undefined = u''

        scribe = request.GET.get('scribe', undefined)
        # alternative names are for backward compatibility with old-style graph
        # search page
        script = request.GET.get('script', undefined)
        chartype = request.GET.get('chartype', undefined)
        character = request.GET.get('character', undefined)
        allograph = request.GET.get('allograph', undefined)
        component = request.GET.get('component', undefined)
        feature = request.GET.get('feature', undefined)
        repository = request.GET.get('repository', undefined)
        index = request.GET.get('index', undefined)

        excluded_images = None
        from digipal.utils import is_staff
        if not is_staff(request):
            excluded_images = Image.filter_permissions(
                Image.objects.all(), [MediaPermission.PERM_PRIVATE])

        none = u'-1'
        one_or_more = u'-2'

        from datetime import datetime

        t0 = datetime.now()
        t4 = datetime.now()

        wheres = []

        if self.search_hands:
            graphs = Graph.objects.filter(
                hand__id__in=self.search_hands.queryset)
        else:

            # .order_by('item_part__current_item__repository__name', 'item_part__current_item__shelfmark', 'descriptions__description','id')
            # Although we are listing hands on the front-end, we search for graphs and not for hand.
            # Two reasons:
            #    searching for character and allograh at the same time through a Hand model would generate two separate joins to graph
            #        this would bring potentially invalid results and it is also much slower
            #    it is faster than excluding all the hands without a graph (yet another expensive join)
            #
            if term:
                term = term.replace('"', '')
                graphs = Graph.objects.filter(
                    Q(hand__descriptions__description__icontains=term) |
                    Q(hand__scribe__name__icontains=term) |
                    Q(hand__assigned_place__name__icontains=term) |
                    Q(hand__assigned_date__date__icontains=term) |
                    Q(hand__item_part__current_item__shelfmark__icontains=term) |
                    Q(hand__item_part__current_item__repository__name__icontains=term) |
                    Q(hand__item_part__current_item__repository__place__name__icontains=term) |
                    Q(hand__item_part__historical_items__catalogue_number__icontains=term) | \
                    # JIRA 423
                    Q(hand__item_part__historical_items__name__icontains=term) | \
                    Q(hand__item_part__group__historical_items__name__icontains=term) | \
                    Q(hand__item_part__display_label__icontains=term) | \
                    Q(hand__item_part__group__display_label__icontains=term)
                )
            else:
                graphs = Graph.objects.all()

            t1 = datetime.now()

            if index:
                graphs = graphs.filter(
                    hand__item_part__historical_items__catalogue_number__iexact=index)
            if repository:
                matches = re.match(ur'^([^,]+?),([^,]+)$', repository)
                if matches:
                    graphs = graphs.filter(Q(hand__item_part__current_item__repository__place__name__iexact=matches.group(
                        1).strip()) & Q(hand__item_part__current_item__repository__name__iexact=matches.group(2).strip()))
            if scribe:
                graphs = graphs.filter(hand__scribe__name__icontains=scribe)
            if script:
                graphs = graphs.filter(hand__script__name=script)

        if chartype:
            graphs = graphs.filter(
                idiograph__allograph__character__ontograph__ontograph_type__name=chartype)
        if character:
            graphs = graphs.filter(
                idiograph__allograph__character__name=character)
        if allograph:
            graphs = graphs.filter(idiograph__allograph__name=allograph)

        # we discard freak graph records (i.e. without annotation) to prevent
        # errors further down the line.
        graphs = graphs.filter(annotation__isnull=False)

        # if the user is not logged in we exclude graphs where the allograph is
        # hidden
        from digipal.models import has_edit_permission
        if not has_edit_permission(request, self.get_model()):
            graphs = graphs.exclude(idiograph__allograph__hidden=True)

        # exclude private images
        if excluded_images and excluded_images.count():
            graphs = graphs.exclude(annotation__image__in=excluded_images)

        # condition on component
        if component:
            component_where = Q(graph_components__component__name=component)
            if feature in [undefined, none]:
                # If no feature is specified we find all the graph which are supposed to have a component
                # according to their idiograph
                component_where = component_where | Q(
                    idiograph__allograph__allograph_components__component__name=component)
            wheres.append(component_where)

        # condition on feature
        if feature not in [undefined, none, one_or_more]:
            wheres.append(Q(graph_components__features__name=feature))
        if feature in [one_or_more]:
            wheres.append(Q(graph_components__features__id__isnull=False))

        # ANDs all the Q() where clauses together
        if wheres:
            where_and = wheres.pop(0)
            for where in wheres:
                where_and = where_and & where

            graphs = graphs.filter(where_and)

        # Treat the feature=none case
        if feature == none:
            excluded_q = Q(graph_components__features__id__isnull=False)
            if component:
                excluded_q = excluded_q & Q(
                    graph_components__component__name=component)
            excluded_graphs = Graph.objects.filter(excluded_q)
            graphs = graphs.exclude(
                id__in=excluded_graphs.values_list('id', flat=True))

        from digipal.utils import set_left_joins_in_queryset, get_str_from_queryset
        set_left_joins_in_queryset(graphs)
        # print get_str_from_queryset(graphs)

        t2 = datetime.now()

        # Get the graphs then id of all the related Hands
        # We use values_list because it is much faster, we don't need to fetch all the Hands at this stage
        # That will be done after pagination in the template
        # Distinct is needed here.
        #graphs = graphs.distinct().order_by('hand__scribe__name', 'hand__id', 'idiograph__allograph__character__ontograph__sort_order')
        chrono('graph filter:')
        graphs = graphs.distinct().order_by('hand__scribe__name', 'hand__id')
        chrono(':graph filter')

        # print graphs.query
        chrono('graph values_list:')
        graph_ids = graphs.values_list('id', 'hand_id')
        chrono(':graph values_list')

#        chrono('len:')
#        l = len(graph_ids)
#        print graph_ids.query
#        chrono(':len')

        # Build a structure that groups all the graph ids by hand id
        # context['hand_ids'] = [[1, 101, 102], [2, 103, 104]]
        # In the above we have two hands: 1 and 2. For hand 1 we have Graph 101
        # and 102.
        chrono('hand_ids:')
        context['hand_ids'] = [[0]]
        last = 0
        for g in graph_ids:
            if g[1] != context['hand_ids'][-1][0]:
                context['hand_ids'].append([g[1]])
            context['hand_ids'][-1].append(g[0])
        del(context['hand_ids'][0])
        chrono(':hand_ids')

        t3 = datetime.now()

        self.graphs_count = len(graph_ids)

        t4 = datetime.now()

        # print 'search %s; hands query: %s + graph count: %s' % (t4 - t0, t3 -
        # t2, t4 - t3)

        t5 = datetime.now()
        self._queryset = context['hand_ids']

        return self._queryset
Ejemplo n.º 13
0
def set_search_results_to_context(request,
                                  context={},
                                  allowed_type=None,
                                  show_advanced_search_form=False):
    ''' Read the information posted through the search form and create the queryset
        for each relevant type of content (e.g. MS, Hand) => context['results']

        If the form was not valid or submitted, context['results'] is left undefined.

        Other context variables used by the search template are also set.
    '''

    # allowed_type: this variable is used to restrict the search to one content type only.
    # This is useful when we display a specific record page and we only
    # have to search for the related content type to show the previous/next links.
    # allowed_type = kwargs.get('allowed_type', None)
    # context = kwargs.get('context', {})

    context['terms'] = ''

    # pagination sizes
    set_page_sizes_to_context(request, context)

    # list of query parameter/form fields which can be changed without
    # triggering a search
    context['submitted'] = False
    non_search_params = ['basic_search_type', 'from_link', 'result_type']
    for param in request.GET:
        if param not in non_search_params and request.GET.get(param):
            context['submitted'] = True

    context['can_edit'] = has_edit_permission(request, Hand)
    context['types'] = get_search_types(request)

    context['annotation_mode'] = request.GET.get('am', '0')

    context['view'] = request.GET.get('view', '')
    for type in context['types']:
        type.set_desired_view(context['view'])
        type.set_page_size(context['page_size'])

    context['search_types_display'] = get_search_types_display(
        context['types'])
    context['is_empty'] = True

    advanced_search_form = SearchPageForm(request.GET)

    advanced_search_form.fields['basic_search_type'].choices = [
        (type.key, type.label) for type in context['types']
    ]

    if show_advanced_search_form:
        context['advanced_search_form'] = advanced_search_form

    if advanced_search_form.is_valid():
        # Read the inputs
        # - term
        term = advanced_search_form.cleaned_data['terms']
        context['terms'] = term or ' '
        context['query_summary'], context[
            'query_summary_interactive'] = get_query_summary(
                request, term, context['submitted'],
                [type.get_form(request) for type in context['types']])

        # - search type
        context['search_type'] = advanced_search_form.cleaned_data[
            'basic_search_type']

        context['search_type_defaulted'] = context['search_type']
        if not context['search_type_defaulted']:
            if context['types']:
                context['search_type_defaulted'] = context['types'][0].key
            else:
                context['search_type_defaulted'] = '?'

        has_result = False

        if context['submitted']:
            # Create the queryset for each allowed content type.
            # If allowed_types is None, search for each supported content type.
            for type in context['types']:
                if allowed_type in [None, type.key]:
                    hand_filters.chrono('Search %s:' % type.key)
                    context['results'] = type.build_queryset(
                        request, term, not has_result)
                    if type.is_empty == False:
                        has_result = True
                    hand_filters.chrono(':Search %s' % type.key)
Ejemplo n.º 14
0
def search_record_view(request):
    ret = reroute_to_static_search(request)
    if ret:
        return ret

    hand_filters.chrono('SEARCH VIEW:')
    hand_filters.chrono('SEARCH LOGIC:')

    # Backward compatibility.
    # Previously all the record pages would go through this search URL and view
    # and their URL was:
    #     /digipal/search/?id=1&result_type=scribes&basic_search_type=hands&terms=Wulfstan
    # Now we redirect those requests to the record page
    #     /digipal/scribes/1/?basic_search_type=hands&terms=Wulfstan+&result_type=scribes
    qs_id = request.GET.get('id', '')
    qs_result_type = request.GET.get('result_type', '')
    if qs_id and qs_result_type:
        from django.shortcuts import redirect
        # TODO: get digipal from current project name or current URL
        redirect_url = '/%s/%s/%s/?%s' % ('digipal', qs_result_type, qs_id,
                                          request.META['QUERY_STRING'])
        return redirect(redirect_url)

    # backward compatibility:
    # query string param 'name' and 'scribes' have ben renamed to 'scribe'
    request.GET = request.GET.copy()
    request.GET['scribe'] = request.GET.get('scribe', '') or request.GET.get(
        'scribes', '') or request.GET.get('name', '')

    request.GET['ms_date'] = request.GET.get('ms_date', '') or request.GET.get(
        'date', '')
    request.GET['hand_date'] = request.GET.get(
        'hand_date', '') or request.GET.get('date', '')
    request.GET['scribe_date'] = request.GET.get(
        'scribe_date', '') or request.GET.get('date', '')

    request.GET['hand_place'] = request.GET.get(
        'hand_place', '') or request.GET.get('place', '')
    request.GET['scriptorium'] = request.GET.get(
        'scriptorium', '') or request.GET.get('place', '')

    # Actually run the searches
    context = {}

    context['nofollow'] = True

    set_search_results_to_context(request,
                                  context=context,
                                  show_advanced_search_form=True)

    # check if the search was executed or not (e.g. form not submitted or
    # invalid form)
    if context.has_key('results'):
        # Tab Selection Logic =
        #     we pick the tab the user has selected even if it is empty. END
        #     if none, we select the filter/advanced search content type
        #     if none or its result is empty we select the first non empty type
        #     if none we select the first type. END
        result_type = request.GET.get('result_type', '')

        # requested result type does not exist => ignore it
        if result_type not in [type.key for type in context['types']]:
            result_type = ''

        if not result_type:
            first_non_empty_type = None
            for type in context['types']:
                if type.key == context['search_type'] and not type.is_empty:
                    result_type = context['search_type']
                    break
                if not first_non_empty_type and not type.is_empty:
                    first_non_empty_type = type.key
            if not result_type:
                result_type = first_non_empty_type

        result_type = result_type or context['types'][0].key
        context['result_type'] = result_type

        # No result at all?
        for type in context['types']:
            if not type.is_empty:
                context['is_empty'] = False

    from digipal import utils
    context['search_help_url'] = utils.get_cms_url_from_slug(
        getattr(settings, 'SEARCH_HELP_PAGE_SLUG', 'search_help'))

    # Initialise the advanced search forms
    # context['drilldownform'] = GraphSearchForm({'terms': context['terms'] or ''})

    page_options = get_search_page_js_data(
        context['types'],
        request.GET.get('from_link') in ('true', '1'), request)
    context['expanded_custom_filters'] = page_options[
        'advanced_search_expanded']
    page_options['linked_fields'] = []

    for type in context['types']:
        type.add_field_links(page_options['linked_fields'])

    context['search_page_options_json'] = json.dumps(page_options)
    for custom_filter in page_options['filters']:
        if custom_filter['key'] == context['search_type_defaulted']:
            context['filters_form'] = custom_filter

    from digipal.models import RequestLog
    RequestLog.save_request(request,
                            sum([type.count for type in context['types']]))

    hand_filters.chrono(':SEARCH LOGIC')
    hand_filters.chrono('SEARCH TEMPLATE:')

    ret = render_to_response('search/search_record.html',
                             context,
                             context_instance=RequestContext(request))

    hand_filters.chrono(':SEARCH TEMPLATE')

    hand_filters.chrono(':SEARCH VIEW')

    return ret
Ejemplo n.º 15
0
def search_ms_image_view(request):
    '''View for the Browse Image page'''

    hand_filters.chrono('BROWSE:')

    from digipal.utils import request_invisible_model
    request_invisible_model('image', request, 'Image')

    hand_filters.chrono('search:')
    hand_filters.chrono('all():')
    images = Image.objects.all()
    hand_filters.chrono(':all()')

    from digipal.forms import FilterManuscriptsImages

    # Get Buttons
    context = {}

    context['view'] = request.GET.get('view', 'images')

    town_or_city = request.GET.get('town_or_city', '')
    repository = request.GET.get('repository', '')
    date = request.GET.get('date', '')

    set_page_sizes_to_context(request, context, [12, 20, 40, 100])

    # Applying filters
    if town_or_city:
        images = images.filter(
            item_part__current_item__repository__place__name=town_or_city)
    if repository:
        # repo is in two parts: repo place, repo name (e.g. cambridge, corpus christi college)
        # but we also support old style URL which have only the name of the repo
        # if we don't, crawlers like Googlebot could receive a 500 error (see
        # JIRA DIGIPAL-483)
        repo_parts = [p.strip() for p in repository.split(',')]
        if repo_parts:
            images = images.filter(
                item_part__current_item__repository__name=repo_parts[-1])
        if len(repo_parts) > 1:
            images = images.filter(
                item_part__current_item__repository__place__name=repo_parts[0])
    if date:
        images = images.filter(hands__assigned_date__date=date)

    images = images.filter(item_part_id__gt=0)
    # not sufficient, see JIRA #552
    # images = Image.sort_query_set_by_locus(images)

    # images = list(images.order_by('id'))
    # from digipal.utils import natural_sort_key
    # images = sorted(images, key=lambda im: natural_sort_key(im.display_label, True))
    # context['images'] = Image.sort_query_set_by_locus(images.prefetch_related('hands', 'annotation_set'))

    # permission filter
    # OPT: on DigiPal prefetch_related of annotation_set takes 20s and it retrieves all the fields even
    # related to linked tables (allograph, character, etc.)
    # Same with hands which takes around 2/3 s.
    # images = Image.filter_permissions_from_request(images.prefetch_related('hands', 'annotation_set'), request)
    images = Image.filter_permissions_from_request(images, request)

    hand_filters.chrono(':search')

    # count hands
    hand_filters.chrono('hands:')
    from django.db.models import Count
    context['images'] = Image.sort_query_set_by_locus(
        images.select_related('item_part__current_item__repository__place').
        annotate(hand_count=Count('hands')))
    hand_filters.chrono('hands:')

    image_search_form = FilterManuscriptsImages(request.GET)
    context['image_search_form'] = image_search_form
    context['query_summary'], context[
        'query_summary_interactive'] = get_query_summary(
            request, '', True, [image_search_form])

    hand_filters.chrono('template:')
    ret = render_to_response('search/search_ms_image.html',
                             context,
                             context_instance=RequestContext(request))
    hand_filters.chrono(':template')

    hand_filters.chrono(':BROWSE')

    return ret
Ejemplo n.º 16
0
    def populate_index(self, ct, index=None):
        chrono('POPULATE_INDEX:')

        # Add documents to the index
        print '\tgenerate sort rankings'

        chrono('RANK_VALUES:')
        ct.prepare_value_rankings(
            callback=lambda progress: self.write_state_update(
                ct, max(0.001, 1.0 / 3.0 * progress)))
        chrono(':RANK_VALUES')

        chrono('INDEXING QUERY:')
        print '\tretrieve all records'
        dputils.gc_collect()

        from whoosh.writing import BufferedWriter
        rcs = ct.get_all_records(True)
        record_count = rcs.count()

        writer = None

        chrono(':INDEXING QUERY')

        print '\tadd records to index'

        i = 0
        commit_size = 500
        progress_size = 200

        # settings.DEV_SERVER = True
        chrono('INDEXING:')
        chrono('First record:')

        record_condition = ct.get_option('condition', None)

        pbar = dputils.ProgressBar(record_count)

        # Indexing can use n x 100 MB
        # Which can be excessive for small VMs
        # One technique is to create small, independent index segments
        # Then optimise them outside this fct on a separate index
        for record in rcs.iterator():
            if i == 0:
                chrono(':First record')
            pbar.update(i + 1)

            if (i % commit_size) == 0:
                # we have to commit every x document otherwise the memory saturates on the VM
                # BufferedWriter is buggy and will crash after a few 100x docs
                if writer:
                    writer.commit(merge=False)

                # we have to recreate after commit because commit unlock index
                writer = None
                index = None
                dputils.gc_collect()

                index = ct.get_whoosh_index()
                writer = index.writer()

            i += 1

            if record_condition and not record_condition(record):
                continue

            writer.add_document(**ct.get_document_from_record(record))

            if (i % progress_size) == 0:
                self.write_state_update(ct,
                                        (1 + 1.0 * i / record_count) * 1.0 / 3)

        if writer:
            writer.commit(merge=False)
        #rcs = None
        # ct.clear_value_rankings()

        pbar.complete()
        chrono(':INDEXING')

        print '\n'

        chrono(':POPULATE_INDEX')

        print '\tdone (%s records)' % record_count