def segment_units(self): self.segunits = [] self.init_variant_patterns() t0 = datetime.now() # TODO: derive the info from the faceted_search settings.py or from a new # settings variable. # arguments args = self.options # unpack hilite args['hilite'] = args.get('hilite', '').split(',') args['hilite_groups'] = [self.get_group_key_from_pattern_key(self.get_pattern_from_id(pid)['key']) for pid in args['hilite'] if pid and self.get_pattern_from_id(pid)] args['ignore'] = args.get('ignore', '') args['exclude'] = args.get('exclude', '') args['ulimit'] = dputils.get_int(args.get('ulimit', 10), 10) args['urange'] = args.get('urange', '') # Get the text units hand_filters.chrono('units:') self.stats = stats = {'duration_segmentation': 0, 'range_size': 0, 'patterns': {}, 'groups': {}} for pattern in self.get_patterns(): group = re.sub(ur'-\d+$', '', pattern['key']) self.stats['groups'][group] = 0 for unit in self.get_unit_model().objects.filter(content_xml__id=4).iterator(): # only fief types = unit.get_entry_type() if not types or 'F' not in types: continue # only selected range if not dputils.is_unit_in_range(unit.unitid, args['urange']): continue stats['range_size'] += 1 # segment the unit self.segment_unit(unit, args) if unit.match_conditions: self.segunits.append(unit) hand_filters.chrono(':units') self.variants = [{'text': variant, 'hits': self.variants[variant]} for variant in sorted(self.variants.keys())] # stats stats['result_size'] = len(self.segunits) stats['result_size_pc'] = int(100.0 * stats['result_size'] / stats['range_size']) if stats['range_size'] else 'N/A' # limit size of returned result if args['ulimit'] > 0: self.segunits = self.segunits[0:args['ulimit']] stats['duration_segmentation'] = (datetime.now() - t0).total_seconds()
def input(self, **kwargs): #raise Exception('BACK TRACE') hand_filters.chrono('input:') # LESSC content = super(LessAndCssAbsoluteFilter, self).input(**kwargs) self.validate_input() # CssAbsoluteFilter hand_filters.chrono('\t %s' % repr(kwargs)) kwargs['filename'] = self.init_filename ret = CssAbsoluteFilter(content).input(**kwargs) hand_filters.chrono(':input') return ret
def input(self, **kwargs): #raise Exception('BACK TRACE') hand_filters.chrono('input:') # LESSC content = super(LessAndCssAbsoluteFilter, self).input(**kwargs) self.validate_input() # CssAbsoluteFilter hand_filters.chrono('\t %s' % repr(kwargs)) kwargs['filename'] = self.init_filename ret = CssAbsoluteFilter(content).input(**kwargs) hand_filters.chrono(':input') return ret
def segment_units(self): self.segunits = [] self.init_variant_patterns() t0 = datetime.now() # TODO: derive the info from the faceted_search settings.py or from a new # settings variable. # arguments args = self.options # unpack hilite args['hilite'] = args.get('hilite', '').split(',') args['hilite_groups'] = [ self.get_group_key_from_pattern_key( self.get_pattern_from_id(pid)['key']) for pid in args['hilite'] if pid and self.get_pattern_from_id(pid) ] args['ignore'] = args.get('ignore', '') args['exclude'] = args.get('exclude', '') args['ulimit'] = dputils.get_int(args.get('ulimit', 10), 10) args['urange'] = args.get('urange', '') # Get the text units hand_filters.chrono('units:') self.stats = stats = { 'duration_segmentation': 0, 'range_size': 0, 'patterns': {}, 'groups': {} } for pattern in self.get_patterns(): group = re.sub(ur'-\d+$', '', pattern['key']) self.stats['groups'][group] = 0 for unit in self.get_unit_model().objects.filter( content_xml__id=4).iterator(): # only fief types = unit.get_entry_type() if not types or 'F' not in types: continue # only selected range if not dputils.is_unit_in_range(unit.unitid, args['urange']): continue stats['range_size'] += 1 # segment the unit self.segment_unit(unit, args) if unit.match_conditions: self.segunits.append(unit) hand_filters.chrono(':units') self.variants = [{ 'text': variant, 'hits': self.variants[variant] } for variant in sorted(self.variants.keys())] # stats stats['result_size'] = len(self.segunits) stats['result_size_pc'] = int( 100.0 * stats['result_size'] / stats['range_size']) if stats['range_size'] else 'N/A' # limit size of returned result if args['ulimit'] > 0: self.segunits = self.segunits[0:args['ulimit']] stats['duration_segmentation'] = (datetime.now() - t0).total_seconds()
def set_search_results_to_context(request, context={}, allowed_type=None, show_advanced_search_form=False): ''' Read the information posted through the search form and create the queryset for each relevant type of content (e.g. MS, Hand) => context['results'] If the form was not valid or submitted, context['results'] is left undefined. Other context variables used by the search template are also set. ''' # allowed_type: this variable is used to restrict the search to one content type only. # This is useful when we display a specific record page and we only # have to search for the related content type to show the previous/next links. # allowed_type = kwargs.get('allowed_type', None) # context = kwargs.get('context', {}) context['terms'] = '' # pagination sizes set_page_sizes_to_context(request, context) # list of query parameter/form fields which can be changed without # triggering a search context['submitted'] = False non_search_params = ['basic_search_type', 'from_link', 'result_type'] for param in request.GET: if param not in non_search_params and request.GET.get(param): context['submitted'] = True context['can_edit'] = has_edit_permission(request, Hand) context['types'] = get_search_types(request) context['annotation_mode'] = request.GET.get('am', '0') context['view'] = request.GET.get('view', '') for type in context['types']: type.set_desired_view(context['view']) type.set_page_size(context['page_size']) context['search_types_display'] = get_search_types_display( context['types']) context['is_empty'] = True advanced_search_form = SearchPageForm(request.GET) advanced_search_form.fields['basic_search_type'].choices = [ (type.key, type.label) for type in context['types']] if show_advanced_search_form: context['advanced_search_form'] = advanced_search_form if advanced_search_form.is_valid(): # Read the inputs # - term term = advanced_search_form.cleaned_data['terms'] context['terms'] = term or ' ' context['query_summary'], context['query_summary_interactive'] = get_query_summary( request, term, context['submitted'], [type.get_form(request) for type in context['types']]) # - search type context['search_type'] = advanced_search_form.cleaned_data['basic_search_type'] context['search_type_defaulted'] = context['search_type'] if not context['search_type_defaulted']: if context['types']: context['search_type_defaulted'] = context['types'][0].key else: context['search_type_defaulted'] = '?' has_result = False if context['submitted']: # Create the queryset for each allowed content type. # If allowed_types is None, search for each supported content type. for type in context['types']: if allowed_type in [None, type.key]: hand_filters.chrono('Search %s:' % type.key) context['results'] = type.build_queryset( request, term, not has_result) if type.is_empty == False: has_result = True hand_filters.chrono(':Search %s' % type.key)
def search_record_view(request): ret = reroute_to_static_search(request) if ret: return ret hand_filters.chrono('SEARCH VIEW:') hand_filters.chrono('SEARCH LOGIC:') # Backward compatibility. # Previously all the record pages would go through this search URL and view # and their URL was: # /digipal/search/?id=1&result_type=scribes&basic_search_type=hands&terms=Wulfstan # Now we redirect those requests to the record page # /digipal/scribes/1/?basic_search_type=hands&terms=Wulfstan+&result_type=scribes qs_id = request.GET.get('id', '') qs_result_type = request.GET.get('result_type', '') if qs_id and qs_result_type: from django.shortcuts import redirect # TODO: get digipal from current project name or current URL redirect_url = '/%s/%s/%s/?%s' % ('digipal', qs_result_type, qs_id, request.META['QUERY_STRING']) return redirect(redirect_url) # backward compatibility: # query string param 'name' and 'scribes' have ben renamed to 'scribe' request.GET = request.GET.copy() request.GET['scribe'] = request.GET.get('scribe', '') or request.GET.get( 'scribes', '') or request.GET.get('name', '') request.GET['ms_date'] = request.GET.get( 'ms_date', '') or request.GET.get('date', '') request.GET['hand_date'] = request.GET.get( 'hand_date', '') or request.GET.get('date', '') request.GET['scribe_date'] = request.GET.get( 'scribe_date', '') or request.GET.get('date', '') request.GET['hand_place'] = request.GET.get( 'hand_place', '') or request.GET.get('place', '') request.GET['scriptorium'] = request.GET.get( 'scriptorium', '') or request.GET.get('place', '') # Actually run the searches context = {} context['nofollow'] = True set_search_results_to_context( request, context=context, show_advanced_search_form=True) # check if the search was executed or not (e.g. form not submitted or # invalid form) if context.has_key('results'): # Tab Selection Logic = # we pick the tab the user has selected even if it is empty. END # if none, we select the filter/advanced search content type # if none or its result is empty we select the first non empty type # if none we select the first type. END result_type = request.GET.get('result_type', '') # requested result type does not exist => ignore it if result_type not in [type.key for type in context['types']]: result_type = '' if not result_type: first_non_empty_type = None for type in context['types']: if type.key == context['search_type'] and not type.is_empty: result_type = context['search_type'] break if not first_non_empty_type and not type.is_empty: first_non_empty_type = type.key if not result_type: result_type = first_non_empty_type result_type = result_type or context['types'][0].key context['result_type'] = result_type # No result at all? for type in context['types']: if not type.is_empty: context['is_empty'] = False from digipal import utils context['search_help_url'] = utils.get_cms_url_from_slug( getattr(settings, 'SEARCH_HELP_PAGE_SLUG', 'search_help')) # Initialise the advanced search forms # context['drilldownform'] = GraphSearchForm({'terms': context['terms'] or ''}) page_options = get_search_page_js_data( context['types'], request.GET.get('from_link') in ('true', '1'), request) context['expanded_custom_filters'] = page_options['advanced_search_expanded'] page_options['linked_fields'] = [] for type in context['types']: type.add_field_links(page_options['linked_fields']) context['search_page_options_json'] = json.dumps(page_options) for custom_filter in page_options['filters']: if custom_filter['key'] == context['search_type_defaulted']: context['filters_form'] = custom_filter from digipal.models import RequestLog RequestLog.save_request(request, sum( [type.count for type in context['types']])) hand_filters.chrono(':SEARCH LOGIC') hand_filters.chrono('SEARCH TEMPLATE:') ret = render_to_response('search/search_record.html', context, context_instance=RequestContext(request)) hand_filters.chrono(':SEARCH TEMPLATE') hand_filters.chrono(':SEARCH VIEW') return ret
def search_ms_image_view(request): '''View for the Browse Image page''' hand_filters.chrono('BROWSE:') from digipal.utils import request_invisible_model request_invisible_model('image', request, 'Image') hand_filters.chrono('search:') hand_filters.chrono('all():') images = Image.objects.all() hand_filters.chrono(':all()') from digipal.forms import FilterManuscriptsImages # Get Buttons context = {} context['view'] = request.GET.get('view', 'images') town_or_city = request.GET.get('town_or_city', '') repository = request.GET.get('repository', '') date = request.GET.get('date', '') set_page_sizes_to_context(request, context, [12, 20, 40, 100]) # Applying filters if town_or_city: images = images.filter( item_part__current_item__repository__place__name=town_or_city) if repository: # repo is in two parts: repo place, repo name (e.g. cambridge, corpus christi college) # but we also support old style URL which have only the name of the repo # if we don't, crawlers like Googlebot could receive a 500 error (see # JIRA DIGIPAL-483) repo_parts = [p.strip() for p in repository.split(',')] if repo_parts: images = images.filter( item_part__current_item__repository__name=repo_parts[-1]) if len(repo_parts) > 1: images = images.filter( item_part__current_item__repository__place__name=repo_parts[0]) if date: images = images.filter(hands__assigned_date__date=date) images = images.filter(item_part_id__gt=0) # not sufficient, see JIRA #552 # images = Image.sort_query_set_by_locus(images) # images = list(images.order_by('id')) # from digipal.utils import natural_sort_key # images = sorted(images, key=lambda im: natural_sort_key(im.display_label, True)) # context['images'] = Image.sort_query_set_by_locus(images.prefetch_related('hands', 'annotation_set')) # permission filter # OPT: on DigiPal prefetch_related of annotation_set takes 20s and it retrieves all the fields even # related to linked tables (allograph, character, etc.) # Same with hands which takes around 2/3 s. # images = Image.filter_permissions_from_request(images.prefetch_related('hands', 'annotation_set'), request) images = Image.filter_permissions_from_request(images, request) hand_filters.chrono(':search') # count hands hand_filters.chrono('hands:') from django.db.models import Count context['images'] = Image.sort_query_set_by_locus(images.select_related( 'item_part__current_item__repository__place').annotate(hand_count=Count('hands'))) hand_filters.chrono('hands:') image_search_form = FilterManuscriptsImages(request.GET) context['image_search_form'] = image_search_form context['query_summary'], context['query_summary_interactive'] = get_query_summary( request, '', True, [image_search_form]) hand_filters.chrono('template:') ret = render_to_response('search/search_ms_image.html', context, context_instance=RequestContext(request)) hand_filters.chrono(':template') hand_filters.chrono(':BROWSE') return ret
def set_search_results_to_context(request, context={}, allowed_type=None, show_advanced_search_form=False): """ Read the information posted through the search form and create the queryset for each relevant type of content (e.g. MS, Hand) => context['results'] If the form was not valid or submitted, context['results'] is left undefined. Other context variables used by the search template are also set. """ # allowed_type: this variable is used to restrict the search to one content type only. # This is useful when we display a specific record page and we only # have to search for the related content type to show the previous/next links. # allowed_type = kwargs.get('allowed_type', None) # context = kwargs.get('context', {}) context["terms"] = "" # pagination sizes set_page_sizes_to_context(request, context) # list of query parameter/form fields which can be changed without triggering a search context["submitted"] = False non_search_params = ["basic_search_type", "from_link", "result_type"] for param in request.GET: if param not in non_search_params and request.GET.get(param): context["submitted"] = True context["can_edit"] = has_edit_permission(request, Hand) context["types"] = get_search_types(request) context["annotation_mode"] = request.GET.get("am", "0") context["view"] = request.GET.get("view", "") for type in context["types"]: type.set_desired_view(context["view"]) type.set_page_size(context["page_size"]) context["search_types_display"] = get_search_types_display(context["types"]) context["is_empty"] = True advanced_search_form = SearchPageForm(request.GET) advanced_search_form.fields["basic_search_type"].choices = [(type.key, type.label) for type in context["types"]] if show_advanced_search_form: context["advanced_search_form"] = advanced_search_form if advanced_search_form.is_valid(): # Read the inputs # - term term = advanced_search_form.cleaned_data["terms"] context["terms"] = term or " " context["query_summary"], context["query_summary_interactive"] = get_query_summary( request, term, context["submitted"], [type.get_form(request) for type in context["types"]] ) # - search type context["search_type"] = advanced_search_form.cleaned_data["basic_search_type"] context["search_type_defaulted"] = context["search_type"] if not context["search_type_defaulted"]: if context["types"]: context["search_type_defaulted"] = context["types"][0].key else: context["search_type_defaulted"] = "?" has_result = False if context["submitted"]: # Create the queryset for each allowed content type. # If allowed_types is None, search for each supported content type. for type in context["types"]: if allowed_type in [None, type.key]: hand_filters.chrono("Search %s:" % type.key) context["results"] = type.build_queryset(request, term, not has_result) if type.is_empty == False: has_result = True hand_filters.chrono(":Search %s" % type.key)
def search_record_view(request): ret = reroute_to_static_search(request) if ret: return ret hand_filters.chrono("SEARCH VIEW:") hand_filters.chrono("SEARCH LOGIC:") # Backward compatibility. # Previously all the record pages would go through this search URL and view # and their URL was: # /digipal/search/?id=1&result_type=scribes&basic_search_type=hands&terms=Wulfstan # Now we redirect those requests to the record page # /digipal/scribes/1/?basic_search_type=hands&terms=Wulfstan+&result_type=scribes qs_id = request.GET.get("id", "") qs_result_type = request.GET.get("result_type", "") if qs_id and qs_result_type: from django.shortcuts import redirect # TODO: get digipal from current project name or current URL redirect_url = "/%s/%s/%s/?%s" % ("digipal", qs_result_type, qs_id, request.META["QUERY_STRING"]) return redirect(redirect_url) # backward compatibility: # query string param 'name' and 'scribes' have ben renamed to 'scribe' request.GET = request.GET.copy() request.GET["scribe"] = ( request.REQUEST.get("scribe", "") or request.REQUEST.get("scribes", "") or request.REQUEST.get("name", "") ) request.GET["ms_date"] = request.REQUEST.get("ms_date", "") or request.REQUEST.get("date", "") request.GET["hand_date"] = request.REQUEST.get("hand_date", "") or request.REQUEST.get("date", "") request.GET["scribe_date"] = request.REQUEST.get("scribe_date", "") or request.REQUEST.get("date", "") request.GET["hand_place"] = request.REQUEST.get("hand_place", "") or request.REQUEST.get("place", "") request.GET["scriptorium"] = request.REQUEST.get("scriptorium", "") or request.REQUEST.get("place", "") # Actually run the searches context = {} context["nofollow"] = True set_search_results_to_context(request, context=context, show_advanced_search_form=True) # check if the search was executed or not (e.g. form not submitted or invalid form) if context.has_key("results"): # Tab Selection Logic = # we pick the tab the user has selected even if it is empty. END # if none, we select the filter/advanced search content type # if none or its result is empty we select the first non empty type # if none we select the first type. END result_type = request.GET.get("result_type", "") # requested result type does not exist => ignore it if result_type not in [type.key for type in context["types"]]: result_type = "" if not result_type: first_non_empty_type = None for type in context["types"]: if type.key == context["search_type"] and not type.is_empty: result_type = context["search_type"] break if not first_non_empty_type and not type.is_empty: first_non_empty_type = type.key if not result_type: result_type = first_non_empty_type result_type = result_type or context["types"][0].key context["result_type"] = result_type # No result at all? for type in context["types"]: if not type.is_empty: context["is_empty"] = False from digipal import utils context["search_help_url"] = utils.get_cms_url_from_slug(getattr(settings, "SEARCH_HELP_PAGE_SLUG", "search_help")) # Initialise the advanced search forms # context['drilldownform'] = GraphSearchForm({'terms': context['terms'] or ''}) page_options = get_search_page_js_data(context["types"], request.GET.get("from_link") in ("true", "1"), request) context["expanded_custom_filters"] = page_options["advanced_search_expanded"] page_options["linked_fields"] = [] for type in context["types"]: type.add_field_links(page_options["linked_fields"]) context["search_page_options_json"] = json.dumps(page_options) for custom_filter in page_options["filters"]: if custom_filter["key"] == context["search_type_defaulted"]: context["filters_form"] = custom_filter from digipal.models import RequestLog RequestLog.save_request(request, sum([type.count for type in context["types"]])) hand_filters.chrono(":SEARCH LOGIC") hand_filters.chrono("SEARCH TEMPLATE:") ret = render_to_response("search/search_record.html", context, context_instance=RequestContext(request)) hand_filters.chrono(":SEARCH TEMPLATE") hand_filters.chrono(":SEARCH VIEW") return ret
def populate_index(self, ct, index=None): chrono("POPULATE_INDEX:") # Add documents to the index print "\tgenerate sort rankings" chrono("RANK_VALUES:") ct.prepare_value_rankings( callback=lambda progress: self.write_state_update(ct, max(0.001, 1.0 / 3.0 * progress)) ) chrono(":RANK_VALUES") chrono("INDEXING QUERY:") print "\tretrieve all records" dputils.gc_collect() from whoosh.writing import BufferedWriter rcs = ct.get_all_records(True) record_count = rcs.count() writer = None chrono(":INDEXING QUERY") print "\tadd records to index" i = 0 commit_size = 500 progress_size = 200 # settings.DEV_SERVER = True chrono("INDEXING:") chrono("First record:") record_condition = ct.get_option("condition", None) pbar = dputils.ProgressBar(record_count) # Indexing can use n x 100 MB # Which can be excessive for small VMs # One technique is to create small, independent index segments # Then optimise them outside this fct on a separate index for record in rcs.iterator(): if i == 0: chrono(":First record") pbar.update(i + 1) if (i % commit_size) == 0: # we have to commit every x document otherwise the memory saturates on the VM # BufferedWriter is buggy and will crash after a few 100x docs if writer: writer.commit(merge=False) # we have to recreate after commit because commit unlock index writer = None index = None dputils.gc_collect() index = ct.get_whoosh_index() writer = index.writer() i += 1 if record_condition and not record_condition(record): continue writer.add_document(**ct.get_document_from_record(record)) if (i % progress_size) == 0: self.write_state_update(ct, (1 + 1.0 * i / record_count) * 1.0 / 3) if writer: writer.commit(merge=False) # rcs = None # ct.clear_value_rankings() pbar.complete() chrono(":INDEXING") print "\n" chrono(":POPULATE_INDEX") print "\tdone (%s records)" % record_count
def _build_queryset(self, request, term): """ View for Hand record drill-down """ context = {} self.graphs_count = 0 undefined = u'' scribe = request.GET.get('scribe', undefined) # alternative names are for backward compatibility with old-style graph search page script = request.GET.get('script', undefined) chartype = request.GET.get('chartype', undefined) character = request.GET.get('character', undefined) allograph = request.GET.get('allograph', undefined) component = request.GET.get('component', undefined) feature = request.GET.get('feature', undefined) repository = request.GET.get('repository', undefined) index = request.GET.get('index', undefined) excluded_images = None from digipal.utils import is_staff if not is_staff(request): excluded_images = Image.filter_permissions(Image.objects.all(), [MediaPermission.PERM_PRIVATE]) none = u'-1' one_or_more = u'-2' from datetime import datetime t0 = datetime.now() t4 = datetime.now() wheres = [] if self.search_hands: graphs = Graph.objects.filter(hand__id__in=self.search_hands.queryset) else: # .order_by('item_part__current_item__repository__name', 'item_part__current_item__shelfmark', 'descriptions__description','id') # Although we are listing hands on the front-end, we search for graphs and not for hand. # Two reasons: # searching for character and allograh at the same time through a Hand model would generate two separate joins to graph # this would bring potentially invalid results and it is also much slower # it is faster than excluding all the hands without a graph (yet another expensive join) # if term: term = term.replace('"', '') graphs = Graph.objects.filter( Q(hand__descriptions__description__icontains=term) | \ Q(hand__scribe__name__icontains=term) | \ Q(hand__assigned_place__name__icontains=term) | \ Q(hand__assigned_date__date__icontains=term) | \ Q(hand__item_part__current_item__shelfmark__icontains=term) | \ Q(hand__item_part__current_item__repository__name__icontains=term) | \ Q(hand__item_part__current_item__repository__place__name__icontains=term) | \ Q(hand__item_part__historical_items__catalogue_number__icontains=term) | \ # JIRA 423 Q(hand__item_part__historical_items__name__icontains=term) | \ Q(hand__item_part__group__historical_items__name__icontains=term) | \ Q(hand__item_part__display_label__icontains=term) | \ Q(hand__item_part__group__display_label__icontains=term) ) else: graphs = Graph.objects.all() t1 = datetime.now() if index: graphs = graphs.filter(hand__item_part__historical_items__catalogue_number__iexact=index) if repository: matches = re.match(ur'^([^,]+?),([^,]+)$', repository) if matches: graphs = graphs.filter(Q(hand__item_part__current_item__repository__place__name__iexact=matches.group(1).strip()) & Q(hand__item_part__current_item__repository__name__iexact=matches.group(2).strip())) if scribe: graphs = graphs.filter(hand__scribe__name__icontains=scribe) if script: graphs = graphs.filter(hand__script__name=script) if chartype: graphs = graphs.filter(idiograph__allograph__character__ontograph__ontograph_type__name=chartype) if character: graphs = graphs.filter(idiograph__allograph__character__name=character) if allograph: graphs = graphs.filter(idiograph__allograph__name=allograph) # we discard freak graph records (i.e. without annotation) to prevent errors further down the line. graphs = graphs.filter(annotation__isnull=False) # if the user is not logged in we exclude graphs where the allograph is hidden from digipal.models import has_edit_permission if not has_edit_permission(request, self.get_model()): graphs = graphs.exclude(idiograph__allograph__hidden=True) # exclude private images if excluded_images and excluded_images.count(): graphs = graphs.exclude(annotation__image__in=excluded_images) # condition on component if component: component_where = Q(graph_components__component__name=component) if feature in [undefined, none]: # If no feature is specified we find all the graph which are supposed to have a component # according to their idiograph component_where = component_where | Q(idiograph__allograph__allograph_components__component__name=component) wheres.append(component_where) # condition on feature if feature not in [undefined, none, one_or_more]: wheres.append(Q(graph_components__features__name=feature)) if feature in [one_or_more]: wheres.append(Q(graph_components__features__id__isnull=False)) # ANDs all the Q() where clauses together if wheres: where_and = wheres.pop(0) for where in wheres: where_and = where_and & where graphs = graphs.filter(where_and) # Treat the feature=none case if feature == none: excluded_q = Q(graph_components__features__id__isnull=False) if component: excluded_q = excluded_q & Q(graph_components__component__name=component) excluded_graphs = Graph.objects.filter(excluded_q) graphs = graphs.exclude(id__in=excluded_graphs.values_list('id', flat=True)) from digipal.utils import set_left_joins_in_queryset, get_str_from_queryset set_left_joins_in_queryset(graphs) #print get_str_from_queryset(graphs) t2 = datetime.now() # Get the graphs then id of all the related Hands # We use values_list because it is much faster, we don't need to fetch all the Hands at this stage # That will be done after pagination in the template # Distinct is needed here. #graphs = graphs.distinct().order_by('hand__scribe__name', 'hand__id', 'idiograph__allograph__character__ontograph__sort_order') chrono('graph filter:') graphs = graphs.distinct().order_by('hand__scribe__name', 'hand__id') chrono(':graph filter') #print graphs.query chrono('graph values_list:') graph_ids = graphs.values_list('id', 'hand_id') chrono(':graph values_list') # chrono('len:') # l = len(graph_ids) # print graph_ids.query # chrono(':len') # Build a structure that groups all the graph ids by hand id # context['hand_ids'] = [[1, 101, 102], [2, 103, 104]] # In the above we have two hands: 1 and 2. For hand 1 we have Graph 101 and 102. chrono('hand_ids:') context['hand_ids'] = [[0]] last = 0 for g in graph_ids: if g[1] != context['hand_ids'][-1][0]: context['hand_ids'].append([g[1]]) context['hand_ids'][-1].append(g[0]) del(context['hand_ids'][0]) chrono(':hand_ids') t3 = datetime.now() self.graphs_count = len(graph_ids) t4 = datetime.now() #print 'search %s; hands query: %s + graph count: %s' % (t4 - t0, t3 - t2, t4 - t3) t5 = datetime.now() self._queryset = context['hand_ids'] return self._queryset
def _build_queryset(self, request, term): """ View for Hand record drill-down """ context = {} self.graphs_count = 0 undefined = u'' scribe = request.GET.get('scribe', undefined) # alternative names are for backward compatibility with old-style graph # search page script = request.GET.get('script', undefined) chartype = request.GET.get('chartype', undefined) character = request.GET.get('character', undefined) allograph = request.GET.get('allograph', undefined) component = request.GET.get('component', undefined) feature = request.GET.get('feature', undefined) repository = request.GET.get('repository', undefined) index = request.GET.get('index', undefined) excluded_images = None from digipal.utils import is_staff if not is_staff(request): excluded_images = Image.filter_permissions( Image.objects.all(), [MediaPermission.PERM_PRIVATE]) none = u'-1' one_or_more = u'-2' from datetime import datetime t0 = datetime.now() t4 = datetime.now() wheres = [] if self.search_hands: graphs = Graph.objects.filter( hand__id__in=self.search_hands.queryset) else: # .order_by('item_part__current_item__repository__name', 'item_part__current_item__shelfmark', 'descriptions__description','id') # Although we are listing hands on the front-end, we search for graphs and not for hand. # Two reasons: # searching for character and allograh at the same time through a Hand model would generate two separate joins to graph # this would bring potentially invalid results and it is also much slower # it is faster than excluding all the hands without a graph (yet another expensive join) # if term: term = term.replace('"', '') graphs = Graph.objects.filter( Q(hand__descriptions__description__icontains=term) | Q(hand__scribe__name__icontains=term) | Q(hand__assigned_place__name__icontains=term) | Q(hand__assigned_date__date__icontains=term) | Q(hand__item_part__current_item__shelfmark__icontains=term) | Q(hand__item_part__current_item__repository__name__icontains=term) | Q(hand__item_part__current_item__repository__place__name__icontains=term) | Q(hand__item_part__historical_items__catalogue_number__icontains=term) | \ # JIRA 423 Q(hand__item_part__historical_items__name__icontains=term) | \ Q(hand__item_part__group__historical_items__name__icontains=term) | \ Q(hand__item_part__display_label__icontains=term) | \ Q(hand__item_part__group__display_label__icontains=term) ) else: graphs = Graph.objects.all() t1 = datetime.now() if index: graphs = graphs.filter( hand__item_part__historical_items__catalogue_number__iexact=index) if repository: matches = re.match(ur'^([^,]+?),([^,]+)$', repository) if matches: graphs = graphs.filter(Q(hand__item_part__current_item__repository__place__name__iexact=matches.group( 1).strip()) & Q(hand__item_part__current_item__repository__name__iexact=matches.group(2).strip())) if scribe: graphs = graphs.filter(hand__scribe__name__icontains=scribe) if script: graphs = graphs.filter(hand__script__name=script) if chartype: graphs = graphs.filter( idiograph__allograph__character__ontograph__ontograph_type__name=chartype) if character: graphs = graphs.filter( idiograph__allograph__character__name=character) if allograph: graphs = graphs.filter(idiograph__allograph__name=allograph) # we discard freak graph records (i.e. without annotation) to prevent # errors further down the line. graphs = graphs.filter(annotation__isnull=False) # if the user is not logged in we exclude graphs where the allograph is # hidden from digipal.models import has_edit_permission if not has_edit_permission(request, self.get_model()): graphs = graphs.exclude(idiograph__allograph__hidden=True) # exclude private images if excluded_images and excluded_images.count(): graphs = graphs.exclude(annotation__image__in=excluded_images) # condition on component if component: component_where = Q(graph_components__component__name=component) if feature in [undefined, none]: # If no feature is specified we find all the graph which are supposed to have a component # according to their idiograph component_where = component_where | Q( idiograph__allograph__allograph_components__component__name=component) wheres.append(component_where) # condition on feature if feature not in [undefined, none, one_or_more]: wheres.append(Q(graph_components__features__name=feature)) if feature in [one_or_more]: wheres.append(Q(graph_components__features__id__isnull=False)) # ANDs all the Q() where clauses together if wheres: where_and = wheres.pop(0) for where in wheres: where_and = where_and & where graphs = graphs.filter(where_and) # Treat the feature=none case if feature == none: excluded_q = Q(graph_components__features__id__isnull=False) if component: excluded_q = excluded_q & Q( graph_components__component__name=component) excluded_graphs = Graph.objects.filter(excluded_q) graphs = graphs.exclude( id__in=excluded_graphs.values_list('id', flat=True)) from digipal.utils import set_left_joins_in_queryset, get_str_from_queryset set_left_joins_in_queryset(graphs) # print get_str_from_queryset(graphs) t2 = datetime.now() # Get the graphs then id of all the related Hands # We use values_list because it is much faster, we don't need to fetch all the Hands at this stage # That will be done after pagination in the template # Distinct is needed here. #graphs = graphs.distinct().order_by('hand__scribe__name', 'hand__id', 'idiograph__allograph__character__ontograph__sort_order') chrono('graph filter:') graphs = graphs.distinct().order_by('hand__scribe__name', 'hand__id') chrono(':graph filter') # print graphs.query chrono('graph values_list:') graph_ids = graphs.values_list('id', 'hand_id') chrono(':graph values_list') # chrono('len:') # l = len(graph_ids) # print graph_ids.query # chrono(':len') # Build a structure that groups all the graph ids by hand id # context['hand_ids'] = [[1, 101, 102], [2, 103, 104]] # In the above we have two hands: 1 and 2. For hand 1 we have Graph 101 # and 102. chrono('hand_ids:') context['hand_ids'] = [[0]] last = 0 for g in graph_ids: if g[1] != context['hand_ids'][-1][0]: context['hand_ids'].append([g[1]]) context['hand_ids'][-1].append(g[0]) del(context['hand_ids'][0]) chrono(':hand_ids') t3 = datetime.now() self.graphs_count = len(graph_ids) t4 = datetime.now() # print 'search %s; hands query: %s + graph count: %s' % (t4 - t0, t3 - # t2, t4 - t3) t5 = datetime.now() self._queryset = context['hand_ids'] return self._queryset
def set_search_results_to_context(request, context={}, allowed_type=None, show_advanced_search_form=False): ''' Read the information posted through the search form and create the queryset for each relevant type of content (e.g. MS, Hand) => context['results'] If the form was not valid or submitted, context['results'] is left undefined. Other context variables used by the search template are also set. ''' # allowed_type: this variable is used to restrict the search to one content type only. # This is useful when we display a specific record page and we only # have to search for the related content type to show the previous/next links. # allowed_type = kwargs.get('allowed_type', None) # context = kwargs.get('context', {}) context['terms'] = '' # pagination sizes set_page_sizes_to_context(request, context) # list of query parameter/form fields which can be changed without # triggering a search context['submitted'] = False non_search_params = ['basic_search_type', 'from_link', 'result_type'] for param in request.GET: if param not in non_search_params and request.GET.get(param): context['submitted'] = True context['can_edit'] = has_edit_permission(request, Hand) context['types'] = get_search_types(request) context['annotation_mode'] = request.GET.get('am', '0') context['view'] = request.GET.get('view', '') for type in context['types']: type.set_desired_view(context['view']) type.set_page_size(context['page_size']) context['search_types_display'] = get_search_types_display( context['types']) context['is_empty'] = True advanced_search_form = SearchPageForm(request.GET) advanced_search_form.fields['basic_search_type'].choices = [ (type.key, type.label) for type in context['types'] ] if show_advanced_search_form: context['advanced_search_form'] = advanced_search_form if advanced_search_form.is_valid(): # Read the inputs # - term term = advanced_search_form.cleaned_data['terms'] context['terms'] = term or ' ' context['query_summary'], context[ 'query_summary_interactive'] = get_query_summary( request, term, context['submitted'], [type.get_form(request) for type in context['types']]) # - search type context['search_type'] = advanced_search_form.cleaned_data[ 'basic_search_type'] context['search_type_defaulted'] = context['search_type'] if not context['search_type_defaulted']: if context['types']: context['search_type_defaulted'] = context['types'][0].key else: context['search_type_defaulted'] = '?' has_result = False if context['submitted']: # Create the queryset for each allowed content type. # If allowed_types is None, search for each supported content type. for type in context['types']: if allowed_type in [None, type.key]: hand_filters.chrono('Search %s:' % type.key) context['results'] = type.build_queryset( request, term, not has_result) if type.is_empty == False: has_result = True hand_filters.chrono(':Search %s' % type.key)
def search_record_view(request): ret = reroute_to_static_search(request) if ret: return ret hand_filters.chrono('SEARCH VIEW:') hand_filters.chrono('SEARCH LOGIC:') # Backward compatibility. # Previously all the record pages would go through this search URL and view # and their URL was: # /digipal/search/?id=1&result_type=scribes&basic_search_type=hands&terms=Wulfstan # Now we redirect those requests to the record page # /digipal/scribes/1/?basic_search_type=hands&terms=Wulfstan+&result_type=scribes qs_id = request.GET.get('id', '') qs_result_type = request.GET.get('result_type', '') if qs_id and qs_result_type: from django.shortcuts import redirect # TODO: get digipal from current project name or current URL redirect_url = '/%s/%s/%s/?%s' % ('digipal', qs_result_type, qs_id, request.META['QUERY_STRING']) return redirect(redirect_url) # backward compatibility: # query string param 'name' and 'scribes' have ben renamed to 'scribe' request.GET = request.GET.copy() request.GET['scribe'] = request.GET.get('scribe', '') or request.GET.get( 'scribes', '') or request.GET.get('name', '') request.GET['ms_date'] = request.GET.get('ms_date', '') or request.GET.get( 'date', '') request.GET['hand_date'] = request.GET.get( 'hand_date', '') or request.GET.get('date', '') request.GET['scribe_date'] = request.GET.get( 'scribe_date', '') or request.GET.get('date', '') request.GET['hand_place'] = request.GET.get( 'hand_place', '') or request.GET.get('place', '') request.GET['scriptorium'] = request.GET.get( 'scriptorium', '') or request.GET.get('place', '') # Actually run the searches context = {} context['nofollow'] = True set_search_results_to_context(request, context=context, show_advanced_search_form=True) # check if the search was executed or not (e.g. form not submitted or # invalid form) if context.has_key('results'): # Tab Selection Logic = # we pick the tab the user has selected even if it is empty. END # if none, we select the filter/advanced search content type # if none or its result is empty we select the first non empty type # if none we select the first type. END result_type = request.GET.get('result_type', '') # requested result type does not exist => ignore it if result_type not in [type.key for type in context['types']]: result_type = '' if not result_type: first_non_empty_type = None for type in context['types']: if type.key == context['search_type'] and not type.is_empty: result_type = context['search_type'] break if not first_non_empty_type and not type.is_empty: first_non_empty_type = type.key if not result_type: result_type = first_non_empty_type result_type = result_type or context['types'][0].key context['result_type'] = result_type # No result at all? for type in context['types']: if not type.is_empty: context['is_empty'] = False from digipal import utils context['search_help_url'] = utils.get_cms_url_from_slug( getattr(settings, 'SEARCH_HELP_PAGE_SLUG', 'search_help')) # Initialise the advanced search forms # context['drilldownform'] = GraphSearchForm({'terms': context['terms'] or ''}) page_options = get_search_page_js_data( context['types'], request.GET.get('from_link') in ('true', '1'), request) context['expanded_custom_filters'] = page_options[ 'advanced_search_expanded'] page_options['linked_fields'] = [] for type in context['types']: type.add_field_links(page_options['linked_fields']) context['search_page_options_json'] = json.dumps(page_options) for custom_filter in page_options['filters']: if custom_filter['key'] == context['search_type_defaulted']: context['filters_form'] = custom_filter from digipal.models import RequestLog RequestLog.save_request(request, sum([type.count for type in context['types']])) hand_filters.chrono(':SEARCH LOGIC') hand_filters.chrono('SEARCH TEMPLATE:') ret = render_to_response('search/search_record.html', context, context_instance=RequestContext(request)) hand_filters.chrono(':SEARCH TEMPLATE') hand_filters.chrono(':SEARCH VIEW') return ret
def search_ms_image_view(request): '''View for the Browse Image page''' hand_filters.chrono('BROWSE:') from digipal.utils import request_invisible_model request_invisible_model('image', request, 'Image') hand_filters.chrono('search:') hand_filters.chrono('all():') images = Image.objects.all() hand_filters.chrono(':all()') from digipal.forms import FilterManuscriptsImages # Get Buttons context = {} context['view'] = request.GET.get('view', 'images') town_or_city = request.GET.get('town_or_city', '') repository = request.GET.get('repository', '') date = request.GET.get('date', '') set_page_sizes_to_context(request, context, [12, 20, 40, 100]) # Applying filters if town_or_city: images = images.filter( item_part__current_item__repository__place__name=town_or_city) if repository: # repo is in two parts: repo place, repo name (e.g. cambridge, corpus christi college) # but we also support old style URL which have only the name of the repo # if we don't, crawlers like Googlebot could receive a 500 error (see # JIRA DIGIPAL-483) repo_parts = [p.strip() for p in repository.split(',')] if repo_parts: images = images.filter( item_part__current_item__repository__name=repo_parts[-1]) if len(repo_parts) > 1: images = images.filter( item_part__current_item__repository__place__name=repo_parts[0]) if date: images = images.filter(hands__assigned_date__date=date) images = images.filter(item_part_id__gt=0) # not sufficient, see JIRA #552 # images = Image.sort_query_set_by_locus(images) # images = list(images.order_by('id')) # from digipal.utils import natural_sort_key # images = sorted(images, key=lambda im: natural_sort_key(im.display_label, True)) # context['images'] = Image.sort_query_set_by_locus(images.prefetch_related('hands', 'annotation_set')) # permission filter # OPT: on DigiPal prefetch_related of annotation_set takes 20s and it retrieves all the fields even # related to linked tables (allograph, character, etc.) # Same with hands which takes around 2/3 s. # images = Image.filter_permissions_from_request(images.prefetch_related('hands', 'annotation_set'), request) images = Image.filter_permissions_from_request(images, request) hand_filters.chrono(':search') # count hands hand_filters.chrono('hands:') from django.db.models import Count context['images'] = Image.sort_query_set_by_locus( images.select_related('item_part__current_item__repository__place'). annotate(hand_count=Count('hands'))) hand_filters.chrono('hands:') image_search_form = FilterManuscriptsImages(request.GET) context['image_search_form'] = image_search_form context['query_summary'], context[ 'query_summary_interactive'] = get_query_summary( request, '', True, [image_search_form]) hand_filters.chrono('template:') ret = render_to_response('search/search_ms_image.html', context, context_instance=RequestContext(request)) hand_filters.chrono(':template') hand_filters.chrono(':BROWSE') return ret
def populate_index(self, ct, index=None): chrono('POPULATE_INDEX:') # Add documents to the index print '\tgenerate sort rankings' chrono('RANK_VALUES:') ct.prepare_value_rankings( callback=lambda progress: self.write_state_update( ct, max(0.001, 1.0 / 3.0 * progress))) chrono(':RANK_VALUES') chrono('INDEXING QUERY:') print '\tretrieve all records' dputils.gc_collect() from whoosh.writing import BufferedWriter rcs = ct.get_all_records(True) record_count = rcs.count() writer = None chrono(':INDEXING QUERY') print '\tadd records to index' i = 0 commit_size = 500 progress_size = 200 # settings.DEV_SERVER = True chrono('INDEXING:') chrono('First record:') record_condition = ct.get_option('condition', None) pbar = dputils.ProgressBar(record_count) # Indexing can use n x 100 MB # Which can be excessive for small VMs # One technique is to create small, independent index segments # Then optimise them outside this fct on a separate index for record in rcs.iterator(): if i == 0: chrono(':First record') pbar.update(i + 1) if (i % commit_size) == 0: # we have to commit every x document otherwise the memory saturates on the VM # BufferedWriter is buggy and will crash after a few 100x docs if writer: writer.commit(merge=False) # we have to recreate after commit because commit unlock index writer = None index = None dputils.gc_collect() index = ct.get_whoosh_index() writer = index.writer() i += 1 if record_condition and not record_condition(record): continue writer.add_document(**ct.get_document_from_record(record)) if (i % progress_size) == 0: self.write_state_update(ct, (1 + 1.0 * i / record_count) * 1.0 / 3) if writer: writer.commit(merge=False) #rcs = None # ct.clear_value_rankings() pbar.complete() chrono(':INDEXING') print '\n' chrono(':POPULATE_INDEX') print '\tdone (%s records)' % record_count