def dump_oai_registry(self, dumpRecords=True, dumpSets=True, dumpMetadataFormats=True): self.assertEquals(len(OaiRegistry.objects()), 0) self.restoreDump(join(DUMP_OAI_PMH_TEST_PATH, 'oai_registry.bson'), 'oai_registry') self.assertTrue(len(OaiRegistry.objects()) > 0) self.dump_oai_identify() if dumpMetadataFormats: self.dump_oai_metadata_format() if dumpSets: self.dump_oai_set() if dumpRecords: self.dump_oai_record()
def get_metadata_formats_detail(request): template = loader.get_template('oai_pmh/explore/explore_metadata_formats_detail.html') try: #Get metadata formats infos = json.loads(request.GET['metadataFormats']) metadataFormats = infos['oai-pmh'] if 'local' in infos: localTemplate = infos['local'] else: localTemplate = None except: metadataFormats = [] localTemplate = None list_metadata_formats = OaiMetadataFormat.objects(pk__in=metadataFormats).all() list_metadata_formats_info = [] for metadataFormat in list_metadata_formats: item = { 'registry' : OaiRegistry.objects(isDeactivated=False).only('name').get(pk=metadataFormat.registry).name, 'metadataPrefix' : metadataFormat.metadataPrefix, 'schema' : metadataFormat.schema, } list_metadata_formats_info.append(item) context = RequestContext(request, { 'list_metadata_formats_info': list_metadata_formats_info, 'local' : localTemplate }) return HttpResponse(json.dumps(template.render(context)), content_type='application/javascript')
def _get_metadata_formats_id(schemas, user_schemas, registries): # We get all template versions for the given schemas # First, we take care of user defined schema templates_id_user = Template.objects(title__in=user_schemas).distinct( field="id") templates_id_user = [str(x) for x in templates_id_user] # Take care of the rest, with versions templates_versions = Template.objects(title__in=schemas).distinct( field="templateVersion") # We get all templates ID, for all versions all_templates_id_common = TemplateVersion.objects(pk__in=templates_versions, isDeleted=False)\ .distinct(field="versions") # We remove the removed version all_templates_id_common_removed = TemplateVersion.objects(pk__in=templates_versions, isDeleted=False)\ .distinct( field="deletedVersions") templates_id_common = list( set(all_templates_id_common) - set(all_templates_id_common_removed)) templates_id = templates_id_user + templates_id_common if len(registries) == 0: # We retrieve deactivated registries so as not to get their metadata formats deactivatedRegistries = [ str(x.id) for x in OaiRegistry.objects(isDeactivated=True).order_by('id') ] metadataFormatsID = OaiMetadataFormat.objects( template__in=templates_id, registry__not__in=deactivatedRegistries).distinct(field="id") else: # We retrieve registries from the refinement metadataFormatsID = OaiMetadataFormat.objects( template__in=templates_id, registry__in=registries).distinct(field="id") return metadataFormatsID
def get_metadata_formats_detail(request): template = loader.get_template("oai_pmh/explore/explore_metadata_formats_detail.html") try: # Get metadata formats infos = json.loads(request.GET["metadataFormats"]) metadataFormats = infos["oai-pmh"] if "local" in infos: localTemplate = infos["local"] else: localTemplate = None except: metadataFormats = [] localTemplate = None list_metadata_formats = OaiMetadataFormat.objects(pk__in=metadataFormats).all() list_metadata_formats_info = [] for metadataFormat in list_metadata_formats: item = { "registry": OaiRegistry.objects(isDeactivated=False).only("name").get(pk=metadataFormat.registry).name, "metadataPrefix": metadataFormat.metadataPrefix, "schema": metadataFormat.schema, } list_metadata_formats_info.append(item) context = RequestContext( request, {"list_metadata_formats_info": list_metadata_formats_info, "local": localTemplate} ) return HttpResponse(json.dumps(template.render(context)), content_type="application/javascript")
def test_oai_pmh_admin_with_data(self): self.dump_oai_registry() url = '/oai_pmh/admin/oai-pmh' r = self.doRequestGetAdminClientLogged(url=url) self.isStatusOK(r.status_code) self.assertIsNotNone(r.content) self.assertIsNotNone(r.context[0].dicts[1].get('registry_form')) self.assertEquals(len(r.context[0].dicts[1].get('registries')), len(OaiRegistry.objects()))
def __init__ (self): super(RequestForm, self).__init__() self.dataproviders = [] self.dataproviders.append(('0', 'Pick one')) self.fields['metadataprefix'].choices = self.dataproviders self.fields['set'].choices = self.dataproviders for o in OaiRegistry.objects(isDeactivated=False).all(): self.dataproviders.append((str(o.id)+'|'+o.url, str(o.name))) self.fields['dataProvider'].choices = self.dataproviders
def __init__(self): super(RequestForm, self).__init__() self.dataproviders = [] self.dataproviders.append(('0', 'Pick one')) self.fields['metadataprefix'].choices = self.dataproviders self.fields['set'].choices = self.dataproviders for o in OaiRegistry.objects(isDeactivated=False).all(): self.dataproviders.append((str(o.id) + '|' + o.url, str(o.name))) self.fields['dataProvider'].choices = self.dataproviders
def init_harvest(): #Kill all tasks purge_all_tasks() #Init all registry isQueued to False in case of a server reboot after an issue registries = OaiRegistry.objects(isDeactivated=False).all() for registry in registries: registry.isQueued = False registry.save() #Check every X seconds if a registry need to be harvested watch_harvest_task.apply_async()
def __init__(self, userId=""): self.SCHEMAS_OPTIONS = [] self.REGISTRIES_OPTIONS = [] #We retrieve all registries (data providers) registries = OaiRegistry.objects(isDeactivated=False).order_by('name') for registry in registries: #We add them self.REGISTRIES_OPTIONS.append((registry.id, registry.name)) super(KeywordForm, self).__init__() self.fields['my_registries'].choices = [] self.fields['my_registries'].choices = self.REGISTRIES_OPTIONS self.my_registries_nb = len(self.REGISTRIES_OPTIONS)
def watch_harvest_task(): registries = OaiRegistry.objects(isDeactivated=False).all() message = "No new registries need to be updated and harvested" #We launch the backround task for each registry for registry in registries: #If we need to harvest and a task doesn't already exist for this registry if registry.harvest and not registry.isQueued: message = message + "Registry {!s} need to be updated and harvested.".format(registry.name.encode("utf-8")) task = harvest_task.apply_async((str(registry.id),)) registry.isQueued = True registry.save() #Periodic call every X seconds watch_harvest_task.apply_async(countdown=10) return message
def __init__(self, listRegistriesId=[]): self.SCHEMAS_OPTIONS = [] #Retrieve registries name registriesName = {} for registryId in listRegistriesId: obj = OaiRegistry.objects(pk=registryId).get() registriesName[str(registryId)] = obj.name #We retrieve all common schemas schemas = OaiMetadataFormat.objects(registry__in=listRegistriesId).order_by('metadataPrefix') groups = [] for k, g in groupby(schemas, lambda x: x.hash): groups.append(list(g)) # Store group iterator as a list #For each group for group in groups: #Get metadata prefix name = group[0].metadataPrefix #Get template name template = group[0].template listValues = [] for elt in group: listValues.append((str(elt.id))) #Provide information about the number of registries using this MF if len(listValues) == 1: name = format_html(name + "<br> (in 1 Registry)") else: name = format_html(name + "<br> (in %s Registries)" % len(listValues)) #If it's linked to a template if template != None: name += format_html(" <text class='local'> + Local </text>") template = Template.objects.only('id', 'title').get(pk=template.id) t = json.dumps({'oai-pmh': listValues, 'local': template.title}) else: t = json.dumps({'oai-pmh': listValues}) self.SCHEMAS_OPTIONS.append((( t , name))) super(MetadataFormatsForm, self).__init__() self.fields['my_schemas'].choices = [] self.fields['my_schemas'].choices = self.SCHEMAS_OPTIONS self.my_schemas_nb = len(self.SCHEMAS_OPTIONS)
def get_results_by_instance_keyword(request): print 'BEGIN def getResultsKeyword(request)' resultsByKeyword = [] results = [] resultString = "" #Instance json_instances = [] if 'HTTPS' in request.META['SERVER_PROTOCOL']: protocol = "https" else: protocol = "http" instance = Instance(name="Local", protocol=protocol, address=request.META['REMOTE_ADDR'], port=request.META['SERVER_PORT'], access_token="token", refresh_token="token") json_instances.append(instance.to_json()) request.session['instancesExplore'] = json_instances sessionName = "resultsExploreOaiPMh" + instance['name'] try: keyword = request.GET['keyword'] schemas = request.GET.getlist('schemas[]') userSchemas = request.GET.getlist('userSchemas[]') refinements = refinements_to_mongo(request.GET.getlist('refinements[]')) if 'onlySuggestions' in request.GET: onlySuggestions = json.loads(request.GET['onlySuggestions']) else: onlySuggestions = False except: keyword = '' schemas = [] userSchemas = [] refinements = {} onlySuggestions = True #We get all template versions for the given schemas #First, we take care of user defined schema templatesIDUser = Template.objects(title__in=userSchemas).distinct(field="id") templatesIDUser = [str(x) for x in templatesIDUser] #Take care of the rest, with versions templatesVersions = Template.objects(title__in=schemas).distinct(field="templateVersion") #We get all templates ID, for all versions allTemplatesIDCommon = TemplateVersion.objects(pk__in=templatesVersions, isDeleted=False).distinct(field="versions") #We remove the removed version allTemplatesIDCommonRemoved = TemplateVersion.objects(pk__in=templatesVersions, isDeleted=False).distinct(field="deletedVersions") templatesIDCommon = list(set(allTemplatesIDCommon) - set(allTemplatesIDCommonRemoved)) templatesID = templatesIDUser + templatesIDCommon #We retrieve deactivated registries so as not to get their metadata formats deactivatedRegistries = [str(x.id) for x in OaiRegistry.objects(isDeactivated=True).order_by('id')] metadataFormatsID = OaiMetadataFormat.objects(template__in=templatesID, registry__not__in=deactivatedRegistries).distinct(field="id") instanceResults = OaiRecord.executeFullTextQuery(keyword, metadataFormatsID, refinements) if len(instanceResults) > 0: if not onlySuggestions: xsltPath = os.path.join(settings.SITE_ROOT, 'static/resources/xsl/xml2html.xsl') xslt = etree.parse(xsltPath) transform = etree.XSLT(xslt) template = loader.get_template('oai_pmh/explore/explore_result_keyword.html') #Retrieve schema and registries. Avoid to retrieve the information for each result registriesName = {} objMetadataFormats = {} listRegistriesID = set([x['registry'] for x in instanceResults]) for registryId in listRegistriesID: obj = OaiRegistry.objects(pk=registryId).get() registriesName[str(registryId)] = obj.name listSchemaId = set([x['metadataformat'] for x in instanceResults]) for schemaId in listSchemaId: obj = OaiMetadataFormat.objects(pk=schemaId).get() objMetadataFormats[str(schemaId)] = obj listItems = [] xmltodictunparse = xmltodict.unparse appendResult = results.append toXML = etree.XML parse = etree.parse XSLT = etree.XSLT if not onlySuggestions: for instanceResult in instanceResults: custom_xslt = False appendResult({'title':instanceResult['identifier'], 'content':xmltodictunparse(instanceResult['metadata']),'id':str(instanceResult['_id'])}) dom = toXML(str(xmltodictunparse(instanceResult['metadata']).encode('utf-8'))) #Check if a custom list result XSLT has to be used try: metadataFormat = objMetadataFormats[str(instanceResult['metadataformat'])] if metadataFormat.template.ResultXsltList: listXslt = parse(BytesIO(metadataFormat.template.ResultXsltList.content.encode('utf-8'))) listTransform = XSLT(listXslt) newdom = listTransform(dom) custom_xslt = True else: newdom = transform(dom) except Exception, e: #We use the default one newdom = transform(dom) custom_xslt = False context = RequestContext(request, {'id':str(instanceResult['_id']), 'xml': str(newdom), 'title': instanceResult['identifier'], 'custom_xslt': custom_xslt, 'schema_name': metadataFormat.metadataPrefix, 'registry_name': registriesName[instanceResult['registry']], 'oai_pmh': True}) resultString+= template.render(context) else: for instanceResult in instanceResults[:20]: wordList = re.sub("[^\w]", " ", keyword).split() wordList = [x + "|" + x +"\w+" for x in wordList] wordList = '|'.join(wordList) listWholeKeywords = re.findall("\\b("+ wordList +")\\b", xmltodict.unparse(instanceResult['metadata']).encode('utf-8'), flags=re.IGNORECASE) labels = list(set(listWholeKeywords)) for label in labels: label = label.lower() result_json = {} result_json['label'] = label result_json['value'] = label if not result_json in resultsByKeyword: resultsByKeyword.append(result_json)
def get_results_by_instance_keyword(request): print 'BEGIN def getResultsKeyword(request)' resultsByKeyword = [] results = [] resultString = "" #Instance json_instances = [] if 'HTTPS' in request.META['SERVER_PROTOCOL']: protocol = "https" else: protocol = "http" instance = Instance(name="Local", protocol=protocol, address=request.META['REMOTE_ADDR'], port=request.META['SERVER_PORT'], access_token="token", refresh_token="token") json_instances.append(instance.to_json()) request.session['instancesExplore'] = json_instances sessionName = "resultsExploreOaiPMh" + instance['name'] try: keyword = request.GET['keyword'] schemas = request.GET.getlist('schemas[]') userSchemas = request.GET.getlist('userSchemas[]') refinements = refinements_to_mongo( request.GET.getlist('refinements[]')) if 'onlySuggestions' in request.GET: onlySuggestions = json.loads(request.GET['onlySuggestions']) else: onlySuggestions = False except: keyword = '' schemas = [] userSchemas = [] refinements = {} onlySuggestions = True #We get all template versions for the given schemas #First, we take care of user defined schema templatesIDUser = Template.objects(title__in=userSchemas).distinct( field="id") templatesIDUser = [str(x) for x in templatesIDUser] #Take care of the rest, with versions templatesVersions = Template.objects(title__in=schemas).distinct( field="templateVersion") #We get all templates ID, for all versions allTemplatesIDCommon = TemplateVersion.objects( pk__in=templatesVersions, isDeleted=False).distinct(field="versions") #We remove the removed version allTemplatesIDCommonRemoved = TemplateVersion.objects( pk__in=templatesVersions, isDeleted=False).distinct(field="deletedVersions") templatesIDCommon = list( set(allTemplatesIDCommon) - set(allTemplatesIDCommonRemoved)) templatesID = templatesIDUser + templatesIDCommon #We retrieve deactivated registries so as not to get their metadata formats deactivatedRegistries = [ str(x.id) for x in OaiRegistry.objects(isDeactivated=True).order_by('id') ] metadataFormatsID = OaiMetadataFormat.objects( template__in=templatesID, registry__not__in=deactivatedRegistries).distinct(field="id") instanceResults = OaiRecord.executeFullTextQuery(keyword, metadataFormatsID, refinements) if len(instanceResults) > 0: if not onlySuggestions: xsltPath = os.path.join(settings.SITE_ROOT, 'static/resources/xsl/xml2html.xsl') xslt = etree.parse(xsltPath) transform = etree.XSLT(xslt) template = loader.get_template( 'oai_pmh/explore/explore_result_keyword.html') #Retrieve schema and registries. Avoid to retrieve the information for each result registriesName = {} objMetadataFormats = {} listRegistriesID = set([x['registry'] for x in instanceResults]) for registryId in listRegistriesID: obj = OaiRegistry.objects(pk=registryId).get() registriesName[str(registryId)] = obj.name listSchemaId = set([x['metadataformat'] for x in instanceResults]) for schemaId in listSchemaId: obj = OaiMetadataFormat.objects(pk=schemaId).get() objMetadataFormats[str(schemaId)] = obj listItems = [] xmltodictunparse = xmltodict.unparse appendResult = results.append toXML = etree.XML parse = etree.parse XSLT = etree.XSLT if not onlySuggestions: for instanceResult in instanceResults: custom_xslt = False appendResult({ 'title': instanceResult['identifier'], 'content': xmltodictunparse(instanceResult['metadata']), 'id': str(instanceResult['_id']) }) dom = toXML( str( xmltodictunparse( instanceResult['metadata']).encode('utf-8'))) #Check if a custom list result XSLT has to be used try: metadataFormat = objMetadataFormats[str( instanceResult['metadataformat'])] if metadataFormat.template.ResultXsltList: listXslt = parse( BytesIO( metadataFormat.template.ResultXsltList.content. encode('utf-8'))) listTransform = XSLT(listXslt) newdom = listTransform(dom) custom_xslt = True else: newdom = transform(dom) except Exception, e: #We use the default one newdom = transform(dom) custom_xslt = False context = RequestContext( request, { 'id': str(instanceResult['_id']), 'xml': str(newdom), 'title': instanceResult['identifier'], 'custom_xslt': custom_xslt, 'schema_name': metadataFormat.metadataPrefix, 'registry_name': registriesName[instanceResult['registry']], 'oai_pmh': True }) resultString += template.render(context) else: for instanceResult in instanceResults[:20]: wordList = re.sub("[^\w]", " ", keyword).split() wordList = [x + "|" + x + "\w+" for x in wordList] wordList = '|'.join(wordList) listWholeKeywords = re.findall( "\\b(" + wordList + ")\\b", xmltodict.unparse( instanceResult['metadata']).encode('utf-8'), flags=re.IGNORECASE) labels = list(set(listWholeKeywords)) for label in labels: label = label.lower() result_json = {} result_json['label'] = label result_json['value'] = label if not result_json in resultsByKeyword: resultsByKeyword.append(result_json)
def get_results_by_instance_keyword(request): print 'BEGIN def getResultsKeyword(request)' resultsByKeyword = [] results = [] resultString = "" #Instance json_instances = [] if 'HTTPS' in request.META['SERVER_PROTOCOL']: protocol = "https" else: protocol = "http" instance = Instance(name="Local", protocol=protocol, address=request.META['REMOTE_ADDR'], port=request.META['SERVER_PORT'], access_token="token", refresh_token="token") json_instances.append(instance.to_json()) request.session['instancesExplore'] = json_instances sessionName = "resultsExploreOaiPMh" + instance['name'] keyword = request.POST.get('keyword', '') schemas = request.POST.getlist('schemas[]', []) user_schemas = request.POST.getlist('userSchemas[]', []) refinements = refinements_to_mongo( json.loads(request.POST.get('refinements', '{}'))) registries = request.POST.getlist('registries[]', []) if 'onlySuggestions' in request.POST: onlySuggestions = json.loads(request.POST['onlySuggestions']) else: onlySuggestions = False metadata_format_ids = _get_metadata_formats_id(schemas=schemas, user_schemas=user_schemas, registries=registries) instanceResults = OaiRecord.executeFullTextQuery(keyword, metadata_format_ids, refinements) if len(instanceResults) > 0: if not onlySuggestions: xsltPath = os.path.join(settings.SITE_ROOT, 'static/resources/xsl/xml2html.xsl') xslt = etree.parse(xsltPath) transform = etree.XSLT(xslt) template = loader.get_template( 'oai_pmh/explore/explore_result_keyword.html') #Retrieve schema and registries. Avoid to retrieve the information for each result registriesName = {} objMetadataFormats = {} listRegistriesID = set([x['registry'] for x in instanceResults]) registriesURL = {} for registryId in listRegistriesID: obj = OaiRegistry.objects(pk=registryId).get() registriesName[str(registryId)] = obj.name registriesURL[str(registryId)] = obj.url listSchemaId = set([x['metadataformat'] for x in instanceResults]) for schemaId in listSchemaId: obj = OaiMetadataFormat.objects(pk=schemaId).get() objMetadataFormats[str(schemaId)] = obj listItems = [] xmltodictunparse = XMLdata.unparse appendResult = results.append toXML = etree.XML parse = etree.parse XSLT = etree.XSLT if not onlySuggestions: for instanceResult in instanceResults: custom_xslt = False appendResult({ 'title': instanceResult['identifier'], 'content': xmltodictunparse(instanceResult['metadata']), 'id': str(instanceResult['_id']) }) dom = toXML( str( xmltodictunparse( instanceResult['metadata']).encode('utf-8'))) #Check if a custom list result XSLT has to be used try: metadataFormat = objMetadataFormats[str( instanceResult['metadataformat'])] if metadataFormat.template.ResultXsltList: listXslt = parse( BytesIO( metadataFormat.template.ResultXsltList.content. encode('utf-8'))) listTransform = XSLT(listXslt) newdom = listTransform(dom) custom_xslt = True else: newdom = transform(dom) except Exception, e: #We use the default one newdom = transform(dom) custom_xslt = False registry_name = registriesName[instanceResult['registry']] if len(registry_name) > 30: registry_name = "{0}...".format(registry_name[:30]) url = urlparse(registriesURL[instanceResult['registry']]) context = RequestContext( request, { 'id': str(instanceResult['_id']), 'xml': str(newdom), 'title': instanceResult['identifier'], 'custom_xslt': custom_xslt, 'template_name': metadataFormat.template.title, 'registry_name': registry_name, 'registry_url': "{0}://{1}".format( url.scheme, url.netloc), 'oai_pmh': True }) resultString += template.render(context) else: for instanceResult in instanceResults[:20]: wordList = re.sub("[^\w]", " ", keyword).split() wordList = [x + "|" + x + "\w+" for x in wordList] wordList = '|'.join(wordList) listWholeKeywords = re.findall( "\\b(" + wordList + ")\\b", XMLdata.unparse( instanceResult['metadata']).encode('utf-8'), flags=re.IGNORECASE) labels = list(set(listWholeKeywords)) for label in labels: label = label.lower() result_json = {} result_json['label'] = label result_json['value'] = label if not result_json in resultsByKeyword: resultsByKeyword.append(result_json)
def get_results_by_instance_keyword(request): print 'BEGIN def getResultsKeyword(request)' resultsByKeyword = [] results = [] resultString = "" #Instance json_instances = [] if 'HTTPS' in request.META['SERVER_PROTOCOL']: protocol = "https" else: protocol = "http" instance = Instance(name="Local", protocol=protocol, address=request.META['REMOTE_ADDR'], port=request.META['SERVER_PORT'], access_token="token", refresh_token="token") json_instances.append(instance.to_json()) request.session['instancesExplore'] = json_instances sessionName = "resultsExploreOaiPMh" + instance['name'] try: keyword = request.GET['keyword'] schemas = request.GET.getlist('schemas[]') mergedSchemas = [] for schema in schemas: t = json.loads(schema) mergedSchemas += t['oai-pmh'] if 'onlySuggestions' in request.GET: onlySuggestions = json.loads(request.GET['onlySuggestions']) else: onlySuggestions = False except: keyword = '' schemas = [] onlySuggestions = True mergedSchemas = [] instanceResults = OaiRecord.executeFullTextQuery(keyword, mergedSchemas) if len(instanceResults) > 0: if not onlySuggestions: xsltPath = os.path.join(settings.SITE_ROOT, 'static/resources/xsl/xml2html.xsl') xslt = etree.parse(xsltPath) transform = etree.XSLT(xslt) #Retrieve schema and registries. Avoid to retrieve the information for each result registriesName = {} schemasName = {} listRegistriesID = set([x['registry'] for x in instanceResults]) for registryId in listRegistriesID: obj = OaiRegistry.objects(pk=registryId).get() registriesName[str(registryId)] = obj.name listSchemaId = set([x['metadataformat'] for x in instanceResults]) for schemaId in listSchemaId: obj = OaiMetadataFormat.objects(pk=schemaId).get() schemasName[str(schemaId)] = obj listItems = [] xmltodictunparse = unparse appendResult = results.append toXML = etree.XML parse = etree.parse XSLT = etree.XSLT if not onlySuggestions: for instanceResult in instanceResults: custom_xslt = False appendResult({ 'title': instanceResult['identifier'], 'content': xmltodictunparse(instanceResult['metadata']), 'id': str(instanceResult['_id']) }) dom = toXML( str( xmltodictunparse( instanceResult['metadata']).encode('utf-8'))) #Check if a custom list result XSLT has to be used try: schema = schemasName[str(instanceResult['metadataformat'])] if schema.ResultXsltList: listXslt = parse( BytesIO( schema.ResultXsltList.content.encode('utf-8'))) listTransform = XSLT(listXslt) newdom = listTransform(dom) custom_xslt = True else: newdom = transform(dom) except Exception, e: #We use the default one newdom = transform(dom) custom_xslt = False item = { 'id': str(instanceResult['_id']), 'xml': str(newdom), 'title': instanceResult['identifier'], 'custom_xslt': custom_xslt, 'schema_name': schema.metadataPrefix, 'registry_name': registriesName[instanceResult['registry']] } listItems.append(item) context = RequestContext(request, {'list_results': listItems}) else: for instanceResult in instanceResults[:20]: wordList = re.sub("[^\w]", " ", keyword).split() wordList = [x + "|" + x + "\w+" for x in wordList] wordList = '|'.join(wordList) listWholeKeywords = re.findall( "\\b(" + wordList + ")\\b", unparse(instanceResult['metadata']).encode('utf-8'), flags=re.IGNORECASE) labels = list(set(listWholeKeywords)) for label in labels: label = label.lower() result_json = {} result_json['label'] = label result_json['value'] = label if not result_json in resultsByKeyword: resultsByKeyword.append(result_json) if not onlySuggestions: template = loader.get_template( 'oai_pmh/explore/explore_result_keyword.html') resultString += template.render(context)