def reindex(self, obj): # get the fedoralink's original class from the obj. clz = fedoralink_classes(obj)[0] if not issubclass(clz, IndexableFedoraObject): # can not reindex something which does not have a mapping return doc_type = self._get_elastic_class(clz) indexer_data = {} for field in clz._meta.fields: data = getattr(obj, field.name) if data is None: continue converted_value = convert(data, field) indexer_data[url2id(field.rdf_name)] = converted_value encoded_fedora_id = base64.b64encode(str(obj.pk).encode('utf-8')).decode('utf-8') indexer_data['_fedora_id'] = obj.pk parent = obj[FEDORA.hasParent] if parent and (isinstance(parent, list) or isinstance(parent, tuple)): parent = parent[0] indexer_data['_fedora_parent'] = convert(parent, FEDORA_PARENT_FIELD) indexer_data['_fedoralink_model'] = [self._get_elastic_class(x) for x in inspect.getmro(clz)] indexer_data['_fedora_type'] = [convert(x, FEDORA_TYPE_FIELD) for x in obj[RDF.type]] indexer_data['_fedora_created'] = [convert(x, FEDORA_CREATED_FIELD) for x in obj[FEDORA.created]] indexer_data['_fedora_last_modified'] = [convert(x, FEDORA_LAST_MODIFIED_FIELD) for x in obj[FEDORA.lastModified]] # noinspection PyBroadException try: self.es.index(index=self.index_name, doc_type=doc_type, body=indexer_data, id=encoded_fedora_id) except: print("Exception in indexing, data", indexer_data) mail_admins('Exception reindexing object %s' % obj.id, traceback.format_exc()) print("Exception reindexing object %s" % obj.id, traceback.format_exc()) traceback.print_exc()
def reindex(self, obj): # get the fedoralink's original class from the obj. clz = fedoralink_classes(obj)[0] if not issubclass(clz, IndexableFedoraObject): # can not reindex something which does not have a mapping return doc_type = self._get_elastic_class(clz) indexer_data = {} for field in clz._meta.fields: data = getattr(obj, field.name) if data is None: continue converted_value = convert(data, field) indexer_data[url2id(field.rdf_name)] = converted_value encoded_fedora_id = base64.b64encode(str(obj.pk).encode('utf-8')).decode('utf-8') indexer_data['_fedora_id'] = obj.pk indexer_data['_fedora_parent'] = convert(obj[FEDORA.hasParent], FEDORA_PARENT_FIELD) indexer_data['_fedoralink_model'] = [ self._get_elastic_class(x) for x in inspect.getmro(clz) ] indexer_data['_fedora_type'] = [ convert(x, FEDORA_TYPE_FIELD) for x in obj[RDF.type] ] indexer_data['_fedora_created'] = [ convert(x, FEDORA_CREATED_FIELD) for x in obj[FEDORA.created] ] indexer_data['_fedora_last_modified'] = [ convert(x, FEDORA_LAST_MODIFIED_FIELD) for x in obj[FEDORA.lastModified] ] # noinspection PyBroadException try: self.es.index(index=self.index_name, doc_type=doc_type, body=indexer_data, id=encoded_fedora_id) except: print("Exception in indexing, data", indexer_data) traceback.print_exc() print("reindexing single object ok")
def search(self, query, model_class, start, end, facets, ordering, values): self._de_morgan(query) self._flatten_query(query) fld2id = {} id2fld = {} id2fldlang = {} for fld in model_class._meta.fields: id_in_elasticsearch = url2id(fld.rdf_name) if isinstance(fld, IndexedLanguageField): for lang in settings.LANGUAGES: nested_id_in_elasticsearch = id_in_elasticsearch + '.' + lang[0] fld2id[fld.name + '.' + lang[0]] = nested_id_in_elasticsearch id2fld[nested_id_in_elasticsearch] = fld.name id2fldlang[nested_id_in_elasticsearch] = fld.name + '@' + lang[0] fld2id[fld.name] = id_in_elasticsearch id2fld[id_in_elasticsearch] = fld.name id2fldlang[id_in_elasticsearch] = fld.name for extra_fld in ('_fedoralink_model', '_fedora_parent'): fld2id[extra_fld] = extra_fld id2fld[extra_fld] = extra_fld id2fldlang[extra_fld] = extra_fld all_fields = set() self._get_all_fields(query, all_fields, fld2id) if query: query_tree = self._build_query(query, fld2id, None) else: query_tree = {"bool": { "must": { "match_all": {} } } } query_tree = { "bool" : { "must" : [ query_tree, self._build_query(Q(_fedoralink_model=self._get_elastic_class(model_class)), fld2id, None) ] }} ordering_clause = self._generate_ordering_clause(fld2id, ordering) facets_clause = self._generate_facet_clause(facets, fld2id) built_query = { "sort": ordering_clause, "query": query_tree, "aggs": facets_clause, "highlight": { "fields": { k: {} for k in all_fields # '*' : {} }, "require_field_match": False }, "from": start if start else 0, "size": (end - (start if start else 0)) if end is not None else 10000 } print(json.dumps(built_query, ensure_ascii=False)) do_profile = FedoraProfillingMiddleware.profilling_enabled() if do_profile: t1 = time.time() resp = self.es.search(body=built_query) if do_profile: t2 = time.time() FedoraProfillingMiddleware.log_time(json.dumps(built_query ,ensure_ascii=False), t2-t1) # print(json.dumps(resp, indent=4)) instances = [] for doc in resp['hits']['hits']: if values is None: instances.append(self.build_instance(doc, id2fld)) facets = [] for k, v in resp.get('aggregations', {}).items(): if 'buckets' in v: # normal value buckets = v['buckets'] else: # nested value, always called "value" - defined above buckets = v['value']['buckets'] if k.endswith('__exists'): facet_id = id2fldlang[k[:-8]] + '__exists' else: facet_id = id2fldlang[k] facets.append(( facet_id, [(vv['key'], vv['doc_count']) for vv in buckets] )) return { 'count': resp['hits']['total'], 'data': iter(instances), 'facets': facets }
def handle(self, *args, **options): FedoraTypeManager.populate() models = list(args) for model_name in models: fields = {} split_model_name = model_name.split('.') indexer_model_name = '_'.join(split_model_name) module_name = '.'.join(split_model_name[:-1]) split_model_name = split_model_name[-1] class_for_name(module_name, split_model_name) modelclz = FedoraTypeManager.get_model_class(split_model_name) for field in modelclz._meta.fields: fldname = url2id(field.rdf_name) if fldname not in fields: fields[fldname] = field indexer = connections['repository'].indexer existing_mapping = indexer.get_mapping(indexer_model_name) existing_properties = existing_mapping.get('properties', {}) new_mapping = { } new_properties = { } fields['_fedora_id'] = FEDORA_ID_FIELD fields['_fedora_parent'] = FEDORA_PARENT_FIELD fields['_fedora_type'] = FEDORA_TYPE_FIELD fields['_fedoralink_model'] = FEDORALINK_TYPE_FIELD fields['_fedora_created'] = FEDORA_CREATED_FIELD fields['_fedora_last_modified'] = FEDORA_LAST_MODIFIED_FIELD for fldname, field in fields.items(): if fldname in existing_properties: continue props = {} new_properties[fldname] = props if isinstance(field, IndexedLanguageField): props['type'] = 'nested' props["include_in_root"] = 'true' props['properties'] = self.gen_languages_mapping(fldname + ".") elif isinstance(field, IndexedTextField): props['type'] = 'string' props['index'] = 'not_analyzed' props['copy_to'] = fldname + "__fulltext" new_properties[fldname + "__fulltext"] = { 'type': 'string', } elif isinstance(field, IndexedDateTimeField): props['type'] = 'date' props['index'] = 'not_analyzed' elif isinstance(field, IndexedDateField): props['type'] = 'date' props['index'] = 'not_analyzed' elif isinstance(field, IndexedIntegerField): props['type'] = 'long' props['index'] = 'not_analyzed' elif isinstance(field, IndexedGPSField): props['type'] = 'string' props['index'] = 'not_analyzed' elif isinstance(field, IndexedLinkedField) or isinstance(field, IndexedBinaryField) : props['type'] = 'string' props['index'] = 'not_analyzed' else: raise Exception("Mapping type %s not handled yet" % type(field)) new_mapping['_all'] = { "store": True } new_mapping['properties'] = new_properties print(json.dumps(new_mapping, indent=4)) indexer.save_mapping(indexer_model_name, new_mapping)
def handle(self, *args, **options): FedoraTypeManager.populate() models = options['model_name'] for model_name in models: fields = {} split_model_name = model_name.split('.') indexer_model_name = '_'.join(split_model_name) module_name = '.'.join(split_model_name[:-1]) split_model_name = split_model_name[-1] class_for_name(module_name, split_model_name) modelclz = FedoraTypeManager.get_model_class(split_model_name) for field in modelclz._meta.fields: fldname = url2id(field.rdf_name) if fldname not in fields: fields[fldname] = field indexer = connections['repository'].indexer existing_mapping = indexer.get_mapping(indexer_model_name) existing_properties = existing_mapping.get('properties', {}) new_mapping = {} new_properties = {} fields['_fedora_id'] = FEDORA_ID_FIELD fields['_fedora_parent'] = FEDORA_PARENT_FIELD fields['_fedora_type'] = FEDORA_TYPE_FIELD fields['_fedoralink_model'] = FEDORALINK_TYPE_FIELD fields['_fedora_created'] = FEDORA_CREATED_FIELD fields['_fedora_last_modified'] = FEDORA_LAST_MODIFIED_FIELD fields['_collection_child_types'] = CESNET_RDF_TYPES print('ADD fields to mapping') for fldname, field in fields.items(): if fldname in existing_properties: continue props = {} new_properties[fldname] = props if isinstance(field, IndexedLanguageField): props['type'] = 'nested' props["include_in_root"] = 'true' props['properties'] = self.gen_languages_mapping(fldname + ".") elif isinstance(field, IndexedTextField): props['type'] = 'keyword' props['copy_to'] = fldname + "__fulltext" new_properties[fldname + "__fulltext"] = { 'type': 'text', } elif isinstance(field, IndexedDateTimeField): props['type'] = 'date' elif isinstance(field, IndexedDateField): props['type'] = 'date' elif isinstance(field, IndexedIntegerField): props['type'] = 'long' elif isinstance(field, IndexedGPSField): props['type'] = 'keyword' elif isinstance(field, IndexedLinkedField) or isinstance( field, IndexedBinaryField): props['type'] = 'keyword' else: raise Exception("Mapping type %s not handled yet" % type(field)) new_mapping['_all'] = {"store": True} new_mapping['properties'] = new_properties print(json.dumps(new_mapping, indent=4)) indexer.save_mapping(indexer_model_name, new_mapping)
def search(self, query, model_class, start, end, facets, ordering, values): self._de_morgan(query) self._flatten_query(query) fld2id = {} id2fld = {} id2fldlang = {} for fld in model_class._meta.fields: id_in_elasticsearch = url2id(fld.rdf_name) if isinstance(fld, IndexedLanguageField): for lang in settings.LANGUAGES: nested_id_in_elasticsearch = id_in_elasticsearch + '.' + lang[0] fld2id[fld.name + '.' + lang[0]] = nested_id_in_elasticsearch id2fld[nested_id_in_elasticsearch] = fld.name id2fldlang[nested_id_in_elasticsearch] = fld.name + '@' + lang[0] fld2id[fld.name] = id_in_elasticsearch id2fld[id_in_elasticsearch] = fld.name id2fldlang[id_in_elasticsearch] = fld.name for extra_fld in ('_fedoralink_model', '_fedora_parent'): fld2id[extra_fld] = extra_fld id2fld[extra_fld] = extra_fld id2fldlang[extra_fld] = extra_fld all_fields = set() self._get_all_fields(query, all_fields, fld2id) filters = [] fulltext_matches = [] if query: if query.connector != 'AND': raise NotImplementedError("Only top-level AND connector is implemented now") for c in query.children: if self._is_filter(c): filters.append(c) else: fulltext_matches.append(c) filters.append(Q(_fedoralink_model=self._get_elastic_class(model_class))) f = Q() f.connector = Q.AND f.children = filters filters = f filters = self._build_filter(filters, fld2id, None) f = Q() f.connector = Q.AND f.children = fulltext_matches fulltext_matches = f fulltext_matches = self._build_fulltext(fulltext_matches, fld2id, None) else: filters = Q(_fedoralink_model=self._get_elastic_class(model_class)) filters = self._build_filter(filters, fld2id, None) fulltext_matches = {} ordering_clause = self._generate_ordering_clause(fld2id, ordering) facets_clause = self._generate_facet_clause(facets, fld2id) built_query = {} if filters: built_query['filter'] = {'bool': filters.get('bool', [])} if fulltext_matches: built_query['query'] = { 'bool': fulltext_matches.get('bool', []) } built_query = { "sort": ordering_clause, "query": { "filtered": built_query }, "aggs": facets_clause, "highlight": { "fields": { k: {} for k in all_fields # '*' : {} }, "require_field_match": False }, "from": start if start else 0, "size": (end - (start if start else 0)) if end is not None else 10000 } print(json.dumps(built_query, indent=4)) resp = self.es.search(body=built_query) # print(json.dumps(resp, indent=4)) instances = [] for doc in resp['hits']['hits']: if values is None: instances.append(self.build_instance(doc, id2fld)) facets = [] for k, v in resp.get('aggregations', {}).items(): if 'buckets' in v: # normal value buckets = v['buckets'] else: # nested value, always called "value" - defined above buckets = v['value']['buckets'] facets.append(( id2fldlang[k], [(vv['key'], vv['doc_count']) for vv in buckets] )) return { 'count': resp['hits']['total'], 'data': iter(instances), 'facets': facets }