def post(self, request, resourceid=None): try: indent = int(request.POST.get('indent', None)) except: indent = None try: if user_can_edit_resources(user=request.user): data = JSONDeserializer().deserialize(request.body) reader = JsonLdReader() reader.read_resource(data) if reader.errors: response = [] for value in reader.errors.itervalues(): response.append(value.message) return JSONResponse(data, indent=indent, status=400, reason=response) else: response = [] for resource in reader.resources: with transaction.atomic(): resource.save(request=request) response.append(JSONDeserializer().deserialize( self.get(request, resource.resourceinstanceid).content)) return JSONResponse(response, indent=indent) else: return JSONResponse(status=403) except Exception as e: return JSONResponse(status=500, reason=e)
def test_find_leaf_branch(self): """ Given a list of leaf nodes, find the appropriate node from the given jsonld """ jsonld_graph = { "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "Will", } } graphtree = self.unique_graph.get_tree() for child in graphtree["children"]: if child["node"].name == "Name": node = child reader = JsonLdReader() branch = reader.findBranch( node["children"], "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by", jsonld_graph[ "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"], ) self.assertEqual(str(branch["node"].pk), "839b0e4c-95e6-11e8-aada-14109fd34195")
def post(self, request, resourceid=None, slug=None, graphid=None): try: indent = int(request.POST.get("indent", None)) except Exception: indent = None try: if user_can_edit_resources(user=request.user): data = JSONDeserializer().deserialize(request.body) reader = JsonLdReader() if slug is not None: graphid = models.GraphModel.objects.get(slug=slug).pk reader.read_resource(data, graphid=graphid) if reader.errors: response = [] for value in reader.errors.values(): response.append(value.message) return JSONResponse({"error": response}, indent=indent, status=400) else: response = [] for resource in reader.resources: with transaction.atomic(): resource.save(request=request) response.append(JSONDeserializer().deserialize(self.get(request, resource.resourceinstanceid).content)) return JSONResponse(response, indent=indent, status=201) else: return JSONResponse(status=403) except Exception as e: if settings.DEBUG is True: exc_type, exc_value, exc_traceback = sys.exc_info() formatted = traceback.format_exception(exc_type, exc_value, exc_traceback) if len(formatted): for message in formatted: print(message) return JSONResponse({"error": "resource data could not be saved: %s" % e}, status=500, reason=e)
def put(self, request, resourceid): if user_can_edit_resources(user=request.user): data = JSONDeserializer().deserialize(request.body) #print data reader = JsonLdReader() reader.read_resource(data) else: return JSONResponse(status=500) return JSONResponse(self.get(request, resourceid))
def put(self, request, resourceid, slug=None, graphid=None): try: indent = int(request.PUT.get("indent", None)) except Exception: indent = None if not user_can_edit_resources(user=request.user, resourceid=resourceid): return JSONResponse(status=403) else: with transaction.atomic(): try: # DELETE resource_instance = Resource.objects.get(pk=resourceid) resource_instance.delete() except models.ResourceInstance.DoesNotExist: pass try: # POST data = JSONDeserializer().deserialize(request.body) reader = JsonLdReader() if slug is not None: graphid = models.GraphModel.objects.get(slug=slug).pk reader.read_resource(data, resourceid=resourceid, graphid=graphid) if reader.errors: response = [] for value in reader.errors.values(): response.append(value.message) return JSONResponse({"error": response}, indent=indent, status=400) else: response = [] for resource in reader.resources: with transaction.atomic(): resource.save(request=request) response.append(JSONDeserializer().deserialize( self.get(request, resource.resourceinstanceid).content)) return JSONResponse(response, indent=indent, status=201) except models.ResourceInstance.DoesNotExist: return JSONResponse(status=404) except Exception as e: return JSONResponse( {"error": "resource data could not be saved"}, status=500, reason=e)
def put(self, request, resourceid): try: indent = int(request.POST.get('indent', None)) except: indent = None try: if user_can_edit_resources(user=request.user): data = JSONDeserializer().deserialize(request.body) reader = JsonLdReader() reader.read_resource(data, use_ids=True) if reader.errors: response = [] for value in reader.errors.itervalues(): response.append(value.message) return JSONResponse(data, indent=indent, status=400, reason=response) else: response = [] for resource in reader.resources: if resourceid != str(resource.pk): raise Exception( 'Resource id in the URI does not match the resource @id supplied in the document' ) old_resource = Resource.objects.get(pk=resource.pk) old_resource.load_tiles() old_tile_ids = set( [str(tile.pk) for tile in old_resource.tiles]) new_tile_ids = set([ str(tile.pk) for tile in resource.get_flattened_tiles() ]) tileids_to_delete = old_tile_ids.difference( new_tile_ids) tiles_to_delete = models.TileModel.objects.filter( pk__in=tileids_to_delete) with transaction.atomic(): tiles_to_delete.delete() resource.save(request=request) response.append(JSONDeserializer().deserialize( self.get(request, resource.resourceinstanceid).content)) return JSONResponse(response, indent=indent) else: return JSONResponse(status=403) except Exception as e: return JSONResponse(status=500, reason=e)
def test_find_branch_from_jsonld_2(self): """ The same test as above except that we now add an additional node to the supplied json which now will match a branch in the graph The graph is partially unique (the children of the root are not unique) """ ambiguous_jsonld_graph = { "@id": "http://*****:*****@type": [ "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation", "http://*****:*****@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation", "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "Will", }, "http://www.cidoc-crm.org/cidoc-crm/P1i_identifies": { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "Smith", }, }], } graphtree = self.ambiguous_graph.get_tree() reader = JsonLdReader() branch = reader.findBranch( graphtree["children"], "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by", ambiguous_jsonld_graph[ "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"], ) self.assertEqual(str(branch["node"].pk), "3f40c4c0-9693-11e8-8a0f-14109fd34195")
def test_find_unique_branch_from_jsonld(self): """ Test that we can find the correct branch in the graph that matches the supplied json-ld The graph is partially unique (the children of the root are not unique) """ jsonld_graph = { "@id": "http://*****:*****@type": [ "http://*****:*****@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation", "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "Will", }, "http://www.cidoc-crm.org/cidoc-crm/P1i_identifies": { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "Smith", }, }, } graphtree = self.unique_graph.get_tree() reader = JsonLdReader() branch = reader.findBranch( graphtree["children"], "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by", jsonld_graph[ "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"], ) self.assertEqual(str(branch["node"].pk), "3e1e65dc-95e6-11e8-9de9-14109fd34195")
def put(self, request, resourceid): try: indent = int(request.PUT.get('indent', None)) except: indent = None if user_can_edit_resources(user=request.user): with transaction.atomic(): try: # DELETE resource_instance = Resource.objects.get(pk=resourceid) resource_instance.delete() except models.ResourceInstance.DoesNotExist: pass try: # POST data = JSONDeserializer().deserialize(request.body) reader = JsonLdReader() reader.read_resource(data, resourceid=resourceid) if reader.errors: response = [] for value in reader.errors.itervalues(): response.append(value.message) return JSONResponse(data, indent=indent, status=400, reason=response) else: response = [] for resource in reader.resources: with transaction.atomic(): resource.save(request=request) response.append(JSONDeserializer().deserialize( self.get(request, resource.resourceinstanceid).content)) return JSONResponse(response, indent=indent, status=201) except models.ResourceInstance.DoesNotExist: return JSONResponse(status=404) else: return JSONResponse(status=500)
def test_cant_find_branch_from_ambiguous_jsonld(self): """ Test that we raise the appropriate error when we can't find the correct branch in the graph given that the supplied json-ld could match more than one branch The graph is partially unique (the children of the root are not unique) """ ambiguous_jsonld_graph = { "@id": "http://*****:*****@type": [ "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation", "http://*****:*****@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation", "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "Will - Ambiguous", }, }], } graphtree = self.ambiguous_graph.get_tree() reader = JsonLdReader() with self.assertRaises(reader.AmbiguousGraphException) as cm: branch = reader.findBranch( graphtree["children"], "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by", ambiguous_jsonld_graph[ "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"], )
def test_find_other_unique_branch_from_jsonld(self): """ Test that we can find the correct branch in the graph that matches the supplied json-ld The graph is partially unique (the children of the root are not unique) """ jsonld_graph = { "@id": "http://*****:*****@type": [ "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation", "http://*****:*****@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation", "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": { "@id": "http://*****:*****@type": "http://www.ics.forth.gr/isl/CRMdig/D21_Person_Name", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "The Shadow", }, }], } graphtree = self.unique_graph.get_tree() reader = JsonLdReader() branch = reader.findBranch( graphtree["children"], "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by", jsonld_graph[ "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"], ) self.assertEqual(str(branch["node"].pk), "91679e1e-95e6-11e8-a166-14109fd34195")
def test_cant_find_branch_from_jsonld(self): """ Test that we raise the appropriate error when we can't find the correct branch in the graph that matches the supplied json-ld The graph is partially unique (the children of the root are not unique) """ incorrect_jsonld_graph = { "@id": "http://*****:*****@type": [ "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation", "http://*****:*****@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation", "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": { "@id": "http://*****:*****@type": "---THIS TYPE IS INCORRECT AND SHOULN'T MATCH---", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "The Shadow", }, }], } graphtree = self.unique_graph.get_tree() reader = JsonLdReader() with self.assertRaises(reader.DataDoesNotMatchGraphException) as cm: branch = reader.findBranch( graphtree["children"], "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by", incorrect_jsonld_graph[ "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"], )
def test_cant_find_branch_from_complex_ambigious_jsonld(self): """ The same test as above except that we now supply a jsonld structure that matches more then one branch in the graph (it's ambiguous) """ complex_jsonld_graph = { "@id": "http://*****:*****@type": [ "http://www.cidoc-crm.org/cidoc-crm/E12_Production", "http://*****:*****@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E17_Type_Assignment", "http://www.cidoc-crm.org/cidoc-crm/P42_assigned": [ { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E55_Type", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "174e9486-0663-4c9d-ab78-c7e441720c26", }, { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E55_Type", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "None", }, ], "http://www.cidoc-crm.org/cidoc-crm/P4_has_time-span": { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span", "http://www.cidoc-crm.org/cidoc-crm/P78_is_identified_by": [ { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E49_Time_Appellation", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "2018-08-06", }, { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E49_Time_Appellation", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "2018-09-20", }, ], }, }], } graphtree = self.phase_type_assignment_graph.get_tree() reader = JsonLdReader() with self.assertRaises(reader.AmbiguousGraphException) as cm: branch = reader.findBranch( graphtree["children"], "http://www.cidoc-crm.org/cidoc-crm/P41i_was_classified_by", complex_jsonld_graph[ "http://www.cidoc-crm.org/cidoc-crm/P41i_was_classified_by"], )
def test_find_branch_from_complex_jsonld(self): """ Given a more complicated json structure find the branch in the graph """ complex_jsonld_graph = { "@id": "http://*****:*****@type": [ "http://www.cidoc-crm.org/cidoc-crm/E12_Production", "http://*****:*****@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E17_Type_Assignment", "http://www.cidoc-crm.org/cidoc-crm/P42_assigned": { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E55_Type", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "None", }, "http://www.cidoc-crm.org/cidoc-crm/P2_has_type": { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E55_Type", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "51cbfba6-34ee-4fbd-8b6e-10ef73fd4083", }, "http://www.cidoc-crm.org/cidoc-crm/P4_has_time-span": { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span", "http://www.cidoc-crm.org/cidoc-crm/P78_is_identified_by": [ { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E49_Time_Appellation", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "2018-08-05", }, { "@id": "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E49_Time_Appellation", "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "2018-08-06", }, ], }, }], } graphtree = self.phase_type_assignment_graph.get_tree() reader = JsonLdReader() branch = reader.findBranch( graphtree["children"], "http://www.cidoc-crm.org/cidoc-crm/P41i_was_classified_by", complex_jsonld_graph[ "http://www.cidoc-crm.org/cidoc-crm/P41i_was_classified_by"], ) self.assertEqual(str(branch["node"].pk), "049fc0c9-fa36-11e6-9e3e-026d961c88e6")
class Command(BaseCommand): """ Command for importing JSON-LD data into Arches """ def add_arguments(self, parser): parser.add_argument( "-s", "--source", default="data/", action="store", dest="source", help="the directory in which the data files are to be found" ) parser.add_argument( "-ow", "--overwrite", default="ignore", action="store", dest="force", help="if overwrite, overwrite records that exist; if ignore, then skip; if error, then halt", ) parser.add_argument("--toobig", default=0, type=int, action="store", dest="toobig", help="Do not attempt to load records > n kb") parser.add_argument( "-m", "--model", default="", action="store", dest="model", help="the name of the model path to load (eg auction_of_lot)", ) parser.add_argument( "-b", "--block", default="", action="store", dest="block", help="the name of the block in the model path to load (eg 00), or slice in the form this,total (eg 1,5)", ) parser.add_argument("--max", default=-1, type=int, action="store", dest="max", help="Maximum number of records to load per model") parser.add_argument("--fast", default=0, action="store", type=int, dest="fast", help="Use bulk_save to store n records at a time") parser.add_argument("-q", "--quiet", default=False, action="store_true", dest="quiet", help="Don't announce every record") parser.add_argument( "--skip", default=-1, type=int, action="store", dest="skip", help="Number of records to skip before starting to load" ) parser.add_argument("--suffix", default="json", action="store", dest="suffix", help="file suffix to load if not .json") parser.add_argument( "--ignore-errors", default=False, action="store_true", dest="ignore_errors", help="Log but do not terminate on errors" ) parser.add_argument( "--strip-issearchable", default=False, action="store_true", dest="strip_search", help="If a node is set to not be exposed to advanced search, then don't even index it", ) def handle(self, *args, **options): print("Starting JSON-LD load") if options["model"]: print(f"Only loading {options['model']}") if options["block"]: print(f"Only loading {options['block']}") if options["force"] == "overwrite": print("Overwriting existing records") if options["toobig"]: print(f"Not loading records > {options['toobig']}kb") if options["quiet"]: print("Only announcing timing data") if options["strip_search"] and not options["fast"]: print("ERROR: stripping fields not exposed to advanced search only works in fast mode") return self.resources = [] self.load_resources(options) def load_resources(self, options): self.reader = JsonLdReader() self.jss = JSONSerializer() source = options["source"] if options["model"]: models = [options["model"]] else: models = os.listdir(source) models.sort() models = [m for m in models if m[0] not in ["_", "."]] print(f"Found possible models: {models}") # This is boilerplate for any use of get_documents_to_index() # Need to add issearchable for strip_search option # Only calculate it once per load self.datatype_factory = DataTypeFactory() dt_instance_hash = {} self.node_info = { str(nodeid): { "datatype": dt_instance_hash.setdefault(datatype, self.datatype_factory.get_instance(datatype)), "issearchable": srch, } for nodeid, datatype, srch in archesmodels.Node.objects.values_list("nodeid", "datatype", "issearchable") } self.node_datatypes = {str(nodeid): datatype for nodeid, datatype in archesmodels.Node.objects.values_list("nodeid", "datatype")} start = time.time() seen = 0 loaded = 0 for m in models: print(f"Loading {m}") graphid = graph_uuid_map.get(m, None) if not graphid: # Check slug try: graphid = archesmodels.GraphModel.objects.get(slug=m).pk except: print(f"Couldn't find a model definition for {m}; skipping") continue # We have a good model, so build the pre-processed tree once self.reader.graphtree = self.reader.process_graph(graphid) block = options["block"] if block and "," not in block: blocks = [block] else: blocks = os.listdir(f"{source}/{m}") blocks.sort() blocks = [b for b in blocks if b[0] not in ["_", "."]] if "," in block: # {slice},{max-slices} (cslice, mslice) = block.split(",") cslice = int(cslice) - 1 mslice = int(mslice) blocks = blocks[cslice::mslice] loaded_model = 0 try: for b in blocks: files = os.listdir(f"{source}/{m}/{b}") files.sort() for f in files: if not f.endswith(options["suffix"]): continue elif f.startswith(".") or f.startswith("_"): continue if options["max"] > 0 and loaded_model >= options["max"]: raise StopIteration() seen += 1 if seen <= options["skip"]: # Do it this way to keep the counts correct continue fn = f"{source}/{m}/{b}/{f}" # Check file size of record if not options["quiet"]: print(f"About to import {fn}") if options["toobig"]: sz = os.os.path.getsize(fn) if sz > options["toobig"]: if not quiet: print(f" ... Skipping due to size: {sz} > {options['toobig']}") continue uu = f.replace(f".{options['suffix']}", "") fh = open(fn) data = fh.read() fh.close() # FIXME Timezone / DateTime Workaround # FIXME The following line should be removed when #5669 / #6346 are closed data = data.replace("T00:00:00Z", "") jsdata = json.loads(data) jsdata = fix_js_data(data, jsdata, m) if len(uu) != 36 or uu[8] != "-": # extract uuid from data if filename is not a UUID uu = jsdata["id"][-36:] if jsdata: try: if options["fast"]: l = self.fast_import_resource( uu, graphid, jsdata, n=options["fast"], reload=options["force"], quiet=options["quiet"], strip_search=options["strip_search"], ) else: l = self.import_resource(uu, graphid, jsdata, reload=options["force"], quiet=options["quiet"]) loaded += l loaded_model += l except Exception as e: print(f"*** Failed to load {fn}:\n {e}\n") if not options["ignore_errors"]: raise else: print(" ... skipped due to bad data :(") if not seen % 100: print(f" ... seen {seen} / loaded {loaded} in {time.time()-start}") except StopIteration as e: break except: raise if options["fast"] and self.resources: self.save_resources() self.index_resources(options["strip_search"]) self.resources = [] print(f"Total Time: seen {seen} / loaded {loaded} in {time.time()-start} seconds") def fast_import_resource(self, resourceid, graphid, data, n=1000, reload="ignore", quiet=True, strip_search=False): try: resource_instance = Resource.objects.get(pk=resourceid) if reload == "ignore": if not quiet: print(f" ... already loaded") return 0 elif reload == "error": print(f"*** Record exists for {resourceid}, and -ow is error") raise FileExistsError(resourceid) else: resource_instance.delete() except archesmodels.ResourceInstance.DoesNotExist: # thrown when resource doesn't exist pass try: self.reader.read_resource(data, resourceid=resourceid, graphid=graphid) self.resources.extend(self.reader.resources) except: print(f"Exception raised while reading {resourceid}...") raise if len(self.resources) >= n: self.save_resources() self.index_resources(strip_search) self.resources = [] return 1 def import_resource(self, resourceid, graphid, data, reload="ignore", quiet=False): with transaction.atomic(): try: resource_instance = Resource.objects.get(pk=resourceid) if reload == "ignore": if not quiet: print(f" ... already loaded") return 0 elif reload == "error": print(f"*** Record exists for {resourceid}, and -ow is error") raise FileExistsError(resourceid) else: resource_instance.delete() except archesmodels.ResourceInstance.DoesNotExist: # thrown when resource doesn't exist pass try: self.reader.read_resource(data, resourceid=resourceid, graphid=graphid) for resource in self.reader.resources: resource.save(request=None) except archesmodels.ResourceInstance.DoesNotExist: print(f"*** Could not find model: {graphid}") return 0 except Exception as e: raise return 1 def save_resources(self): tiles = [] for resource in self.resources: resource.tiles = resource.get_flattened_tiles() tiles.extend(resource.tiles) Resource.objects.bulk_create(self.resources) TileModel.objects.bulk_create(tiles) for t in tiles: for nodeid in t.data.keys(): datatype = self.node_info[nodeid]["datatype"] datatype.pre_tile_save(t, nodeid) for resource in self.resources: resource.save_edit(edit_type="create") def index_resources(self, strip_search=False): se = SearchEngineInstance documents = [] term_list = [] for resource in self.resources: if strip_search: document, terms = monkey_get_documents_to_index(resource, node_info=self.node_info) else: document, terms = resource.get_documents_to_index( fetchTiles=False, datatype_factory=self.datatype_factory, node_datatypes=self.node_datatypes ) documents.append(se.create_bulk_item(index="resources", id=document["resourceinstanceid"], data=document)) for term in terms: term_list.append(se.create_bulk_item(index="terms", id=term["_id"], data=term["_source"])) se.bulk_index(documents) se.bulk_index(term_list)
def load_resources(self, options): self.reader = JsonLdReader() self.jss = JSONSerializer() source = options["source"] if options["model"]: models = [options["model"]] else: models = os.listdir(source) models.sort() models = [m for m in models if m[0] not in ["_", "."]] print(f"Found possible models: {models}") # This is boilerplate for any use of get_documents_to_index() # Need to add issearchable for strip_search option # Only calculate it once per load self.datatype_factory = DataTypeFactory() dt_instance_hash = {} self.node_info = { str(nodeid): { "datatype": dt_instance_hash.setdefault(datatype, self.datatype_factory.get_instance(datatype)), "issearchable": srch, } for nodeid, datatype, srch in archesmodels.Node.objects.values_list("nodeid", "datatype", "issearchable") } self.node_datatypes = {str(nodeid): datatype for nodeid, datatype in archesmodels.Node.objects.values_list("nodeid", "datatype")} start = time.time() seen = 0 loaded = 0 for m in models: print(f"Loading {m}") graphid = graph_uuid_map.get(m, None) if not graphid: # Check slug try: graphid = archesmodels.GraphModel.objects.get(slug=m).pk except: print(f"Couldn't find a model definition for {m}; skipping") continue # We have a good model, so build the pre-processed tree once self.reader.graphtree = self.reader.process_graph(graphid) block = options["block"] if block and "," not in block: blocks = [block] else: blocks = os.listdir(f"{source}/{m}") blocks.sort() blocks = [b for b in blocks if b[0] not in ["_", "."]] if "," in block: # {slice},{max-slices} (cslice, mslice) = block.split(",") cslice = int(cslice) - 1 mslice = int(mslice) blocks = blocks[cslice::mslice] loaded_model = 0 try: for b in blocks: files = os.listdir(f"{source}/{m}/{b}") files.sort() for f in files: if not f.endswith(options["suffix"]): continue elif f.startswith(".") or f.startswith("_"): continue if options["max"] > 0 and loaded_model >= options["max"]: raise StopIteration() seen += 1 if seen <= options["skip"]: # Do it this way to keep the counts correct continue fn = f"{source}/{m}/{b}/{f}" # Check file size of record if not options["quiet"]: print(f"About to import {fn}") if options["toobig"]: sz = os.os.path.getsize(fn) if sz > options["toobig"]: if not quiet: print(f" ... Skipping due to size: {sz} > {options['toobig']}") continue uu = f.replace(f".{options['suffix']}", "") fh = open(fn) data = fh.read() fh.close() # FIXME Timezone / DateTime Workaround # FIXME The following line should be removed when #5669 / #6346 are closed data = data.replace("T00:00:00Z", "") jsdata = json.loads(data) jsdata = fix_js_data(data, jsdata, m) if len(uu) != 36 or uu[8] != "-": # extract uuid from data if filename is not a UUID uu = jsdata["id"][-36:] if jsdata: try: if options["fast"]: l = self.fast_import_resource( uu, graphid, jsdata, n=options["fast"], reload=options["force"], quiet=options["quiet"], strip_search=options["strip_search"], ) else: l = self.import_resource(uu, graphid, jsdata, reload=options["force"], quiet=options["quiet"]) loaded += l loaded_model += l except Exception as e: print(f"*** Failed to load {fn}:\n {e}\n") if not options["ignore_errors"]: raise else: print(" ... skipped due to bad data :(") if not seen % 100: print(f" ... seen {seen} / loaded {loaded} in {time.time()-start}") except StopIteration as e: break except: raise if options["fast"] and self.resources: self.save_resources() self.index_resources(options["strip_search"]) self.resources = [] print(f"Total Time: seen {seen} / loaded {loaded} in {time.time()-start} seconds")