def post(self, request, resourceid=None, slug=None, graphid=None): try: indent = int(request.POST.get("indent", None)) except Exception: indent = None try: if user_can_edit_resources(user=request.user): data = JSONDeserializer().deserialize(request.body) reader = JsonLdReader() if slug is not None: graphid = models.GraphModel.objects.get(slug=slug).pk reader.read_resource(data, graphid=graphid) if reader.errors: response = [] for value in reader.errors.values(): response.append(value.message) return JSONResponse({"error": response}, indent=indent, status=400) else: response = [] for resource in reader.resources: with transaction.atomic(): resource.save(request=request) response.append(JSONDeserializer().deserialize(self.get(request, resource.resourceinstanceid).content)) return JSONResponse(response, indent=indent, status=201) else: return JSONResponse(status=403) except Exception as e: if settings.DEBUG is True: exc_type, exc_value, exc_traceback = sys.exc_info() formatted = traceback.format_exception(exc_type, exc_value, exc_traceback) if len(formatted): for message in formatted: print(message) return JSONResponse({"error": "resource data could not be saved: %s" % e}, status=500, reason=e)
def post(self, request, resourceid=None): try: indent = int(request.POST.get('indent', None)) except: indent = None try: if user_can_edit_resources(user=request.user): data = JSONDeserializer().deserialize(request.body) reader = JsonLdReader() reader.read_resource(data) if reader.errors: response = [] for value in reader.errors.itervalues(): response.append(value.message) return JSONResponse(data, indent=indent, status=400, reason=response) else: response = [] for resource in reader.resources: with transaction.atomic(): resource.save(request=request) response.append(JSONDeserializer().deserialize( self.get(request, resource.resourceinstanceid).content)) return JSONResponse(response, indent=indent) else: return JSONResponse(status=403) except Exception as e: return JSONResponse(status=500, reason=e)
def put(self, request, resourceid): if user_can_edit_resources(user=request.user): data = JSONDeserializer().deserialize(request.body) #print data reader = JsonLdReader() reader.read_resource(data) else: return JSONResponse(status=500) return JSONResponse(self.get(request, resourceid))
def put(self, request, resourceid, slug=None, graphid=None): try: indent = int(request.PUT.get("indent", None)) except Exception: indent = None if not user_can_edit_resources(user=request.user, resourceid=resourceid): return JSONResponse(status=403) else: with transaction.atomic(): try: # DELETE resource_instance = Resource.objects.get(pk=resourceid) resource_instance.delete() except models.ResourceInstance.DoesNotExist: pass try: # POST data = JSONDeserializer().deserialize(request.body) reader = JsonLdReader() if slug is not None: graphid = models.GraphModel.objects.get(slug=slug).pk reader.read_resource(data, resourceid=resourceid, graphid=graphid) if reader.errors: response = [] for value in reader.errors.values(): response.append(value.message) return JSONResponse({"error": response}, indent=indent, status=400) else: response = [] for resource in reader.resources: with transaction.atomic(): resource.save(request=request) response.append(JSONDeserializer().deserialize( self.get(request, resource.resourceinstanceid).content)) return JSONResponse(response, indent=indent, status=201) except models.ResourceInstance.DoesNotExist: return JSONResponse(status=404) except Exception as e: return JSONResponse( {"error": "resource data could not be saved"}, status=500, reason=e)
def put(self, request, resourceid): try: indent = int(request.POST.get('indent', None)) except: indent = None try: if user_can_edit_resources(user=request.user): data = JSONDeserializer().deserialize(request.body) reader = JsonLdReader() reader.read_resource(data, use_ids=True) if reader.errors: response = [] for value in reader.errors.itervalues(): response.append(value.message) return JSONResponse(data, indent=indent, status=400, reason=response) else: response = [] for resource in reader.resources: if resourceid != str(resource.pk): raise Exception( 'Resource id in the URI does not match the resource @id supplied in the document' ) old_resource = Resource.objects.get(pk=resource.pk) old_resource.load_tiles() old_tile_ids = set( [str(tile.pk) for tile in old_resource.tiles]) new_tile_ids = set([ str(tile.pk) for tile in resource.get_flattened_tiles() ]) tileids_to_delete = old_tile_ids.difference( new_tile_ids) tiles_to_delete = models.TileModel.objects.filter( pk__in=tileids_to_delete) with transaction.atomic(): tiles_to_delete.delete() resource.save(request=request) response.append(JSONDeserializer().deserialize( self.get(request, resource.resourceinstanceid).content)) return JSONResponse(response, indent=indent) else: return JSONResponse(status=403) except Exception as e: return JSONResponse(status=500, reason=e)
def put(self, request, resourceid): try: indent = int(request.PUT.get('indent', None)) except: indent = None if user_can_edit_resources(user=request.user): with transaction.atomic(): try: # DELETE resource_instance = Resource.objects.get(pk=resourceid) resource_instance.delete() except models.ResourceInstance.DoesNotExist: pass try: # POST data = JSONDeserializer().deserialize(request.body) reader = JsonLdReader() reader.read_resource(data, resourceid=resourceid) if reader.errors: response = [] for value in reader.errors.itervalues(): response.append(value.message) return JSONResponse(data, indent=indent, status=400, reason=response) else: response = [] for resource in reader.resources: with transaction.atomic(): resource.save(request=request) response.append(JSONDeserializer().deserialize( self.get(request, resource.resourceinstanceid).content)) return JSONResponse(response, indent=indent, status=201) except models.ResourceInstance.DoesNotExist: return JSONResponse(status=404) else: return JSONResponse(status=500)
class Command(BaseCommand): """ Command for importing JSON-LD data into Arches """ def add_arguments(self, parser): parser.add_argument( "-s", "--source", default="data/", action="store", dest="source", help="the directory in which the data files are to be found" ) parser.add_argument( "-ow", "--overwrite", default="ignore", action="store", dest="force", help="if overwrite, overwrite records that exist; if ignore, then skip; if error, then halt", ) parser.add_argument("--toobig", default=0, type=int, action="store", dest="toobig", help="Do not attempt to load records > n kb") parser.add_argument( "-m", "--model", default="", action="store", dest="model", help="the name of the model path to load (eg auction_of_lot)", ) parser.add_argument( "-b", "--block", default="", action="store", dest="block", help="the name of the block in the model path to load (eg 00), or slice in the form this,total (eg 1,5)", ) parser.add_argument("--max", default=-1, type=int, action="store", dest="max", help="Maximum number of records to load per model") parser.add_argument("--fast", default=0, action="store", type=int, dest="fast", help="Use bulk_save to store n records at a time") parser.add_argument("-q", "--quiet", default=False, action="store_true", dest="quiet", help="Don't announce every record") parser.add_argument( "--skip", default=-1, type=int, action="store", dest="skip", help="Number of records to skip before starting to load" ) parser.add_argument("--suffix", default="json", action="store", dest="suffix", help="file suffix to load if not .json") parser.add_argument( "--ignore-errors", default=False, action="store_true", dest="ignore_errors", help="Log but do not terminate on errors" ) parser.add_argument( "--strip-issearchable", default=False, action="store_true", dest="strip_search", help="If a node is set to not be exposed to advanced search, then don't even index it", ) def handle(self, *args, **options): print("Starting JSON-LD load") if options["model"]: print(f"Only loading {options['model']}") if options["block"]: print(f"Only loading {options['block']}") if options["force"] == "overwrite": print("Overwriting existing records") if options["toobig"]: print(f"Not loading records > {options['toobig']}kb") if options["quiet"]: print("Only announcing timing data") if options["strip_search"] and not options["fast"]: print("ERROR: stripping fields not exposed to advanced search only works in fast mode") return self.resources = [] self.load_resources(options) def load_resources(self, options): self.reader = JsonLdReader() self.jss = JSONSerializer() source = options["source"] if options["model"]: models = [options["model"]] else: models = os.listdir(source) models.sort() models = [m for m in models if m[0] not in ["_", "."]] print(f"Found possible models: {models}") # This is boilerplate for any use of get_documents_to_index() # Need to add issearchable for strip_search option # Only calculate it once per load self.datatype_factory = DataTypeFactory() dt_instance_hash = {} self.node_info = { str(nodeid): { "datatype": dt_instance_hash.setdefault(datatype, self.datatype_factory.get_instance(datatype)), "issearchable": srch, } for nodeid, datatype, srch in archesmodels.Node.objects.values_list("nodeid", "datatype", "issearchable") } self.node_datatypes = {str(nodeid): datatype for nodeid, datatype in archesmodels.Node.objects.values_list("nodeid", "datatype")} start = time.time() seen = 0 loaded = 0 for m in models: print(f"Loading {m}") graphid = graph_uuid_map.get(m, None) if not graphid: # Check slug try: graphid = archesmodels.GraphModel.objects.get(slug=m).pk except: print(f"Couldn't find a model definition for {m}; skipping") continue # We have a good model, so build the pre-processed tree once self.reader.graphtree = self.reader.process_graph(graphid) block = options["block"] if block and "," not in block: blocks = [block] else: blocks = os.listdir(f"{source}/{m}") blocks.sort() blocks = [b for b in blocks if b[0] not in ["_", "."]] if "," in block: # {slice},{max-slices} (cslice, mslice) = block.split(",") cslice = int(cslice) - 1 mslice = int(mslice) blocks = blocks[cslice::mslice] loaded_model = 0 try: for b in blocks: files = os.listdir(f"{source}/{m}/{b}") files.sort() for f in files: if not f.endswith(options["suffix"]): continue elif f.startswith(".") or f.startswith("_"): continue if options["max"] > 0 and loaded_model >= options["max"]: raise StopIteration() seen += 1 if seen <= options["skip"]: # Do it this way to keep the counts correct continue fn = f"{source}/{m}/{b}/{f}" # Check file size of record if not options["quiet"]: print(f"About to import {fn}") if options["toobig"]: sz = os.os.path.getsize(fn) if sz > options["toobig"]: if not quiet: print(f" ... Skipping due to size: {sz} > {options['toobig']}") continue uu = f.replace(f".{options['suffix']}", "") fh = open(fn) data = fh.read() fh.close() # FIXME Timezone / DateTime Workaround # FIXME The following line should be removed when #5669 / #6346 are closed data = data.replace("T00:00:00Z", "") jsdata = json.loads(data) jsdata = fix_js_data(data, jsdata, m) if len(uu) != 36 or uu[8] != "-": # extract uuid from data if filename is not a UUID uu = jsdata["id"][-36:] if jsdata: try: if options["fast"]: l = self.fast_import_resource( uu, graphid, jsdata, n=options["fast"], reload=options["force"], quiet=options["quiet"], strip_search=options["strip_search"], ) else: l = self.import_resource(uu, graphid, jsdata, reload=options["force"], quiet=options["quiet"]) loaded += l loaded_model += l except Exception as e: print(f"*** Failed to load {fn}:\n {e}\n") if not options["ignore_errors"]: raise else: print(" ... skipped due to bad data :(") if not seen % 100: print(f" ... seen {seen} / loaded {loaded} in {time.time()-start}") except StopIteration as e: break except: raise if options["fast"] and self.resources: self.save_resources() self.index_resources(options["strip_search"]) self.resources = [] print(f"Total Time: seen {seen} / loaded {loaded} in {time.time()-start} seconds") def fast_import_resource(self, resourceid, graphid, data, n=1000, reload="ignore", quiet=True, strip_search=False): try: resource_instance = Resource.objects.get(pk=resourceid) if reload == "ignore": if not quiet: print(f" ... already loaded") return 0 elif reload == "error": print(f"*** Record exists for {resourceid}, and -ow is error") raise FileExistsError(resourceid) else: resource_instance.delete() except archesmodels.ResourceInstance.DoesNotExist: # thrown when resource doesn't exist pass try: self.reader.read_resource(data, resourceid=resourceid, graphid=graphid) self.resources.extend(self.reader.resources) except: print(f"Exception raised while reading {resourceid}...") raise if len(self.resources) >= n: self.save_resources() self.index_resources(strip_search) self.resources = [] return 1 def import_resource(self, resourceid, graphid, data, reload="ignore", quiet=False): with transaction.atomic(): try: resource_instance = Resource.objects.get(pk=resourceid) if reload == "ignore": if not quiet: print(f" ... already loaded") return 0 elif reload == "error": print(f"*** Record exists for {resourceid}, and -ow is error") raise FileExistsError(resourceid) else: resource_instance.delete() except archesmodels.ResourceInstance.DoesNotExist: # thrown when resource doesn't exist pass try: self.reader.read_resource(data, resourceid=resourceid, graphid=graphid) for resource in self.reader.resources: resource.save(request=None) except archesmodels.ResourceInstance.DoesNotExist: print(f"*** Could not find model: {graphid}") return 0 except Exception as e: raise return 1 def save_resources(self): tiles = [] for resource in self.resources: resource.tiles = resource.get_flattened_tiles() tiles.extend(resource.tiles) Resource.objects.bulk_create(self.resources) TileModel.objects.bulk_create(tiles) for t in tiles: for nodeid in t.data.keys(): datatype = self.node_info[nodeid]["datatype"] datatype.pre_tile_save(t, nodeid) for resource in self.resources: resource.save_edit(edit_type="create") def index_resources(self, strip_search=False): se = SearchEngineInstance documents = [] term_list = [] for resource in self.resources: if strip_search: document, terms = monkey_get_documents_to_index(resource, node_info=self.node_info) else: document, terms = resource.get_documents_to_index( fetchTiles=False, datatype_factory=self.datatype_factory, node_datatypes=self.node_datatypes ) documents.append(se.create_bulk_item(index="resources", id=document["resourceinstanceid"], data=document)) for term in terms: term_list.append(se.create_bulk_item(index="terms", id=term["_id"], data=term["_source"])) se.bulk_index(documents) se.bulk_index(term_list)