Beispiel #1
0
    def post(self, request, resourceid=None, slug=None, graphid=None):
        try:
            indent = int(request.POST.get("indent", None))
        except Exception:
            indent = None

        try:
            if user_can_edit_resources(user=request.user):
                data = JSONDeserializer().deserialize(request.body)
                reader = JsonLdReader()
                if slug is not None:
                    graphid = models.GraphModel.objects.get(slug=slug).pk
                reader.read_resource(data, graphid=graphid)
                if reader.errors:
                    response = []
                    for value in reader.errors.values():
                        response.append(value.message)
                    return JSONResponse({"error": response}, indent=indent, status=400)
                else:
                    response = []
                    for resource in reader.resources:
                        with transaction.atomic():
                            resource.save(request=request)
                        response.append(JSONDeserializer().deserialize(self.get(request, resource.resourceinstanceid).content))
                    return JSONResponse(response, indent=indent, status=201)
            else:
                return JSONResponse(status=403)
        except Exception as e:
            if settings.DEBUG is True:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                formatted = traceback.format_exception(exc_type, exc_value, exc_traceback)
                if len(formatted):
                    for message in formatted:
                        print(message)
            return JSONResponse({"error": "resource data could not be saved: %s" % e}, status=500, reason=e)
Beispiel #2
0
    def post(self, request, resourceid=None):
        try:
            indent = int(request.POST.get('indent', None))
        except:
            indent = None

        try:
            if user_can_edit_resources(user=request.user):
                data = JSONDeserializer().deserialize(request.body)
                reader = JsonLdReader()
                reader.read_resource(data)
                if reader.errors:
                    response = []
                    for value in reader.errors.itervalues():
                        response.append(value.message)
                    return JSONResponse(data, indent=indent, status=400, reason=response)
                else:
                    response = []
                    for resource in reader.resources:
                        with transaction.atomic():
                            resource.save(request=request)
                        response.append(JSONDeserializer().deserialize(
                            self.get(request, resource.resourceinstanceid).content))
                    return JSONResponse(response, indent=indent)
            else:
                return JSONResponse(status=403)
        except Exception as e:
            return JSONResponse(status=500, reason=e)
Beispiel #3
0
    def put(self, request, resourceid):
        if user_can_edit_resources(user=request.user):
            data = JSONDeserializer().deserialize(request.body)
            #print data
            reader = JsonLdReader()
            reader.read_resource(data)
        else:
            return JSONResponse(status=500)

        return JSONResponse(self.get(request, resourceid))
Beispiel #4
0
    def put(self, request, resourceid, slug=None, graphid=None):
        try:
            indent = int(request.PUT.get("indent", None))
        except Exception:
            indent = None

        if not user_can_edit_resources(user=request.user,
                                       resourceid=resourceid):
            return JSONResponse(status=403)
        else:
            with transaction.atomic():
                try:
                    # DELETE
                    resource_instance = Resource.objects.get(pk=resourceid)
                    resource_instance.delete()
                except models.ResourceInstance.DoesNotExist:
                    pass

                try:
                    # POST
                    data = JSONDeserializer().deserialize(request.body)
                    reader = JsonLdReader()
                    if slug is not None:
                        graphid = models.GraphModel.objects.get(slug=slug).pk
                    reader.read_resource(data,
                                         resourceid=resourceid,
                                         graphid=graphid)
                    if reader.errors:
                        response = []
                        for value in reader.errors.values():
                            response.append(value.message)
                        return JSONResponse({"error": response},
                                            indent=indent,
                                            status=400)
                    else:
                        response = []
                        for resource in reader.resources:
                            with transaction.atomic():
                                resource.save(request=request)
                            response.append(JSONDeserializer().deserialize(
                                self.get(request,
                                         resource.resourceinstanceid).content))
                        return JSONResponse(response,
                                            indent=indent,
                                            status=201)
                except models.ResourceInstance.DoesNotExist:
                    return JSONResponse(status=404)
                except Exception as e:
                    return JSONResponse(
                        {"error": "resource data could not be saved"},
                        status=500,
                        reason=e)
Beispiel #5
0
    def put(self, request, resourceid):
        try:
            indent = int(request.POST.get('indent', None))
        except:
            indent = None

        try:
            if user_can_edit_resources(user=request.user):
                data = JSONDeserializer().deserialize(request.body)
                reader = JsonLdReader()
                reader.read_resource(data, use_ids=True)
                if reader.errors:
                    response = []
                    for value in reader.errors.itervalues():
                        response.append(value.message)
                    return JSONResponse(data,
                                        indent=indent,
                                        status=400,
                                        reason=response)
                else:
                    response = []
                    for resource in reader.resources:
                        if resourceid != str(resource.pk):
                            raise Exception(
                                'Resource id in the URI does not match the resource @id supplied in the document'
                            )
                        old_resource = Resource.objects.get(pk=resource.pk)
                        old_resource.load_tiles()
                        old_tile_ids = set(
                            [str(tile.pk) for tile in old_resource.tiles])
                        new_tile_ids = set([
                            str(tile.pk)
                            for tile in resource.get_flattened_tiles()
                        ])
                        tileids_to_delete = old_tile_ids.difference(
                            new_tile_ids)
                        tiles_to_delete = models.TileModel.objects.filter(
                            pk__in=tileids_to_delete)
                        with transaction.atomic():
                            tiles_to_delete.delete()
                            resource.save(request=request)
                        response.append(JSONDeserializer().deserialize(
                            self.get(request,
                                     resource.resourceinstanceid).content))
                    return JSONResponse(response, indent=indent)
            else:
                return JSONResponse(status=403)
        except Exception as e:
            return JSONResponse(status=500, reason=e)
Beispiel #6
0
    def put(self, request, resourceid):
        try:
            indent = int(request.PUT.get('indent', None))
        except:
            indent = None

        if user_can_edit_resources(user=request.user):
            with transaction.atomic():
                try:
                    # DELETE
                    resource_instance = Resource.objects.get(pk=resourceid)
                    resource_instance.delete()
                except models.ResourceInstance.DoesNotExist:
                    pass

                try:
                    # POST
                    data = JSONDeserializer().deserialize(request.body)
                    reader = JsonLdReader()
                    reader.read_resource(data, resourceid=resourceid)
                    if reader.errors:
                        response = []
                        for value in reader.errors.itervalues():
                            response.append(value.message)
                        return JSONResponse(data,
                                            indent=indent,
                                            status=400,
                                            reason=response)
                    else:
                        response = []
                        for resource in reader.resources:
                            with transaction.atomic():
                                resource.save(request=request)
                            response.append(JSONDeserializer().deserialize(
                                self.get(request,
                                         resource.resourceinstanceid).content))
                        return JSONResponse(response,
                                            indent=indent,
                                            status=201)
                except models.ResourceInstance.DoesNotExist:
                    return JSONResponse(status=404)
        else:
            return JSONResponse(status=500)
Beispiel #7
0
class Command(BaseCommand):
    """
    Command for importing JSON-LD data into Arches
    """

    def add_arguments(self, parser):

        parser.add_argument(
            "-s", "--source", default="data/", action="store", dest="source", help="the directory in which the data files are to be found"
        )

        parser.add_argument(
            "-ow",
            "--overwrite",
            default="ignore",
            action="store",
            dest="force",
            help="if overwrite, overwrite records that exist; if ignore, then skip; if error, then halt",
        )

        parser.add_argument("--toobig", default=0, type=int, action="store", dest="toobig", help="Do not attempt to load records > n kb")

        parser.add_argument(
            "-m", "--model", default="", action="store", dest="model", help="the name of the model path to load (eg auction_of_lot)",
        )

        parser.add_argument(
            "-b",
            "--block",
            default="",
            action="store",
            dest="block",
            help="the name of the block in the model path to load (eg 00), or slice in the form this,total (eg 1,5)",
        )

        parser.add_argument("--max", default=-1, type=int, action="store", dest="max", help="Maximum number of records to load per model")

        parser.add_argument("--fast", default=0, action="store", type=int, dest="fast", help="Use bulk_save to store n records at a time")

        parser.add_argument("-q", "--quiet", default=False, action="store_true", dest="quiet", help="Don't announce every record")

        parser.add_argument(
            "--skip", default=-1, type=int, action="store", dest="skip", help="Number of records to skip before starting to load"
        )

        parser.add_argument("--suffix", default="json", action="store", dest="suffix", help="file suffix to load if not .json")

        parser.add_argument(
            "--ignore-errors", default=False, action="store_true", dest="ignore_errors", help="Log but do not terminate on errors"
        )

        parser.add_argument(
            "--strip-issearchable",
            default=False,
            action="store_true",
            dest="strip_search",
            help="If a node is set to not be exposed to advanced search, then don't even index it",
        )

    def handle(self, *args, **options):

        print("Starting JSON-LD load")
        if options["model"]:
            print(f"Only loading {options['model']}")
        if options["block"]:
            print(f"Only loading {options['block']}")
        if options["force"] == "overwrite":
            print("Overwriting existing records")
        if options["toobig"]:
            print(f"Not loading records > {options['toobig']}kb")
        if options["quiet"]:
            print("Only announcing timing data")

        if options["strip_search"] and not options["fast"]:
            print("ERROR: stripping fields not exposed to advanced search only works in fast mode")
            return

        self.resources = []
        self.load_resources(options)

    def load_resources(self, options):

        self.reader = JsonLdReader()
        self.jss = JSONSerializer()
        source = options["source"]
        if options["model"]:
            models = [options["model"]]
        else:
            models = os.listdir(source)
            models.sort()
            models = [m for m in models if m[0] not in ["_", "."]]
        print(f"Found possible models: {models}")

        # This is boilerplate for any use of get_documents_to_index()
        # Need to add issearchable for strip_search option
        # Only calculate it once per load
        self.datatype_factory = DataTypeFactory()
        dt_instance_hash = {}
        self.node_info = {
            str(nodeid): {
                "datatype": dt_instance_hash.setdefault(datatype, self.datatype_factory.get_instance(datatype)),
                "issearchable": srch,
            }
            for nodeid, datatype, srch in archesmodels.Node.objects.values_list("nodeid", "datatype", "issearchable")
        }
        self.node_datatypes = {str(nodeid): datatype for nodeid, datatype in archesmodels.Node.objects.values_list("nodeid", "datatype")}

        start = time.time()
        seen = 0
        loaded = 0

        for m in models:
            print(f"Loading {m}")
            graphid = graph_uuid_map.get(m, None)
            if not graphid:
                # Check slug
                try:
                    graphid = archesmodels.GraphModel.objects.get(slug=m).pk
                except:
                    print(f"Couldn't find a model definition for {m}; skipping")
                    continue
            # We have a good model, so build the pre-processed tree once
            self.reader.graphtree = self.reader.process_graph(graphid)
            block = options["block"]
            if block and "," not in block:
                blocks = [block]
            else:
                blocks = os.listdir(f"{source}/{m}")
                blocks.sort()
                blocks = [b for b in blocks if b[0] not in ["_", "."]]
                if "," in block:
                    # {slice},{max-slices}
                    (cslice, mslice) = block.split(",")
                    cslice = int(cslice) - 1
                    mslice = int(mslice)
                    blocks = blocks[cslice::mslice]

            loaded_model = 0

            try:
                for b in blocks:
                    files = os.listdir(f"{source}/{m}/{b}")
                    files.sort()
                    for f in files:
                        if not f.endswith(options["suffix"]):
                            continue
                        elif f.startswith(".") or f.startswith("_"):
                            continue

                        if options["max"] > 0 and loaded_model >= options["max"]:
                            raise StopIteration()
                        seen += 1
                        if seen <= options["skip"]:
                            # Do it this way to keep the counts correct
                            continue
                        fn = f"{source}/{m}/{b}/{f}"
                        # Check file size of record
                        if not options["quiet"]:
                            print(f"About to import {fn}")
                        if options["toobig"]:
                            sz = os.os.path.getsize(fn)
                            if sz > options["toobig"]:
                                if not quiet:
                                    print(f" ... Skipping due to size:  {sz} > {options['toobig']}")
                                continue
                        uu = f.replace(f".{options['suffix']}", "")
                        fh = open(fn)
                        data = fh.read()
                        fh.close()
                        # FIXME Timezone / DateTime Workaround
                        # FIXME The following line should be removed when #5669 / #6346 are closed
                        data = data.replace("T00:00:00Z", "")
                        jsdata = json.loads(data)
                        jsdata = fix_js_data(data, jsdata, m)
                        if len(uu) != 36 or uu[8] != "-":
                            # extract uuid from data if filename is not a UUID
                            uu = jsdata["id"][-36:]
                        if jsdata:
                            try:
                                if options["fast"]:
                                    l = self.fast_import_resource(
                                        uu,
                                        graphid,
                                        jsdata,
                                        n=options["fast"],
                                        reload=options["force"],
                                        quiet=options["quiet"],
                                        strip_search=options["strip_search"],
                                    )
                                else:
                                    l = self.import_resource(uu, graphid, jsdata, reload=options["force"], quiet=options["quiet"])
                                loaded += l
                                loaded_model += l
                            except Exception as e:
                                print(f"*** Failed to load {fn}:\n     {e}\n")
                                if not options["ignore_errors"]:
                                    raise
                        else:
                            print(" ... skipped due to bad data :(")
                        if not seen % 100:
                            print(f" ... seen {seen} / loaded {loaded} in {time.time()-start}")
            except StopIteration as e:
                break
            except:
                raise
        if options["fast"] and self.resources:
            self.save_resources()
            self.index_resources(options["strip_search"])
            self.resources = []
        print(f"Total Time: seen {seen} / loaded {loaded} in {time.time()-start} seconds")

    def fast_import_resource(self, resourceid, graphid, data, n=1000, reload="ignore", quiet=True, strip_search=False):
        try:
            resource_instance = Resource.objects.get(pk=resourceid)
            if reload == "ignore":
                if not quiet:
                    print(f" ... already loaded")
                return 0
            elif reload == "error":
                print(f"*** Record exists for {resourceid}, and -ow is error")
                raise FileExistsError(resourceid)
            else:
                resource_instance.delete()
        except archesmodels.ResourceInstance.DoesNotExist:
            # thrown when resource doesn't exist
            pass
        try:
            self.reader.read_resource(data, resourceid=resourceid, graphid=graphid)
            self.resources.extend(self.reader.resources)
        except:
            print(f"Exception raised while reading {resourceid}...")
            raise
        if len(self.resources) >= n:
            self.save_resources()
            self.index_resources(strip_search)
            self.resources = []
        return 1

    def import_resource(self, resourceid, graphid, data, reload="ignore", quiet=False):
        with transaction.atomic():
            try:
                resource_instance = Resource.objects.get(pk=resourceid)
                if reload == "ignore":
                    if not quiet:
                        print(f" ... already loaded")
                    return 0
                elif reload == "error":
                    print(f"*** Record exists for {resourceid}, and -ow is error")
                    raise FileExistsError(resourceid)
                else:
                    resource_instance.delete()
            except archesmodels.ResourceInstance.DoesNotExist:
                # thrown when resource doesn't exist
                pass

            try:
                self.reader.read_resource(data, resourceid=resourceid, graphid=graphid)
                for resource in self.reader.resources:
                    resource.save(request=None)
            except archesmodels.ResourceInstance.DoesNotExist:
                print(f"*** Could not find model: {graphid}")
                return 0
            except Exception as e:
                raise
        return 1

    def save_resources(self):
        tiles = []
        for resource in self.resources:
            resource.tiles = resource.get_flattened_tiles()
            tiles.extend(resource.tiles)
        Resource.objects.bulk_create(self.resources)
        TileModel.objects.bulk_create(tiles)
        for t in tiles:
            for nodeid in t.data.keys():
                datatype = self.node_info[nodeid]["datatype"]
                datatype.pre_tile_save(t, nodeid)
        for resource in self.resources:
            resource.save_edit(edit_type="create")

    def index_resources(self, strip_search=False):
        se = SearchEngineInstance
        documents = []
        term_list = []
        for resource in self.resources:
            if strip_search:
                document, terms = monkey_get_documents_to_index(resource, node_info=self.node_info)
            else:
                document, terms = resource.get_documents_to_index(
                    fetchTiles=False, datatype_factory=self.datatype_factory, node_datatypes=self.node_datatypes
                )
            documents.append(se.create_bulk_item(index="resources", id=document["resourceinstanceid"], data=document))
            for term in terms:
                term_list.append(se.create_bulk_item(index="terms", id=term["_id"], data=term["_source"]))
        se.bulk_index(documents)
        se.bulk_index(term_list)