Beispiel #1
0
    def post(self, request, resourceid=None):
        try:
            indent = int(request.POST.get('indent', None))
        except:
            indent = None

        try:
            if user_can_edit_resources(user=request.user):
                data = JSONDeserializer().deserialize(request.body)
                reader = JsonLdReader()
                reader.read_resource(data)
                if reader.errors:
                    response = []
                    for value in reader.errors.itervalues():
                        response.append(value.message)
                    return JSONResponse(data, indent=indent, status=400, reason=response)
                else:
                    response = []
                    for resource in reader.resources:
                        with transaction.atomic():
                            resource.save(request=request)
                        response.append(JSONDeserializer().deserialize(
                            self.get(request, resource.resourceinstanceid).content))
                    return JSONResponse(response, indent=indent)
            else:
                return JSONResponse(status=403)
        except Exception as e:
            return JSONResponse(status=500, reason=e)
Beispiel #2
0
    def test_find_leaf_branch(self):
        """
        Given a list of leaf nodes, find the appropriate node from the given jsonld

        """

        jsonld_graph = {
            "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": {
                "@id":
                "http://*****:*****@type":
                "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation",
                "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "Will",
            }
        }

        graphtree = self.unique_graph.get_tree()
        for child in graphtree["children"]:
            if child["node"].name == "Name":
                node = child
        reader = JsonLdReader()
        branch = reader.findBranch(
            node["children"],
            "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by",
            jsonld_graph[
                "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"],
        )
        self.assertEqual(str(branch["node"].pk),
                         "839b0e4c-95e6-11e8-aada-14109fd34195")
Beispiel #3
0
    def post(self, request, resourceid=None, slug=None, graphid=None):
        try:
            indent = int(request.POST.get("indent", None))
        except Exception:
            indent = None

        try:
            if user_can_edit_resources(user=request.user):
                data = JSONDeserializer().deserialize(request.body)
                reader = JsonLdReader()
                if slug is not None:
                    graphid = models.GraphModel.objects.get(slug=slug).pk
                reader.read_resource(data, graphid=graphid)
                if reader.errors:
                    response = []
                    for value in reader.errors.values():
                        response.append(value.message)
                    return JSONResponse({"error": response}, indent=indent, status=400)
                else:
                    response = []
                    for resource in reader.resources:
                        with transaction.atomic():
                            resource.save(request=request)
                        response.append(JSONDeserializer().deserialize(self.get(request, resource.resourceinstanceid).content))
                    return JSONResponse(response, indent=indent, status=201)
            else:
                return JSONResponse(status=403)
        except Exception as e:
            if settings.DEBUG is True:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                formatted = traceback.format_exception(exc_type, exc_value, exc_traceback)
                if len(formatted):
                    for message in formatted:
                        print(message)
            return JSONResponse({"error": "resource data could not be saved: %s" % e}, status=500, reason=e)
Beispiel #4
0
    def put(self, request, resourceid):
        if user_can_edit_resources(user=request.user):
            data = JSONDeserializer().deserialize(request.body)
            #print data
            reader = JsonLdReader()
            reader.read_resource(data)
        else:
            return JSONResponse(status=500)

        return JSONResponse(self.get(request, resourceid))
Beispiel #5
0
    def put(self, request, resourceid, slug=None, graphid=None):
        try:
            indent = int(request.PUT.get("indent", None))
        except Exception:
            indent = None

        if not user_can_edit_resources(user=request.user,
                                       resourceid=resourceid):
            return JSONResponse(status=403)
        else:
            with transaction.atomic():
                try:
                    # DELETE
                    resource_instance = Resource.objects.get(pk=resourceid)
                    resource_instance.delete()
                except models.ResourceInstance.DoesNotExist:
                    pass

                try:
                    # POST
                    data = JSONDeserializer().deserialize(request.body)
                    reader = JsonLdReader()
                    if slug is not None:
                        graphid = models.GraphModel.objects.get(slug=slug).pk
                    reader.read_resource(data,
                                         resourceid=resourceid,
                                         graphid=graphid)
                    if reader.errors:
                        response = []
                        for value in reader.errors.values():
                            response.append(value.message)
                        return JSONResponse({"error": response},
                                            indent=indent,
                                            status=400)
                    else:
                        response = []
                        for resource in reader.resources:
                            with transaction.atomic():
                                resource.save(request=request)
                            response.append(JSONDeserializer().deserialize(
                                self.get(request,
                                         resource.resourceinstanceid).content))
                        return JSONResponse(response,
                                            indent=indent,
                                            status=201)
                except models.ResourceInstance.DoesNotExist:
                    return JSONResponse(status=404)
                except Exception as e:
                    return JSONResponse(
                        {"error": "resource data could not be saved"},
                        status=500,
                        reason=e)
Beispiel #6
0
    def put(self, request, resourceid):
        try:
            indent = int(request.POST.get('indent', None))
        except:
            indent = None

        try:
            if user_can_edit_resources(user=request.user):
                data = JSONDeserializer().deserialize(request.body)
                reader = JsonLdReader()
                reader.read_resource(data, use_ids=True)
                if reader.errors:
                    response = []
                    for value in reader.errors.itervalues():
                        response.append(value.message)
                    return JSONResponse(data,
                                        indent=indent,
                                        status=400,
                                        reason=response)
                else:
                    response = []
                    for resource in reader.resources:
                        if resourceid != str(resource.pk):
                            raise Exception(
                                'Resource id in the URI does not match the resource @id supplied in the document'
                            )
                        old_resource = Resource.objects.get(pk=resource.pk)
                        old_resource.load_tiles()
                        old_tile_ids = set(
                            [str(tile.pk) for tile in old_resource.tiles])
                        new_tile_ids = set([
                            str(tile.pk)
                            for tile in resource.get_flattened_tiles()
                        ])
                        tileids_to_delete = old_tile_ids.difference(
                            new_tile_ids)
                        tiles_to_delete = models.TileModel.objects.filter(
                            pk__in=tileids_to_delete)
                        with transaction.atomic():
                            tiles_to_delete.delete()
                            resource.save(request=request)
                        response.append(JSONDeserializer().deserialize(
                            self.get(request,
                                     resource.resourceinstanceid).content))
                    return JSONResponse(response, indent=indent)
            else:
                return JSONResponse(status=403)
        except Exception as e:
            return JSONResponse(status=500, reason=e)
Beispiel #7
0
    def test_find_branch_from_jsonld_2(self):
        """
        The same test as above except that we now add an additional node to the supplied json which now will match a branch in the graph
        The graph is partially unique (the children of the root are not unique)

        """

        ambiguous_jsonld_graph = {
            "@id":
            "http://*****:*****@type": [
                "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation",
                "http://*****:*****@id":
                "http://*****:*****@type":
                "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation",
                "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": {
                    "@id":
                    "http://*****:*****@type":
                    "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation",
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "Will",
                },
                "http://www.cidoc-crm.org/cidoc-crm/P1i_identifies": {
                    "@id":
                    "http://*****:*****@type":
                    "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation",
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                    "Smith",
                },
            }],
        }

        graphtree = self.ambiguous_graph.get_tree()
        reader = JsonLdReader()
        branch = reader.findBranch(
            graphtree["children"],
            "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by",
            ambiguous_jsonld_graph[
                "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"],
        )
        self.assertEqual(str(branch["node"].pk),
                         "3f40c4c0-9693-11e8-8a0f-14109fd34195")
Beispiel #8
0
    def test_find_unique_branch_from_jsonld(self):
        """
        Test that we can find the correct branch in the graph that matches the supplied json-ld
        The graph is partially unique (the children of the root are not unique)

        """

        jsonld_graph = {
            "@id":
            "http://*****:*****@type": [
                "http://*****:*****@id":
                "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation",
                "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": {
                    "@id":
                    "http://*****:*****@type":
                    "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation",
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "Will",
                },
                "http://www.cidoc-crm.org/cidoc-crm/P1i_identifies": {
                    "@id":
                    "http://*****:*****@type":
                    "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation",
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                    "Smith",
                },
            },
        }
        graphtree = self.unique_graph.get_tree()
        reader = JsonLdReader()
        branch = reader.findBranch(
            graphtree["children"],
            "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by",
            jsonld_graph[
                "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"],
        )
        self.assertEqual(str(branch["node"].pk),
                         "3e1e65dc-95e6-11e8-9de9-14109fd34195")
Beispiel #9
0
    def put(self, request, resourceid):
        try:
            indent = int(request.PUT.get('indent', None))
        except:
            indent = None

        if user_can_edit_resources(user=request.user):
            with transaction.atomic():
                try:
                    # DELETE
                    resource_instance = Resource.objects.get(pk=resourceid)
                    resource_instance.delete()
                except models.ResourceInstance.DoesNotExist:
                    pass

                try:
                    # POST
                    data = JSONDeserializer().deserialize(request.body)
                    reader = JsonLdReader()
                    reader.read_resource(data, resourceid=resourceid)
                    if reader.errors:
                        response = []
                        for value in reader.errors.itervalues():
                            response.append(value.message)
                        return JSONResponse(data,
                                            indent=indent,
                                            status=400,
                                            reason=response)
                    else:
                        response = []
                        for resource in reader.resources:
                            with transaction.atomic():
                                resource.save(request=request)
                            response.append(JSONDeserializer().deserialize(
                                self.get(request,
                                         resource.resourceinstanceid).content))
                        return JSONResponse(response,
                                            indent=indent,
                                            status=201)
                except models.ResourceInstance.DoesNotExist:
                    return JSONResponse(status=404)
        else:
            return JSONResponse(status=500)
Beispiel #10
0
    def test_cant_find_branch_from_ambiguous_jsonld(self):
        """
        Test that we raise the appropriate error when we can't find the correct branch in the graph given that the supplied
        json-ld could match more than one branch
        The graph is partially unique (the children of the root are not unique)

        """

        ambiguous_jsonld_graph = {
            "@id":
            "http://*****:*****@type": [
                "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation",
                "http://*****:*****@id":
                "http://*****:*****@type":
                "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation",
                "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": {
                    "@id":
                    "http://*****:*****@type":
                    "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation",
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                    "Will - Ambiguous",
                },
            }],
        }

        graphtree = self.ambiguous_graph.get_tree()
        reader = JsonLdReader()
        with self.assertRaises(reader.AmbiguousGraphException) as cm:
            branch = reader.findBranch(
                graphtree["children"],
                "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by",
                ambiguous_jsonld_graph[
                    "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"],
            )
Beispiel #11
0
    def test_find_other_unique_branch_from_jsonld(self):
        """
        Test that we can find the correct branch in the graph that matches the supplied json-ld
        The graph is partially unique (the children of the root are not unique)

        """

        jsonld_graph = {
            "@id":
            "http://*****:*****@type": [
                "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation",
                "http://*****:*****@id":
                "http://*****:*****@type":
                "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation",
                "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": {
                    "@id":
                    "http://*****:*****@type":
                    "http://www.ics.forth.gr/isl/CRMdig/D21_Person_Name",
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                    "The Shadow",
                },
            }],
        }

        graphtree = self.unique_graph.get_tree()
        reader = JsonLdReader()
        branch = reader.findBranch(
            graphtree["children"],
            "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by",
            jsonld_graph[
                "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"],
        )
        self.assertEqual(str(branch["node"].pk),
                         "91679e1e-95e6-11e8-a166-14109fd34195")
Beispiel #12
0
    def test_cant_find_branch_from_jsonld(self):
        """
        Test that we raise the appropriate error when we can't find the correct branch in the graph that matches the supplied json-ld
        The graph is partially unique (the children of the root are not unique)

        """

        incorrect_jsonld_graph = {
            "@id":
            "http://*****:*****@type": [
                "http://www.cidoc-crm.org/cidoc-crm/E82_Actor_Appellation",
                "http://*****:*****@id":
                "http://*****:*****@type":
                "http://www.cidoc-crm.org/cidoc-crm/E41_Appellation",
                "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by": {
                    "@id":
                    "http://*****:*****@type":
                    "---THIS TYPE IS INCORRECT AND SHOULN'T MATCH---",
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                    "The Shadow",
                },
            }],
        }

        graphtree = self.unique_graph.get_tree()
        reader = JsonLdReader()
        with self.assertRaises(reader.DataDoesNotMatchGraphException) as cm:
            branch = reader.findBranch(
                graphtree["children"],
                "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by",
                incorrect_jsonld_graph[
                    "http://www.cidoc-crm.org/cidoc-crm/P1_is_identified_by"],
            )
Beispiel #13
0
    def test_cant_find_branch_from_complex_ambigious_jsonld(self):
        """
        The same test as above except that we now supply a jsonld structure that matches more then one branch in the graph (it's ambiguous)

        """

        complex_jsonld_graph = {
            "@id":
            "http://*****:*****@type": [
                "http://www.cidoc-crm.org/cidoc-crm/E12_Production",
                "http://*****:*****@id":
                "http://*****:*****@type":
                "http://www.cidoc-crm.org/cidoc-crm/E17_Type_Assignment",
                "http://www.cidoc-crm.org/cidoc-crm/P42_assigned": [
                    {
                        "@id":
                        "http://*****:*****@type":
                        "http://www.cidoc-crm.org/cidoc-crm/E55_Type",
                        "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                        "174e9486-0663-4c9d-ab78-c7e441720c26",
                    },
                    {
                        "@id":
                        "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E55_Type",
                        "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                        "None",
                    },
                ],
                "http://www.cidoc-crm.org/cidoc-crm/P4_has_time-span": {
                    "@id":
                    "http://*****:*****@type":
                    "http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span",
                    "http://www.cidoc-crm.org/cidoc-crm/P78_is_identified_by":
                    [
                        {
                            "@id":
                            "http://*****:*****@type":
                            "http://www.cidoc-crm.org/cidoc-crm/E49_Time_Appellation",
                            "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                            "2018-08-06",
                        },
                        {
                            "@id":
                            "http://*****:*****@type":
                            "http://www.cidoc-crm.org/cidoc-crm/E49_Time_Appellation",
                            "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                            "2018-09-20",
                        },
                    ],
                },
            }],
        }

        graphtree = self.phase_type_assignment_graph.get_tree()
        reader = JsonLdReader()
        with self.assertRaises(reader.AmbiguousGraphException) as cm:
            branch = reader.findBranch(
                graphtree["children"],
                "http://www.cidoc-crm.org/cidoc-crm/P41i_was_classified_by",
                complex_jsonld_graph[
                    "http://www.cidoc-crm.org/cidoc-crm/P41i_was_classified_by"],
            )
Beispiel #14
0
    def test_find_branch_from_complex_jsonld(self):
        """
        Given a more complicated json structure find the branch in the graph

        """

        complex_jsonld_graph = {
            "@id":
            "http://*****:*****@type": [
                "http://www.cidoc-crm.org/cidoc-crm/E12_Production",
                "http://*****:*****@id":
                "http://*****:*****@type":
                "http://www.cidoc-crm.org/cidoc-crm/E17_Type_Assignment",
                "http://www.cidoc-crm.org/cidoc-crm/P42_assigned": {
                    "@id":
                    "http://*****:*****@type": "http://www.cidoc-crm.org/cidoc-crm/E55_Type",
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#value": "None",
                },
                "http://www.cidoc-crm.org/cidoc-crm/P2_has_type": {
                    "@id":
                    "http://*****:*****@type":
                    "http://www.cidoc-crm.org/cidoc-crm/E55_Type",
                    "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                    "51cbfba6-34ee-4fbd-8b6e-10ef73fd4083",
                },
                "http://www.cidoc-crm.org/cidoc-crm/P4_has_time-span": {
                    "@id":
                    "http://*****:*****@type":
                    "http://www.cidoc-crm.org/cidoc-crm/E52_Time-Span",
                    "http://www.cidoc-crm.org/cidoc-crm/P78_is_identified_by":
                    [
                        {
                            "@id":
                            "http://*****:*****@type":
                            "http://www.cidoc-crm.org/cidoc-crm/E49_Time_Appellation",
                            "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                            "2018-08-05",
                        },
                        {
                            "@id":
                            "http://*****:*****@type":
                            "http://www.cidoc-crm.org/cidoc-crm/E49_Time_Appellation",
                            "http://www.w3.org/1999/02/22-rdf-syntax-ns#value":
                            "2018-08-06",
                        },
                    ],
                },
            }],
        }

        graphtree = self.phase_type_assignment_graph.get_tree()
        reader = JsonLdReader()
        branch = reader.findBranch(
            graphtree["children"],
            "http://www.cidoc-crm.org/cidoc-crm/P41i_was_classified_by",
            complex_jsonld_graph[
                "http://www.cidoc-crm.org/cidoc-crm/P41i_was_classified_by"],
        )
        self.assertEqual(str(branch["node"].pk),
                         "049fc0c9-fa36-11e6-9e3e-026d961c88e6")
Beispiel #15
0
class Command(BaseCommand):
    """
    Command for importing JSON-LD data into Arches
    """

    def add_arguments(self, parser):

        parser.add_argument(
            "-s", "--source", default="data/", action="store", dest="source", help="the directory in which the data files are to be found"
        )

        parser.add_argument(
            "-ow",
            "--overwrite",
            default="ignore",
            action="store",
            dest="force",
            help="if overwrite, overwrite records that exist; if ignore, then skip; if error, then halt",
        )

        parser.add_argument("--toobig", default=0, type=int, action="store", dest="toobig", help="Do not attempt to load records > n kb")

        parser.add_argument(
            "-m", "--model", default="", action="store", dest="model", help="the name of the model path to load (eg auction_of_lot)",
        )

        parser.add_argument(
            "-b",
            "--block",
            default="",
            action="store",
            dest="block",
            help="the name of the block in the model path to load (eg 00), or slice in the form this,total (eg 1,5)",
        )

        parser.add_argument("--max", default=-1, type=int, action="store", dest="max", help="Maximum number of records to load per model")

        parser.add_argument("--fast", default=0, action="store", type=int, dest="fast", help="Use bulk_save to store n records at a time")

        parser.add_argument("-q", "--quiet", default=False, action="store_true", dest="quiet", help="Don't announce every record")

        parser.add_argument(
            "--skip", default=-1, type=int, action="store", dest="skip", help="Number of records to skip before starting to load"
        )

        parser.add_argument("--suffix", default="json", action="store", dest="suffix", help="file suffix to load if not .json")

        parser.add_argument(
            "--ignore-errors", default=False, action="store_true", dest="ignore_errors", help="Log but do not terminate on errors"
        )

        parser.add_argument(
            "--strip-issearchable",
            default=False,
            action="store_true",
            dest="strip_search",
            help="If a node is set to not be exposed to advanced search, then don't even index it",
        )

    def handle(self, *args, **options):

        print("Starting JSON-LD load")
        if options["model"]:
            print(f"Only loading {options['model']}")
        if options["block"]:
            print(f"Only loading {options['block']}")
        if options["force"] == "overwrite":
            print("Overwriting existing records")
        if options["toobig"]:
            print(f"Not loading records > {options['toobig']}kb")
        if options["quiet"]:
            print("Only announcing timing data")

        if options["strip_search"] and not options["fast"]:
            print("ERROR: stripping fields not exposed to advanced search only works in fast mode")
            return

        self.resources = []
        self.load_resources(options)

    def load_resources(self, options):

        self.reader = JsonLdReader()
        self.jss = JSONSerializer()
        source = options["source"]
        if options["model"]:
            models = [options["model"]]
        else:
            models = os.listdir(source)
            models.sort()
            models = [m for m in models if m[0] not in ["_", "."]]
        print(f"Found possible models: {models}")

        # This is boilerplate for any use of get_documents_to_index()
        # Need to add issearchable for strip_search option
        # Only calculate it once per load
        self.datatype_factory = DataTypeFactory()
        dt_instance_hash = {}
        self.node_info = {
            str(nodeid): {
                "datatype": dt_instance_hash.setdefault(datatype, self.datatype_factory.get_instance(datatype)),
                "issearchable": srch,
            }
            for nodeid, datatype, srch in archesmodels.Node.objects.values_list("nodeid", "datatype", "issearchable")
        }
        self.node_datatypes = {str(nodeid): datatype for nodeid, datatype in archesmodels.Node.objects.values_list("nodeid", "datatype")}

        start = time.time()
        seen = 0
        loaded = 0

        for m in models:
            print(f"Loading {m}")
            graphid = graph_uuid_map.get(m, None)
            if not graphid:
                # Check slug
                try:
                    graphid = archesmodels.GraphModel.objects.get(slug=m).pk
                except:
                    print(f"Couldn't find a model definition for {m}; skipping")
                    continue
            # We have a good model, so build the pre-processed tree once
            self.reader.graphtree = self.reader.process_graph(graphid)
            block = options["block"]
            if block and "," not in block:
                blocks = [block]
            else:
                blocks = os.listdir(f"{source}/{m}")
                blocks.sort()
                blocks = [b for b in blocks if b[0] not in ["_", "."]]
                if "," in block:
                    # {slice},{max-slices}
                    (cslice, mslice) = block.split(",")
                    cslice = int(cslice) - 1
                    mslice = int(mslice)
                    blocks = blocks[cslice::mslice]

            loaded_model = 0

            try:
                for b in blocks:
                    files = os.listdir(f"{source}/{m}/{b}")
                    files.sort()
                    for f in files:
                        if not f.endswith(options["suffix"]):
                            continue
                        elif f.startswith(".") or f.startswith("_"):
                            continue

                        if options["max"] > 0 and loaded_model >= options["max"]:
                            raise StopIteration()
                        seen += 1
                        if seen <= options["skip"]:
                            # Do it this way to keep the counts correct
                            continue
                        fn = f"{source}/{m}/{b}/{f}"
                        # Check file size of record
                        if not options["quiet"]:
                            print(f"About to import {fn}")
                        if options["toobig"]:
                            sz = os.os.path.getsize(fn)
                            if sz > options["toobig"]:
                                if not quiet:
                                    print(f" ... Skipping due to size:  {sz} > {options['toobig']}")
                                continue
                        uu = f.replace(f".{options['suffix']}", "")
                        fh = open(fn)
                        data = fh.read()
                        fh.close()
                        # FIXME Timezone / DateTime Workaround
                        # FIXME The following line should be removed when #5669 / #6346 are closed
                        data = data.replace("T00:00:00Z", "")
                        jsdata = json.loads(data)
                        jsdata = fix_js_data(data, jsdata, m)
                        if len(uu) != 36 or uu[8] != "-":
                            # extract uuid from data if filename is not a UUID
                            uu = jsdata["id"][-36:]
                        if jsdata:
                            try:
                                if options["fast"]:
                                    l = self.fast_import_resource(
                                        uu,
                                        graphid,
                                        jsdata,
                                        n=options["fast"],
                                        reload=options["force"],
                                        quiet=options["quiet"],
                                        strip_search=options["strip_search"],
                                    )
                                else:
                                    l = self.import_resource(uu, graphid, jsdata, reload=options["force"], quiet=options["quiet"])
                                loaded += l
                                loaded_model += l
                            except Exception as e:
                                print(f"*** Failed to load {fn}:\n     {e}\n")
                                if not options["ignore_errors"]:
                                    raise
                        else:
                            print(" ... skipped due to bad data :(")
                        if not seen % 100:
                            print(f" ... seen {seen} / loaded {loaded} in {time.time()-start}")
            except StopIteration as e:
                break
            except:
                raise
        if options["fast"] and self.resources:
            self.save_resources()
            self.index_resources(options["strip_search"])
            self.resources = []
        print(f"Total Time: seen {seen} / loaded {loaded} in {time.time()-start} seconds")

    def fast_import_resource(self, resourceid, graphid, data, n=1000, reload="ignore", quiet=True, strip_search=False):
        try:
            resource_instance = Resource.objects.get(pk=resourceid)
            if reload == "ignore":
                if not quiet:
                    print(f" ... already loaded")
                return 0
            elif reload == "error":
                print(f"*** Record exists for {resourceid}, and -ow is error")
                raise FileExistsError(resourceid)
            else:
                resource_instance.delete()
        except archesmodels.ResourceInstance.DoesNotExist:
            # thrown when resource doesn't exist
            pass
        try:
            self.reader.read_resource(data, resourceid=resourceid, graphid=graphid)
            self.resources.extend(self.reader.resources)
        except:
            print(f"Exception raised while reading {resourceid}...")
            raise
        if len(self.resources) >= n:
            self.save_resources()
            self.index_resources(strip_search)
            self.resources = []
        return 1

    def import_resource(self, resourceid, graphid, data, reload="ignore", quiet=False):
        with transaction.atomic():
            try:
                resource_instance = Resource.objects.get(pk=resourceid)
                if reload == "ignore":
                    if not quiet:
                        print(f" ... already loaded")
                    return 0
                elif reload == "error":
                    print(f"*** Record exists for {resourceid}, and -ow is error")
                    raise FileExistsError(resourceid)
                else:
                    resource_instance.delete()
            except archesmodels.ResourceInstance.DoesNotExist:
                # thrown when resource doesn't exist
                pass

            try:
                self.reader.read_resource(data, resourceid=resourceid, graphid=graphid)
                for resource in self.reader.resources:
                    resource.save(request=None)
            except archesmodels.ResourceInstance.DoesNotExist:
                print(f"*** Could not find model: {graphid}")
                return 0
            except Exception as e:
                raise
        return 1

    def save_resources(self):
        tiles = []
        for resource in self.resources:
            resource.tiles = resource.get_flattened_tiles()
            tiles.extend(resource.tiles)
        Resource.objects.bulk_create(self.resources)
        TileModel.objects.bulk_create(tiles)
        for t in tiles:
            for nodeid in t.data.keys():
                datatype = self.node_info[nodeid]["datatype"]
                datatype.pre_tile_save(t, nodeid)
        for resource in self.resources:
            resource.save_edit(edit_type="create")

    def index_resources(self, strip_search=False):
        se = SearchEngineInstance
        documents = []
        term_list = []
        for resource in self.resources:
            if strip_search:
                document, terms = monkey_get_documents_to_index(resource, node_info=self.node_info)
            else:
                document, terms = resource.get_documents_to_index(
                    fetchTiles=False, datatype_factory=self.datatype_factory, node_datatypes=self.node_datatypes
                )
            documents.append(se.create_bulk_item(index="resources", id=document["resourceinstanceid"], data=document))
            for term in terms:
                term_list.append(se.create_bulk_item(index="terms", id=term["_id"], data=term["_source"]))
        se.bulk_index(documents)
        se.bulk_index(term_list)
Beispiel #16
0
    def load_resources(self, options):

        self.reader = JsonLdReader()
        self.jss = JSONSerializer()
        source = options["source"]
        if options["model"]:
            models = [options["model"]]
        else:
            models = os.listdir(source)
            models.sort()
            models = [m for m in models if m[0] not in ["_", "."]]
        print(f"Found possible models: {models}")

        # This is boilerplate for any use of get_documents_to_index()
        # Need to add issearchable for strip_search option
        # Only calculate it once per load
        self.datatype_factory = DataTypeFactory()
        dt_instance_hash = {}
        self.node_info = {
            str(nodeid): {
                "datatype": dt_instance_hash.setdefault(datatype, self.datatype_factory.get_instance(datatype)),
                "issearchable": srch,
            }
            for nodeid, datatype, srch in archesmodels.Node.objects.values_list("nodeid", "datatype", "issearchable")
        }
        self.node_datatypes = {str(nodeid): datatype for nodeid, datatype in archesmodels.Node.objects.values_list("nodeid", "datatype")}

        start = time.time()
        seen = 0
        loaded = 0

        for m in models:
            print(f"Loading {m}")
            graphid = graph_uuid_map.get(m, None)
            if not graphid:
                # Check slug
                try:
                    graphid = archesmodels.GraphModel.objects.get(slug=m).pk
                except:
                    print(f"Couldn't find a model definition for {m}; skipping")
                    continue
            # We have a good model, so build the pre-processed tree once
            self.reader.graphtree = self.reader.process_graph(graphid)
            block = options["block"]
            if block and "," not in block:
                blocks = [block]
            else:
                blocks = os.listdir(f"{source}/{m}")
                blocks.sort()
                blocks = [b for b in blocks if b[0] not in ["_", "."]]
                if "," in block:
                    # {slice},{max-slices}
                    (cslice, mslice) = block.split(",")
                    cslice = int(cslice) - 1
                    mslice = int(mslice)
                    blocks = blocks[cslice::mslice]

            loaded_model = 0

            try:
                for b in blocks:
                    files = os.listdir(f"{source}/{m}/{b}")
                    files.sort()
                    for f in files:
                        if not f.endswith(options["suffix"]):
                            continue
                        elif f.startswith(".") or f.startswith("_"):
                            continue

                        if options["max"] > 0 and loaded_model >= options["max"]:
                            raise StopIteration()
                        seen += 1
                        if seen <= options["skip"]:
                            # Do it this way to keep the counts correct
                            continue
                        fn = f"{source}/{m}/{b}/{f}"
                        # Check file size of record
                        if not options["quiet"]:
                            print(f"About to import {fn}")
                        if options["toobig"]:
                            sz = os.os.path.getsize(fn)
                            if sz > options["toobig"]:
                                if not quiet:
                                    print(f" ... Skipping due to size:  {sz} > {options['toobig']}")
                                continue
                        uu = f.replace(f".{options['suffix']}", "")
                        fh = open(fn)
                        data = fh.read()
                        fh.close()
                        # FIXME Timezone / DateTime Workaround
                        # FIXME The following line should be removed when #5669 / #6346 are closed
                        data = data.replace("T00:00:00Z", "")
                        jsdata = json.loads(data)
                        jsdata = fix_js_data(data, jsdata, m)
                        if len(uu) != 36 or uu[8] != "-":
                            # extract uuid from data if filename is not a UUID
                            uu = jsdata["id"][-36:]
                        if jsdata:
                            try:
                                if options["fast"]:
                                    l = self.fast_import_resource(
                                        uu,
                                        graphid,
                                        jsdata,
                                        n=options["fast"],
                                        reload=options["force"],
                                        quiet=options["quiet"],
                                        strip_search=options["strip_search"],
                                    )
                                else:
                                    l = self.import_resource(uu, graphid, jsdata, reload=options["force"], quiet=options["quiet"])
                                loaded += l
                                loaded_model += l
                            except Exception as e:
                                print(f"*** Failed to load {fn}:\n     {e}\n")
                                if not options["ignore_errors"]:
                                    raise
                        else:
                            print(" ... skipped due to bad data :(")
                        if not seen % 100:
                            print(f" ... seen {seen} / loaded {loaded} in {time.time()-start}")
            except StopIteration as e:
                break
            except:
                raise
        if options["fast"] and self.resources:
            self.save_resources()
            self.index_resources(options["strip_search"])
            self.resources = []
        print(f"Total Time: seen {seen} / loaded {loaded} in {time.time()-start} seconds")