예제 #1
0
    def add_model(self, desc, name=None):
        # Extract the class name
        model_name = gn(desc, "name", name).lower()
        # Extract is_composite
        is_composite = gn(desc, "is_composite", False)
        # Format the class name to be PEP compliant
        model_name = to_class_name(model_name)
        # Every class fields are recorded into an objects
        model_fields = {
            # Additional informations
            "_description": gn(desc, "help_text", gn(desc, "description")),
            "_topic"      : gn(desc, "scope"),
            # Default fields
            "_author": models.IntArrayProperty(null=True, help_text=u'People that edited this entity.', verbose_name=u'author'),
            "_status": models.IntegerProperty(null=True, help_text=u'',verbose_name=u'status'),
            "_is_composite": is_composite
        }
        # Pick some options (Meta class)
        model_options = {}

        for f in ["verbose_name", "verbose_name_plural"]:
            # Extract those option into a separate class
            if f in desc: model_options[f] = desc[f]

        fields = []
        # List all fields
        for idx, field in enumerate(gn(desc, 'fields', [])):
            # Get a field instance and its name
            field_name, field_instance = self.get_model_field(field, model_name)
            # No error
            if None not in [field_name, field_instance]:
                # Record the field
                fields.append(field_name)
                model_fields[field_name] = field_instance

        # Creates a module with the extracted options
        self.models[model_name] = create_node_model(model_name, model_fields,
                                                    app_label=self.app_label,
                                                    options=model_options,
                                                    module=self.module)

        self.models[model_name].__fields_order__ = fields

        # Prevent a bug with select_related when using neo4django and virtual models
        if not hasattr(self.models[model_name]._meta, '_relationships'):
            self.models[model_name]._meta._relationships = {}

        return self.models[model_name]
예제 #2
0
파일: jobs.py 프로젝트: ndldd/detective.io
def process_bulk_parsing_and_save_as_model(topic, files, start_time=None):
    """
    Job which parses uploaded content, validates and saves them as model
    """

    start_time               = start_time != None and start_time or time.time()
    entities                 = {}
    relations                = []
    errors                   = []
    id_mapping               = {}
    nb_lines                 = 0
    file_reading_progression = 0
    job                      = get_current_job()

    # Define Exceptions
    class Error (Exception):
        """
        Generic Custom Exception for this endpoint.
        Include the topic.
        """
        def __init__(self, **kwargs):
            """ set the topic and add all the parameters as attributes """
            self.topic = topic.title
            for key, value in kwargs.items():
                setattr(self, key, value)
        def __str__(self):
            return self.__dict__

    class WarningCastingValueFail     (Error): pass
    class WarningValidationError      (Error): pass
    class WarningKeyUnknown           (Error): pass
    class WarningInformationIsMissing (Error): pass
    class AttributeDoesntExist        (Error): pass
    class WrongCSVSyntax              (Error): pass
    class ColumnUnknow                (Error): pass
    class ModelDoesntExist            (Error): pass
    class RelationDoesntExist         (Error): pass

    try:
        assert type(files) in (tuple, list), type(files)
        assert len(files) > 0, "You need to upload at least one file."
        assert type(files[0]) in (tuple, list)
        assert len(files[0]) == 2

        # retrieve all models in current topic
        all_models = dict((model.__name__, model) for model in topic.get_models())
        # iterate over all files and dissociate entities .csv from relations .csv
        for file in files:
            if type(file) is tuple:
                file_name = file[0]
                file      = file[1]
            else:
                raise Exception()
            csv_reader = utils.open_csv(file)
            header     = csv_reader.next()
            assert len(header) > 1, "{file_name} header should have at least 2 columns"
            assert header[0].endswith("_id"), "{file_name} : First column should begin with a header like <model_name>_id. Actually {first_col}".format(file_name=file_name, first_col=header[0])
            if len(header) >=3 and header[0].endswith("_id") and header[2].endswith("_id"):
                # this is a relationship file
                relations.append((file_name, file))
            else:
                # this is an entities file
                model_name = utils.to_class_name(header[0].replace("_id", ""))
                if model_name in all_models.keys():
                    entities[model_name] = (file_name, file)
                else:
                    raise ModelDoesntExist(model=model_name, file=file_name, models_availables=all_models.keys())
            nb_lines += len(file) - 1 # -1 removes headers

        # first iterate over entities
        logger.debug("BulkUpload: creating entities")
        for entity, (file_name, file) in entities.items():
            csv_reader = utils.open_csv(file)
            header     = csv_reader.next()
            # must check that all columns map to an existing model field
            fields       = utils.get_model_fields(all_models[entity])
            fields_types = {}
            for field in fields:
                fields_types[field['name']] = field['type']
            field_names = [field['name'] for field in fields]
            columns        = []
            for column in header[1:]:
                column = utils.to_underscores(column)
                if not column in field_names and not column.endswith("__sources__"):
                    raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names)
                    break
                if column.endswith("__sources__"):
                    column_type = "__sources__"
                    column = column[:-len("__sources__")]
                    if not column in field_names:
                        raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names)
                        break
                else:
                    column_type = fields_types.get(column, None)
                columns.append((column, column_type))
            else:
                # here, we know that all columns are valid
                for row in csv_reader:
                    data      = {}
                    sources   = {}
                    entity_id = row[0]
                    for i, (column, column_type) in enumerate(columns):
                        value = str(row[i+1]).decode('utf-8')
                        # cast value if needed
                        if value:
                            try:
                                if "Integer" in column_type:
                                    value = int(value)
                                # TODO: cast float
                                if "Date" in column_type:
                                    value = datetime.datetime(*map(int, re.split('[^\d]', value)[:3])).replace(tzinfo=utc)

                            except Exception as e:
                                e = WarningCastingValueFail(
                                    column_name = column,
                                    value       = value,
                                    type        = column_type,
                                    data        = data, model=entity,
                                    file        = file_name,
                                    line        = csv_reader.line_num,
                                    error       = str(e)
                                )
                                errors.append(e)
                                break
                            if column_type == "__sources__":
                                sources[column] = value
                            else:
                                data[column] = value
                    else:
                        # instanciate a model
                        try:
                            item = all_models[entity].objects.create(**data)
                            # map the object with the ID defined in the .csv
                            id_mapping[(entity, entity_id)] = item
                            # create sources
                            for sourced_field, reference in sources.items():
                                for ref in reference.split("||"):
                                    FieldSource.objects.create(individual=item.id, field=sourced_field, reference=ref)
                            # FIXME: job can be accessed somewhere else (i.e detective/topics/common/jobs.py:JobResource)
                            # Concurrent access are not secure here.
                            # For now we refresh the job just before saving it.
                            file_reading_progression += 1
                            if job:
                                job.refresh()
                                job.meta["file_reading_progression"] = (float(file_reading_progression) / float(nb_lines)) * 100
                                job.meta["file_reading"] = file_name
                                job.save()
                        except Exception as e:
                            errors.append(
                                WarningValidationError(
                                    data  = data,
                                    model = entity,
                                    file  = file_name,
                                    line  = csv_reader.line_num,
                                    error = str(e)
                                )
                            )

        inserted_relations = 0
        # then iterate over relations
        logger.debug("BulkUpload: creating relations")
        for file_name, file in relations:
            # create a csv reader
            csv_reader      = utils.open_csv(file)
            csv_header      = csv_reader.next()
            relation_name   = utils.to_underscores(csv_header[1])
            model_from      = utils.to_class_name(csv_header[0].replace("_id", ""))
            model_to        = utils.to_class_name(csv_header[2].replace("_id", ""))
            properties_name = csv_header[3:]
            # retrieve ModelProperties from related model
            ModelProperties = topic.get_rules().model(all_models[model_from]).field(relation_name).get("through")
            # check that the relation actually exists between the two objects
            try:
                getattr(all_models[model_from], relation_name)
            except Exception as e:
                raise RelationDoesntExist(
                    file             = file_name,
                    model_from       = model_from,
                    model_to         = model_to,
                    relation_name    = relation_name,
                    fields_available = [field['name'] for field in utils.iterate_model_fields(all_models[model_from])],
                    error            = str(e))
            for row in csv_reader:
                id_from    = row[0]
                id_to      = row[2]
                properties = [p.decode('utf-8') for p in row[3:]]
                if id_to and id_from:
                    try:
                        instance_from = id_mapping[(model_from, id_from)]
                        instance_to   = id_mapping[(model_to, id_to)]
                        getattr(instance_from, relation_name).add(instance_to)
                        # add properties if needed
                        if ModelProperties and properties_name and properties:
                            # save the relationship to create an id
                            instance_from.save()
                            # retrieve this id
                            relation_id = next(rel.id for rel in instance_from.node.relationships.outgoing() if rel.end.id == instance_to.id)
                            # properties of the relationship
                            relation_args = {
                                "_endnodes"     : [id_mapping[(model_from, id_from)].id, instance_to.id],
                                "_relationship" : relation_id,
                            }
                            # Pairwise the properties with their names 
                            relation_args.update(zip(properties_name, properties))
                            try:
                                ModelProperties.objects.create(**relation_args)
                            except TypeError as e:
                                errors.append(
                                    AttributeDoesntExist(
                                        file             = file_name,
                                        line             = csv_reader.line_num,
                                        model_from       = model_from,
                                        id_from          = id_from,
                                        model_to         = model_to,
                                        id_to            = id_to,
                                        relation_args    = relation_args,
                                        error            = str(e)
                                    )
                        )
                        # update the job
                        inserted_relations += 1
                        file_reading_progression += 1
                        if job:
                            job.refresh()
                            job.meta["file_reading_progression"] = (float(file_reading_progression) / float(nb_lines)) * 100
                            job.meta["file_reading"] = file_name
                            job.save()
                    except KeyError as e:
                        errors.append(
                            WarningKeyUnknown(
                                file             = file_name,
                                line             = csv_reader.line_num,
                                model_from       = model_from,
                                id_from          = id_from,
                                model_to         = model_to,
                                id_to            = id_to,
                                relation_name    = relation_name,
                                error            = str(e)
                            )
                        )
                    except Exception as e:
                        # Error unknown, we break the process to alert the user
                        raise Error(
                            file             = file_name,
                            line             = csv_reader.line_num,
                            model_from       = model_from,
                            id_from          = id_from,
                            model_to         = model_to,
                            id_to            = id_to,
                            relation_name    = relation_name,
                            error            = str(e))
                else:
                    # A key is missing (id_from or id_to) but we don't want to stop the parsing.
                    # Then we store the wrong line to return it to the user.
                    errors.append(
                        WarningInformationIsMissing(
                            file=file_name, row=row, line=csv_reader.line_num, id_to=id_to, id_from=id_from
                        )
                    )

        # Save everything
        saved = 0
        logger.debug("BulkUpload: saving %d objects" % (len(id_mapping)))
        if job:
            job.refresh()
            job.meta["objects_to_save"] = len(id_mapping)
            job.save()
        for item in id_mapping.values():
            item.save()
            saved += 1
            if job:
                job.refresh()
                job.meta["saving_progression"] = saved
                job.save()
        if job: job.refresh()
        if job and "track" in job.meta:
            from django.core.mail import send_mail
            user = User.objects.get(pk=job.meta["user"])
            send_mail("upload finished", "your upload just finished", settings.DEFAULT_FROM_EMAIL, (user.email,))
        return {
            'duration' : (time.time() - start_time),
            'inserted' : {
                'objects' : saved,
                'links'   : inserted_relations
            },
            "errors" : sorted([dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors])
        }

    except Exception as e:
        import traceback
        logger.error(traceback.format_exc())
        if e.__dict__:
            message = str(e.__dict__)
        else:
            message = e.message
        return {
            "errors" : [{e.__class__.__name__ : message}]
        }
예제 #3
0
    def handle(self, *args, **options):
        if not args:
            raise CommandError('Please specify path to ontology file.')

        # Gives the ontology URI. Only needed for documentation purposes
        ontologyURI = "http://www.semanticweb.org/nkb/ontologies/2013/6/impact-investment#"
        # This string will contain the models.py file
        headers = [
            "# -*- coding: utf-8 -*-",
            "# The ontology can be found in its entirety at %s" % ontologyURI,
            "from neo4django.db import models",
            "from neo4django.graph_auth.models import User",
            ""
        ]


        # This array contains the correspondance between data types
        correspondanceTypes = {
            "string" : "StringProperty",
            "anyURI" : "URLProperty",
            "int" : "IntegerProperty",
            "nonNegativeInteger" : "IntegerProperty",
            "nonPositiveInteger" : "IntegerProperty",
            "PositiveInteger" : "IntegerProperty",
            "NegativeInteger" : "IntegerProperty",
            # Looking forward the neo4django float support!
            # See also: https://github.com/scholrly/neo4django/issues/197
            "float" : "StringProperty",
            "integer" : "IntegerProperty",
            "dateTimeStamp" : "DateTimeProperty",
            "dateTime" : "DateTimeProperty",
            "boolean" : "BooleanProperty"
        }

        try :
            # Parses the file with etree
            tree = etree.parse(args[0])
        except:
            raise CommandError('Unable to parse the given file.')

        self.root = tree.getroot()
        models = []

        # Finds all the Classes
        for ontologyClassElement in self.root.findall("owl:Class", namespaces):

            # Finds the URI of the class
            classURI = ontologyClassElement.attrib["{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about"]

            #Finds the name of the class
            className = to_class_name(classURI.split("#")[1])

            # By default, the class has no parent
            parentClass = "models.NodeModel"

            # Declares an array to store the relationships and properties from this class
            relations = []
            properties = []


            scope = get(ontologyClassElement, "scope").replace("'", "\\'")
            # Class help text
            help_text = get(ontologyClassElement, "help_text").replace("'", "\\'")
            # Verbose names
            verbose_name = get(ontologyClassElement, "verbose_name").replace("'", "\\'")
            verbose_name_plural = get(ontologyClassElement, "verbose_name_plural").replace("'", "\\'")

            # Finds all the subClasses of the Class
            for subClassElement in ontologyClassElement.findall("rdfs:subClassOf", namespaces):

                # If the Class is actually an extension of another Class
                if "{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource" in subClassElement.attrib:

                    parentClassURI = subClassElement.attrib["{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource"]
                    parentClass = to_class_name(parentClassURI.split("#")[1])

                else:

                    for restriction in subClassElement.findall("owl:Restriction", namespaces):

                        # If there is a relationship defined in the subclass
                        if restriction.find("owl:onClass", namespaces) is not None:

                            # Finds the relationship and its elements
                            # (destination Class and type)
                            relationClass    = restriction.find("owl:onClass", namespaces)
                            relation         = {}
                            relation["URI"]  = relationClass.attrib["{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource"]
                            relation["name"] = to_class_name(relation["URI"].split("#")[1])

                            # Exception when the relation's destination is
                            # an individual from the same class
                            if relation["name"] == className:
                                relation["name"] = '"self"'
                            else:
                                relation["name"] = '"%s"' % relation["name"]


                            relationType     = restriction.find("owl:onProperty", namespaces)
                            relationTypeURI  = relationType.attrib["{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource"]
                            relation["type"] = relationTypeURI.split("#")[1]

                            # Guesses the destination of the relation based on the name.
                            # Name should be "has_..."
                            if relation["type"].find('has') == 0:
                                relation["destination"] = pron(relation["type"][3:])

                                # Get the property's options
                                options = self.propOptions(relation["type"])

                                # Help text
                                relation["help_text"]    = get(options, "help_text").replace("'", "\\'")
                                # Verbose name
                                relation["verbose_name"] = get(options, "verbose_name")
                                relation["type"]         = relation["type"]

                                # Adds the relationship to the array containing all relationships for the class only
                                # if the relation has a destination
                                if "destination" in relation:
                                    relations.append(relation)

                        # If there is a property defined in the subclass
                        elif restriction.find("owl:onDataRange", namespaces) is not None or restriction.find("owl:someValuesFrom", namespaces) is not None:
                            propertyTypeElement = restriction.find("owl:onProperty", namespaces)
                            propertyTypeURI     = propertyTypeElement.attrib["{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource"]
                            propertyType        = propertyTypeURI.split("#")[1]

                            if restriction.find("owl:onDataRange", namespaces) is not None:
                                dataTypeElement = restriction.find("owl:onDataRange", namespaces)
                            else:
                                dataTypeElement = restriction.find("owl:someValuesFrom", namespaces)

                            dataTypeURI = dataTypeElement.attrib["{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource"]

                            t = dataTypeURI.split("#")[1]

                            if t in correspondanceTypes:
                                dataType = correspondanceTypes[t]
                                # Get the property's options
                                options = self.propOptions(propertyType)

                                prop = {
                                    "name" : propertyType,
                                    "type" : dataType,
                                    # Help text
                                    "help_text": get(options, "help_text").replace("'", "\\'"),
                                    # Verbose name
                                    "verbose_name": get(options, "verbose_name")
                                }

                                properties.append(prop)
                            else:
                                raise CommandError("Property '%s' of '%s' using unkown type: %s" % (propertyType, className, t) )

            models.append({
                "className"          : className,
                "scope"              : scope,
                "help_text"          : help_text,
                "verbose_name"       : verbose_name,
                "verbose_name_plural": verbose_name_plural,
                "parentClass"        : parentClass,
                "properties"         : properties,
                "relations"          : relations,
                "dependencies"       : [parentClass]
            })

        # Topological sort of the model to avoid dependance missings
        models = self.topolgical_sort(models)
        # Output the models file
        self.print_models(models, headers)
예제 #4
0
def process_parsing(topic, files):
    """
    Job which reads the uploaded files, validate and saves them as model
    """

    entities   = {}
    relations  = []
    errors     = []
    id_mapping = {}

    assert type(files) in (tuple, list)
    assert len(files) > 0
    assert type(files[0]) in (tuple, list)
    assert len(files[0]) == 2

    # Define Exceptions
    class Error (Exception):
        """
        Generic Custom Exception for this endpoint.
        Include the topic.
        """
        def __init__(self, **kwargs):
            """ set the topic and add all the parameters as attributes """
            self.topic = topic.title
            for key, value in kwargs.items():
                setattr(self, key, value)
        def __str__(self):
            return self.__dict__

    class WarningCastingValueFail     (Error): pass
    class WarningValidationError      (Error): pass
    class WarningKeyUnknown           (Error): pass
    class WarningInformationIsMissing (Error): pass
    class AttributeDoesntExist        (Error): pass
    class WrongCSVSyntax              (Error): pass
    class ColumnUnknow                (Error): pass
    class ModelDoesntExist            (Error): pass
    class RelationDoesntExist         (Error): pass

    try:
        # retrieve all models in current topic
        all_models = dict((model.__name__, model) for model in topic.get_models())
        # iterate over all files and dissociate entities .csv from relations .csv
        for file in files:
            if type(file) is tuple:
                file_name = file[0]
                file      = file[1]
            elif hasattr(file, "read"):
                file_name = file.name
            else:
                raise Exception("ERROR")
            csv_reader = utils.open_csv(file)
            header = csv_reader.next()
            assert len(header) > 1, "header should have at least 2 columns"
            assert header[0].endswith("_id"), "First column should begin with a header like <model_name>_id"
            if len(header) >=3 and header[0].endswith("_id") and header[2].endswith("_id"):
                # this is a relationship file
                relations.append((file_name, file))
            else:
                # this is an entities file
                model_name = utils.to_class_name(header[0].replace("_id", ""))
                if model_name in all_models.keys():
                    entities[model_name] = (file_name, file)
                else:
                    raise ModelDoesntExist(model=model_name, file=file_name, models_availables=all_models.keys())

        # first iterate over entities
        logger.debug("BulkUpload: creating entities")
        for entity, (file_name, file) in entities.items():
            csv_reader = utils.open_csv(file)
            header     = csv_reader.next()
            # must check that all columns map to an existing model field
            fields      = utils.get_model_fields(all_models[entity])
            fields_types = {}
            for field in fields:
                fields_types[field['name']] = field['type']
            field_names = [field['name'] for field in fields]
            columns = []
            for column in header[1:]:
                column = utils.to_underscores(column)
                if column is not '':
                    if not column in field_names:
                        raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names)
                        break
                    column_type = fields_types[column]
                    columns.append((column, column_type))
            else:
                # here, we know that all columns are valid
                for row in csv_reader:
                    data = {}
                    id   = row[0]
                    for i, (column, column_type) in enumerate(columns):
                        value = str(row[i+1]).decode('utf-8')
                        # cast value if needed
                        if value:
                            try:
                                if "Integer" in column_type:
                                    value = int(value)
                                # TODO: cast float
                                if "Date" in column_type:
                                    value = datetime.datetime(*map(int, re.split('[^\d]', value)[:-1])).replace(tzinfo=utc)
                            except Exception as e:
                                e = WarningCastingValueFail(
                                    column_name = column,
                                    value       = value,
                                    type        = column_type,
                                    data        = data, model=entity,
                                    file        = file_name,
                                    line        = csv_reader.line_num,
                                    error       = str(e)
                                )
                                errors.append(e)
                                break
                            data[column] = value
                    else:
                        # instanciate a model
                        try:
                            item = all_models[entity].objects.create(**data)
                            # map the object with the ID defined in the .csv
                            id_mapping[(entity, id)] = item
                        except Exception as e:
                            errors.append(
                                WarningValidationError(
                                    data  = data,
                                    model = entity,
                                    file  = file_name,
                                    line  = csv_reader.line_num,
                                    error = str(e)
                                )
                            )

        inserted_relations = 0
        # then iterate over relations
        logger.debug("BulkUpload: creating relations")
        for file_name, file in relations:
            # create a csv reader
            csv_reader    = utils.open_csv(file)
            csv_header    = csv_reader.next()
            relation_name = utils.to_underscores(csv_header[1])
            model_from    = utils.to_class_name(csv_header[0].replace("_id", ""))
            model_to      = utils.to_class_name(csv_header[2].replace("_id", ""))
            # check that the relation actually exists between the two objects
            try:
                getattr(all_models[model_from], relation_name)
            except Exception as e:
                raise RelationDoesntExist(
                    file             = file_name,
                    model_from       = model_from,
                    model_to         = model_to,
                    relation_name    = relation_name,
                    fields_available = [field['name'] for field in utils.get_model_fields(all_models[model_from])],
                    error            = str(e))
            for row in csv_reader:
                id_from = row[0]
                id_to   = row[2]
                if id_to and id_from:
                    try:
                        getattr(id_mapping[(model_from, id_from)], relation_name).add(id_mapping[(model_to, id_to)])
                        inserted_relations += 1
                    except KeyError as e:
                        errors.append(
                            WarningKeyUnknown(
                                file             = file_name,
                                line             = csv_reader.line_num,
                                model_from       = model_from,
                                id_from          = id_from,
                                model_to         = model_to,
                                id_to            = id_to,
                                relation_name    = relation_name,
                                error            = str(e)
                            )
                        )
                    except Exception as e:
                        # Error unknown, we break the process to alert the user
                        raise Error(
                            file             = file_name,
                            line             = csv_reader.line_num,
                            model_from       = model_from,
                            id_from          = id_from,
                            model_to         = model_to,
                            id_to            = id_to,
                            relation_name    = relation_name,
                            error            = str(e))
                else:
                    # A key is missing (id_from or id_to) but we don't want to stop the parsing.
                    # Then we store the wrong line to return it to the user.
                    errors.append(
                        WarningInformationIsMissing(
                            file=file_name, row=row, line=csv_reader.line_num, id_to=id_to, id_from=id_from
                        )
                    )

        # Save everything
        saved = 0
        logger.debug("BulkUpload: saving %d objects" % (len(id_mapping)))
        for item in id_mapping.values():
            item.save()
            saved += 1

        return {
            'inserted' : {
                'objects' : saved,
                'links'   : inserted_relations
            },
            "errors" : sorted([dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors])
        }
    except Exception as e:
        import traceback
        logger.error(traceback.format_exc())
        return {
            "errors" : [{e.__class__.__name__ : str(e.__dict__)}]
        }
예제 #5
0
    def get_model_field(self, desc, model_name):
        # All field's options
        field_opts = dict(null=True)
        # Get the name tag
        field_name = gn(desc, 'name')
        # Convert the name to a python readable format
        field_name = to_underscores(field_name)
        # We didn't found a name
        # @TODO handle that with a custom exception
        if field_name is None: return None, None
        # The field can contains rules
        for name, value in gn(desc, 'rules', dict()).iteritems():
            self.add_rule(model_name, field_name, name, value)
        # Get field's special properties
        field_opts = dict( field_opts.items() + self.get_field_specials(desc).items() )
        if field_name == "name":
            field_opts["indexed"] = True
        # It's a relationship!
        if "related_model" in desc and desc["related_model"] is not None:
            field_opts["target"] = to_class_name(desc["related_model"].lower())
            field_target = to_class_name(field_opts["target"])
            # Remove "has_" from the begining of the name
            if field_name.startswith("has_"): field_name = field_name[4:]
            # Build rel_type using the name and the class name
            field_opts["rel_type"] = "%s_has_%s+"  % ( to_underscores(model_name), field_name)
            field_type = "relationship"

            # Add a related name
            if "related_name" in field_opts and field_opts["related_name"] is not None:
                # Convert related_name to the same format
                related_name = field_opts["related_name"]
                related_name = to_underscores(related_name)
                field_opts["related_name"] = related_name
            else:
                related_name = field_opts["related_name"] = None

            # This relationship can embed properties.
            # Properties are directly bound to the relationship field.
            if "fields" in desc:
                # Fields related to the new model
                composite_fields = gn(desc, 'fields', [])
                # Create a field to reference the relationship ID
                composite_fields.append(dict(
                    type="int",
                    name="_relationship",
                    help_text="The ID of the relationship to describe.",
                    indexed=True,
                    rules=dict(is_editable=False)
                ))
                composite_fields.append(dict(
                    type="intarray",
                    name="_endnodes",
                    help_text="IDs of the relationship's extremities.",
                    indexed=True,
                    rules=dict(is_editable=False)
                ))
                # Name of the new model
                composite_name = "%s %s %s Properties" % (
                    model_name,
                    field_name,
                    field_target
                )
                # Create a Model with the relation
                composite_model = {
                    "name": composite_name,
                    "fields": composite_fields
                }
                # Create the new model!
                model = self.add_model(composite_model)
                # We have to register (for later) a rule that says
                # explicitely that this field has properties
                self.add_rule(model_name, field_name, "has_properties", True)
                self.add_rule(model_name, field_name, "through", model)
                # This relationship is visible in the target model
                if related_name is not None:
                    # Add another rule for the reverse relationship
                    self.add_rule(field_target, related_name, "has_properties", True)
                    self.add_rule(field_target, related_name, "through", model)
                # Add a rules to make this "special" model
                self.modelrules.model(model).add(is_relationship_properties=True,
                                                 relationship_source=model_name,
                                                 relationship_target=field_target,
                                                 is_searchable=False)
        # It's a literal value
        else:
            # Picks one of the two tags type
            field_type = desc["type"].lower()
            # Remove "field" suffix
            if field_type.endswith("field"): field_type = field_type[0:-5]
        # Skip unkown type
        # @TODO raise custom exception
        if not field_type in self.JSONTYPES: return None, None
        # Convert type to neo4django property type
        field_type = self.JSONTYPES[field_type]
        # Add a default value for boolean properties
        if field_type == 'BooleanProperty' and not 'default' in field_opts.keys():
            field_opts['default'] = False
        # Return an instance of the field
        return field_name, getattr(models, field_type)(**field_opts)
예제 #6
0
def process_bulk_parsing_and_save_as_model(topic, files, start_time=None):
    """
    Job which parses uploaded content, validates and saves them as model
    """

    start_time = start_time != None and start_time or time.time()
    entities = {}
    relations = []
    errors = []
    id_mapping = {}
    nb_lines = 0
    file_reading_progression = 0
    job = get_current_job()

    # Define Exceptions
    class Error(Exception):
        """
        Generic Custom Exception for this endpoint.
        Include the topic.
        """
        def __init__(self, **kwargs):
            """ set the topic and add all the parameters as attributes """
            self.topic = topic.title
            for key, value in kwargs.items():
                setattr(self, key, value)

        def __str__(self):
            return self.__dict__

    class WarningCastingValueFail(Error):
        pass

    class WarningValidationError(Error):
        pass

    class WarningKeyUnknown(Error):
        pass

    class WarningInformationIsMissing(Error):
        pass

    class AttributeDoesntExist(Error):
        pass

    class WrongCSVSyntax(Error):
        pass

    class ColumnUnknow(Error):
        pass

    class ModelDoesntExist(Error):
        pass

    class RelationDoesntExist(Error):
        pass

    try:
        assert type(files) in (tuple, list), type(files)
        assert len(files) > 0, "You need to upload at least one file."
        assert type(files[0]) in (tuple, list)
        assert len(files[0]) == 2

        # retrieve all models in current topic
        all_models = dict(
            (model.__name__, model) for model in topic.get_models())
        # iterate over all files and dissociate entities .csv from relations .csv
        for file in files:
            if type(file) is tuple:
                file_name = file[0]
                file = file[1]
            else:
                raise Exception()
            csv_reader = utils.open_csv(file)
            header = csv_reader.next()
            assert len(
                header
            ) > 1, "{file_name} header should have at least 2 columns"
            assert header[0].endswith(
                "_id"
            ), "{file_name} : First column should begin with a header like <model_name>_id. Actually {first_col}".format(
                file_name=file_name, first_col=header[0])
            if len(header) >= 3 and header[0].endswith(
                    "_id") and header[2].endswith("_id"):
                # this is a relationship file
                relations.append((file_name, file))
            else:
                # this is an entities file
                model_name = utils.to_class_name(header[0].replace("_id", ""))
                if model_name in all_models.keys():
                    entities[model_name] = (file_name, file)
                else:
                    raise ModelDoesntExist(model=model_name,
                                           file=file_name,
                                           models_availables=all_models.keys())
            nb_lines += len(file) - 1  # -1 removes headers

        # first iterate over entities
        logger.debug("BulkUpload: creating entities")
        for entity, (file_name, file) in entities.items():
            csv_reader = utils.open_csv(file)
            header = csv_reader.next()
            # must check that all columns map to an existing model field
            fields = utils.get_model_fields(all_models[entity])
            fields_types = {}
            for field in fields:
                fields_types[field['name']] = field['type']
            field_names = [field['name'] for field in fields]
            columns = []
            for column in header[1:]:
                column = utils.to_underscores(column)
                if not column in field_names and not column.endswith(
                        "__sources__"):
                    raise ColumnUnknow(file=file_name,
                                       column=column,
                                       model=entity,
                                       attributes_available=field_names)
                    break
                if column.endswith("__sources__"):
                    column_type = "__sources__"
                    column = column[:-len("__sources__")]
                    if not column in field_names:
                        raise ColumnUnknow(file=file_name,
                                           column=column,
                                           model=entity,
                                           attributes_available=field_names)
                        break
                else:
                    column_type = fields_types.get(column, None)
                columns.append((column, column_type))
            else:
                # here, we know that all columns are valid
                for row in csv_reader:
                    data = {}
                    sources = {}
                    entity_id = row[0]
                    for i, (column, column_type) in enumerate(columns):
                        value = str(row[i + 1]).decode('utf-8')
                        # cast value if needed
                        if value:
                            try:
                                if "Integer" in column_type:
                                    value = int(value)
                                # TODO: cast float
                                if "Date" in column_type:
                                    value = datetime.datetime(*map(
                                        int,
                                        re.split('[^\d]', value)[:3])).replace(
                                            tzinfo=utc)

                            except Exception as e:
                                e = WarningCastingValueFail(
                                    column_name=column,
                                    value=value,
                                    type=column_type,
                                    data=data,
                                    model=entity,
                                    file=file_name,
                                    line=csv_reader.line_num,
                                    error=str(e))
                                errors.append(e)
                                break
                            if column_type == "__sources__":
                                sources[column] = value
                            else:
                                data[column] = value
                    else:
                        # instanciate a model
                        try:
                            item = all_models[entity].objects.create(**data)
                            # map the object with the ID defined in the .csv
                            id_mapping[(entity, entity_id)] = item
                            # create sources
                            for sourced_field, reference in sources.items():
                                for ref in reference.split("||"):
                                    FieldSource.objects.create(
                                        individual=item.id,
                                        field=sourced_field,
                                        reference=ref)
                            # FIXME: job can be accessed somewhere else (i.e detective/topics/common/jobs.py:JobResource)
                            # Concurrent access are not secure here.
                            # For now we refresh the job just before saving it.
                            file_reading_progression += 1
                            if job:
                                job.refresh()
                                job.meta["file_reading_progression"] = (
                                    float(file_reading_progression) /
                                    float(nb_lines)) * 100
                                job.meta["file_reading"] = file_name
                                job.save()
                        except Exception as e:
                            errors.append(
                                WarningValidationError(
                                    data=data,
                                    model=entity,
                                    file=file_name,
                                    line=csv_reader.line_num,
                                    error=str(e)))

        inserted_relations = 0
        # then iterate over relations
        logger.debug("BulkUpload: creating relations")
        for file_name, file in relations:
            # create a csv reader
            csv_reader = utils.open_csv(file)
            csv_header = csv_reader.next()
            relation_name = utils.to_underscores(csv_header[1])
            model_from = utils.to_class_name(csv_header[0].replace("_id", ""))
            model_to = utils.to_class_name(csv_header[2].replace("_id", ""))
            properties_name = csv_header[3:]
            # retrieve ModelProperties from related model
            ModelProperties = topic.get_rules().model(
                all_models[model_from]).field(relation_name).get("through")
            # check that the relation actually exists between the two objects
            try:
                getattr(all_models[model_from], relation_name)
            except Exception as e:
                raise RelationDoesntExist(
                    file=file_name,
                    model_from=model_from,
                    model_to=model_to,
                    relation_name=relation_name,
                    fields_available=[
                        field['name'] for field in utils.iterate_model_fields(
                            all_models[model_from])
                    ],
                    error=str(e))
            for row in csv_reader:
                id_from = row[0]
                id_to = row[2]
                properties = [p.decode('utf-8') for p in row[3:]]
                if id_to and id_from:
                    try:
                        instance_from = id_mapping[(model_from, id_from)]
                        instance_to = id_mapping[(model_to, id_to)]
                        getattr(instance_from, relation_name).add(instance_to)
                        # add properties if needed
                        if ModelProperties and properties_name and properties:
                            # save the relationship to create an id
                            instance_from.save()
                            # retrieve this id
                            relation_id = next(
                                rel.id for rel in
                                instance_from.node.relationships.outgoing()
                                if rel.end.id == instance_to.id)
                            # properties of the relationship
                            relation_args = {
                                "_endnodes": [
                                    id_mapping[(model_from, id_from)].id,
                                    instance_to.id
                                ],
                                "_relationship":
                                relation_id,
                            }
                            # Pairwise the properties with their names
                            relation_args.update(
                                zip(properties_name, properties))
                            try:
                                ModelProperties.objects.create(**relation_args)
                            except TypeError as e:
                                errors.append(
                                    AttributeDoesntExist(
                                        file=file_name,
                                        line=csv_reader.line_num,
                                        model_from=model_from,
                                        id_from=id_from,
                                        model_to=model_to,
                                        id_to=id_to,
                                        relation_args=relation_args,
                                        error=str(e)))
                        # update the job
                        inserted_relations += 1
                        file_reading_progression += 1
                        if job:
                            job.refresh()
                            job.meta["file_reading_progression"] = (
                                float(file_reading_progression) /
                                float(nb_lines)) * 100
                            job.meta["file_reading"] = file_name
                            job.save()
                    except KeyError as e:
                        errors.append(
                            WarningKeyUnknown(file=file_name,
                                              line=csv_reader.line_num,
                                              model_from=model_from,
                                              id_from=id_from,
                                              model_to=model_to,
                                              id_to=id_to,
                                              relation_name=relation_name,
                                              error=str(e)))
                    except Exception as e:
                        # Error unknown, we break the process to alert the user
                        raise Error(file=file_name,
                                    line=csv_reader.line_num,
                                    model_from=model_from,
                                    id_from=id_from,
                                    model_to=model_to,
                                    id_to=id_to,
                                    relation_name=relation_name,
                                    error=str(e))
                else:
                    # A key is missing (id_from or id_to) but we don't want to stop the parsing.
                    # Then we store the wrong line to return it to the user.
                    errors.append(
                        WarningInformationIsMissing(file=file_name,
                                                    row=row,
                                                    line=csv_reader.line_num,
                                                    id_to=id_to,
                                                    id_from=id_from))

        # Save everything
        saved = 0
        logger.debug("BulkUpload: saving %d objects" % (len(id_mapping)))
        if job:
            job.refresh()
            job.meta["objects_to_save"] = len(id_mapping)
            job.save()
        for item in id_mapping.values():
            item.save()
            saved += 1
            if job:
                job.refresh()
                job.meta["saving_progression"] = saved
                job.save()
        if job: job.refresh()
        if job and "track" in job.meta:
            from django.core.mail import send_mail
            user = User.objects.get(pk=job.meta["user"])
            send_mail("upload finished", "your upload just finished",
                      settings.DEFAULT_FROM_EMAIL, (user.email, ))
        return {
            'duration': (time.time() - start_time),
            'inserted': {
                'objects': saved,
                'links': inserted_relations
            },
            "errors":
            sorted([
                dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors
            ])
        }

    except Exception as e:
        import traceback
        logger.error(traceback.format_exc())
        if e.__dict__:
            message = str(e.__dict__)
        else:
            message = e.message
        return {"errors": [{e.__class__.__name__: message}]}
예제 #7
0
def parse(ontology, module='', app_label=None):
    app_label = app_label if app_label is not None else module.split(".")[-1]
    # Deduce the path to the ontology
    if type(ontology) is FieldFile:
        raw = ontology.read()
        # Open the ontology file and returns the root
        root = ET.fromstring(raw)
    else:
        tree = ET.parse(str(ontology))
        # Get the root of the xml
        root = tree.getroot()
    # Where record the new classes
    classes = dict()
    # List classes
    for clss in root.findall("owl:Class", namespaces=NAMESPACES):
        # Extract the class name
        class_name = attr(clss, "rdf:about", "").split('#')[-1]
        # Format the class name to be PEP compliant
        class_name = to_class_name(class_name)
        # Get all special attributes for this class
        class_specials = get_class_specials(clss)
        # Every class fields are recorded into an objects
        class_fields = {
            # Additional informations
            "_description":
            class_specials["help_text"],
            "_topic":
            class_specials["scope"],
            # Default fields
            "_author":
            models.IntArrayProperty(
                null=True,
                help_text=u'People that edited this entity.',
                verbose_name=u'author'),
            "_status":
            models.IntegerProperty(null=True,
                                   help_text=u'',
                                   verbose_name=u'status')
        }
        # Pick some options (Meta class)
        class_options = {}
        for f in ["verbose_name", "verbose_name_plural"]:
            if class_specials[f] is not None:
                class_options[f] = class_specials[f]
        # List all fields
        for field in clss.findall("rdfs:subClassOf//owl:Restriction",
                                  namespaces=NAMESPACES):
            # All field's options
            field_opts = dict(null=True)
            # Get the name tag
            field_name = field.find("owl:onProperty", namespaces=NAMESPACES)
            # We didn't found a name
            if field_name is None: continue
            # Get the complete field name using the rdf:resource attribute
            field_name = attr(field_name, "rdf:resource")
            # Get field's special properties
            field_opts = dict(field_opts.items() +
                              get_field_specials(root, field_name).items())
            # Convert the name to a python readable format
            field_name = to_underscores(field_name.split("#")[-1])
            if "related_name" in field_opts and field_opts[
                    "related_name"] is not None:
                # Convert related_name to the same format
                field_opts["related_name"] = to_underscores(
                    field_opts["related_name"])
            # It might be a relationship
            on_class = field.find("owl:onClass", namespaces=NAMESPACES)
            # It's a relationship!
            if on_class is not None:
                field_opts["target"] = to_class_name(
                    attr(on_class, "rdf:resource").split("#")[-1])
                # Remove "has_" from the begining of the name
                if field_name.startswith("has_"): field_name = field_name[4:]
                # Build rel_type using the name and the class name
                field_opts["rel_type"] = "%s_has_%s+" % (
                    to_underscores(class_name), field_name)
                field_type = "Relationship"
            else:
                # Get the type tag
                data_range = field.find("owl:onDataRange",
                                        namespaces=NAMESPACES)
                # It might be another tag
                values_from = field.find("owl:someValuesFrom",
                                         namespaces=NAMESPACES)
                # Picks one of the two tags type
                field_type = data_range if data_range is not None else values_from
                # It might be nothing!
                if field_type is None: continue
                # Convert the type to a python readable format
                field_type = OWLTYPES[attr(field_type,
                                           "rdf:resource").split("#")[-1]]
            # Record the field
            class_fields[field_name] = getattr(models,
                                               field_type)(**field_opts)
        # Record the class with this fields
        classes[class_name] = create_node_model(class_name,
                                                class_fields,
                                                app_label=app_label,
                                                options=class_options,
                                                module=module)

        # Prevent a bug with select_related when using neo4django and virtual models
        if not hasattr(classes[class_name]._meta, '_relationships'):
            classes[class_name]._meta._relationships = {}
    return classes
예제 #8
0
def parse(ontology, module='', app_label=None):
    app_label = app_label if app_label is not None else module.split(".")[-1]
    # Deduce the path to the ontology
    if type(ontology) is FieldFile:
        raw = ontology.read()
        # Open the ontology file and returns the root
        root = ET.fromstring(raw)
    else:
        tree = ET.parse(str(ontology))
        # Get the root of the xml
        root = tree.getroot()
    # Where record the new classes
    classes = dict()
    # List classes
    for clss in root.findall("owl:Class", namespaces=NAMESPACES):
        # Extract the class name
        class_name = attr(clss, "rdf:about", "").split('#')[-1]
        # Format the class name to be PEP compliant
        class_name = to_class_name(class_name)
        # Get all special attributes for this class
        class_specials = get_class_specials(clss)
        # Every class fields are recorded into an objects
        class_fields = {
            # Additional informations
            "_description": class_specials["help_text"],
            "_topic"      : class_specials["scope"],
            # Default fields
            "_author": models.IntArrayProperty(null=True, help_text=u'People that edited this entity.', verbose_name=u'author'),
            "_status": models.IntegerProperty(null=True,help_text=u'',verbose_name=u'status')
        }
        # Pick some options (Meta class)
        class_options = {}
        for f in ["verbose_name", "verbose_name_plural"]:
            if class_specials[f] is not None:
                class_options[f] = class_specials[f]
        # List all fields
        for field in clss.findall("rdfs:subClassOf//owl:Restriction", namespaces=NAMESPACES):
            # All field's options
            field_opts = dict(null=True)
            # Get the name tag
            field_name = field.find("owl:onProperty", namespaces=NAMESPACES)
            # We didn't found a name
            if field_name is None: continue
            # Get the complete field name using the rdf:resource attribute
            field_name = attr(field_name, "rdf:resource");
            # Get field's special properties
            field_opts = dict(field_opts.items() + get_field_specials(root, field_name).items() )
            # Convert the name to a python readable format
            field_name = to_underscores(field_name.split("#")[-1])
            if "related_name" in field_opts and field_opts["related_name"] is not None:
                # Convert related_name to the same format
                field_opts["related_name"] = to_underscores(field_opts["related_name"])
            # It might be a relationship
            on_class = field.find("owl:onClass", namespaces=NAMESPACES)
            # It's a relationship!
            if on_class is not None:
                field_opts["target"] = to_class_name(attr(on_class, "rdf:resource").split("#")[-1])
                # Remove "has_" from the begining of the name
                if field_name.startswith("has_"): field_name = field_name[4:]
                # Build rel_type using the name and the class name
                field_opts["rel_type"] = "%s_has_%s+"  % ( to_underscores(class_name), field_name)
                field_type = "Relationship"
            else:
                # Get the type tag
                data_range = field.find("owl:onDataRange", namespaces=NAMESPACES)
                # It might be another tag
                values_from = field.find("owl:someValuesFrom", namespaces=NAMESPACES)
                # Picks one of the two tags type
                field_type = data_range if data_range is not None else values_from
                # It might be nothing!
                if field_type is None: continue
                # Convert the type to a python readable format
                field_type = OWLTYPES[attr(field_type, "rdf:resource").split("#")[-1]]
            # Record the field
            class_fields[field_name] = getattr(models, field_type)(**field_opts)
        # Record the class with this fields
        classes[class_name] = create_node_model(class_name, class_fields, app_label=app_label, options=class_options, module=module)


        # Prevent a bug with select_related when using neo4django and virtual models
        if not hasattr(classes[class_name]._meta, '_relationships'):
            classes[class_name]._meta._relationships = {}
    return classes