Exemple #1
0
def process_bulk_parsing_and_save_as_model(topic, files, start_time=None):
    """
    Job which parses uploaded content, validates and saves them as model
    """

    start_time               = start_time != None and start_time or time.time()
    entities                 = {}
    relations                = []
    errors                   = []
    id_mapping               = {}
    nb_lines                 = 0
    file_reading_progression = 0
    job                      = get_current_job()

    # Define Exceptions
    class Error (Exception):
        """
        Generic Custom Exception for this endpoint.
        Include the topic.
        """
        def __init__(self, **kwargs):
            """ set the topic and add all the parameters as attributes """
            self.topic = topic.title
            for key, value in kwargs.items():
                setattr(self, key, value)
        def __str__(self):
            return self.__dict__

    class WarningCastingValueFail     (Error): pass
    class WarningValidationError      (Error): pass
    class WarningKeyUnknown           (Error): pass
    class WarningInformationIsMissing (Error): pass
    class AttributeDoesntExist        (Error): pass
    class WrongCSVSyntax              (Error): pass
    class ColumnUnknow                (Error): pass
    class ModelDoesntExist            (Error): pass
    class RelationDoesntExist         (Error): pass

    try:
        assert type(files) in (tuple, list), type(files)
        assert len(files) > 0, "You need to upload at least one file."
        assert type(files[0]) in (tuple, list)
        assert len(files[0]) == 2

        # retrieve all models in current topic
        all_models = dict((model.__name__, model) for model in topic.get_models())
        # iterate over all files and dissociate entities .csv from relations .csv
        for file in files:
            if type(file) is tuple:
                file_name = file[0]
                file      = file[1]
            else:
                raise Exception()
            csv_reader = utils.open_csv(file)
            header     = csv_reader.next()
            assert len(header) > 1, "{file_name} header should have at least 2 columns"
            assert header[0].endswith("_id"), "{file_name} : First column should begin with a header like <model_name>_id. Actually {first_col}".format(file_name=file_name, first_col=header[0])
            if len(header) >=3 and header[0].endswith("_id") and header[2].endswith("_id"):
                # this is a relationship file
                relations.append((file_name, file))
            else:
                # this is an entities file
                model_name = utils.to_class_name(header[0].replace("_id", ""))
                if model_name in all_models.keys():
                    entities[model_name] = (file_name, file)
                else:
                    raise ModelDoesntExist(model=model_name, file=file_name, models_availables=all_models.keys())
            nb_lines += len(file) - 1 # -1 removes headers

        # first iterate over entities
        logger.debug("BulkUpload: creating entities")
        for entity, (file_name, file) in entities.items():
            csv_reader = utils.open_csv(file)
            header     = csv_reader.next()
            # must check that all columns map to an existing model field
            fields       = utils.get_model_fields(all_models[entity])
            fields_types = {}
            for field in fields:
                fields_types[field['name']] = field['type']
            field_names = [field['name'] for field in fields]
            columns        = []
            for column in header[1:]:
                column = utils.to_underscores(column)
                if not column in field_names and not column.endswith("__sources__"):
                    raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names)
                    break
                if column.endswith("__sources__"):
                    column_type = "__sources__"
                    column = column[:-len("__sources__")]
                    if not column in field_names:
                        raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names)
                        break
                else:
                    column_type = fields_types.get(column, None)
                columns.append((column, column_type))
            else:
                # here, we know that all columns are valid
                for row in csv_reader:
                    data      = {}
                    sources   = {}
                    entity_id = row[0]
                    for i, (column, column_type) in enumerate(columns):
                        value = str(row[i+1]).decode('utf-8')
                        # cast value if needed
                        if value:
                            try:
                                if "Integer" in column_type:
                                    value = int(value)
                                # TODO: cast float
                                if "Date" in column_type:
                                    value = datetime.datetime(*map(int, re.split('[^\d]', value)[:3])).replace(tzinfo=utc)

                            except Exception as e:
                                e = WarningCastingValueFail(
                                    column_name = column,
                                    value       = value,
                                    type        = column_type,
                                    data        = data, model=entity,
                                    file        = file_name,
                                    line        = csv_reader.line_num,
                                    error       = str(e)
                                )
                                errors.append(e)
                                break
                            if column_type == "__sources__":
                                sources[column] = value
                            else:
                                data[column] = value
                    else:
                        # instanciate a model
                        try:
                            item = all_models[entity].objects.create(**data)
                            # map the object with the ID defined in the .csv
                            id_mapping[(entity, entity_id)] = item
                            # create sources
                            for sourced_field, reference in sources.items():
                                for ref in reference.split("||"):
                                    FieldSource.objects.create(individual=item.id, field=sourced_field, reference=ref)
                            # FIXME: job can be accessed somewhere else (i.e detective/topics/common/jobs.py:JobResource)
                            # Concurrent access are not secure here.
                            # For now we refresh the job just before saving it.
                            file_reading_progression += 1
                            if job:
                                job.refresh()
                                job.meta["file_reading_progression"] = (float(file_reading_progression) / float(nb_lines)) * 100
                                job.meta["file_reading"] = file_name
                                job.save()
                        except Exception as e:
                            errors.append(
                                WarningValidationError(
                                    data  = data,
                                    model = entity,
                                    file  = file_name,
                                    line  = csv_reader.line_num,
                                    error = str(e)
                                )
                            )

        inserted_relations = 0
        # then iterate over relations
        logger.debug("BulkUpload: creating relations")
        for file_name, file in relations:
            # create a csv reader
            csv_reader      = utils.open_csv(file)
            csv_header      = csv_reader.next()
            relation_name   = utils.to_underscores(csv_header[1])
            model_from      = utils.to_class_name(csv_header[0].replace("_id", ""))
            model_to        = utils.to_class_name(csv_header[2].replace("_id", ""))
            properties_name = csv_header[3:]
            # retrieve ModelProperties from related model
            ModelProperties = topic.get_rules().model(all_models[model_from]).field(relation_name).get("through")
            # check that the relation actually exists between the two objects
            try:
                getattr(all_models[model_from], relation_name)
            except Exception as e:
                raise RelationDoesntExist(
                    file             = file_name,
                    model_from       = model_from,
                    model_to         = model_to,
                    relation_name    = relation_name,
                    fields_available = [field['name'] for field in utils.iterate_model_fields(all_models[model_from])],
                    error            = str(e))
            for row in csv_reader:
                id_from    = row[0]
                id_to      = row[2]
                properties = [p.decode('utf-8') for p in row[3:]]
                if id_to and id_from:
                    try:
                        instance_from = id_mapping[(model_from, id_from)]
                        instance_to   = id_mapping[(model_to, id_to)]
                        getattr(instance_from, relation_name).add(instance_to)
                        # add properties if needed
                        if ModelProperties and properties_name and properties:
                            # save the relationship to create an id
                            instance_from.save()
                            # retrieve this id
                            relation_id = next(rel.id for rel in instance_from.node.relationships.outgoing() if rel.end.id == instance_to.id)
                            # properties of the relationship
                            relation_args = {
                                "_endnodes"     : [id_mapping[(model_from, id_from)].id, instance_to.id],
                                "_relationship" : relation_id,
                            }
                            # Pairwise the properties with their names 
                            relation_args.update(zip(properties_name, properties))
                            try:
                                ModelProperties.objects.create(**relation_args)
                            except TypeError as e:
                                errors.append(
                                    AttributeDoesntExist(
                                        file             = file_name,
                                        line             = csv_reader.line_num,
                                        model_from       = model_from,
                                        id_from          = id_from,
                                        model_to         = model_to,
                                        id_to            = id_to,
                                        relation_args    = relation_args,
                                        error            = str(e)
                                    )
                        )
                        # update the job
                        inserted_relations += 1
                        file_reading_progression += 1
                        if job:
                            job.refresh()
                            job.meta["file_reading_progression"] = (float(file_reading_progression) / float(nb_lines)) * 100
                            job.meta["file_reading"] = file_name
                            job.save()
                    except KeyError as e:
                        errors.append(
                            WarningKeyUnknown(
                                file             = file_name,
                                line             = csv_reader.line_num,
                                model_from       = model_from,
                                id_from          = id_from,
                                model_to         = model_to,
                                id_to            = id_to,
                                relation_name    = relation_name,
                                error            = str(e)
                            )
                        )
                    except Exception as e:
                        # Error unknown, we break the process to alert the user
                        raise Error(
                            file             = file_name,
                            line             = csv_reader.line_num,
                            model_from       = model_from,
                            id_from          = id_from,
                            model_to         = model_to,
                            id_to            = id_to,
                            relation_name    = relation_name,
                            error            = str(e))
                else:
                    # A key is missing (id_from or id_to) but we don't want to stop the parsing.
                    # Then we store the wrong line to return it to the user.
                    errors.append(
                        WarningInformationIsMissing(
                            file=file_name, row=row, line=csv_reader.line_num, id_to=id_to, id_from=id_from
                        )
                    )

        # Save everything
        saved = 0
        logger.debug("BulkUpload: saving %d objects" % (len(id_mapping)))
        if job:
            job.refresh()
            job.meta["objects_to_save"] = len(id_mapping)
            job.save()
        for item in id_mapping.values():
            item.save()
            saved += 1
            if job:
                job.refresh()
                job.meta["saving_progression"] = saved
                job.save()
        if job: job.refresh()
        if job and "track" in job.meta:
            from django.core.mail import send_mail
            user = User.objects.get(pk=job.meta["user"])
            send_mail("upload finished", "your upload just finished", settings.DEFAULT_FROM_EMAIL, (user.email,))
        return {
            'duration' : (time.time() - start_time),
            'inserted' : {
                'objects' : saved,
                'links'   : inserted_relations
            },
            "errors" : sorted([dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors])
        }

    except Exception as e:
        import traceback
        logger.error(traceback.format_exc())
        if e.__dict__:
            message = str(e.__dict__)
        else:
            message = e.message
        return {
            "errors" : [{e.__class__.__name__ : message}]
        }
Exemple #2
0
def process_parsing(topic, files):
    """
    Job which reads the uploaded files, validate and saves them as model
    """

    entities   = {}
    relations  = []
    errors     = []
    id_mapping = {}

    assert type(files) in (tuple, list)
    assert len(files) > 0
    assert type(files[0]) in (tuple, list)
    assert len(files[0]) == 2

    # Define Exceptions
    class Error (Exception):
        """
        Generic Custom Exception for this endpoint.
        Include the topic.
        """
        def __init__(self, **kwargs):
            """ set the topic and add all the parameters as attributes """
            self.topic = topic.title
            for key, value in kwargs.items():
                setattr(self, key, value)
        def __str__(self):
            return self.__dict__

    class WarningCastingValueFail     (Error): pass
    class WarningValidationError      (Error): pass
    class WarningKeyUnknown           (Error): pass
    class WarningInformationIsMissing (Error): pass
    class AttributeDoesntExist        (Error): pass
    class WrongCSVSyntax              (Error): pass
    class ColumnUnknow                (Error): pass
    class ModelDoesntExist            (Error): pass
    class RelationDoesntExist         (Error): pass

    try:
        # retrieve all models in current topic
        all_models = dict((model.__name__, model) for model in topic.get_models())
        # iterate over all files and dissociate entities .csv from relations .csv
        for file in files:
            if type(file) is tuple:
                file_name = file[0]
                file      = file[1]
            elif hasattr(file, "read"):
                file_name = file.name
            else:
                raise Exception("ERROR")
            csv_reader = utils.open_csv(file)
            header = csv_reader.next()
            assert len(header) > 1, "header should have at least 2 columns"
            assert header[0].endswith("_id"), "First column should begin with a header like <model_name>_id"
            if len(header) >=3 and header[0].endswith("_id") and header[2].endswith("_id"):
                # this is a relationship file
                relations.append((file_name, file))
            else:
                # this is an entities file
                model_name = utils.to_class_name(header[0].replace("_id", ""))
                if model_name in all_models.keys():
                    entities[model_name] = (file_name, file)
                else:
                    raise ModelDoesntExist(model=model_name, file=file_name, models_availables=all_models.keys())

        # first iterate over entities
        logger.debug("BulkUpload: creating entities")
        for entity, (file_name, file) in entities.items():
            csv_reader = utils.open_csv(file)
            header     = csv_reader.next()
            # must check that all columns map to an existing model field
            fields      = utils.get_model_fields(all_models[entity])
            fields_types = {}
            for field in fields:
                fields_types[field['name']] = field['type']
            field_names = [field['name'] for field in fields]
            columns = []
            for column in header[1:]:
                column = utils.to_underscores(column)
                if column is not '':
                    if not column in field_names:
                        raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names)
                        break
                    column_type = fields_types[column]
                    columns.append((column, column_type))
            else:
                # here, we know that all columns are valid
                for row in csv_reader:
                    data = {}
                    id   = row[0]
                    for i, (column, column_type) in enumerate(columns):
                        value = str(row[i+1]).decode('utf-8')
                        # cast value if needed
                        if value:
                            try:
                                if "Integer" in column_type:
                                    value = int(value)
                                # TODO: cast float
                                if "Date" in column_type:
                                    value = datetime.datetime(*map(int, re.split('[^\d]', value)[:-1])).replace(tzinfo=utc)
                            except Exception as e:
                                e = WarningCastingValueFail(
                                    column_name = column,
                                    value       = value,
                                    type        = column_type,
                                    data        = data, model=entity,
                                    file        = file_name,
                                    line        = csv_reader.line_num,
                                    error       = str(e)
                                )
                                errors.append(e)
                                break
                            data[column] = value
                    else:
                        # instanciate a model
                        try:
                            item = all_models[entity].objects.create(**data)
                            # map the object with the ID defined in the .csv
                            id_mapping[(entity, id)] = item
                        except Exception as e:
                            errors.append(
                                WarningValidationError(
                                    data  = data,
                                    model = entity,
                                    file  = file_name,
                                    line  = csv_reader.line_num,
                                    error = str(e)
                                )
                            )

        inserted_relations = 0
        # then iterate over relations
        logger.debug("BulkUpload: creating relations")
        for file_name, file in relations:
            # create a csv reader
            csv_reader    = utils.open_csv(file)
            csv_header    = csv_reader.next()
            relation_name = utils.to_underscores(csv_header[1])
            model_from    = utils.to_class_name(csv_header[0].replace("_id", ""))
            model_to      = utils.to_class_name(csv_header[2].replace("_id", ""))
            # check that the relation actually exists between the two objects
            try:
                getattr(all_models[model_from], relation_name)
            except Exception as e:
                raise RelationDoesntExist(
                    file             = file_name,
                    model_from       = model_from,
                    model_to         = model_to,
                    relation_name    = relation_name,
                    fields_available = [field['name'] for field in utils.get_model_fields(all_models[model_from])],
                    error            = str(e))
            for row in csv_reader:
                id_from = row[0]
                id_to   = row[2]
                if id_to and id_from:
                    try:
                        getattr(id_mapping[(model_from, id_from)], relation_name).add(id_mapping[(model_to, id_to)])
                        inserted_relations += 1
                    except KeyError as e:
                        errors.append(
                            WarningKeyUnknown(
                                file             = file_name,
                                line             = csv_reader.line_num,
                                model_from       = model_from,
                                id_from          = id_from,
                                model_to         = model_to,
                                id_to            = id_to,
                                relation_name    = relation_name,
                                error            = str(e)
                            )
                        )
                    except Exception as e:
                        # Error unknown, we break the process to alert the user
                        raise Error(
                            file             = file_name,
                            line             = csv_reader.line_num,
                            model_from       = model_from,
                            id_from          = id_from,
                            model_to         = model_to,
                            id_to            = id_to,
                            relation_name    = relation_name,
                            error            = str(e))
                else:
                    # A key is missing (id_from or id_to) but we don't want to stop the parsing.
                    # Then we store the wrong line to return it to the user.
                    errors.append(
                        WarningInformationIsMissing(
                            file=file_name, row=row, line=csv_reader.line_num, id_to=id_to, id_from=id_from
                        )
                    )

        # Save everything
        saved = 0
        logger.debug("BulkUpload: saving %d objects" % (len(id_mapping)))
        for item in id_mapping.values():
            item.save()
            saved += 1

        return {
            'inserted' : {
                'objects' : saved,
                'links'   : inserted_relations
            },
            "errors" : sorted([dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors])
        }
    except Exception as e:
        import traceback
        logger.error(traceback.format_exc())
        return {
            "errors" : [{e.__class__.__name__ : str(e.__dict__)}]
        }
Exemple #3
0
# -*- coding: utf-8 -*-
from django.core.management.base import BaseCommand, CommandError
from lxml import etree
from app.detective.utils import to_class_name, to_camelcase, to_underscores
import re

# Defines the owl and rdf namespaces
namespaces = {
    'owl': 'http://www.w3.org/2002/07/owl#',
    'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
    'rdfs': 'http://www.w3.org/2000/01/rdf-schema#'
}
# transform property name
pron = lambda name: to_underscores(to_camelcase(name))

# get local tag
def get(sets, el):
    if hasattr(sets, "iterchildren"):
        props = [ e for e in sets.iterchildren() if re.search('#}%s$' % el, e.tag) ]
        return props[0].text if len(props) else ''
    else:
        return ""

# Merge 2 list and remove duplicates using the given field as reference
def merge(first_list, second_list, field):
    refs = [ x[field] for x in second_list ]
    return second_list + [ x for x in first_list if x[field] not in refs ]


class Command(BaseCommand):
    help = "Parse the given OWL file to generate its neo4django models."
Exemple #4
0
    def get_model_field(self, desc, model_name):
        # All field's options
        field_opts = dict(null=True)
        # Get the name tag
        field_name = gn(desc, 'name')
        # Convert the name to a python readable format
        field_name = to_underscores(field_name)
        # We didn't found a name
        # @TODO handle that with a custom exception
        if field_name is None: return None, None
        # The field can contains rules
        for name, value in gn(desc, 'rules', dict()).iteritems():
            self.add_rule(model_name, field_name, name, value)
        # Get field's special properties
        field_opts = dict( field_opts.items() + self.get_field_specials(desc).items() )
        if field_name == "name":
            field_opts["indexed"] = True
        # It's a relationship!
        if "related_model" in desc and desc["related_model"] is not None:
            field_opts["target"] = to_class_name(desc["related_model"].lower())
            field_target = to_class_name(field_opts["target"])
            # Remove "has_" from the begining of the name
            if field_name.startswith("has_"): field_name = field_name[4:]
            # Build rel_type using the name and the class name
            field_opts["rel_type"] = "%s_has_%s+"  % ( to_underscores(model_name), field_name)
            field_type = "relationship"

            # Add a related name
            if "related_name" in field_opts and field_opts["related_name"] is not None:
                # Convert related_name to the same format
                related_name = field_opts["related_name"]
                related_name = to_underscores(related_name)
                field_opts["related_name"] = related_name
            else:
                related_name = field_opts["related_name"] = None

            # This relationship can embed properties.
            # Properties are directly bound to the relationship field.
            if "fields" in desc:
                # Fields related to the new model
                composite_fields = gn(desc, 'fields', [])
                # Create a field to reference the relationship ID
                composite_fields.append(dict(
                    type="int",
                    name="_relationship",
                    help_text="The ID of the relationship to describe.",
                    indexed=True,
                    rules=dict(is_editable=False)
                ))
                composite_fields.append(dict(
                    type="intarray",
                    name="_endnodes",
                    help_text="IDs of the relationship's extremities.",
                    indexed=True,
                    rules=dict(is_editable=False)
                ))
                # Name of the new model
                composite_name = "%s %s %s Properties" % (
                    model_name,
                    field_name,
                    field_target
                )
                # Create a Model with the relation
                composite_model = {
                    "name": composite_name,
                    "fields": composite_fields
                }
                # Create the new model!
                model = self.add_model(composite_model)
                # We have to register (for later) a rule that says
                # explicitely that this field has properties
                self.add_rule(model_name, field_name, "has_properties", True)
                self.add_rule(model_name, field_name, "through", model)
                # This relationship is visible in the target model
                if related_name is not None:
                    # Add another rule for the reverse relationship
                    self.add_rule(field_target, related_name, "has_properties", True)
                    self.add_rule(field_target, related_name, "through", model)
                # Add a rules to make this "special" model
                self.modelrules.model(model).add(is_relationship_properties=True,
                                                 relationship_source=model_name,
                                                 relationship_target=field_target,
                                                 is_searchable=False)
        # It's a literal value
        else:
            # Picks one of the two tags type
            field_type = desc["type"].lower()
            # Remove "field" suffix
            if field_type.endswith("field"): field_type = field_type[0:-5]
        # Skip unkown type
        # @TODO raise custom exception
        if not field_type in self.JSONTYPES: return None, None
        # Convert type to neo4django property type
        field_type = self.JSONTYPES[field_type]
        # Add a default value for boolean properties
        if field_type == 'BooleanProperty' and not 'default' in field_opts.keys():
            field_opts['default'] = False
        # Return an instance of the field
        return field_name, getattr(models, field_type)(**field_opts)
Exemple #5
0
def process_bulk_parsing_and_save_as_model(topic, files, start_time=None):
    """
    Job which parses uploaded content, validates and saves them as model
    """

    start_time = start_time != None and start_time or time.time()
    entities = {}
    relations = []
    errors = []
    id_mapping = {}
    nb_lines = 0
    file_reading_progression = 0
    job = get_current_job()

    # Define Exceptions
    class Error(Exception):
        """
        Generic Custom Exception for this endpoint.
        Include the topic.
        """
        def __init__(self, **kwargs):
            """ set the topic and add all the parameters as attributes """
            self.topic = topic.title
            for key, value in kwargs.items():
                setattr(self, key, value)

        def __str__(self):
            return self.__dict__

    class WarningCastingValueFail(Error):
        pass

    class WarningValidationError(Error):
        pass

    class WarningKeyUnknown(Error):
        pass

    class WarningInformationIsMissing(Error):
        pass

    class AttributeDoesntExist(Error):
        pass

    class WrongCSVSyntax(Error):
        pass

    class ColumnUnknow(Error):
        pass

    class ModelDoesntExist(Error):
        pass

    class RelationDoesntExist(Error):
        pass

    try:
        assert type(files) in (tuple, list), type(files)
        assert len(files) > 0, "You need to upload at least one file."
        assert type(files[0]) in (tuple, list)
        assert len(files[0]) == 2

        # retrieve all models in current topic
        all_models = dict(
            (model.__name__, model) for model in topic.get_models())
        # iterate over all files and dissociate entities .csv from relations .csv
        for file in files:
            if type(file) is tuple:
                file_name = file[0]
                file = file[1]
            else:
                raise Exception()
            csv_reader = utils.open_csv(file)
            header = csv_reader.next()
            assert len(
                header
            ) > 1, "{file_name} header should have at least 2 columns"
            assert header[0].endswith(
                "_id"
            ), "{file_name} : First column should begin with a header like <model_name>_id. Actually {first_col}".format(
                file_name=file_name, first_col=header[0])
            if len(header) >= 3 and header[0].endswith(
                    "_id") and header[2].endswith("_id"):
                # this is a relationship file
                relations.append((file_name, file))
            else:
                # this is an entities file
                model_name = utils.to_class_name(header[0].replace("_id", ""))
                if model_name in all_models.keys():
                    entities[model_name] = (file_name, file)
                else:
                    raise ModelDoesntExist(model=model_name,
                                           file=file_name,
                                           models_availables=all_models.keys())
            nb_lines += len(file) - 1  # -1 removes headers

        # first iterate over entities
        logger.debug("BulkUpload: creating entities")
        for entity, (file_name, file) in entities.items():
            csv_reader = utils.open_csv(file)
            header = csv_reader.next()
            # must check that all columns map to an existing model field
            fields = utils.get_model_fields(all_models[entity])
            fields_types = {}
            for field in fields:
                fields_types[field['name']] = field['type']
            field_names = [field['name'] for field in fields]
            columns = []
            for column in header[1:]:
                column = utils.to_underscores(column)
                if not column in field_names and not column.endswith(
                        "__sources__"):
                    raise ColumnUnknow(file=file_name,
                                       column=column,
                                       model=entity,
                                       attributes_available=field_names)
                    break
                if column.endswith("__sources__"):
                    column_type = "__sources__"
                    column = column[:-len("__sources__")]
                    if not column in field_names:
                        raise ColumnUnknow(file=file_name,
                                           column=column,
                                           model=entity,
                                           attributes_available=field_names)
                        break
                else:
                    column_type = fields_types.get(column, None)
                columns.append((column, column_type))
            else:
                # here, we know that all columns are valid
                for row in csv_reader:
                    data = {}
                    sources = {}
                    entity_id = row[0]
                    for i, (column, column_type) in enumerate(columns):
                        value = str(row[i + 1]).decode('utf-8')
                        # cast value if needed
                        if value:
                            try:
                                if "Integer" in column_type:
                                    value = int(value)
                                # TODO: cast float
                                if "Date" in column_type:
                                    value = datetime.datetime(*map(
                                        int,
                                        re.split('[^\d]', value)[:3])).replace(
                                            tzinfo=utc)

                            except Exception as e:
                                e = WarningCastingValueFail(
                                    column_name=column,
                                    value=value,
                                    type=column_type,
                                    data=data,
                                    model=entity,
                                    file=file_name,
                                    line=csv_reader.line_num,
                                    error=str(e))
                                errors.append(e)
                                break
                            if column_type == "__sources__":
                                sources[column] = value
                            else:
                                data[column] = value
                    else:
                        # instanciate a model
                        try:
                            item = all_models[entity].objects.create(**data)
                            # map the object with the ID defined in the .csv
                            id_mapping[(entity, entity_id)] = item
                            # create sources
                            for sourced_field, reference in sources.items():
                                for ref in reference.split("||"):
                                    FieldSource.objects.create(
                                        individual=item.id,
                                        field=sourced_field,
                                        reference=ref)
                            # FIXME: job can be accessed somewhere else (i.e detective/topics/common/jobs.py:JobResource)
                            # Concurrent access are not secure here.
                            # For now we refresh the job just before saving it.
                            file_reading_progression += 1
                            if job:
                                job.refresh()
                                job.meta["file_reading_progression"] = (
                                    float(file_reading_progression) /
                                    float(nb_lines)) * 100
                                job.meta["file_reading"] = file_name
                                job.save()
                        except Exception as e:
                            errors.append(
                                WarningValidationError(
                                    data=data,
                                    model=entity,
                                    file=file_name,
                                    line=csv_reader.line_num,
                                    error=str(e)))

        inserted_relations = 0
        # then iterate over relations
        logger.debug("BulkUpload: creating relations")
        for file_name, file in relations:
            # create a csv reader
            csv_reader = utils.open_csv(file)
            csv_header = csv_reader.next()
            relation_name = utils.to_underscores(csv_header[1])
            model_from = utils.to_class_name(csv_header[0].replace("_id", ""))
            model_to = utils.to_class_name(csv_header[2].replace("_id", ""))
            properties_name = csv_header[3:]
            # retrieve ModelProperties from related model
            ModelProperties = topic.get_rules().model(
                all_models[model_from]).field(relation_name).get("through")
            # check that the relation actually exists between the two objects
            try:
                getattr(all_models[model_from], relation_name)
            except Exception as e:
                raise RelationDoesntExist(
                    file=file_name,
                    model_from=model_from,
                    model_to=model_to,
                    relation_name=relation_name,
                    fields_available=[
                        field['name'] for field in utils.iterate_model_fields(
                            all_models[model_from])
                    ],
                    error=str(e))
            for row in csv_reader:
                id_from = row[0]
                id_to = row[2]
                properties = [p.decode('utf-8') for p in row[3:]]
                if id_to and id_from:
                    try:
                        instance_from = id_mapping[(model_from, id_from)]
                        instance_to = id_mapping[(model_to, id_to)]
                        getattr(instance_from, relation_name).add(instance_to)
                        # add properties if needed
                        if ModelProperties and properties_name and properties:
                            # save the relationship to create an id
                            instance_from.save()
                            # retrieve this id
                            relation_id = next(
                                rel.id for rel in
                                instance_from.node.relationships.outgoing()
                                if rel.end.id == instance_to.id)
                            # properties of the relationship
                            relation_args = {
                                "_endnodes": [
                                    id_mapping[(model_from, id_from)].id,
                                    instance_to.id
                                ],
                                "_relationship":
                                relation_id,
                            }
                            # Pairwise the properties with their names
                            relation_args.update(
                                zip(properties_name, properties))
                            try:
                                ModelProperties.objects.create(**relation_args)
                            except TypeError as e:
                                errors.append(
                                    AttributeDoesntExist(
                                        file=file_name,
                                        line=csv_reader.line_num,
                                        model_from=model_from,
                                        id_from=id_from,
                                        model_to=model_to,
                                        id_to=id_to,
                                        relation_args=relation_args,
                                        error=str(e)))
                        # update the job
                        inserted_relations += 1
                        file_reading_progression += 1
                        if job:
                            job.refresh()
                            job.meta["file_reading_progression"] = (
                                float(file_reading_progression) /
                                float(nb_lines)) * 100
                            job.meta["file_reading"] = file_name
                            job.save()
                    except KeyError as e:
                        errors.append(
                            WarningKeyUnknown(file=file_name,
                                              line=csv_reader.line_num,
                                              model_from=model_from,
                                              id_from=id_from,
                                              model_to=model_to,
                                              id_to=id_to,
                                              relation_name=relation_name,
                                              error=str(e)))
                    except Exception as e:
                        # Error unknown, we break the process to alert the user
                        raise Error(file=file_name,
                                    line=csv_reader.line_num,
                                    model_from=model_from,
                                    id_from=id_from,
                                    model_to=model_to,
                                    id_to=id_to,
                                    relation_name=relation_name,
                                    error=str(e))
                else:
                    # A key is missing (id_from or id_to) but we don't want to stop the parsing.
                    # Then we store the wrong line to return it to the user.
                    errors.append(
                        WarningInformationIsMissing(file=file_name,
                                                    row=row,
                                                    line=csv_reader.line_num,
                                                    id_to=id_to,
                                                    id_from=id_from))

        # Save everything
        saved = 0
        logger.debug("BulkUpload: saving %d objects" % (len(id_mapping)))
        if job:
            job.refresh()
            job.meta["objects_to_save"] = len(id_mapping)
            job.save()
        for item in id_mapping.values():
            item.save()
            saved += 1
            if job:
                job.refresh()
                job.meta["saving_progression"] = saved
                job.save()
        if job: job.refresh()
        if job and "track" in job.meta:
            from django.core.mail import send_mail
            user = User.objects.get(pk=job.meta["user"])
            send_mail("upload finished", "your upload just finished",
                      settings.DEFAULT_FROM_EMAIL, (user.email, ))
        return {
            'duration': (time.time() - start_time),
            'inserted': {
                'objects': saved,
                'links': inserted_relations
            },
            "errors":
            sorted([
                dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors
            ])
        }

    except Exception as e:
        import traceback
        logger.error(traceback.format_exc())
        if e.__dict__:
            message = str(e.__dict__)
        else:
            message = e.message
        return {"errors": [{e.__class__.__name__: message}]}
        def reduce_origin(rows):
            # No nodes, no links
            if len(rows) == 0: return ([], [],)
            # Initialize structures
            all_nodes = dict()
            # Use defaultdict() to create somewhat of an autovivificating list
            # We want to build a structure of the form:
            # { source_id : { relation_name : [ target_ids ] } }
            # Must use a set() instead of list() to avoid checking duplicates but it screw up json.dumps()
            all_links = defaultdict(lambda: dict(__count=0, __relations=defaultdict(list)))
            IDs = set(sum([row['nodes'] for row in rows], []))

            # Get all entities from their IDs
            query = """
                START root = node({0})
                MATCH (root)-[:`<<INSTANCE>>`]-(type)
                WHERE type.app_label = '{1}'
                AND HAS(root.name)
                RETURN ID(root) as ID, root, type
            """.format(','.join([str(ID) for ID in IDs]), get_model_topic(self.get_model()))
            all_raw_nodes = connection.cypher(query).to_dicts()
            for row in all_raw_nodes:
                # Twist some data in the entity
                for key in row['root']['data'].keys():
                    if key[0] == '_': del row['root']['data'][key]
                row['root']['data']['_type'] = row['type']['data']['model_name']
                row['root']['data']['_id'] = row['ID']

                all_nodes[row['ID']] = row['root']['data']

            for row in rows:
                nodes = row['nodes']
                i = 0
                for relation in row['relations']:
                    try:
                        if all_nodes[nodes[i]] is None or all_nodes[nodes[i + 1]] is None: continue
                        (a, b) = (nodes[i], nodes[i + 1])
                        if re.search('^'+to_underscores(all_nodes[nodes[i]]['_type']), relation) is None:
                            (a, b) = (nodes[i + 1], nodes[i])
                        if not b in all_links[a]['__relations'][relation]:
                            all_links[a]['__count'] += 1
                            all_links[a]['__relations'][relation].append(b)
                    except KeyError: pass
                    i += 1

            # Sort and aggregate nodes when we're over the threshold
            for node in all_links.keys():
                shortcut = all_links[node]['__relations']
                if all_links[node]['__count'] >= aggregation_threshold:
                    sorted_relations = sorted([(len(shortcut[rel]), rel) for rel in shortcut],
                                              key=lambda to_sort: to_sort[0])
                    shortcut = defaultdict(list)
                    i = 0
                    while i < aggregation_threshold:
                        for rel in sorted_relations:
                            try:
                                node_id = all_links[node]['__relations'][rel[1]].pop()
                                shortcut[rel[1]].append(node_id)
                                i += 1
                            except IndexError:
                                # Must except IndexError if we .pop() on an empty list
                                pass
                            if i >= aggregation_threshold: break
                    shortcut['_AGGREGATION_'] = sum(all_links[node]['__relations'].values(), [])
                all_links[node] = shortcut

            return (all_nodes, all_links)
Exemple #7
0
def parse(ontology, module='', app_label=None):
    app_label = app_label if app_label is not None else module.split(".")[-1]
    # Deduce the path to the ontology
    if type(ontology) is FieldFile:
        raw = ontology.read()
        # Open the ontology file and returns the root
        root = ET.fromstring(raw)
    else:
        tree = ET.parse(str(ontology))
        # Get the root of the xml
        root = tree.getroot()
    # Where record the new classes
    classes = dict()
    # List classes
    for clss in root.findall("owl:Class", namespaces=NAMESPACES):
        # Extract the class name
        class_name = attr(clss, "rdf:about", "").split('#')[-1]
        # Format the class name to be PEP compliant
        class_name = to_class_name(class_name)
        # Get all special attributes for this class
        class_specials = get_class_specials(clss)
        # Every class fields are recorded into an objects
        class_fields = {
            # Additional informations
            "_description":
            class_specials["help_text"],
            "_topic":
            class_specials["scope"],
            # Default fields
            "_author":
            models.IntArrayProperty(
                null=True,
                help_text=u'People that edited this entity.',
                verbose_name=u'author'),
            "_status":
            models.IntegerProperty(null=True,
                                   help_text=u'',
                                   verbose_name=u'status')
        }
        # Pick some options (Meta class)
        class_options = {}
        for f in ["verbose_name", "verbose_name_plural"]:
            if class_specials[f] is not None:
                class_options[f] = class_specials[f]
        # List all fields
        for field in clss.findall("rdfs:subClassOf//owl:Restriction",
                                  namespaces=NAMESPACES):
            # All field's options
            field_opts = dict(null=True)
            # Get the name tag
            field_name = field.find("owl:onProperty", namespaces=NAMESPACES)
            # We didn't found a name
            if field_name is None: continue
            # Get the complete field name using the rdf:resource attribute
            field_name = attr(field_name, "rdf:resource")
            # Get field's special properties
            field_opts = dict(field_opts.items() +
                              get_field_specials(root, field_name).items())
            # Convert the name to a python readable format
            field_name = to_underscores(field_name.split("#")[-1])
            if "related_name" in field_opts and field_opts[
                    "related_name"] is not None:
                # Convert related_name to the same format
                field_opts["related_name"] = to_underscores(
                    field_opts["related_name"])
            # It might be a relationship
            on_class = field.find("owl:onClass", namespaces=NAMESPACES)
            # It's a relationship!
            if on_class is not None:
                field_opts["target"] = to_class_name(
                    attr(on_class, "rdf:resource").split("#")[-1])
                # Remove "has_" from the begining of the name
                if field_name.startswith("has_"): field_name = field_name[4:]
                # Build rel_type using the name and the class name
                field_opts["rel_type"] = "%s_has_%s+" % (
                    to_underscores(class_name), field_name)
                field_type = "Relationship"
            else:
                # Get the type tag
                data_range = field.find("owl:onDataRange",
                                        namespaces=NAMESPACES)
                # It might be another tag
                values_from = field.find("owl:someValuesFrom",
                                         namespaces=NAMESPACES)
                # Picks one of the two tags type
                field_type = data_range if data_range is not None else values_from
                # It might be nothing!
                if field_type is None: continue
                # Convert the type to a python readable format
                field_type = OWLTYPES[attr(field_type,
                                           "rdf:resource").split("#")[-1]]
            # Record the field
            class_fields[field_name] = getattr(models,
                                               field_type)(**field_opts)
        # Record the class with this fields
        classes[class_name] = create_node_model(class_name,
                                                class_fields,
                                                app_label=app_label,
                                                options=class_options,
                                                module=module)

        # Prevent a bug with select_related when using neo4django and virtual models
        if not hasattr(classes[class_name]._meta, '_relationships'):
            classes[class_name]._meta._relationships = {}
    return classes
Exemple #8
0
def parse(ontology, module='', app_label=None):
    app_label = app_label if app_label is not None else module.split(".")[-1]
    # Deduce the path to the ontology
    if type(ontology) is FieldFile:
        raw = ontology.read()
        # Open the ontology file and returns the root
        root = ET.fromstring(raw)
    else:
        tree = ET.parse(str(ontology))
        # Get the root of the xml
        root = tree.getroot()
    # Where record the new classes
    classes = dict()
    # List classes
    for clss in root.findall("owl:Class", namespaces=NAMESPACES):
        # Extract the class name
        class_name = attr(clss, "rdf:about", "").split('#')[-1]
        # Format the class name to be PEP compliant
        class_name = to_class_name(class_name)
        # Get all special attributes for this class
        class_specials = get_class_specials(clss)
        # Every class fields are recorded into an objects
        class_fields = {
            # Additional informations
            "_description": class_specials["help_text"],
            "_topic"      : class_specials["scope"],
            # Default fields
            "_author": models.IntArrayProperty(null=True, help_text=u'People that edited this entity.', verbose_name=u'author'),
            "_status": models.IntegerProperty(null=True,help_text=u'',verbose_name=u'status')
        }
        # Pick some options (Meta class)
        class_options = {}
        for f in ["verbose_name", "verbose_name_plural"]:
            if class_specials[f] is not None:
                class_options[f] = class_specials[f]
        # List all fields
        for field in clss.findall("rdfs:subClassOf//owl:Restriction", namespaces=NAMESPACES):
            # All field's options
            field_opts = dict(null=True)
            # Get the name tag
            field_name = field.find("owl:onProperty", namespaces=NAMESPACES)
            # We didn't found a name
            if field_name is None: continue
            # Get the complete field name using the rdf:resource attribute
            field_name = attr(field_name, "rdf:resource");
            # Get field's special properties
            field_opts = dict(field_opts.items() + get_field_specials(root, field_name).items() )
            # Convert the name to a python readable format
            field_name = to_underscores(field_name.split("#")[-1])
            if "related_name" in field_opts and field_opts["related_name"] is not None:
                # Convert related_name to the same format
                field_opts["related_name"] = to_underscores(field_opts["related_name"])
            # It might be a relationship
            on_class = field.find("owl:onClass", namespaces=NAMESPACES)
            # It's a relationship!
            if on_class is not None:
                field_opts["target"] = to_class_name(attr(on_class, "rdf:resource").split("#")[-1])
                # Remove "has_" from the begining of the name
                if field_name.startswith("has_"): field_name = field_name[4:]
                # Build rel_type using the name and the class name
                field_opts["rel_type"] = "%s_has_%s+"  % ( to_underscores(class_name), field_name)
                field_type = "Relationship"
            else:
                # Get the type tag
                data_range = field.find("owl:onDataRange", namespaces=NAMESPACES)
                # It might be another tag
                values_from = field.find("owl:someValuesFrom", namespaces=NAMESPACES)
                # Picks one of the two tags type
                field_type = data_range if data_range is not None else values_from
                # It might be nothing!
                if field_type is None: continue
                # Convert the type to a python readable format
                field_type = OWLTYPES[attr(field_type, "rdf:resource").split("#")[-1]]
            # Record the field
            class_fields[field_name] = getattr(models, field_type)(**field_opts)
        # Record the class with this fields
        classes[class_name] = create_node_model(class_name, class_fields, app_label=app_label, options=class_options, module=module)


        # Prevent a bug with select_related when using neo4django and virtual models
        if not hasattr(classes[class_name]._meta, '_relationships'):
            classes[class_name]._meta._relationships = {}
    return classes