def process_bulk_parsing_and_save_as_model(topic, files, start_time=None): """ Job which parses uploaded content, validates and saves them as model """ start_time = start_time != None and start_time or time.time() entities = {} relations = [] errors = [] id_mapping = {} nb_lines = 0 file_reading_progression = 0 job = get_current_job() # Define Exceptions class Error (Exception): """ Generic Custom Exception for this endpoint. Include the topic. """ def __init__(self, **kwargs): """ set the topic and add all the parameters as attributes """ self.topic = topic.title for key, value in kwargs.items(): setattr(self, key, value) def __str__(self): return self.__dict__ class WarningCastingValueFail (Error): pass class WarningValidationError (Error): pass class WarningKeyUnknown (Error): pass class WarningInformationIsMissing (Error): pass class AttributeDoesntExist (Error): pass class WrongCSVSyntax (Error): pass class ColumnUnknow (Error): pass class ModelDoesntExist (Error): pass class RelationDoesntExist (Error): pass try: assert type(files) in (tuple, list), type(files) assert len(files) > 0, "You need to upload at least one file." assert type(files[0]) in (tuple, list) assert len(files[0]) == 2 # retrieve all models in current topic all_models = dict((model.__name__, model) for model in topic.get_models()) # iterate over all files and dissociate entities .csv from relations .csv for file in files: if type(file) is tuple: file_name = file[0] file = file[1] else: raise Exception() csv_reader = utils.open_csv(file) header = csv_reader.next() assert len(header) > 1, "{file_name} header should have at least 2 columns" assert header[0].endswith("_id"), "{file_name} : First column should begin with a header like <model_name>_id. Actually {first_col}".format(file_name=file_name, first_col=header[0]) if len(header) >=3 and header[0].endswith("_id") and header[2].endswith("_id"): # this is a relationship file relations.append((file_name, file)) else: # this is an entities file model_name = utils.to_class_name(header[0].replace("_id", "")) if model_name in all_models.keys(): entities[model_name] = (file_name, file) else: raise ModelDoesntExist(model=model_name, file=file_name, models_availables=all_models.keys()) nb_lines += len(file) - 1 # -1 removes headers # first iterate over entities logger.debug("BulkUpload: creating entities") for entity, (file_name, file) in entities.items(): csv_reader = utils.open_csv(file) header = csv_reader.next() # must check that all columns map to an existing model field fields = utils.get_model_fields(all_models[entity]) fields_types = {} for field in fields: fields_types[field['name']] = field['type'] field_names = [field['name'] for field in fields] columns = [] for column in header[1:]: column = utils.to_underscores(column) if not column in field_names and not column.endswith("__sources__"): raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names) break if column.endswith("__sources__"): column_type = "__sources__" column = column[:-len("__sources__")] if not column in field_names: raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names) break else: column_type = fields_types.get(column, None) columns.append((column, column_type)) else: # here, we know that all columns are valid for row in csv_reader: data = {} sources = {} entity_id = row[0] for i, (column, column_type) in enumerate(columns): value = str(row[i+1]).decode('utf-8') # cast value if needed if value: try: if "Integer" in column_type: value = int(value) # TODO: cast float if "Date" in column_type: value = datetime.datetime(*map(int, re.split('[^\d]', value)[:3])).replace(tzinfo=utc) except Exception as e: e = WarningCastingValueFail( column_name = column, value = value, type = column_type, data = data, model=entity, file = file_name, line = csv_reader.line_num, error = str(e) ) errors.append(e) break if column_type == "__sources__": sources[column] = value else: data[column] = value else: # instanciate a model try: item = all_models[entity].objects.create(**data) # map the object with the ID defined in the .csv id_mapping[(entity, entity_id)] = item # create sources for sourced_field, reference in sources.items(): for ref in reference.split("||"): FieldSource.objects.create(individual=item.id, field=sourced_field, reference=ref) # FIXME: job can be accessed somewhere else (i.e detective/topics/common/jobs.py:JobResource) # Concurrent access are not secure here. # For now we refresh the job just before saving it. file_reading_progression += 1 if job: job.refresh() job.meta["file_reading_progression"] = (float(file_reading_progression) / float(nb_lines)) * 100 job.meta["file_reading"] = file_name job.save() except Exception as e: errors.append( WarningValidationError( data = data, model = entity, file = file_name, line = csv_reader.line_num, error = str(e) ) ) inserted_relations = 0 # then iterate over relations logger.debug("BulkUpload: creating relations") for file_name, file in relations: # create a csv reader csv_reader = utils.open_csv(file) csv_header = csv_reader.next() relation_name = utils.to_underscores(csv_header[1]) model_from = utils.to_class_name(csv_header[0].replace("_id", "")) model_to = utils.to_class_name(csv_header[2].replace("_id", "")) properties_name = csv_header[3:] # retrieve ModelProperties from related model ModelProperties = topic.get_rules().model(all_models[model_from]).field(relation_name).get("through") # check that the relation actually exists between the two objects try: getattr(all_models[model_from], relation_name) except Exception as e: raise RelationDoesntExist( file = file_name, model_from = model_from, model_to = model_to, relation_name = relation_name, fields_available = [field['name'] for field in utils.iterate_model_fields(all_models[model_from])], error = str(e)) for row in csv_reader: id_from = row[0] id_to = row[2] properties = [p.decode('utf-8') for p in row[3:]] if id_to and id_from: try: instance_from = id_mapping[(model_from, id_from)] instance_to = id_mapping[(model_to, id_to)] getattr(instance_from, relation_name).add(instance_to) # add properties if needed if ModelProperties and properties_name and properties: # save the relationship to create an id instance_from.save() # retrieve this id relation_id = next(rel.id for rel in instance_from.node.relationships.outgoing() if rel.end.id == instance_to.id) # properties of the relationship relation_args = { "_endnodes" : [id_mapping[(model_from, id_from)].id, instance_to.id], "_relationship" : relation_id, } # Pairwise the properties with their names relation_args.update(zip(properties_name, properties)) try: ModelProperties.objects.create(**relation_args) except TypeError as e: errors.append( AttributeDoesntExist( file = file_name, line = csv_reader.line_num, model_from = model_from, id_from = id_from, model_to = model_to, id_to = id_to, relation_args = relation_args, error = str(e) ) ) # update the job inserted_relations += 1 file_reading_progression += 1 if job: job.refresh() job.meta["file_reading_progression"] = (float(file_reading_progression) / float(nb_lines)) * 100 job.meta["file_reading"] = file_name job.save() except KeyError as e: errors.append( WarningKeyUnknown( file = file_name, line = csv_reader.line_num, model_from = model_from, id_from = id_from, model_to = model_to, id_to = id_to, relation_name = relation_name, error = str(e) ) ) except Exception as e: # Error unknown, we break the process to alert the user raise Error( file = file_name, line = csv_reader.line_num, model_from = model_from, id_from = id_from, model_to = model_to, id_to = id_to, relation_name = relation_name, error = str(e)) else: # A key is missing (id_from or id_to) but we don't want to stop the parsing. # Then we store the wrong line to return it to the user. errors.append( WarningInformationIsMissing( file=file_name, row=row, line=csv_reader.line_num, id_to=id_to, id_from=id_from ) ) # Save everything saved = 0 logger.debug("BulkUpload: saving %d objects" % (len(id_mapping))) if job: job.refresh() job.meta["objects_to_save"] = len(id_mapping) job.save() for item in id_mapping.values(): item.save() saved += 1 if job: job.refresh() job.meta["saving_progression"] = saved job.save() if job: job.refresh() if job and "track" in job.meta: from django.core.mail import send_mail user = User.objects.get(pk=job.meta["user"]) send_mail("upload finished", "your upload just finished", settings.DEFAULT_FROM_EMAIL, (user.email,)) return { 'duration' : (time.time() - start_time), 'inserted' : { 'objects' : saved, 'links' : inserted_relations }, "errors" : sorted([dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors]) } except Exception as e: import traceback logger.error(traceback.format_exc()) if e.__dict__: message = str(e.__dict__) else: message = e.message return { "errors" : [{e.__class__.__name__ : message}] }
def process_parsing(topic, files): """ Job which reads the uploaded files, validate and saves them as model """ entities = {} relations = [] errors = [] id_mapping = {} assert type(files) in (tuple, list) assert len(files) > 0 assert type(files[0]) in (tuple, list) assert len(files[0]) == 2 # Define Exceptions class Error (Exception): """ Generic Custom Exception for this endpoint. Include the topic. """ def __init__(self, **kwargs): """ set the topic and add all the parameters as attributes """ self.topic = topic.title for key, value in kwargs.items(): setattr(self, key, value) def __str__(self): return self.__dict__ class WarningCastingValueFail (Error): pass class WarningValidationError (Error): pass class WarningKeyUnknown (Error): pass class WarningInformationIsMissing (Error): pass class AttributeDoesntExist (Error): pass class WrongCSVSyntax (Error): pass class ColumnUnknow (Error): pass class ModelDoesntExist (Error): pass class RelationDoesntExist (Error): pass try: # retrieve all models in current topic all_models = dict((model.__name__, model) for model in topic.get_models()) # iterate over all files and dissociate entities .csv from relations .csv for file in files: if type(file) is tuple: file_name = file[0] file = file[1] elif hasattr(file, "read"): file_name = file.name else: raise Exception("ERROR") csv_reader = utils.open_csv(file) header = csv_reader.next() assert len(header) > 1, "header should have at least 2 columns" assert header[0].endswith("_id"), "First column should begin with a header like <model_name>_id" if len(header) >=3 and header[0].endswith("_id") and header[2].endswith("_id"): # this is a relationship file relations.append((file_name, file)) else: # this is an entities file model_name = utils.to_class_name(header[0].replace("_id", "")) if model_name in all_models.keys(): entities[model_name] = (file_name, file) else: raise ModelDoesntExist(model=model_name, file=file_name, models_availables=all_models.keys()) # first iterate over entities logger.debug("BulkUpload: creating entities") for entity, (file_name, file) in entities.items(): csv_reader = utils.open_csv(file) header = csv_reader.next() # must check that all columns map to an existing model field fields = utils.get_model_fields(all_models[entity]) fields_types = {} for field in fields: fields_types[field['name']] = field['type'] field_names = [field['name'] for field in fields] columns = [] for column in header[1:]: column = utils.to_underscores(column) if column is not '': if not column in field_names: raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names) break column_type = fields_types[column] columns.append((column, column_type)) else: # here, we know that all columns are valid for row in csv_reader: data = {} id = row[0] for i, (column, column_type) in enumerate(columns): value = str(row[i+1]).decode('utf-8') # cast value if needed if value: try: if "Integer" in column_type: value = int(value) # TODO: cast float if "Date" in column_type: value = datetime.datetime(*map(int, re.split('[^\d]', value)[:-1])).replace(tzinfo=utc) except Exception as e: e = WarningCastingValueFail( column_name = column, value = value, type = column_type, data = data, model=entity, file = file_name, line = csv_reader.line_num, error = str(e) ) errors.append(e) break data[column] = value else: # instanciate a model try: item = all_models[entity].objects.create(**data) # map the object with the ID defined in the .csv id_mapping[(entity, id)] = item except Exception as e: errors.append( WarningValidationError( data = data, model = entity, file = file_name, line = csv_reader.line_num, error = str(e) ) ) inserted_relations = 0 # then iterate over relations logger.debug("BulkUpload: creating relations") for file_name, file in relations: # create a csv reader csv_reader = utils.open_csv(file) csv_header = csv_reader.next() relation_name = utils.to_underscores(csv_header[1]) model_from = utils.to_class_name(csv_header[0].replace("_id", "")) model_to = utils.to_class_name(csv_header[2].replace("_id", "")) # check that the relation actually exists between the two objects try: getattr(all_models[model_from], relation_name) except Exception as e: raise RelationDoesntExist( file = file_name, model_from = model_from, model_to = model_to, relation_name = relation_name, fields_available = [field['name'] for field in utils.get_model_fields(all_models[model_from])], error = str(e)) for row in csv_reader: id_from = row[0] id_to = row[2] if id_to and id_from: try: getattr(id_mapping[(model_from, id_from)], relation_name).add(id_mapping[(model_to, id_to)]) inserted_relations += 1 except KeyError as e: errors.append( WarningKeyUnknown( file = file_name, line = csv_reader.line_num, model_from = model_from, id_from = id_from, model_to = model_to, id_to = id_to, relation_name = relation_name, error = str(e) ) ) except Exception as e: # Error unknown, we break the process to alert the user raise Error( file = file_name, line = csv_reader.line_num, model_from = model_from, id_from = id_from, model_to = model_to, id_to = id_to, relation_name = relation_name, error = str(e)) else: # A key is missing (id_from or id_to) but we don't want to stop the parsing. # Then we store the wrong line to return it to the user. errors.append( WarningInformationIsMissing( file=file_name, row=row, line=csv_reader.line_num, id_to=id_to, id_from=id_from ) ) # Save everything saved = 0 logger.debug("BulkUpload: saving %d objects" % (len(id_mapping))) for item in id_mapping.values(): item.save() saved += 1 return { 'inserted' : { 'objects' : saved, 'links' : inserted_relations }, "errors" : sorted([dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors]) } except Exception as e: import traceback logger.error(traceback.format_exc()) return { "errors" : [{e.__class__.__name__ : str(e.__dict__)}] }
# -*- coding: utf-8 -*- from django.core.management.base import BaseCommand, CommandError from lxml import etree from app.detective.utils import to_class_name, to_camelcase, to_underscores import re # Defines the owl and rdf namespaces namespaces = { 'owl': 'http://www.w3.org/2002/07/owl#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#' } # transform property name pron = lambda name: to_underscores(to_camelcase(name)) # get local tag def get(sets, el): if hasattr(sets, "iterchildren"): props = [ e for e in sets.iterchildren() if re.search('#}%s$' % el, e.tag) ] return props[0].text if len(props) else '' else: return "" # Merge 2 list and remove duplicates using the given field as reference def merge(first_list, second_list, field): refs = [ x[field] for x in second_list ] return second_list + [ x for x in first_list if x[field] not in refs ] class Command(BaseCommand): help = "Parse the given OWL file to generate its neo4django models."
def get_model_field(self, desc, model_name): # All field's options field_opts = dict(null=True) # Get the name tag field_name = gn(desc, 'name') # Convert the name to a python readable format field_name = to_underscores(field_name) # We didn't found a name # @TODO handle that with a custom exception if field_name is None: return None, None # The field can contains rules for name, value in gn(desc, 'rules', dict()).iteritems(): self.add_rule(model_name, field_name, name, value) # Get field's special properties field_opts = dict( field_opts.items() + self.get_field_specials(desc).items() ) if field_name == "name": field_opts["indexed"] = True # It's a relationship! if "related_model" in desc and desc["related_model"] is not None: field_opts["target"] = to_class_name(desc["related_model"].lower()) field_target = to_class_name(field_opts["target"]) # Remove "has_" from the begining of the name if field_name.startswith("has_"): field_name = field_name[4:] # Build rel_type using the name and the class name field_opts["rel_type"] = "%s_has_%s+" % ( to_underscores(model_name), field_name) field_type = "relationship" # Add a related name if "related_name" in field_opts and field_opts["related_name"] is not None: # Convert related_name to the same format related_name = field_opts["related_name"] related_name = to_underscores(related_name) field_opts["related_name"] = related_name else: related_name = field_opts["related_name"] = None # This relationship can embed properties. # Properties are directly bound to the relationship field. if "fields" in desc: # Fields related to the new model composite_fields = gn(desc, 'fields', []) # Create a field to reference the relationship ID composite_fields.append(dict( type="int", name="_relationship", help_text="The ID of the relationship to describe.", indexed=True, rules=dict(is_editable=False) )) composite_fields.append(dict( type="intarray", name="_endnodes", help_text="IDs of the relationship's extremities.", indexed=True, rules=dict(is_editable=False) )) # Name of the new model composite_name = "%s %s %s Properties" % ( model_name, field_name, field_target ) # Create a Model with the relation composite_model = { "name": composite_name, "fields": composite_fields } # Create the new model! model = self.add_model(composite_model) # We have to register (for later) a rule that says # explicitely that this field has properties self.add_rule(model_name, field_name, "has_properties", True) self.add_rule(model_name, field_name, "through", model) # This relationship is visible in the target model if related_name is not None: # Add another rule for the reverse relationship self.add_rule(field_target, related_name, "has_properties", True) self.add_rule(field_target, related_name, "through", model) # Add a rules to make this "special" model self.modelrules.model(model).add(is_relationship_properties=True, relationship_source=model_name, relationship_target=field_target, is_searchable=False) # It's a literal value else: # Picks one of the two tags type field_type = desc["type"].lower() # Remove "field" suffix if field_type.endswith("field"): field_type = field_type[0:-5] # Skip unkown type # @TODO raise custom exception if not field_type in self.JSONTYPES: return None, None # Convert type to neo4django property type field_type = self.JSONTYPES[field_type] # Add a default value for boolean properties if field_type == 'BooleanProperty' and not 'default' in field_opts.keys(): field_opts['default'] = False # Return an instance of the field return field_name, getattr(models, field_type)(**field_opts)
def process_bulk_parsing_and_save_as_model(topic, files, start_time=None): """ Job which parses uploaded content, validates and saves them as model """ start_time = start_time != None and start_time or time.time() entities = {} relations = [] errors = [] id_mapping = {} nb_lines = 0 file_reading_progression = 0 job = get_current_job() # Define Exceptions class Error(Exception): """ Generic Custom Exception for this endpoint. Include the topic. """ def __init__(self, **kwargs): """ set the topic and add all the parameters as attributes """ self.topic = topic.title for key, value in kwargs.items(): setattr(self, key, value) def __str__(self): return self.__dict__ class WarningCastingValueFail(Error): pass class WarningValidationError(Error): pass class WarningKeyUnknown(Error): pass class WarningInformationIsMissing(Error): pass class AttributeDoesntExist(Error): pass class WrongCSVSyntax(Error): pass class ColumnUnknow(Error): pass class ModelDoesntExist(Error): pass class RelationDoesntExist(Error): pass try: assert type(files) in (tuple, list), type(files) assert len(files) > 0, "You need to upload at least one file." assert type(files[0]) in (tuple, list) assert len(files[0]) == 2 # retrieve all models in current topic all_models = dict( (model.__name__, model) for model in topic.get_models()) # iterate over all files and dissociate entities .csv from relations .csv for file in files: if type(file) is tuple: file_name = file[0] file = file[1] else: raise Exception() csv_reader = utils.open_csv(file) header = csv_reader.next() assert len( header ) > 1, "{file_name} header should have at least 2 columns" assert header[0].endswith( "_id" ), "{file_name} : First column should begin with a header like <model_name>_id. Actually {first_col}".format( file_name=file_name, first_col=header[0]) if len(header) >= 3 and header[0].endswith( "_id") and header[2].endswith("_id"): # this is a relationship file relations.append((file_name, file)) else: # this is an entities file model_name = utils.to_class_name(header[0].replace("_id", "")) if model_name in all_models.keys(): entities[model_name] = (file_name, file) else: raise ModelDoesntExist(model=model_name, file=file_name, models_availables=all_models.keys()) nb_lines += len(file) - 1 # -1 removes headers # first iterate over entities logger.debug("BulkUpload: creating entities") for entity, (file_name, file) in entities.items(): csv_reader = utils.open_csv(file) header = csv_reader.next() # must check that all columns map to an existing model field fields = utils.get_model_fields(all_models[entity]) fields_types = {} for field in fields: fields_types[field['name']] = field['type'] field_names = [field['name'] for field in fields] columns = [] for column in header[1:]: column = utils.to_underscores(column) if not column in field_names and not column.endswith( "__sources__"): raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names) break if column.endswith("__sources__"): column_type = "__sources__" column = column[:-len("__sources__")] if not column in field_names: raise ColumnUnknow(file=file_name, column=column, model=entity, attributes_available=field_names) break else: column_type = fields_types.get(column, None) columns.append((column, column_type)) else: # here, we know that all columns are valid for row in csv_reader: data = {} sources = {} entity_id = row[0] for i, (column, column_type) in enumerate(columns): value = str(row[i + 1]).decode('utf-8') # cast value if needed if value: try: if "Integer" in column_type: value = int(value) # TODO: cast float if "Date" in column_type: value = datetime.datetime(*map( int, re.split('[^\d]', value)[:3])).replace( tzinfo=utc) except Exception as e: e = WarningCastingValueFail( column_name=column, value=value, type=column_type, data=data, model=entity, file=file_name, line=csv_reader.line_num, error=str(e)) errors.append(e) break if column_type == "__sources__": sources[column] = value else: data[column] = value else: # instanciate a model try: item = all_models[entity].objects.create(**data) # map the object with the ID defined in the .csv id_mapping[(entity, entity_id)] = item # create sources for sourced_field, reference in sources.items(): for ref in reference.split("||"): FieldSource.objects.create( individual=item.id, field=sourced_field, reference=ref) # FIXME: job can be accessed somewhere else (i.e detective/topics/common/jobs.py:JobResource) # Concurrent access are not secure here. # For now we refresh the job just before saving it. file_reading_progression += 1 if job: job.refresh() job.meta["file_reading_progression"] = ( float(file_reading_progression) / float(nb_lines)) * 100 job.meta["file_reading"] = file_name job.save() except Exception as e: errors.append( WarningValidationError( data=data, model=entity, file=file_name, line=csv_reader.line_num, error=str(e))) inserted_relations = 0 # then iterate over relations logger.debug("BulkUpload: creating relations") for file_name, file in relations: # create a csv reader csv_reader = utils.open_csv(file) csv_header = csv_reader.next() relation_name = utils.to_underscores(csv_header[1]) model_from = utils.to_class_name(csv_header[0].replace("_id", "")) model_to = utils.to_class_name(csv_header[2].replace("_id", "")) properties_name = csv_header[3:] # retrieve ModelProperties from related model ModelProperties = topic.get_rules().model( all_models[model_from]).field(relation_name).get("through") # check that the relation actually exists between the two objects try: getattr(all_models[model_from], relation_name) except Exception as e: raise RelationDoesntExist( file=file_name, model_from=model_from, model_to=model_to, relation_name=relation_name, fields_available=[ field['name'] for field in utils.iterate_model_fields( all_models[model_from]) ], error=str(e)) for row in csv_reader: id_from = row[0] id_to = row[2] properties = [p.decode('utf-8') for p in row[3:]] if id_to and id_from: try: instance_from = id_mapping[(model_from, id_from)] instance_to = id_mapping[(model_to, id_to)] getattr(instance_from, relation_name).add(instance_to) # add properties if needed if ModelProperties and properties_name and properties: # save the relationship to create an id instance_from.save() # retrieve this id relation_id = next( rel.id for rel in instance_from.node.relationships.outgoing() if rel.end.id == instance_to.id) # properties of the relationship relation_args = { "_endnodes": [ id_mapping[(model_from, id_from)].id, instance_to.id ], "_relationship": relation_id, } # Pairwise the properties with their names relation_args.update( zip(properties_name, properties)) try: ModelProperties.objects.create(**relation_args) except TypeError as e: errors.append( AttributeDoesntExist( file=file_name, line=csv_reader.line_num, model_from=model_from, id_from=id_from, model_to=model_to, id_to=id_to, relation_args=relation_args, error=str(e))) # update the job inserted_relations += 1 file_reading_progression += 1 if job: job.refresh() job.meta["file_reading_progression"] = ( float(file_reading_progression) / float(nb_lines)) * 100 job.meta["file_reading"] = file_name job.save() except KeyError as e: errors.append( WarningKeyUnknown(file=file_name, line=csv_reader.line_num, model_from=model_from, id_from=id_from, model_to=model_to, id_to=id_to, relation_name=relation_name, error=str(e))) except Exception as e: # Error unknown, we break the process to alert the user raise Error(file=file_name, line=csv_reader.line_num, model_from=model_from, id_from=id_from, model_to=model_to, id_to=id_to, relation_name=relation_name, error=str(e)) else: # A key is missing (id_from or id_to) but we don't want to stop the parsing. # Then we store the wrong line to return it to the user. errors.append( WarningInformationIsMissing(file=file_name, row=row, line=csv_reader.line_num, id_to=id_to, id_from=id_from)) # Save everything saved = 0 logger.debug("BulkUpload: saving %d objects" % (len(id_mapping))) if job: job.refresh() job.meta["objects_to_save"] = len(id_mapping) job.save() for item in id_mapping.values(): item.save() saved += 1 if job: job.refresh() job.meta["saving_progression"] = saved job.save() if job: job.refresh() if job and "track" in job.meta: from django.core.mail import send_mail user = User.objects.get(pk=job.meta["user"]) send_mail("upload finished", "your upload just finished", settings.DEFAULT_FROM_EMAIL, (user.email, )) return { 'duration': (time.time() - start_time), 'inserted': { 'objects': saved, 'links': inserted_relations }, "errors": sorted([ dict([(e.__class__.__name__, str(e.__dict__))]) for e in errors ]) } except Exception as e: import traceback logger.error(traceback.format_exc()) if e.__dict__: message = str(e.__dict__) else: message = e.message return {"errors": [{e.__class__.__name__: message}]}
def reduce_origin(rows): # No nodes, no links if len(rows) == 0: return ([], [],) # Initialize structures all_nodes = dict() # Use defaultdict() to create somewhat of an autovivificating list # We want to build a structure of the form: # { source_id : { relation_name : [ target_ids ] } } # Must use a set() instead of list() to avoid checking duplicates but it screw up json.dumps() all_links = defaultdict(lambda: dict(__count=0, __relations=defaultdict(list))) IDs = set(sum([row['nodes'] for row in rows], [])) # Get all entities from their IDs query = """ START root = node({0}) MATCH (root)-[:`<<INSTANCE>>`]-(type) WHERE type.app_label = '{1}' AND HAS(root.name) RETURN ID(root) as ID, root, type """.format(','.join([str(ID) for ID in IDs]), get_model_topic(self.get_model())) all_raw_nodes = connection.cypher(query).to_dicts() for row in all_raw_nodes: # Twist some data in the entity for key in row['root']['data'].keys(): if key[0] == '_': del row['root']['data'][key] row['root']['data']['_type'] = row['type']['data']['model_name'] row['root']['data']['_id'] = row['ID'] all_nodes[row['ID']] = row['root']['data'] for row in rows: nodes = row['nodes'] i = 0 for relation in row['relations']: try: if all_nodes[nodes[i]] is None or all_nodes[nodes[i + 1]] is None: continue (a, b) = (nodes[i], nodes[i + 1]) if re.search('^'+to_underscores(all_nodes[nodes[i]]['_type']), relation) is None: (a, b) = (nodes[i + 1], nodes[i]) if not b in all_links[a]['__relations'][relation]: all_links[a]['__count'] += 1 all_links[a]['__relations'][relation].append(b) except KeyError: pass i += 1 # Sort and aggregate nodes when we're over the threshold for node in all_links.keys(): shortcut = all_links[node]['__relations'] if all_links[node]['__count'] >= aggregation_threshold: sorted_relations = sorted([(len(shortcut[rel]), rel) for rel in shortcut], key=lambda to_sort: to_sort[0]) shortcut = defaultdict(list) i = 0 while i < aggregation_threshold: for rel in sorted_relations: try: node_id = all_links[node]['__relations'][rel[1]].pop() shortcut[rel[1]].append(node_id) i += 1 except IndexError: # Must except IndexError if we .pop() on an empty list pass if i >= aggregation_threshold: break shortcut['_AGGREGATION_'] = sum(all_links[node]['__relations'].values(), []) all_links[node] = shortcut return (all_nodes, all_links)
def parse(ontology, module='', app_label=None): app_label = app_label if app_label is not None else module.split(".")[-1] # Deduce the path to the ontology if type(ontology) is FieldFile: raw = ontology.read() # Open the ontology file and returns the root root = ET.fromstring(raw) else: tree = ET.parse(str(ontology)) # Get the root of the xml root = tree.getroot() # Where record the new classes classes = dict() # List classes for clss in root.findall("owl:Class", namespaces=NAMESPACES): # Extract the class name class_name = attr(clss, "rdf:about", "").split('#')[-1] # Format the class name to be PEP compliant class_name = to_class_name(class_name) # Get all special attributes for this class class_specials = get_class_specials(clss) # Every class fields are recorded into an objects class_fields = { # Additional informations "_description": class_specials["help_text"], "_topic": class_specials["scope"], # Default fields "_author": models.IntArrayProperty( null=True, help_text=u'People that edited this entity.', verbose_name=u'author'), "_status": models.IntegerProperty(null=True, help_text=u'', verbose_name=u'status') } # Pick some options (Meta class) class_options = {} for f in ["verbose_name", "verbose_name_plural"]: if class_specials[f] is not None: class_options[f] = class_specials[f] # List all fields for field in clss.findall("rdfs:subClassOf//owl:Restriction", namespaces=NAMESPACES): # All field's options field_opts = dict(null=True) # Get the name tag field_name = field.find("owl:onProperty", namespaces=NAMESPACES) # We didn't found a name if field_name is None: continue # Get the complete field name using the rdf:resource attribute field_name = attr(field_name, "rdf:resource") # Get field's special properties field_opts = dict(field_opts.items() + get_field_specials(root, field_name).items()) # Convert the name to a python readable format field_name = to_underscores(field_name.split("#")[-1]) if "related_name" in field_opts and field_opts[ "related_name"] is not None: # Convert related_name to the same format field_opts["related_name"] = to_underscores( field_opts["related_name"]) # It might be a relationship on_class = field.find("owl:onClass", namespaces=NAMESPACES) # It's a relationship! if on_class is not None: field_opts["target"] = to_class_name( attr(on_class, "rdf:resource").split("#")[-1]) # Remove "has_" from the begining of the name if field_name.startswith("has_"): field_name = field_name[4:] # Build rel_type using the name and the class name field_opts["rel_type"] = "%s_has_%s+" % ( to_underscores(class_name), field_name) field_type = "Relationship" else: # Get the type tag data_range = field.find("owl:onDataRange", namespaces=NAMESPACES) # It might be another tag values_from = field.find("owl:someValuesFrom", namespaces=NAMESPACES) # Picks one of the two tags type field_type = data_range if data_range is not None else values_from # It might be nothing! if field_type is None: continue # Convert the type to a python readable format field_type = OWLTYPES[attr(field_type, "rdf:resource").split("#")[-1]] # Record the field class_fields[field_name] = getattr(models, field_type)(**field_opts) # Record the class with this fields classes[class_name] = create_node_model(class_name, class_fields, app_label=app_label, options=class_options, module=module) # Prevent a bug with select_related when using neo4django and virtual models if not hasattr(classes[class_name]._meta, '_relationships'): classes[class_name]._meta._relationships = {} return classes
def parse(ontology, module='', app_label=None): app_label = app_label if app_label is not None else module.split(".")[-1] # Deduce the path to the ontology if type(ontology) is FieldFile: raw = ontology.read() # Open the ontology file and returns the root root = ET.fromstring(raw) else: tree = ET.parse(str(ontology)) # Get the root of the xml root = tree.getroot() # Where record the new classes classes = dict() # List classes for clss in root.findall("owl:Class", namespaces=NAMESPACES): # Extract the class name class_name = attr(clss, "rdf:about", "").split('#')[-1] # Format the class name to be PEP compliant class_name = to_class_name(class_name) # Get all special attributes for this class class_specials = get_class_specials(clss) # Every class fields are recorded into an objects class_fields = { # Additional informations "_description": class_specials["help_text"], "_topic" : class_specials["scope"], # Default fields "_author": models.IntArrayProperty(null=True, help_text=u'People that edited this entity.', verbose_name=u'author'), "_status": models.IntegerProperty(null=True,help_text=u'',verbose_name=u'status') } # Pick some options (Meta class) class_options = {} for f in ["verbose_name", "verbose_name_plural"]: if class_specials[f] is not None: class_options[f] = class_specials[f] # List all fields for field in clss.findall("rdfs:subClassOf//owl:Restriction", namespaces=NAMESPACES): # All field's options field_opts = dict(null=True) # Get the name tag field_name = field.find("owl:onProperty", namespaces=NAMESPACES) # We didn't found a name if field_name is None: continue # Get the complete field name using the rdf:resource attribute field_name = attr(field_name, "rdf:resource"); # Get field's special properties field_opts = dict(field_opts.items() + get_field_specials(root, field_name).items() ) # Convert the name to a python readable format field_name = to_underscores(field_name.split("#")[-1]) if "related_name" in field_opts and field_opts["related_name"] is not None: # Convert related_name to the same format field_opts["related_name"] = to_underscores(field_opts["related_name"]) # It might be a relationship on_class = field.find("owl:onClass", namespaces=NAMESPACES) # It's a relationship! if on_class is not None: field_opts["target"] = to_class_name(attr(on_class, "rdf:resource").split("#")[-1]) # Remove "has_" from the begining of the name if field_name.startswith("has_"): field_name = field_name[4:] # Build rel_type using the name and the class name field_opts["rel_type"] = "%s_has_%s+" % ( to_underscores(class_name), field_name) field_type = "Relationship" else: # Get the type tag data_range = field.find("owl:onDataRange", namespaces=NAMESPACES) # It might be another tag values_from = field.find("owl:someValuesFrom", namespaces=NAMESPACES) # Picks one of the two tags type field_type = data_range if data_range is not None else values_from # It might be nothing! if field_type is None: continue # Convert the type to a python readable format field_type = OWLTYPES[attr(field_type, "rdf:resource").split("#")[-1]] # Record the field class_fields[field_name] = getattr(models, field_type)(**field_opts) # Record the class with this fields classes[class_name] = create_node_model(class_name, class_fields, app_label=app_label, options=class_options, module=module) # Prevent a bug with select_related when using neo4django and virtual models if not hasattr(classes[class_name]._meta, '_relationships'): classes[class_name]._meta._relationships = {} return classes