def _create_schema_from_graph(self, g): # identify schema attributes # TODO: identify rdf_url from graph instead # some cases where the url is not the same as the uri get_schema_keys = [OWL.Ontology, RDFS.Class, RDF.Property, None] rdf_url = None for key in get_schema_keys: try: schema_subject, _, _ = next(g.triples((None, RDF.type, key))) rdf_url = str(schema_subject) break except Exception as e: pass if rdf_url is None: return None label_keys = [DC.title, RDFS.label, DCTERMS.title] description_keys = [DC.description, DCTERMS.description] label = "Not available" description = "Not available" for key in label_keys: try: rdf_url, _, label = next(g.triples( (schema_subject, key, None))) break except: pass for key in description_keys: try: rdf_url, _, description = next( g.triples((schema_subject, key, None))) break except: pass # only save if it does not exists try: self.schema = Schema.objects.get(url=str(rdf_url)) except Exception as e: self.schema = Schema(label=self.standardize_string( label, remove_version=True), url=str(rdf_url), description=str(description)) if self.save_to_db: self.schema.save() return self.schema
def test_identify_from_json_data_strava_test(self): from MetaDataApi.metadata.services import ( RdfSchemaService, DataCleaningService, SchemaIdentificationV2, RdfInstanceService) from MetaDataApi.metadata.models import Schema, Object, Attribute, ObjectRelation from django.contrib.auth.models import User rdf_inst = RdfInstanceService() # data_cleaning = DataCleaningService() LoadTestData.init_foaf() user = LoadTestData.init_user() # schema = LoadTestData.init_strava_schema_from_file() schema = Schema(label="strava") schema.save() # objects = LoadTestData.init_strava_data_from_file() service = SchemaIdentificationV2() data = UtilsForTesting.loadStravaActivities() objects = service.identify_from_json_data( data, schema, user, parrent_label="activities") metaobj = list(Object.objects.all()) metaobj = list(ObjectRelation.objects.all()) metaobj = list(Attribute.objects.all()) labels = list(map(lambda x: "%s - %s" % (x.base.label, str(type(x).__name__)), objects)) RdfSchemaService().export_schema_from_db(schema) file = RdfInstanceService().export_instances_to_rdf_file(schema, objects) print(schema.url) expected = ['activities - ObjectInstance', 'person__to__activities - ObjectRelationInstance', 'athlete - ObjectInstance', 'activities__to__athlete - ObjectRelationInstance', 'start_latlng - ObjectInstance', 'activities__to__start_latlng - ObjectRelationInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - ObjectInstance', 'activities__to__end_latlng - ObjectRelationInstance', 'end_latlng - FloatAttributeInstance', 'location_city - ObjectInstance', 'activities__to__location_city - ObjectRelationInstance', 'location_state - ObjectInstance', 'activities__to__location_state - ObjectRelationInstance', 'map - ObjectInstance', 'activities__to__map - ObjectRelationInstance', 'gear_id - ObjectInstance', 'activities__to__gear_id - ObjectRelationInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'summary_polyline - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance', 'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'summary_polyline - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance', 'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'summary_polyline - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance', 'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance', 'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'summary_polyline - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance', 'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance', 'activities - ObjectInstance', 'resource_state - IntAttributeInstance', 'athlete - ObjectInstance', 'id - IntAttributeInstance', 'resource_state - IntAttributeInstance', 'name - StringAttributeInstance', 'distance - FloatAttributeInstance', 'moving_time - IntAttributeInstance', 'elapsed_time - IntAttributeInstance', 'total_elevation_gain - FloatAttributeInstance', 'type - StringAttributeInstance', 'workout_type - IntAttributeInstance', 'id - IntAttributeInstance', 'external_id - StringAttributeInstance', 'upload_id - IntAttributeInstance', 'start_date - DateTimeAttributeInstance', 'start_date_local - DateTimeAttributeInstance', 'timezone - StringAttributeInstance', 'utc_offset - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'start_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'end_latlng - FloatAttributeInstance', 'location_country - StringAttributeInstance', 'start_latitude - FloatAttributeInstance', 'start_longitude - FloatAttributeInstance', 'achievement_count - IntAttributeInstance', 'kudos_count - IntAttributeInstance', 'comment_count - IntAttributeInstance', 'athlete_count - IntAttributeInstance', 'photo_count - IntAttributeInstance', 'map - ObjectInstance', 'id - StringAttributeInstance', 'summary_polyline - StringAttributeInstance', 'resource_state - IntAttributeInstance', 'trainer - BoolAttributeInstance', 'commute - BoolAttributeInstance', 'manual - BoolAttributeInstance', 'private - BoolAttributeInstance', 'visibility - StringAttributeInstance', 'flagged - BoolAttributeInstance', 'from_accepted_tag - BoolAttributeInstance', 'average_speed - FloatAttributeInstance', 'max_speed - FloatAttributeInstance', 'has_heartrate - BoolAttributeInstance', 'heartrate_opt_out - BoolAttributeInstance', 'display_hide_heartrate_option - BoolAttributeInstance', 'elev_high - FloatAttributeInstance', 'elev_low - FloatAttributeInstance', 'pr_count - IntAttributeInstance', 'total_photo_count - IntAttributeInstance', 'has_kudoed - BoolAttributeInstance'] labels.sort() expected.sort() self.assertEqual(labels, expected)
def create_new_empty_schema(self, schema_label): self.schema = Schema() self.schema.label = self.standardize_string(schema_label) self.schema.description = "" self.schema.url = "temp" # quick fix for saving without conflicting with unique url # create a dummy file content = ContentFile("") self.schema.rdfs_file.save(schema_label + ".ttl", content) self.schema.url = self.schema.rdfs_file.url self.schema.save() self.touched_meta_items.append(self.schema) return self.schema
def test_identify_json_data_sample(self): from MetaDataApi.metadata.services import ( SchemaIdentificationV2) from MetaDataApi.metadata.models import Schema, Object LoadTestData.init_foaf() LoadTestData.init_open_m_health_sample(extras=[ "body-temperature-2.0.json", "body-temperature-2.x.json", ]) url = "https://raw.githubusercontent.com/Grusinator/MetaDataApi" + \ "/master/schemas/json/omh/test_data/body-temperature/2.0/" + \ "shouldPass/valid-temperature.json" obj_count = Object.objects.all().count() # make sure that the number of objects is larger than if obj_count < 10: raise AssertionError("database not populated") with request.urlopen(url) as resp: text = resp.read().decode() service = SchemaIdentificationV2() schema = service._try_get_item(Schema(label="open_m_health")) input_data = { "body-temperature": json.loads(text) } objs = service.map_data_to_native_instances(input_data, schema) self.assertEqual(len(objs), 4)
def init_strava_data_from_file(): from MetaDataApi.metadata.services import ( RdfInstanceService, RdfSchemaService, DataCleaningService, SchemaIdentificationV2 ) from MetaDataApi.metadata.models import Schema, Object, Attribute user = LoadTestData.init_user() service = SchemaIdentificationV2() # load the file testfile = os.path.join( settings.BASE_DIR, "MetaDataApi/metadata/tests/data/json/strava_activities.json") with open(testfile) as f: data = json.loads(f.read()) schema = service._try_get_item(Schema(label="strava")) label = "activities" objects = service.map_data_to_native_instances( data, schema, parrent_label=label, owner=user) return objects
def test_(self): from MetaDataApi.metadata.services import (RdfSchemaService, RdfInstanceService) from MetaDataApi.metadata.models import (Object, Schema, Attribute, ObjectRelation) from MetaDataApi.metadata.models import (RawData, CategoryTypes, ObjectInstance, ObjectRelationInstance, FloatAttributeInstance, StringAttributeInstance) LoadTestData.init_foaf() service = RdfInstanceService() schema_label = "friend_of_a_friend" schema = service._try_get_item(Schema(label=schema_label)) foaf_atts = Attribute.objects.filter(object__schema=schema) s = list(filter(lambda x: x.label, foaf_atts)) foaf_person = service.get_foaf_person() foaf_name = Attribute.objects.get(label="first_name", object__schema=schema) foaf_knows = ObjectRelation.objects.get(label="knows", schema=schema) b1 = ObjectInstance(base=foaf_person) b2 = ObjectInstance(base=foaf_person) b1.save() b2.save() name1 = StringAttributeInstance(base=foaf_name, object=b1, value="B1") name2 = StringAttributeInstance(base=foaf_name, object=b2, value="B2") name1.save() name2.save() rel1 = ObjectRelationInstance(base=foaf_knows, from_object=b1, to_object=b2) rel1.save() objects = [b1, b2, name1, name2, rel1] rdf_file = service.export_instances_to_rdf_file(schema, objects) self.assertIsNotNone(rdf_file.url)
def test_identify_json_data_sample(self): from MetaDataApi.metadata.services.data_cleaning_service import ( DataCleaningService) from MetaDataApi.metadata.models import Schema, Object dc_service = DataCleaningService() schema_label = "open_m_health" schema = dc_service._try_get_item(Schema(label=schema_label)) dc_service.relate_root_classes_to_foaf(schema) self.assertEqual(1 + 1, 2)
def write_to_db(self, input_url, schema_label): self.baseurl, filename = self._infer_info_split_url(input_url) label = filename.replace(".json", "") data = self._read_json_from_url(input_url) schema_label = self.standardize_string(schema_label, remove_version=False) self.schema = self._try_get_item(Schema(label=schema_label)) if not self.schema: self.schema = self.create_new_empty_schema(schema_label) return_objects = self._iterate_schema(data, label, filename=filename) self.schema = None return self.touched_meta_items
class BaseMetaDataService(): def __init__(self): # super(BaseMetaDataService, self).__init__() self.schema = None self.baseurl = None self.foaf_person = None self.added_meta_items = [] self.touched_meta_items = [] self.added_instance_items = [] self._error_list = [] # one switch to force overwrite the objects when self.overwrite_db_objects = False # -- same -- but to disable saving to db self.save_to_db = True # dont use the same if it allready exists self.allways_create_new = False self.att_inst_to_type_map = { # StringAttributeInstance: str, StringAttributeInstance: str, DateTimeAttributeInstance: datetime, FloatAttributeInstance: float, IntAttributeInstance: int, BoolAttributeInstance: bool } self.att_types = tuple(typ if isinstance(typ, type) else type(typ) for typ in Attribute.data_type_map.keys()) self.att_instances = tuple(self.att_inst_to_type_map.keys()) self.instances = self.att_instances + \ (ObjectInstance, ObjectRelationInstance) def inverse_dict(self, dicti, value): try: keys = list(dicti.keys()) values = list(dicti.values()) index = values.index(value) return keys[index] except Exception as e: return None def standardize_string(self, string, remove_version=False): string = inflection.underscore(str(string)) string = string.replace(".json", "") string = string.replace(" ", "_") # remove any version numbers if remove_version: string = re.sub(r"(|_version|_v|_v.)(|_)\d+\.(\d+|x)(|_)", '', string) string = re.sub("(|_)vocabulary(|_)", '', string) # remove parenthesis with content string = re.sub(r'(|_)\([^)]*\)', '', string) # remove trailing and leading whitespace/underscore # string = re.sub('/^[\W_]+|[\W_]+$/', '', string) return string def rest_endpoint_to_label(self, endpoint): # TODO the last might not be the most relevant endpoint_without_args = endpoint.split("?")[0] last_elm = endpoint_without_args.split("/")[-1] return self.standardize_string(last_elm) def create_new_empty_schema(self, schema_label): self.schema = Schema() self.schema.label = self.standardize_string(schema_label) self.schema.description = "" self.schema.url = "temp" # quick fix for saving without conflicting with unique url # create a dummy file content = ContentFile("") self.schema.rdfs_file.save(schema_label + ".ttl", content) self.schema.url = self.schema.rdfs_file.url self.schema.save() self.touched_meta_items.append(self.schema) return self.schema def is_meta_item_in_created_list(self, item, item_list=None): item_list = item_list or self.touched_meta_items # new __eq__implementation return next(filter(item.__eq__, item_list), None) # old rubbish same_labels = filter(lambda x: item.label == x.label, item_list) if isinstance(item, Object): for same_label in same_labels: if same_label.to_relations.all().count() == 0: return True for relation in same_label.to_relations.all(): item_to_relation_objects = [ rel.to_object for rel in item.from_relations.all() ] if relation.to_object in item_to_relation_objects: return True elif isinstance(item, Attribute): for same_label in same_labels: if same_label.object == item.object: return True elif isinstance(item, ObjectRelation): for in_list in item_list: if in_list.label == item.label: return True else: raise Exception() return False def dict_contains_only_attr(self, data): # if its not a dict, then its not an # attribute if not isinstance(data, dict): return False data = data.copy() if len(data) == 0: return False attr_names = ["value", "unit"] attrs = [data.pop(name, None) for name in attr_names] return len(data) == 0 # def compare_rel( # x): return x.from_relations in item.from_relations.all() # def func(x): return any(filter(compare_rel, x.from_relations.all())) # any([func(x) for x in same_labels]) # else: def identify_data_type(self, element): if element is None: return None def test_float(elm): assert ("." in elm), "does not contain decimal separator" return float(elm) def test_bool(elm): trues = ("true", "True") falses = ("false", "False") if elm in trues: return True elif elm in falses: return False else: raise ValueError("is not either true or false") def test_datetime(text): try: return dateutil.parser.parse(text) except: datetime_formats = ( '%Y-%m-%dT%H: %M: %SZ', # strava ) for fmt in datetime_formats: try: return datetime.strptime(text, fmt) except ValueError as e: pass raise ValueError('no valid date format found') # even though it is a string, # it might really be a int or float # so if string verify!! if isinstance(element, str): conv_functions = { float: test_float, int: lambda elm: int(elm), datetime: test_datetime, str: lambda elm: str(elm), bool: test_bool } order = [float, int, datetime, bool, str] for typ in order: try: # try the converting function of that type # if it doesnt fail, thats our type return conv_functions[typ](element) except (ValueError, AssertionError) as e: pass # if nothing else works, return as string return str(element) elif isinstance(element, (float, int, bool)): # otherwise just return the type of return element def _try_get_item(self, item, parrent_label=None): # standardize label string if hasattr(item, "label"): item.label = self.standardize_string(item.label) search_args = {} item_type = type(item) # meta vs instances if isinstance(item, (Attribute, Object, ObjectRelation, Schema)): search_args["label"] = item.label # instance only look for primary key elif isinstance(item, self.instances): # search_args["base__label"] = item.base.label search_args["pk"] = item.pk # individual metaobjects if isinstance(item, Attribute): search_args["object__label"] = item.object.label elif isinstance(item, Object): search_args["schema"] = item.schema # adds the option to search for objects dependent # on from relations if parrent_label and parrent_label == "None": search_args["from_relations"] = None elif parrent_label: search_args["from_relations__from_object__label"]\ = parrent_label # search_args["from_relations"] = item.from_relations elif isinstance(item, ObjectRelation): search_args["from_object"] = item.from_object search_args["to_object"] = item.to_object # # individual instances # elif isinstance(item, self.att_instances): # search_args["object__base__label"] = item.object.base.label # elif isinstance(item, ObjectInstance): # search_args["base__schema"] = item.base.schema # # adds the option to search for objects dependent # # on from relations # if parrent_label and parrent_label == "None": # search_args["from_relations"] = None # elif parrent_label: # search_args["from_relations__from_object__base__label"]\ # = parrent_label # # search_args["from_relations"] = item.from_relations # elif isinstance(item, ObjectRelation): # search_args["from_object"] = item.from_object # search_args["to_object"] = item.to_object try: # this "with transaction.atomic():" # is used to make tests run due to some # random error, atomic something. # it works fine when runs normally with transaction.atomic(): return item_type.objects.get(**search_args) except ObjectDoesNotExist as e: return None except MultipleObjectsReturned as e: self._error_list.append((item, e)) if hasattr(item, "schema"): schema_label = item.schema.label else: schema_label = item.object.schema.label print("""Warning, this is most likely wrong wrong, the object found: %s objects, but the first was chosen. -- label: %s schema: %s""" % ( item_type.objects.filter(**search_args).count(), item.label, schema_label, )) return item_type.objects.filter(**search_args).first() except Exception as e: pass def _try_create_item(self, item, update=False, parrent_label=None): item_type = type(item) remove_version = not isinstance(item_type, Schema) # test if exists item.label = self.standardize_string(item.label, remove_version=remove_version) try: # this "with transaction.atomic():" # is used to make tests run due to some # random error, atomic something. # it works fine when runs normally with transaction.atomic(): # we want to be able to tell uniquely if this is the same # object, so test on all objects, not only the label, # so that it is possible to know if we are overwriting # objects that shouldnt return_item = self._try_get_item(item, parrent_label=parrent_label) # the object exists, if return_item: if update or self.overwrite_db_objects: if self.save_to_db: return_item.delete() item.save() else: # if not updated, return the fetched one item = return_item else: item = return_item # does not exists, create it! else: try: if self.save_to_db: item.save() self.added_meta_items.append(item) except Exception as e: # on update add to debug list self._error_list.append((item, e)) return None # on success return the item, either fetched, or saved # so that the referenced object lives in the database self.touched_meta_items.append(item) return item except (transaction.TransactionManagementError, ) as e: return None except Exception as e: return None def is_objects_connected(self, obj_from, obj_to, objects): relations = obj_from.from_relations.all() related_objects = list(map(lambda x: x.to_object.get(), relations)) related_objects = list(filter(lambda x: x in objects, related_objects)) for obj in related_objects: if obj == obj_to: return True elif self.is_objects_connected(obj, obj_to, objects): return True return False def get_foaf_person(self): if not self.foaf_person: schema = Schema.objects.get(label="friend_of_a_friend") self.foaf_person = Object.objects.get(label="person", schema=schema) return self.foaf_person def att_to_att_inst(self, attr): data_type = self.inverse_dict(Attribute.data_type_map, attr.data_type) return self.inverse_dict(self.att_inst_to_type_map, data_type) def get_connected_attribute_pairs(self, att_1, att_2): foaf1, att1_list = BaseMetaDataService.path_to_object( att_1, self.get_foaf_person(), childrens=[]) foaf2, att2_list = BaseMetaDataService.path_to_object( att_2, self.get_foaf_person(), childrens=[]) # get first common object common_set = set(att1_list) & set(att2_list) common_obj = next(filter(lambda x: x in common_set, att1_list)) # truncate list down to common object att1_list = att1_list[:att1_list.index(common_obj) + 1] att2_list = att2_list[:att2_list.index(common_obj) + 1] returns = [] common_instances = ObjectInstance.objects.filter(base=common_obj) for common_instance in common_instances: value1 = self.get_specific_child(common_instance, att_1, path=att1_list) value2 = self.get_specific_child(common_instance, att_2, path=att2_list) returns.append((value1, value2)) return returns def get_specific_child(self, obj_inst, child, path=None): """ get a decendent object, either att, or obj of a specific type """ if path is None: path = self.path_to_object(child, obj_inst.base) search_args = self.build_search_args_from_list(path, obj_inst) AttributeInstance = self.att_to_att_inst(child) try: return AttributeInstance.objects.get(**search_args) except ObjectDoesNotExist as e: return None except MultipleObjectsReturned as e: print("WARNING: obj, contains multiple object, first is taken") return next(AttributeInstance.objects.filter(**search_args)) def build_search_args_from_list(self, path, obj_inst): search_args = {} base_arg_name = "from_relations__from_object__" arg_name = "" # loop though all but last for obj in path: if isinstance(obj, Attribute): arg_name += "object__" search_args["base__label"] = obj.label elif obj == obj_inst.base: # last elm add primary key search_args[arg_name + "pk"] = obj_inst.pk else: # Not neccesary as long as pk is being added # search_args[arg_name + "base__label"] = obj.label arg_name += base_arg_name return search_args @staticmethod def path_to_object(obj, root_obj, childrens=[]): if isinstance(obj, Attribute): # add to path childrens.append(obj) obj = obj.object if obj == root_obj: return obj, childrens else: parrent_rels = obj.from_relations.all() childrens.append(obj) for parrent_rel in parrent_rels: parrent_obj = parrent_rel.from_object obj, childrens = BaseMetaDataService.path_to_object( parrent_obj, root_obj, childrens=childrens) if obj == root_obj: return obj, list(childrens) # this branch has been exhausted, return none return None, childrens
def get_related_schema(self): schema = self._try_get_item( Schema(label=self.dataprovider.provider_name)) return schema or self.create_new_empty_schema( self.dataprovider.provider_name)
class RdfSchemaService(BaseRdfSchemaService): def __init__(self): super(RdfSchemaService, self).__init__() def export_schema_from_db(self, schema): g = Graph() # reset objects created (exported) self.touched_meta_items = [] self.schema = schema # to know which have been exported self.touched_meta_items.append(self.schema) objects = Object.objects.filter(schema=self.schema) # to know which have been exported self.touched_meta_items.extend(objects) namespace = self.schema.url.replace(".ttl", "#") Ontology = Namespace(namespace) g.bind(schema.label, Ontology) rdf_schema = URIRef(Ontology) # define the ontology g.add((rdf_schema, RDF.type, OWL.Ontology)) g.add((rdf_schema, DC.title, Literal(self.schema.label))) g.add((rdf_schema, DC.description, Literal(self.schema.description))) for obj in objects: # make sure that there is no space in the url, and the object is # unique obj_name = self.create_uri_ref(obj) # type g.add((obj_name, RDF.type, RDFS.Class)) # description g.add((obj_name, RDFS.label, Literal(obj.label))) g.add((obj_name, RDFS.comment, Literal(obj.description))) # is defined by what schema g.add((obj_name, RDFS.isDefinedBy, rdf_schema)) attributes = obj.attributes.all() # add attributes for attribute in attributes: # make sure that there is no space in the url attribute_name = self.create_uri_ref(attribute) g.add((attribute_name, RDF.type, RDF.Property)) # this one relates the attribute to the object or domain g.add((attribute_name, RDFS.domain, obj_name)) rdf_data_type = self.att_type_to_rdfs_uri(attribute.data_type) # data_type g.add((attribute_name, RDFS.range, rdf_data_type)) # label and description g.add((attribute_name, RDFS.label, Literal(attribute.label))) g.add((attribute_name, RDFS.comment, Literal(attribute.description))) # defined by g.add((attribute_name, RDFS.isDefinedBy, rdf_schema)) # to know which have been exported self.touched_meta_items.extend(attributes) relations = ObjectRelation.objects.filter(schema=self.schema) # to know which have been exported self.touched_meta_items.extend(relations) for relation in relations: # the "R_" is to avoid naming conflict with classes relation_name = self.create_uri_ref(relation) # make sure that there is no space in the url from_object_name = self.create_uri_ref(relation.from_object) to_object_name = self.create_uri_ref(relation.to_object) # here a realtion object is created g.add((relation_name, RDF.type, RDF.Property)) # a relation is a property with the domain of the object # and range another object # from_object g.add((relation_name, RDFS.domain, from_object_name)) # to_object g.add((relation_name, RDFS.range, to_object_name)) # label and description g.add((relation_name, RDFS.label, Literal(relation.label))) g.add((relation_name, RDFS.comment, Literal(relation.description))) # defined by g.add((relation_name, RDFS.isDefinedBy, rdf_schema)) ttl_data = g.serialize(format='turtle') content = ContentFile(ttl_data) # schema.rdfs_file.delete() self.schema.rdfs_file.save(self.schema.label + ".ttl", content) self.schema.save() return self.schema def write_to_db_baseschema(self): # not very readable, consider to change to [_ for _ in _ ] graph_list = [ self._create_graph_from_url(url) for url in self.default_list ] [self._create_schema_from_graph(g) for g in graph_list] [self._create_objects_from_graph(g) for g in graph_list] [self._create_object_references_from_graphV2(g) for g in graph_list] [self._create_attributes_from_graph(g) for g in graph_list] def read_objects_from_rdfs(self, rdf_url): self.save_to_db = False g = self._create_graph_from_url(rdf_url) self.schema = self._create_schema_from_graph(g) self._create_objects_from_graph(g) self._create_object_references_from_graphV2(g) self._create_attributes_from_graph(g) return self.touched_meta_items def write_to_db(self, rdf_url, overwrite=False): self.overwrite_db_objects = overwrite g = self._create_graph_from_url(rdf_url) self.schema = self._create_schema_from_graph(g) self._create_objects_from_graph(g) self._create_object_references_from_graphV2(g) self._create_attributes_from_graph(g) def _create_graph_from_url(self, rdf_url): g = Graph() # this is needed for the parser to be able to read files register( # 'text/rdf+n3', Parser, 'text/plain', Parser, 'rdflib.plugins.parsers.notation3', 'N3Parser') # if not a string, its not an url # assume its a file if hasattr(rdf_url, 'read') or rdf_url[-4:] in [".ttl", ".xml"]: try: # default is n3 (ttl) if not xml (ttl is default) format = None if ".xml" in rdf_url else "n3" # make sure that the parser is reading from the beginning try: rdf_url.seek(0) except: pass g.parse(rdf_url, format=format) return g except Exception as e: raise Exception("could not load specified file as a graph.") # cant load from raw github ttl if format is not set format = "n3" if ".ttl" in rdf_url else None if rdf_url in self.selfhosted: rdf_url = self.selfhosted[rdf_url] try: g.parse(rdf_url, format=format) except URLError as e: print("could not fetch schema from url: " + rdf_url) raise (e) return None return g def _validate_dependencies(self, g): missing_list = [] # recursively try to loo for format, namespace in g.namespaces(): namespace = str(namespace) if not self._validate_namespace(namespace): missing_list.append(namespace) return missing_list def _create_schema_from_graph(self, g): # identify schema attributes # TODO: identify rdf_url from graph instead # some cases where the url is not the same as the uri get_schema_keys = [OWL.Ontology, RDFS.Class, RDF.Property, None] rdf_url = None for key in get_schema_keys: try: schema_subject, _, _ = next(g.triples((None, RDF.type, key))) rdf_url = str(schema_subject) break except Exception as e: pass if rdf_url is None: return None label_keys = [DC.title, RDFS.label, DCTERMS.title] description_keys = [DC.description, DCTERMS.description] label = "Not available" description = "Not available" for key in label_keys: try: rdf_url, _, label = next(g.triples( (schema_subject, key, None))) break except: pass for key in description_keys: try: rdf_url, _, description = next( g.triples((schema_subject, key, None))) break except: pass # only save if it does not exists try: self.schema = Schema.objects.get(url=str(rdf_url)) except Exception as e: self.schema = Schema(label=self.standardize_string( label, remove_version=True), url=str(rdf_url), description=str(description)) if self.save_to_db: self.schema.save() return self.schema def _create_objects_from_graph(self, g): # now create all objects for s, p, o in g.triples((None, None, RDFS.Class)): _s = str(s) _o = str(o) _p = str(p) # mandatory try: # Property label label = next(g.triples((s, RDFS.label, None)))[2] # Property Class/domain schema_url = next(g.triples((s, RDFS.isDefinedBy, None)))[2] except Exception as e: continue # volentary try: # Property comment comment = next(g.triples((s, RDFS.comment, None)))[2] except: pass try: self.schema = Schema.objects.get(url=schema_url) except Exception as e: pass else: object = self._try_create_item( Object(label=str(label), description=str(comment), schema=self.schema)) def _create_object_references_from_graphV2(self, g): # object references is in fact rather a # property with range and domain pointing at # 2 objects # now create all object references for s, p, o in g.triples((None, None, RDF.Property)): # similar to property try: # Property label label = next(g.triples((s, RDFS.label, None)))[2] # Property comment try: comment = next(g.triples((s, RDFS.comment, None)))[2] except: comment = "could not find" # Property Class/domain domain = next(g.triples((s, RDFS.domain, None)))[2] # Property data_type o_range = next(g.triples((s, RDFS.range, None)))[2] # the purpose of this is just to identify # data properties to avoid database lookup # if not needed if o_range in self.valid_data_types: pass from_schema_url, from_obj_label = self._split_rdfs_url(domain) to_schema_url, to_obj_label = self._split_rdfs_url(o_range) # get the schema so that we can select the object from the # right schema liste = Schema.objects.all() # get the right url from either current schema or from other # in db if self.schema.url == from_schema_url: from_schema = self.schema else: from_schema = Schema.objects.get(url=from_schema_url) if self.schema.url == to_schema_url: to_schema = self.schema else: to_schema = Schema.objects.get(url=to_schema_url) # standardize the labels to match what has been created from_obj_label = self.standardize_string(from_obj_label) to_obj_label = self.standardize_string(to_obj_label) # first try find the objects in the created list from_object = next( filter(lambda x: x.label == from_obj_label, self.touched_meta_items), None) to_object = next( filter(lambda x: x.label == to_obj_label, self.touched_meta_items), None) # TODO: consider the case if 2 objects has the same label? if not from_object: from_object = Object.objects.filter( label=from_obj_label, schema=from_schema).first() if not to_object: to_object = Object.objects.filter( label=to_obj_label, schema=to_schema).first() # if no such 2 objects exists except Exception as e: continue if from_object and to_object: object_relation = self._try_create_item( ObjectRelation(from_object=from_object, to_object=to_object, label=label, schema=self.schema, description=comment)) def _create_attributes_from_graph(self, g): for s, p, o in g.triples((None, None, RDF.Property)): _s = str(s) _o = str(o) _p = str(p) try: # Property label label = next(g.triples((s, RDFS.label, None)))[2] # Property comment comment = next(g.triples((s, RDFS.comment, None)))[2] # Property Class/domain domain = next(g.triples((s, RDFS.domain, None)))[2] # Property data_type range = next(g.triples((s, RDFS.range, None)))[2] _, obj_label = self._split_rdfs_url(domain) obj_label = self.standardize_string(obj_label) # find the object in the created list first object = next( filter(lambda x: x.label == obj_label, self.touched_meta_items), None) # if not found look in the database if object is None: object = Object.objects.filter(label=obj_label).first() except Exception as e: continue if object is None: continue if range not in self.valid_data_types: continue attribute = self._try_create_item( Attribute(data_type=self.rdfs_to_att_type(range), label=label, object=object)) def _validate_namespace(self, namespace): try: Schema.objects.Get(url=str(namespace)) except Exception as e: return False return True def _split_rdfs_url(self, url): if not isinstance(url, (term.URIRef, URIRef)): return None methodlist = [ lambda x: x.split("#"), lambda x: ("/".join(x.split("/")[:-1]) + "/", x.split("/")[-1]) ] for method in methodlist: try: url, label = method(str(url)) if label == "": continue url += "/" if url[-1] != "/" else "" return url, label except Exception as e: pass return None