def parse_identity_key(legacy_id: Any): """Try to convert a given value into a string that we can use to construct a non-obfuscated KGIRI""" if isinstance(legacy_id, int): key = f"{legacy_id}" elif isinstance(legacy_id, str): key = legacy_id.translate(special_char_map) key = re.sub( r"\b('?\w)", lambda match: match.group(1).capitalize(), inflection.dasherize(_translate_to_human_readable(inflection.underscore(key))) ) # key = inflection.titleize(key) key = inflection.parameterize(key, separator='-') # key = unidecode(legacy_id) # key = stringcase.spinalcase(stringcase.lowercase(key)) # key = key.replace('"', '') # key = key.replace('(', '-') # key = key.replace(')', '-') # key = key.replace('/', '-') # key = key.replace('\\', '-') # key = key.replace('=', '-') # key = key.replace('>', '-') # key = key.replace('<', '-') # key = key.replace(':', '-') # key = key.replace(',', '-') # key = key.replace('|', '-') # key = key.replace('&', '-and-') # key = key.replace('-&-', '-and-') elif isinstance(legacy_id, Timestamp): key = Literal(legacy_id).lower() else: # # Do not change this to a call to log.error because that cause circular dependency (TODO to fix that)s print( f"ERROR: While parsing an identity key: encountered unknown type {type(legacy_id)} for value {legacy_id}", file=sys.stderr ) return None key = key.replace('--', '-') key = key.replace('--', '-') key = strip_end(key, '-') return key
def createBikeGraph(arg, g): nspaces = readDict() schema = Namespace(nspaces.get('schema')) naptan = Namespace(nspaces.get('naptan')) owl = Namespace(nspaces.get('owl')) xsd = Namespace(nspaces.get('xsd')) rdfs = Namespace(nspaces.get('rdfs')) vcard = Namespace(nspaces.get('vcard')) locationOnt = Namespace(nspaces.get('locationOnt')) geom = Namespace(nspaces.get('geom')) geo = Namespace(nspaces.get('geo')) geosparql = Namespace(nspaces.get('geosparql')) rdf = Namespace(nspaces.get('rdf')) dcterms = Namespace(nspaces.get('dcterms')) dul = Namespace(nspaces.get('dul')) locn = Namespace(nspaces.get('locn')) dc = Namespace(nspaces.get('dc')) bikeid = arg[0].split('_')[1].encode('utf-8') bikeGUID = getUid(bikeid, naptan) bikeLat, bikeLong = float(arg[8]), float(arg[9]) bikeLats = str('{:f}'.format(bikeLat)) bikeLongs = str('{:f}'.format(bikeLong)) nTotalDocks = str(arg[7].encode('utf-8')) address = arg[2].split(',') bikeLabel = address[len(address) - 1].lstrip() + ' ' + str(bikeid) bikeGeometry = "POINT (" + str(bikeLat) + " " + str(bikeLong) + ")" bikeAddress = Literal(re.sub(r'&(?![A-Za-z]+[0-9]*;|#[0-9]+;|#x[0-9a-fA-F]+;)', r'and',arg[2])) bikeAddressSplit = Literal(bikeAddress.split(',', 1)[-1]) bikeAddressLocality = Literal(bikeAddressSplit.replace(' ', '',1)) bikeCreatedDate = arg[5] singleBike = createBikeParkID(bikeGUID) singleAddress = createAddress(bikeGUID) singleGeometry = createGeometry(bikeGUID) bikePublisher = URIRef('https://api.tfl.gov.uk/#BikePoint') bikeBusinessType = URIRef('http://data.linkedevents.org/kos/3cixty/bikestation') g.add((singleBike, rdf.type, dul.Place)) g.add((singleBike, rdf.type, locationOnt.bikePark)) g.add((singleBike, dcterms.identifier, Literal(bikeLabel))) g.add((singleBike, dcterms.description, Literal("London TFL Bike hire docks"))) g.add((singleBike, schema.dateCreated, Literal(bikeCreatedDate, datatype=xsd.dateTime))) g.add((singleBike, locationOnt.nTotalDocks, Literal(nTotalDocks, datatype=xsd.int))) g.add((singleBike, dc.publisher, bikePublisher)) g.add((singleBike, locationOnt.businessType, bikeBusinessType)) g.add((singleBike, geom.geometry, singleGeometry)) g.add((singleBike, schema.geo, singleGeometry)) g.add((singleBike, geosparql.hasGeometry, singleGeometry)) g.add((singleBike, locn.geometry, singleGeometry)) g.add((singleBike, vcard.hasAddress, singleAddress)) g.add((singleBike, locn.addresss, singleAddress)) g.add((singleBike, schema.location, singleAddress)) g.add((singleGeometry, rdf.type, geosparql.hasGeometry)) g.add((singleGeometry, rdf.type, geom.geometry)) g.add((singleGeometry, rdf.type, locn.geometry)) g.add((singleGeometry, rdf.type, schema.geo)) g.add((singleGeometry, geo.geometry, Literal(bikeGeometry, datatype=geosparql.wktLiteral))) g.add((singleGeometry, geo.lat, Literal(bikeLats, datatype=xsd.double))) g.add((singleGeometry, geo.long, Literal(bikeLongs, datatype=xsd.double))) g.add((singleGeometry, schema.latitude, Literal(bikeLats, datatype=xsd.double))) g.add((singleGeometry, schema.longitude, Literal(bikeLongs, datatype=xsd.double))) g.add((singleAddress, rdf.type, locn.address)) g.add((singleAddress, rdf.type, schema.location)) g.add((singleAddress, rdf.type, vcard.hasAddress)) g.add((singleAddress, dcterms.title, bikeAddress)) g.add((singleAddress, schema.streetAddress, bikeAddress)) g.add((singleAddress, locn.address, bikeAddress)) g.add((singleAddress, vcard.street_address, bikeAddress)) g.add((singleAddress, schema.addressLocality, bikeAddressLocality)) return g
def literal_match(self, literal: Literal, surface: str): dtype = literal.datatype if hasattr(literal, "datatype") else None literal, surface = str(literal).strip(), str(surface).strip() score = 0.0 if dtype: # Typed literals should match well if str(dtype) == str(self.DATETIME): try: l = datetime.datetime.fromisoformat(literal).timestamp() yearmatch = YEAR_PATTERN.match(surface) if yearmatch: year = int(yearmatch.groups()[0]) s = datetime.datetime(year, 1, 1).timestamp() else: try: s = datetime.datetime.fromisoformat( surface).timestamp() except: s = self._dateparse(surface).timestamp() if s: score = max(0, 1 - (abs(s - l) / (60 * 60 * 24 * 365))) if score: yield LiteralMatchResult(score, literal, dtype) return # else: # log.debug(f"No date match ({l},{s}) = {score}") except Exception as e: pass else: try: s = float(surface.replace(",", "")) l = float(literal.replace(",", "")) score = max(0, 1 - (abs(s - l) / max(abs(s), abs(l)))) if score > 0.95: yield LiteralMatchResult(score, literal, dtype) return except Exception as e: pass score = bool(surface.lower() == literal.lower()) elif surface and literal: # Strings may match approximately if self.stringmatch == "jaccard": stok, ltok = set(surface.lower().split()), set( literal.lower().split()) if stok and ltok: score = len(stok & ltok) / len(stok | ltok) elif self.stringmatch == "levenshtein": import Levenshtein slow, llow = surface.lower(), literal.lower() if slow and llow: m = min(len(slow), len(llow)) score = max(0, (m - Levenshtein.distance(slow, llow)) / m) if score: yield LiteralMatchResult(score, literal, dtype)
def kgiri_replace_iri_in_literal(value: Literal): if not kgiri_replace_enabled: return value return Literal(value.replace(kgiri_base_replace, kgiri_base))
def commentStatements(user, commentUri, realComment): # here you can put more processing on the comment text realComment = Literal(realComment.replace("\r", ""), datatype=realComment.datatype) # rdflib n3 can't read these back return [(commentUri, CONTENT.encoded, realComment)]