def create_time_interval(start, end, timeline=TL.UniversalTimeline): eventnode = rdflib.BNode() g.add( (eventnode, RDF.type, TL.Interval) ) g.add( (eventnode, TL.beginsAt, Literal(start)) ) g.add( (eventnode, TL.endsAt, Literal(end)) ) g.add( (eventnode, TL.onTimeLine, timeline) ) return eventnode
def format_adjective_satelites(self): """""" count = 0 self.logger.info(f"start formatting AdjectiveSatelliteSynset") satellite_synsets = self.graph.subjects( RDF.type, SCHEMA.AdjectiveSatelliteSynset) for synset in satellite_synsets: if synset.endswith("-a"): count += 1 new_synset = URIRef(synset.replace("-a", "-s")) self.logger.debug( f"replacing '{synset.n3()}' by '{new_synset.n3()}'") self._replace_node(synset, new_synset, "format_adjective_satelites") # replace synset id synset_id = synset.split("synset-")[-1] new_synset_id = new_synset.split("synset-")[-1] self._drop_triple( (new_synset, SCHEMA.synsetId, Literal(synset_id)), "format_adjective_satelites") self._add_triple( (new_synset, SCHEMA.synsetId, Literal(new_synset_id)), "format_adjective_satelites") else: self.logger.warning( f"ill formed AdjectiveSatelliteSynset '{synset.n3()}'") # resulting added and removed triples self.logger.info(f"action applied to {count} valid synsets" f"\n\ttotal: {self.added_triples} triples added" f"\n\ttotal: {self.removed_triples} triples removed")
def format_synset_id(self): """""" count = 0 self.logger.info(f"start formatting property synsetId") synsets = self._get_all_synsets() for synset, in synsets: count += 1 # removes old property synset_id = self.graph.value(synset, SCHEMA.synsetId) if synset_id: self._drop_triple((synset, SCHEMA.synsetId, synset_id)) # replaces property synset_id = Literal(synset.split("/synset-")[-1]) synset_offset = Literal(synset_id.split("-")[0]) self._add_triple((synset, SCHEMA.offset, synset_offset)) self._add_triple((synset, SCHEMA.synsetId, synset_id)) # resulting added and removed triples self.logger.info(f"action applied to {count} cases" f"\n\ttotal: {self.added_triples} triples added" f"\n\ttotal: {self.removed_triples} triples removed")
def create_time_interval(g, start, end): eventnode = rdflib.BNode() g.add((eventnode, RDF.type, TL.Interval)) g.add((eventnode, TL.at, Literal(start))) g.add((eventnode, TL.duration, Literal(str(end - start)))) g.add((eventnode, TL.timelime, TL.universaltimeline)) return eventnode
def create_instrument(inst_label): # create instrument URI instrumentURI = create_uri("instruments", uuid.uuid4()) # add instrument metadata g.add((instrumentURI, RDF.type, MO.Instrument)) g.add((instrumentURI, DTL.orig_inst_label, Literal(inst_label))) g.add((instrumentURI, DTL.dtl_inst_label, Literal(inst_label))) logging.debug("Instrument %s created", inst_label) return instrumentURI
def create_track(self, trackTitle, tnum): # create track URI trackURI = self.create_uri("tracks", uuid.uuid4()) #add track metadata self.add((trackURI, RDF.type, MO.Track)) self.add((trackURI, DC.title, Literal(trackTitle))) self.add((trackURI, MO.track_number, Literal(str(tnum)))) logging.debug("Track created") return trackURI
def create_release(self, title, dcount): # create release URI releaseURI = self.create_uri("releases", uuid.uuid4()) # add release metadata self.add((releaseURI, RDF.type, MO.Release)) self.add((releaseURI, DC.title, Literal(title))) self.add((releaseURI, MO.record_count, Literal(str(dcount)))) self.add((releaseURI, DTL.is_remastered, Literal("0"))) logging.debug("Release created") return releaseURI
def process_tunes(): tune_ids = tunes_table.id tune_titles = tunes_table.name logging.info("\ncreating %i tunes", len(tune_ids)) for tune_id, title in zip(tune_ids, tune_titles): logging.debug("creating tune %s", title) tuneURI = create_uri("tunes", tune_id) g.add((tuneURI, RDF.type, MO.MusicalWork)) g.add((tuneURI, DC.title, Literal(title))) g.add((tuneURI, DTL.lord_id, Literal(tune_id))) logging.info("tunes created")
def process_bands(): leader_ids = leaders_table.id leader_names = leaders_table.name logging.info("\ncreating %i bands", len(leader_ids)) for leader_id, name in zip(leader_ids, leader_names): logging.debug("creating band %s", name) bandURI = create_uri("bands", leader_id) g.add((bandURI, RDF.type, MO.MusicGroup)) g.add((bandURI, FOAF.name, Literal(name))) g.add((bandURI, DTL.lord_id, Literal(leader_id))) logging.info("bands created")
def process_musicians(): musician_ids = musicians_table.id musician_names = musicians_table.name logging.info("\ncreating %i musicians", len(musician_ids)) for musician_id, name in zip(musician_ids, musician_names): logging.debug("creating musician %s", name) musicianURI = create_uri("musicians", musician_id) g.add((musicianURI, RDF.type, MO.MusicArtist)) g.add((musicianURI, FOAF.name, Literal(name))) g.add((musicianURI, DTL.lord_id, Literal(musician_id))) logging.info("musicians created")
def create_album(self, title): # create album URI albumURI = self.create_uri("albums", uuid.uuid4()) # add album metadata self.add((albumURI, RDF.type, MO.SignalGroup)) self.add((albumURI, DC.title, Literal(title))) self.add((albumURI, DTL.is_compilation, Literal("1"))) self.add((albumURI, DTL.is_remix, Literal("0"))) self.add((albumURI, DTL.is_live, Literal("0"))) logging.debug("Album created") return albumURI
def create_release_event(self, title, date): # create uri releaseEventURI = self.create_uri("release_events", uuid.uuid4()) # add metadata self.add((releaseEventURI, RDF.type, MO.ReleaseEvent)) self.add((releaseEventURI, DC.title, Literal(title))) # add date self.add((releaseEventURI, EVENT.time, create_date(g, "2008-10-24"))) # add place self.add((releaseEventURI, EVENT.place, Literal("Germany"))) logging.debug("Release event created") return releaseEventURI
def create_medium(self, title, dnum, tcount): # create medium URI mediumURI = self.create_uri("mediums", uuid.uuid4()) ######## should medium uid be defined through release? # add medium metadata self.add((mediumURI, RDF.type, MO.Record)) self.add((mediumURI, MO.record_number, Literal(str(dnum)))) self.add((mediumURI, DC.title, Literal(title))) if tcount > 0: self.add((mediumURI, MO.track_count, Literal(str(tcount)))) logging.debug("Medium created") return mediumURI
def find_instrument(self, inst_label): # instruments are matched by instrument name (label). That will not mtch everything since there # might be multiple names and spelings for the same instrument, also typos if (None, DTL.je_inst_label, Literal(inst_label)) in self.g: instrument = self.g.value(subject=None, predicate=DTL.je_inst_label, object=Literal(inst_label), default=None, any=False) logging.debug("Instrument %s found: %s", inst_label, instrument) return instrument else: return None
def create_performer(self, artistURI, instrument, confidence): performerURI = self.create_uri("Performers", uuid.uuid4()) self.add((performerURI, RDF.type, DTL.Performer)) self.add((performerURI, DTL.musician, artistURI)) self.add((performerURI, DTL.instrument, Literal(instrument))) # confidence confnode = rdflib.BNode() self.add((confnode, RDF.type, DTL.PerformerConfidence)) self.add((confnode, DTL.musician_confidence, Literal(confidence[0]))) self.add((confnode, DTL.instrument_confidence, Literal(confidence[1]))) self.add((performerURI, DTL.performer_confidence, confnode)) logging.debug("Performer created") return performerURI
def process_instruments(): instrument_ids = instruments_table.id instrument_titles = instruments_table.name logging.info("\ncreating %i instruments", len(instrument_ids)) for instrument_id, title in zip(instrument_ids, instrument_titles): logging.debug("creating instrument %s", title) instrumentURI = create_uri("instruments", instrument_id) g.add((instrumentURI, RDF.type, MO.Instrument)) g.add((instrumentURI, DTL.lord_inst_label, Literal(title))) g.add((instrumentURI, DTL.orig_inst_label, Literal(title))) g.add((instrumentURI, DTL.lord_id, Literal(instrument_id))) ######## are these lord uids? logging.info("instruments created")
def process_tracks(): track_ids = tracks_table.id track_titles = tracks_table.name track_numbers = tracks_table.track_nr logging.info("\ncreating %i tracks", len(track_ids)) for track_id, title, tnum in zip(track_ids, track_titles, track_numbers): logging.debug("creating track %s with number %s", title, tnum) trackURI = create_uri("tracks", track_id) g.add((trackURI, RDF.type, MO.Track)) g.add((trackURI, DC.title, Literal(title))) g.add((trackURI, MO.track_number, Literal(str(tnum)))) g.add((trackURI, DTL.lord_id, Literal(track_id))) ######## are these lord uids? logging.info("tracks created")
def process_time_area(): # parse areadate strings session_areadate_strings = sessions_table.location_time_str logging.info("\nparsing %i area date strings", len(session_areadate_strings)) session_area_strings = [] session_date_strings = [] for session_areadate_str in session_areadate_strings: areastr, datestr = Lord_time_area_parser.parse_location_time_str( session_areadate_str) logging.debug("area: %s, date: %s", areastr, datestr) session_area_strings.append(areastr) session_date_strings.append(datestr) logging.debug("area - date strings parsed") # add area and date to sessions session_idxs = sessions_table.sessionId logging.info("\nadding areas and dates to %i sessions", len(session_idxs)) for session_id, session_area_str, session_date_str in zip( session_ids, session_area_strings, session_date_strings): sessionURI = find_by_id("sessions", session_id) logging.debug('add place: %s', areastr) add((sessionURI, EVENT.place, Literal(areaString))) logging.debug('datestr: %s', datestr) dtlutil.add_datestr(sessionURI, datestr) logging.debug("\ndates and areas added to sessions")
def create_label(self, title): # create URI labelURI = self.create_uri("labels", uuid.uuid4()) self.add((labelURI, RDF.type, MO.Label)) self.add((labelURI, DC.title, Literal(title))) logging.debug("Label created") return labelURI
def create_artist(self, artistName): # create artist URI artistURI = self.create_uri("artists", uuid.uuid4()) # add artist metadata self.add((artistURI, RDF.type, MO.MusicArtist)) self.add((artistURI, FOAF.name, Literal(artistName))) logging.debug("Artist created") return artistURI
def create_band(self, bandName): # create URI bandURI = self.create_uri("bands", uuid.uuid4()) #add metadata self.add((bandURI, RDF.type, MO.MusicGroup)) self.add((bandURI, FOAF.name, Literal(bandName))) logging.debug("Band created") return bandURI
def create_arranger(self, name): # create URI arrangerURI = self.create_uri("arrangers", uuid.uuid4()) #add metadata self.add((arrangerURI, RDF.type, MO.MusicArtist)) self.add((arrangerURI, FOAF.name, Literal(name))) logging.debug("Arranger created") return arrangerURI
def create_composer(self, name): # create URI composerURI = self.create_uri("composers", uuid.uuid4()) #add metadata self.add((composerURI, RDF.type, MO.MusicArtist)) self.add((composerURI, FOAF.name, Literal(name))) logging.debug("Composer created") return composerURI
def create_instrument(self, inst_label): # create instrument URI instrumentURI = self.create_uri("instruments", uuid.uuid4()) # add instrument metadata self.add((instrumentURI, RDF.type, MO.Instrument)) self.add((instrumentURI, DTL.je_inst_label, Literal(inst_label))) logging.debug("Instrument %s created", inst_label) return instrumentURI
def get_instrument(instrument): instrumentURI = g.value(subject=None, predicate=DTL.orig_inst_label, \ object=Literal(instrument), default=None, any=False) if instrumentURI == None: instrumentURI = create_instrument(instrument) else: logging.debug("found instrument %s", instrument) return instrumentURI
def process_releases(): release_ids = releases_table.full_id release_titles = releases_table.title release_label_id_strs = releases_table.label_id_str release_notes = releases_table.notes_str logging.info("\ncreating %i releases", len(release_ids)) for release_id, release_title, label_id_str, note in \ zip(release_ids, release_titles, release_label_id_strs, release_notes): logging.debug("creating release %s", release_title) releaseURI = create_uri("releases", release_id) g.add((releaseURI, RDF.type, MO.Release)) g.add((releaseURI, DC.title, Literal(release_title))) g.add((releaseURI, DTL.lord_label_id_str, Literal(label_id_str))) g.add((releaseURI, DTL.lord_release_notes, Literal(note))) g.add((releaseURI, DTL.lord_id, Literal(release_id))) #label_id_str? notes_str? logging.info("releases created")
def add_adjective_markers(self, senses: Graph, adjective_lines): """""" self.logger.info(f"start processing {len(adjective_lines)} lines") # format data adjective_data = [] for line in adjective_lines: synset_id, _, _, words_count, *tail = line.split() for i in range(int(words_count, base=16)): adjective_data.append((synset_id, tail[2 * i])) # find and add adjective markers count = 0 for synset_id, word in adjective_data: marker = None if word.endswith("(a)"): marker = "a" # predicate position elif word.endswith("(p)"): marker = "p" # prenominal (attributive) position elif word.endswith("(ip)"): marker = "ip" # immediately postnominal position else: continue # adds marker valid = False word = self._format_lexical(word[:word.find(f"({marker})")], True) for synset in self.graph.subjects(SCHEMA.synsetId, Literal(synset_id)): sense = self._get_sense(synset, word) if sense is not None: valid = True count += 1 self.logger.debug( f"adding marker '{marker}' from word '{word}' to sense '{sense.n3()}'" ) senses.add((sense, SCHEMA.adjPosition, Literal(marker))) # validates the result if not valid: self.logger.warning( f"could not add marker '{marker}' from word '{word}' to synset '{synset_id}'" ) # print statistics self.logger.info(f"after action {count} triples were added")
def process_sessions(): session_ids = sessions_table.full_id logging.info("\ncreating %i sessions", len(session_ids)) for counter, session_id in enumerate(session_ids): logging.debug(str(counter)) sessionURI = create_uri("sessions", session_id) g.add((sessionURI, RDF.type, MO.Performance)) g.add((sessionURI, DTL.lord_id, Literal(session_id))) logging.info("sessions created")
def create_qualified_date(g, freetext_date, startdate, enddate, is_apprx=True, apprxq=None): if startdate != enddate: timespan_node = create_time_interval(g, startdate, enddate) g.add((timespan_node, RDF.type, DTL.QualifiedDateInterval)) else: timespan_node = create_date(g, startdate) g.add((timespan_node, RDF.type, DTL.QualifiedDateInstant)) g.add((timespan_node, DTL.freetext_timespan, Literal(freetext_date))) g.add((timespan_node, DTL.is_approximate, Literal(str(int(is_apprx))))) if apprxq != None: g.add((timespan_node, DTL.approximation_qualifier, Literal(apprxq))) # logging.debug("Qualified date created") return timespan_node
def process_time_area(): parser = LordAreaDateParser() # parse areadate strings session_areadate_strings = sessions_table.location_time_str logging.info("\nparsing %i area date strings", len(session_areadate_strings)) session_area_strings = [] session_date_strings = [] for session_areadate_str in session_areadate_strings: areastr, datestr = parser.parse_area_date_str(session_areadate_str) ## except parser.UnparsableAreaDateStringWarning as e: ## logging.warning(e.message) ## areastr = session_areadate_str ## datestr = "" logging.debug("area: %s, date: %s", areastr, datestr) session_area_strings.append(areastr) session_date_strings.append(datestr) logging.info("area - date strings parsed") # g.add area and date to sessions from dateParser import DateParser session_idxs = sessions_table.id logging.info("\nadding areas and dates to %i sessions", len(session_idxs)) for session_idx, session_area_str, session_date_str in zip( session_idxs, session_area_strings, session_date_strings): logging.debug("session %i", session_idx) session_id = get_session_id_by_idx(session_idx) sessionURI = find_by_id("sessions", session_id) logging.debug('add place: %s', session_area_str) g.add((sessionURI, EVENT.place, Literal(session_area_str))) g.add((sessionURI, DTL.orig_date, Literal(session_date_str))) logging.debug('datestr: %s', session_date_str) try: dtlutil.add_datestr(g, sessionURI, session_date_str) except DateParser.UnparsableDateWarning as e0: logging.warning(e0.message) except DateParser.YearOutOfRangeWarning as e1: logging.warning(e1.message) logging.info("\ndates and areas added to sessions")