class RecordMaker: """ create a valid collection record by - extracting certain fields from the provided (invalid) record - writing them into a valid, "template" record - changing ID and writing new recor to disk """ old_prefix = "DCS-COLLECTION" new_prefix = "NCAR-COLLECTION" def __init__(self, path): """ path to invalide record """ print path self.path = path self.recname = os.path.basename(path) self.baseRec = DleseCollectRecord( path=os.path.join(recs, self.recname)) self.fullTitle = self.baseRec.getFullTitle() self.shortTitle = self.baseRec.getShortTitle() self.description = self.baseRec.getDescription() self.key = self.baseRec.getKey() self.id = self.baseRec.getId() self.newId = self.getNewId(self.id) self.newRec = self._makeNewRec() def report(self, rec): """ print out key fields of provided record """ print 'id: ', rec.getId() print 'key: ', rec.getKey() print 'fullTitle: ', rec.getFullTitle() print 'shortTitle: ', rec.getShortTitle() print 'description: ', rec.getDescription() def getNewId(self, id): return id.replace(self.old_prefix, self.new_prefix) def _makeNewRec(self): newRec = DleseCollectRecord(path=templatepath) newRec.setId(self.newId) newRec.setFullTitle(self.fullTitle) newRec.setShortTitle(self.shortTitle) newRec.setDescription("NCAR Library " + self.shortTitle) newRec.setKey(self.key) return newRec def write(self, path=None): if path is None: path = os.path.join(os.path.dirname(self.path), self.newId + ".xml") DleseCollectRecord.write(self.newRec, path) print "wrote to " + path
def __init__(self, instance): UserDict.__init__(self) self.instance = instance self.collectPath = os.path.join(instance.path, "dlese_collect", "collect") for filename in os.listdir(self.collectPath): if not filename.lower().endswith(".xml"): continue rec = DleseCollectRecord( path=os.path.join(self.collectPath, filename)) self[rec.getKey()] = rec
def findDleseCollectionRecord(field, value): """ returns first DleseCollectRecord having the specified value for specified field """ dlese_collect_dir = os.path.join(dlese_records, 'dlese_collect', 'collect') for filename in filter(lambda x: x.endswith('xml'), os.listdir(dlese_collect_dir)): path = os.path.join(dlese_collect_dir, filename) rec = DleseCollectRecord(path=path) if (rec.get(field) == value): return rec
def __init__(self, path): """ path to invalide record """ print path self.path = path self.recname = os.path.basename(path) self.baseRec = DleseCollectRecord( path=os.path.join(recs, self.recname)) self.fullTitle = self.baseRec.getFullTitle() self.shortTitle = self.baseRec.getShortTitle() self.description = self.baseRec.getDescription() self.key = self.baseRec.getKey() self.id = self.baseRec.getId() self.newId = self.getNewId(self.id) self.newRec = self._makeNewRec()
def merge(self): """ ignore records with collection key containing 'hsbio' collisions are reported when dowrites is False. when dowrites is True, collisions raise an Exception """ for filename in filter(self.acceptItem, os.listdir(self.src_dir)): src_path = os.path.join(self.src_dir, filename) src_rec = DleseCollectRecord(path=src_path) src_key = src_rec.getKey() if not self.acceptCollectionKey(src_key): if verbose: print 'SKIPPING:', src_key continue dst_path = os.path.join(self.dst_dir, src_key + '.xml') if os.path.exists(dst_path): ## COLLISION if dowrites: raise Exception, 'dst_path exists at %s' % dst_path else: # debugging - print out collisions print 'COLLISION: dst_path exists at %s' % dst_path continue if dowrites: src_rec.setId(src_key) src_rec.write(dst_path) else: if 1 or verbose: print 'would have copied %s to ...\n\t%s' % ( os.path.basename(src_path), dst_path)
def __init__(self, dcr_path, id=None): self.initialized = False self.dcr = DleseCollectRecord(path=dcr_path) self.ncr = NCSCollectRecord(path=self.ncs_collect_template) self.process_field_mappings() self.injectContributors() xmlFormat = self.dcr.get('libraryFormat') if (xmlFormat == 'adn'): self.ncr.addViewContext('DLESECollections') self.ncr.set('metadataPrefix', 'nsdl_dc') elif (xmlFormat == 'dlese_anno'): self.ncr.addViewContext('DLESEAnnotations') self.ncr.set('metadataPrefix', 'comm_anno') if id: self.ncr.setId(id) #set the destination (ncr) path for writing self.ncr.path = os.path.join(self.tmp_output_path, self.ncr.getId() + '.xml') self.initialized = True
def write(self, path=None): if path is None: path = os.path.join(os.path.dirname(self.path), self.newId + ".xml") DleseCollectRecord.write(self.newRec, path) print "wrote to " + path
def _makeNewRec(self): newRec = DleseCollectRecord(path=templatepath) newRec.setId(self.newId) newRec.setFullTitle(self.fullTitle) newRec.setShortTitle(self.shortTitle) newRec.setDescription("NCAR Library " + self.shortTitle) newRec.setKey(self.key) return newRec
def updateCollectionRecord(self, new_key, new_name=None): """ - find the collection record with this collections prefix -- we look through them one by one - update the prefix - if dowrites: - write collection record """ collect = os.path.join (self.repo, "dlese_collect", "collect") for filename in filter (lambda x:x.endswith('xml'), os.listdir(collect)): # print filename path = os.path.join(collect, filename) rec = DleseCollectRecord(path=path) oldKey = rec.getKey() if oldKey == self.key: print 'old key: %s' % rec.getKey() rec.setKey(new_key) rec.setId(new_key) if new_name: rec.setShortTitle (new_name) rec.setFullTitle(new_name) if self.dowrites: rec.write() os.rename(path, os.path.join(collect, new_key+'.xml')) print 'wrote collection record: %s' % rec.getId() else: print rec print 'WOULD have written collection record: %s' % rec.getId() return
class DleseToNcsCollectTransform: ncs_collect_template = '/Users/ostwald/devel/python/python-lib/uconn/ncs_collect_template.xml' tmp_output_path = '/Users/ostwald/tmp/dlese_to_ncs_collect' """ field_mappings documentation """ field_mappings = [ # dlese_collect -> ncs_collect 'id', # we do id by hand 'description', ['collectionLocation', 'url'], ['fullTitle', 'title'], ['id', 'collSetSpec'], ['created', 'dateTime'], # involves a massage 'libraryFormat', ['key', 'oaiSetSpec'] ] def __init__(self, dcr_path, id=None): self.initialized = False self.dcr = DleseCollectRecord(path=dcr_path) self.ncr = NCSCollectRecord(path=self.ncs_collect_template) self.process_field_mappings() self.injectContributors() xmlFormat = self.dcr.get('libraryFormat') if (xmlFormat == 'adn'): self.ncr.addViewContext('DLESECollections') self.ncr.set('metadataPrefix', 'nsdl_dc') elif (xmlFormat == 'dlese_anno'): self.ncr.addViewContext('DLESEAnnotations') self.ncr.set('metadataPrefix', 'comm_anno') if id: self.ncr.setId(id) #set the destination (ncr) path for writing self.ncr.path = os.path.join(self.tmp_output_path, self.ncr.getId() + '.xml') self.initialized = True def injectField(self, dcr_field, ncr_field=None): if ncr_field is None: ncr_field = dcr_field # get value from dcr_field try: value = self.dcr.get(dcr_field) if not value: # msg = 'WARN: injectField - no value for "%s" in %s' % (dcr_field, self.dcr.getId()) # print msg raise Exception, "no value in metadata" except Exception, msg: print "WARN ingest did NOT get value for %s at '%s': %s" % ( self.dcr.getId(), dcr_field, msg) value = "" # kludges for certain fields if dcr_field == 'created': value += 'T00:00:00Z' if ncr_field == 'collSetSpec': value = 'ncs-' + value # inject value in ncr_field try: print 'setting "%s" at "%s"' % (value, ncr_field) self.ncr.set(ncr_field, value) except Exception, msg: print "ERROR setting value at '%s': %s" % (ncr_field, msg)