def main(): # x.normalized extracts the first entry in the namedtuple Obj_id. # valid_idnums is a set of all the IDs in the XML file valid_idnums = set([x.normalized for x in list_objects(_args.modesfile)]) numneeded = 0 if _args.candidatefile: candidate_set = build_candidate_set(valid_idnums) else: candidate_set = valid_idnums img_dict = dict() build_img_dict(img_dict, _args.imgdir) for nid in sorted(candidate_set): denid = denormalize_id(nid) if _args.invert: if nid not in img_dict: trace(2, 'Not in image folder: {}', denid) continue else: if nid in img_dict: trace(2, 'In image folder: {}', denid) if reportfile: print(denid, file=reportfile) continue # print IDs of objects needed as they are not in the image folder(s). # if --invert is set, print the objects in the image folder(s). print(denid, file=outfile) numneeded += 1 needq = f'{"not " if _args.invert else ""}needed' print(f'Total {needq}: {numneeded} of {len(candidate_set)} candidates.')
def main(): global nwritten outfile.write(b'<?xml version="1.0" encoding="UTF-8"?><Interchange>\n') for event, elem in ET.iterparse(infile): if elem.tag != 'Object': continue idelem = elem.find(cfg.record_id_xpath) idnum = idelem.text if idelem is not None else None nidnum = normalize_id(idnum) trace(3, 'idnum: {}', idnum) if nidnum and nidnum in newvals: updated = one_element(elem, nidnum) del newvals[nidnum.upper()] else: updated = False if _args.missing: trace(2, 'Not in CSV file: "{}"', idnum) if updated or _args.all: outfile.write(ET.tostring(elem, encoding='utf-8')) nwritten += 1 if _args.short: break outfile.write(b'</Interchange>') for nidnum in newvals: trace(1, 'In CSV but not XML: "{}"', denormalize_id(nidnum))
def main(): for event, obj in ET.iterparse(infile): if obj.tag == 'Object': one_object(obj) obj.clear() for box in sorted(boxdict.keys()): writer.writerow(['']) writer.writerow(['']) writer.writerow([f'Box {unpad_loc(box)}']) writer.writerow(['--------------']) for nnum in sorted(boxdict[box]): writer.writerow([denormalize_id(nnum), titledict[nnum]])
if not cat: continue ac = row['Accn. No.'] try: accnum = normalize_id(ac.strip()) except ValueError: print(f'bad accnum: skipping {cat}: "{ac}"') continue if accnum not in objset: print(f'skipping {cat}: "{accnum}"') continue cat2accn[cat] = accnum for filename in imgs: m = re.match(r'(\d+\w?)\.', filename) if m: cat = m.group(1) else: print(f'skipping image: {filename}') continue try: accnum = cat2accn[cat] except KeyError: continue newfilename = denormalize_id(accnum) + '.jpg' source = os.path.join(IMGSPATH, filename) dest = os.path.join(ACCESSIONEDPATH, newfilename) shutil.copy(source, dest) print(f'{dest=}, {source=}') print(accnum, file=includecsvfile)
def one_object(objelt, idnum, exhibition: ExhibitionTuple, catalog_num=''): """ # :param objelt: the Object :param idnum: the ObjectIdentity/Number text (for trace) :param exhibition: the Exhibition tuple corresponding to exhibition_list.py :param catalog_num: for the CatalogueNumber element :return: a tuple containing the BeginDate, the new Exhibition element """ global found_old_key def new_exhib(): newelt = ET.Element('Exhibition') subelt = ET.SubElement(newelt, 'ExhibitionName') subelt.text = exhibition.ExhibitionName if catalog_num is not None: subelt = ET.SubElement(newelt, 'CatalogueNumber') subelt.text = str(catalog_num) subelt = ET.SubElement(newelt, 'Place') subelt.text = exhibition.Place dateelt = ET.SubElement(newelt, 'Date') subelt = ET.SubElement(dateelt, 'DateBegin') subelt.text = modesdate(exhibition.DateBegin) subelt = ET.SubElement(dateelt, 'DateEnd') subelt.text = modesdate(exhibition.DateEnd) return exhibition.DateBegin, newelt def one_exhibition(exhib_elt): """ Handle an existing exhibition :param exhib_elt: an Exhibition element (under Object) :return: 0 if it is an empty template 1 if the input element's ExhibitionName is different from the one we are inserting 2 (also update the values) if the ExhibitionName values match It is possible to have duplicate exhibition names but exhibitions are guaranteed to have unique name+place+begindate. So if we are replacing the name, make sure that we are updating the correct exhibition by checking the complete keys. """ exhibname = exhib_elt.find('ExhibitionName') if exhibname is None: return 0 # This is an empty Exhibition template # Updating the exhibition name is a special case since it's used as # a key. The old name is in the XML file and is to be replaced. We # could skip this step as the full key compare will work anyhow, but # this gets us out of here quickly in most cases. if exhibname.text not in (_oldname, exhibition.ExhibitionName): return 1 # not a match so just keep this element as is # The exhibition names match but if they are duplicated elsewhere in # the list, we must look deeper. exhibkey = _oldname if _oldname else exhibition.ExhibitionName + ':' exhibkey += _oldplace if _oldplace else exhibition.Place + ':' exhibkey += _olddate if _olddate else ( exhibition.DateBegin.isoformat()[:10]) xmlkey = exhibname.text xmlplace = '' xmldate = '' subelts = list(exhib_elt) for subelt in subelts: tag = subelt.tag if tag == "Place": xmlplace = subelt.text elif tag == "Date": dates = list(subelt) for dateelt in dates: if dateelt.tag == 'DateBegin': xmldate = dateelt.text break xmlkey += ':' + xmlplace + ':' + xmldate # And finally, confirm that it's really the one we want to update. trace(3, '{}: exhibkey={}\nxmlkey={}', idnum, exhibkey, xmlkey) if exhibkey != xmlkey: return 1 # The names match so update the values for subelt in subelts: tag = subelt.tag if tag == "ExhibitionName": subelt.text = exhibition.ExhibitionName elif tag == "CatalogueNumber": subelt.text = str(catalog_num) elif tag == "Place": subelt.text = exhibition.Place elif tag == "Date": dates = list(subelt) for dateelt in dates: if dateelt.tag == 'DateBegin': dateelt.text = modesdate(exhibition.DateBegin) elif dateelt.tag == 'DateEnd': dateelt.text = modesdate(exhibition.DateEnd) else: trace( 1, 'ID {}: Unknown subelt in {} Exhibition element: {},' ' element not updated.', subelt.text, display_id, tag) return 2 # end one_exhibition display_id = denormalize_id(idnum) trace(2, 'one_element: {} {}', display_id, exhibition) elts = list(objelt) # the children of Object # for elt in elts: # print(elt) exhibs_to_insert = list() # all current plus any new exhibs_to_remove = list() # empty Exhibition template or to be deleted firstexix = None # index of the first Exhibition element need_new = True for n, elt in enumerate(elts): if elt.tag == "Exhibition": if firstexix is None: firstexix = n status = one_exhibition(elt) if status == 0: # This is an empty Exhibition template exhibs_to_remove.append(elt) continue begindate, _ = datefrommodes(elt.find('./Date/DateBegin').text) if status == 1: # Not this exhibition exhibs_to_insert.append((begindate, elt)) # will sort on date continue else: # status == 2 # Sanity check that we got at least one hit on the --old_xxxx parameter found_old_key = True need_new = False if _args.delete: exhibs_to_remove.append(elt) else: exhibs_to_insert.append( (begindate, elt)) # will sort on date if firstexix is None: # no Exhibition elements were found etype = objelt.get('elementtype') trace(1, 'Object number {}: No Exhibition element. etype: {}', idnum, etype) for n, elt in enumerate(elts): if elt.tag == "Acquisition": firstexix = n + 1 # insert the new elt after <Acquisition> break # Remove all the Exhibition elements and re-insert the ones we're # keeping in date order. for _edate, exhib in exhibs_to_insert: # print(objelt, exhib) objelt.remove(exhib) for exhib in exhibs_to_remove: objelt.remove(exhib) if need_new: newexhibit = new_exhib() # returns a tuple of (date, element) exhibs_to_insert.append(newexhibit) # Insert the Exhibition elements with the most recent one first for _edate, exhib in sorted(exhibs_to_insert): objelt.insert(firstexix, exhib)
def test_04(self): nid = denormalize_id('JB999999999') self.assertEqual(nid, 'JB999999999')
def test_03(self): nid = denormalize_id('LDHRM.2018.000001.000002') self.assertEqual(nid, 'LDHRM.2018.1.2')
def test_01(self): nid = denormalize_id('JB000001') self.assertEqual(nid, 'JB001')
def main(argv): # can be called either by __main__ or test_xml2csv global _args, _logfile _args = getargs(argv) infilename = _args.infile outfilename = _args.outfile cfgfilename = _args.cfgfile if _args.logfile: _logfile = open(_args.logfile, 'w') else: _logfile = sys.stdout infile = openfile(infilename) nlines = notfound = nwritten = 0 Config.reset_config() # needed by test_xml2csv if cfgfilename: cfgfile = open(cfgfilename) else: cfgfile = None trace( 1, 'Warning: Config file omitted. Only accession numbers will be output.' ) config = Config(cfgfile, dump=_args.verbose >= 2, logfile=_logfile) outcsv, outfile = opencsvwriter(outfilename, config.delimiter) outlist = [] titles = yaml_fieldnames(config) trace(1, 'Columns: {}', ', '.join(titles)) if not _args.heading: trace(1, 'Heading row not written.') if _args.heading: outcsv.writerow(titles) objectlevel = 0 if _args.object: expanded = [normalize_id(obj) for obj in expand_idnum(_args.object)] includeset = set(expanded) # JB001-002 -> JB001, JB002 includes = dict.fromkeys(includeset) else: includes = read_include_dict(_args.include, _args.include_column, _args.include_skip, _args.verbose, logfile=_logfile, allow_blanks=_args.allow_blanks) for event, elem in ET.iterparse(infile, events=('start', 'end')): # print(event) if event == 'start': # print(elem.tag) if elem.tag == config.record_tag: objectlevel += 1 continue # It's an "end" event. if elem.tag != config.record_tag: # default: Object continue objectlevel -= 1 if objectlevel: continue # It's not a top level Object. data = [] idelem = elem.find(config.record_id_xpath) idnum = idelem.text if idelem is not None else '' trace(3, 'idnum: {}', idnum) nlines += 1 writerow = config.select(elem, includes, exclude=_args.exclude) # print(f'{writerow=}') if not writerow: continue norm_idnum = normalize_id(idnum, _args.mdacode, verbose=_args.verbose) # We have selected the id but only write the row if there is something # to display. There will always be at least the ID number in the first # column unless skip_number was specified in the config. if config.skip_number: writerow = False else: # Insert the ID number as the first column. data.append(norm_idnum) for document in config.col_docs: text, command = one_document(document, elem, config) # print(f'{command=}') if text is None: notfound += 1 trace(2, '{}: cmd: {}, "{}" is not found in XML.', idnum, command, document[Stmt.TITLE]) text = '' if text: writerow = True data.append(text) if writerow: nwritten += 1 outlist.append(data) trace(3, '{} written.', idnum) elem.clear() if includes and not _args.exclude: includes.pop(norm_idnum) if _args.short: break if config.sort_numeric: outlist.sort(key=lambda x: int(x[0])) else: outlist.sort() # Create a list of flags indicating whether the value needs to be # de-normalized. norm = [] if not config.skip_number: norm.append(True) # for the Serial number for doc in config.col_docs: if doc[Stmt.CMD] in Cmd.get_control_cmds(): continue norm.append(Stmt.NORMALIZE in doc) lennorm = len(norm) for row in outlist: for n, cell in enumerate(row[:lennorm]): if norm[n]: row[n] = denormalize_id(cell, _args.mdacode) outcsv.writerow(row) infile.close() if cfgfile: cfgfile.close() outfile.close() if includes and len(includes): trace(1, '{} items in include list not in XML.', len(includes)) if _args.verbose > 1: print('In include list but not xml:', file=_logfile) for accnum in includes: print(accnum, file=_logfile) if not _args.bom: trace(1, 'BOM not written.') return nlines, nwritten, notfound