def get_csv_dict(csvfile): """ :param: csvfile: contains the accession number specified by --col_acc, optionally the exhibition number in the column specified by and --col_ex, and optionally the catalog number specified by --col_cat. :return: A dict with the key of the accession number and the value being a tuple of (exhibition number, catalogue number). """ def one_accession_number(accno): # print(f'{row=}') try: accnum = normalize_id(accno) except ValueError: print(f"Skipping in csv: {accno}") return if accnum in cdict: raise KeyError(f'Duplicate accession number: {accnum}') cataloguenumber = None if _args.col_cat is not None: cataloguenumber = row[_args.col_cat] try: # convert "33." to 33 cataloguenumber = int(float(cataloguenumber)) except ValueError: pass # ok, doesn't have to be an integer # print(row) # print(exhibition, cataloguenumber) cdict[accnum] = (exhibition, cataloguenumber) with codecs.open(csvfile, 'r', 'utf-8-sig') as mapfile: cdict = {} reader = csv.reader(mapfile) for n in range(_args.skiprows): next(reader) for row in reader: accnumber = row[_args.col_acc] if not accnumber: continue # blank accession number if _args.exhibition: exhibition = _args.exhibition else: col_ex = _args.col_ex try: exhibition = int(row[col_ex]) except (IndexError, ValueError) as e: if _args.allow_missing: trace(2, 'Missing exhibition number, skipping {}', accnumber) continue print(f'Missing column {col_ex}, accession #: {accnumber}') raise e # The "accnumber" might actually be a range of accession numbers # in the form JB001-002: accnumlist = expand_idnum(accnumber) for accn in accnumlist: one_accession_number(accn) trace(2, 'get_csv_dict: {}', cdict) return cdict
def loadcsv(): """ Read the CSV file containing objectid -> location mappings, specified by the --mapfile argument. :return: the dictionary containing the mappings """ rownum = 0 location_dict = {} if _args.subp == 'validate': return location_dict loc_arg = _args.location need_heading = bool(_args.heading) with codecs.open(_args.mapfile, 'r', 'utf-8-sig') as mapfile: reader = csv.reader(mapfile) for row in reader: rownum += 1 trace(3, 'row: {}', row) if need_heading: # if --location is given just skip the first row if not loc_arg and (row[_args.col_loc].strip().lower() != _args.heading.lower()): print(f'Fatal error: Failed heading check. ' f'{row[_args.col_loc].lower()} is not ' f'{_args.heading.lower()}.') sys.exit(1) need_heading = False continue objid = row[_args.col_acc].strip().upper() if not objid and ''.join(row): trace(2, 'Skipping row with blank object id: {}', row) continue objidlist = expand_idnum(objid) for ob in objidlist: nobjid = nd.normalize_id(ob) if not nobjid: print(f'Warning: Blank object ID row {rownum}: {row}') continue # blank number if nobjid in location_dict: print( f'Fatal error: Duplicate object ID row {rownum}: {row}.' ) sys.exit(1) location_dict[nobjid] = loc_arg if loc_arg else row[ _args.col_loc].strip() return location_dict
def main(): outfile.write(b'<?xml version="1.0"?><Interchange>\n') if _args.object: objlist = expand_idnum(_args.object) # JB001-002 -> JB001, JB002 exmap = { normalize_id(obj): # JB001 -> JB00000001 (_args.exhibition, _args.catalogue) for obj in objlist } else: exmap = get_csv_dict( _args.mapfile) # acc # -> (exhibition #, catalog #) exdict = get_exhibition_dict() # exhibition # -> Exhibition tuple written = 0 numupdated = 0 for event, elem in ET.iterparse(infile): if elem.tag != 'Object': continue idelem = elem.find(Stmt.get_default_record_id_xpath()) idnum = idelem.text if idelem is not None else None idnum = normalize_id(idnum) trace(3, 'idnum: {}', idnum) if idnum and idnum in exmap: exnum, cataloguenumber = exmap[idnum] one_object(elem, idnum, exdict[exnum], cataloguenumber) del exmap[idnum] updated = True numupdated += 1 else: updated = False if updated or _args.all: outfile.write(ET.tostring(elem, encoding='utf-8')) written += 1 if updated and _args.short: break outfile.write(b'</Interchange>') for idnum in exmap: trace(1, 'In CSV but not XML: "{}"', idnum) trace( 1, f'End exhibition.py. {written} object' f'{"s" if written != 1 else ""} written ' f'of which {numupdated} updated.')
def main(argv): # can be called either by __main__ or test_xml2csv global _args, _logfile _args = getargs(argv) infilename = _args.infile outfilename = _args.outfile cfgfilename = _args.cfgfile if _args.logfile: _logfile = open(_args.logfile, 'w') else: _logfile = sys.stdout infile = openfile(infilename) nlines = notfound = nwritten = 0 Config.reset_config() # needed by test_xml2csv if cfgfilename: cfgfile = open(cfgfilename) else: cfgfile = None trace( 1, 'Warning: Config file omitted. Only accession numbers will be output.' ) config = Config(cfgfile, dump=_args.verbose >= 2, logfile=_logfile) outcsv, outfile = opencsvwriter(outfilename, config.delimiter) outlist = [] titles = yaml_fieldnames(config) trace(1, 'Columns: {}', ', '.join(titles)) if not _args.heading: trace(1, 'Heading row not written.') if _args.heading: outcsv.writerow(titles) objectlevel = 0 if _args.object: expanded = [normalize_id(obj) for obj in expand_idnum(_args.object)] includeset = set(expanded) # JB001-002 -> JB001, JB002 includes = dict.fromkeys(includeset) else: includes = read_include_dict(_args.include, _args.include_column, _args.include_skip, _args.verbose, logfile=_logfile, allow_blanks=_args.allow_blanks) for event, elem in ET.iterparse(infile, events=('start', 'end')): # print(event) if event == 'start': # print(elem.tag) if elem.tag == config.record_tag: objectlevel += 1 continue # It's an "end" event. if elem.tag != config.record_tag: # default: Object continue objectlevel -= 1 if objectlevel: continue # It's not a top level Object. data = [] idelem = elem.find(config.record_id_xpath) idnum = idelem.text if idelem is not None else '' trace(3, 'idnum: {}', idnum) nlines += 1 writerow = config.select(elem, includes, exclude=_args.exclude) # print(f'{writerow=}') if not writerow: continue norm_idnum = normalize_id(idnum, _args.mdacode, verbose=_args.verbose) # We have selected the id but only write the row if there is something # to display. There will always be at least the ID number in the first # column unless skip_number was specified in the config. if config.skip_number: writerow = False else: # Insert the ID number as the first column. data.append(norm_idnum) for document in config.col_docs: text, command = one_document(document, elem, config) # print(f'{command=}') if text is None: notfound += 1 trace(2, '{}: cmd: {}, "{}" is not found in XML.', idnum, command, document[Stmt.TITLE]) text = '' if text: writerow = True data.append(text) if writerow: nwritten += 1 outlist.append(data) trace(3, '{} written.', idnum) elem.clear() if includes and not _args.exclude: includes.pop(norm_idnum) if _args.short: break if config.sort_numeric: outlist.sort(key=lambda x: int(x[0])) else: outlist.sort() # Create a list of flags indicating whether the value needs to be # de-normalized. norm = [] if not config.skip_number: norm.append(True) # for the Serial number for doc in config.col_docs: if doc[Stmt.CMD] in Cmd.get_control_cmds(): continue norm.append(Stmt.NORMALIZE in doc) lennorm = len(norm) for row in outlist: for n, cell in enumerate(row[:lennorm]): if norm[n]: row[n] = denormalize_id(cell, _args.mdacode) outcsv.writerow(row) infile.close() if cfgfile: cfgfile.close() outfile.close() if includes and len(includes): trace(1, '{} items in include list not in XML.', len(includes)) if _args.verbose > 1: print('In include list but not xml:', file=_logfile) for accnum in includes: print(accnum, file=_logfile) if not _args.bom: trace(1, 'BOM not written.') return nlines, nwritten, notfound
called_from_sphinx = True if __name__ == '__main__': called_from_sphinx = False assert sys.version_info >= (3, 6) is_diff = sys.argv[1] == 'diff' is_select = sys.argv[1] == 'select' is_update = sys.argv[1] == 'update' is_validate = sys.argv[1] == 'validate' _args = getargs(sys.argv) verbose = _args.verbose if is_update and _args.object: if not _args.location: raise (ValueError('You specified the object id. You must also ' 'specify the location.')) objectlist = expand_idnum(_args.object) newlocs = {nd.normalize_id(obj): _args.location for obj in objectlist} trace(2, 'Object(s) specified, newlocs= {}', newlocs) else: newlocs = loadcsv() total_in_csvfile = len(newlocs) total_updated = total_written = 0 total_failed = total_objects = 0 # validate only infile = open(_args.infile, encoding=_args.encoding) if is_update: outfile = open(_args.outfile, 'wb') else: outfile = None main() if is_update: print(f'Total Updated: {total_updated}/{total_in_csvfile}\n'
def test_01(self): idnums = expand_idnum('jb001') self.assertNotEqual(idnums, ['JB001'])
def test_19(self): idnums = expand_idnum('SH104-5') self.assertEqual(idnums, ['SH104', 'SH105'])
def test_06(self): idnums = expand_idnum('SH9-10') self.assertEqual(idnums, ['SH9', 'SH10'])
objcount = selcount = 0 object_number = '' _args = getargs() infile = open(_args.infile) if _args.directory: outfile = _args.outfile ld = os.listdir(outfile) if ld and not _args.force: print(f'Directory {outfile} is not empty. Exiting.') sys.exit() else: outfile = open(_args.outfile, 'wb') if _args.cfgfile: cfgfile = open(_args.cfgfile) else: cfgfile = None config = Config(cfgfile, dump=_args.verbose >= 2) if _args.object: expanded = [normalize_id(obj) for obj in expand_idnum(_args.object)] includeset = set(expanded) # JB001-002 -> JB001, JB002 includes = dict.fromkeys(includeset) else: includes = read_include_dict(_args.include, _args.include_column, _args.include_skip, _args.verbose) main() basename = os.path.basename(sys.argv[0]) print( f'{selcount} object{"" if selcount == 1 else "s"} selected from {objcount}.' ) print(f'End {basename.split(".")[0]}')
def test_15(self): idnums = expand_idnum('SH1-02') self.assertEqual(idnums, ['SH1', 'SH2'])
def test_14(self): idnums = expand_idnum('LDHRM.2021.2 & 17,JB001') target = ['LDHRM.2021.2', 'LDHRM.2021.17', 'JB001'] self.assertEqual(idnums, target)
def test_11(self): idnums = expand_idnum('LDHRM.2021.2-17') target = ['LDHRM.2021.' + str(n) for n in range(2, 18)] self.assertEqual(idnums, target)
def test_10(self): idnums = expand_idnum('JB08-1110') target = ['JB08', 'JB09'] + ['JB' + str(n) for n in range(10, 1111)] self.assertEqual(idnums, target)
def test_09(self): idnums = expand_idnum('JB001-002') self.assertEqual(idnums, ['JB001', 'JB002'])
def test_07(self): idnums = expand_idnum('JB09-10') self.assertEqual(idnums, ['JB09', 'JB10'])
def test_16(self): idnums = expand_idnum('SH1&2&3') self.assertEqual(idnums, ['SH1', 'SH2', 'SH3'])
def test_17(self): idnums = expand_idnum('SH10-3') self.assertEqual(idnums, ['SH10', 'SH11', 'SH12', 'SH13'])
def test_one_idnum(self): idlist = [] for idnum in IDNUM_TESTS: idlist += expand_idnum(idnum) self.assertEqual(idlist, IDNUM_RESULTS)
def test_02(self): idnums = expand_idnum('JB001') self.assertEqual(idnums, ['JB001'])