Exemple #1
0
def main():
    outfile.write(b'<?xml version="1.0"?><Interchange>\n')
    written = 0
    numupdated = 0
    conserved, condition = make_conserved()
    for event, elem in ET.iterparse(infile):
        if elem.tag != 'Object':
            continue
        idelem = elem.find(Stmt.get_default_record_id_xpath())
        idnum = idelem.text if idelem is not None else None
        idnum = normalize_id(idnum)
        trace(3, 'idnum: {}', idnum)
        updated = False
        if idnum.startswith('SH') or idnum == 'JB00000314':
            one_object(elem, idnum, conserved, condition)
            updated = True
            numupdated += 1
        if updated or _args.all:
            outfile.write(ET.tostring(elem, encoding='utf-8'))
            written += 1
        if updated and _args.short:
            break
    outfile.write(b'</Interchange>')
    trace(
        1, f'End SH_acquisition.py. {written} object'
        f'{"s" if written != 1 else ""} written '
        f'of which {numupdated} updated.')
    for idnum in conserved:
        print(f'Not processed: {idnum}')
Exemple #2
0
def main():
    global nwritten
    outfile.write(b'<?xml version="1.0" encoding="UTF-8"?><Interchange>\n')
    for event, elem in ET.iterparse(infile):
        if elem.tag != 'Object':
            continue
        idelem = elem.find(cfg.record_id_xpath)
        idnum = idelem.text if idelem is not None else None
        nidnum = normalize_id(idnum)
        trace(3, 'idnum: {}', idnum)
        if nidnum and nidnum in newvals:
            updated = one_element(elem, nidnum)
            del newvals[nidnum.upper()]
        else:
            updated = False
            if _args.missing:
                trace(2, 'Not in CSV file: "{}"', idnum)
        if updated or _args.all:
            outfile.write(ET.tostring(elem, encoding='utf-8'))
            nwritten += 1
        if _args.short:
            break
    outfile.write(b'</Interchange>')
    for nidnum in newvals:
        trace(1, 'In CSV but not XML: "{}"', denormalize_id(nidnum))
Exemple #3
0
def handle_diff(idnum, elem):
    if _args.all:
        if idnum not in newlocs and _args.warn:
            trace(3, 'Not in CSV file: {}', idnum)
            return
    objlocs = elem.findall('./ObjectLocation')
    for ol in objlocs:
        loc = ol.get(ELEMENTTYPE)
        if (_args.normal
                and loc == NORMAL_LOCATION) or (_args.current
                                                and loc == CURRENT_LOCATION):
            location = ol.find('./Location')
            if location.text is not None:
                text = location.text.strip().upper()
            else:
                text = None
            if _args.location:
                newtext = _args.location
            else:
                nidnum = nd.normalize_id(idnum)
                newtext = newlocs.get(nidnum, None)
                if newtext is None:
                    return
                del newlocs[nidnum]
            trace(2, 'New location for {}: {}', idnum, newtext)
            if text != newtext:
                trace(1, 'Different {}: XML: {}, CSV: {}', idnum, text,
                      newtext)
            break
Exemple #4
0
def main():
    outfile.write(b'<?xml version="1.0"?><Interchange>\n')
    if _args.object:
        objlist = expand_idnum(_args.object)  # JB001-002 -> JB001, JB002
        exmap = {
            normalize_id(obj):  # JB001 -> JB00000001
            (_args.exhibition, _args.catalogue)
            for obj in objlist
        }
    else:
        exmap = get_csv_dict(
            _args.mapfile)  # acc # -> (exhibition #, catalog #)
    exdict = get_exhibition_dict()  # exhibition # -> Exhibition tuple
    written = 0
    numupdated = 0
    for event, elem in ET.iterparse(infile):
        if elem.tag != 'Object':
            continue
        idelem = elem.find(Stmt.get_default_record_id_xpath())
        idnum = idelem.text if idelem is not None else None
        idnum = normalize_id(idnum)
        trace(3, 'idnum: {}', idnum)
        if idnum and idnum in exmap:
            exnum, cataloguenumber = exmap[idnum]
            one_object(elem, idnum, exdict[exnum], cataloguenumber)
            del exmap[idnum]
            updated = True
            numupdated += 1
        else:
            updated = False
        if updated or _args.all:
            outfile.write(ET.tostring(elem, encoding='utf-8'))
            written += 1
        if updated and _args.short:
            break
    outfile.write(b'</Interchange>')
    for idnum in exmap:
        trace(1, 'In CSV but not XML: "{}"', idnum)
    trace(
        1, f'End exhibition.py. {written} object'
        f'{"s" if written != 1 else ""} written '
        f'of which {numupdated} updated.')
Exemple #5
0
def one_object(elt):
    num = elt.find('./ObjectIdentity/Number').text
    loc = elt.find('./ObjectLocation[@elementtype="current location"]/Location')
    if loc is not None and loc.text:
        location = pad_loc(loc.text)
    else:
        location = 'unknown'
    title = elt.find('./Identification/Title').text

    nnum = normalize_id(num)
    boxdict[location].append(nnum)
    titledict[nnum] = title
Exemple #6
0
def make_conserved() -> tuple[set[str], dict[str]]:
    confile = codecs.open(CSVFILE, 'r', 'utf-8-sig')
    trace(1, 'using list of conserved objects in: {}', confile.name)
    conserved = set()
    for row in confile:
        row = row.strip()
        if not (row.startswith('SH') or row.startswith('JB')):
            row = 'SH' + row
        normid = normalize_id(row)
        if normid in conserved:
            print(f'Duplicate id: {normid}')
        conserved.add(normid)
    print(f'{len(conserved)=}')
    condfile = codecs.open(CONDFILE, 'r', 'utf-8-sig')
    trace(1, 'using condition of conserved objects in: {}', condfile.name)
    condition = dict()
    reader = csv.reader(condfile)
    for row in reader:
        normid = normalize_id(row[0])
        if normid in condition:
            print(f'Duplicate id: {normid}')
        condition[normid] = row[1]
    print(f'{len(condition)=}')
    return conserved, condition
Exemple #7
0
def loadnewvals(allow_blanks=False):
    """
    Read the CSV file containing objectid -> new element values
    :param allow_blanks: if True, rows with a blank accession number are
            skipped. Otherwise a ValueError exception is raised.
    :return: the dictionary containing the mappings where the key is the
             objectid and the value is a list of the remaining columns
    """
    newval_dict = {}
    with codecs.open(_args.mapfile, 'r', 'utf-8-sig') as mapfile:
        reader = csv.reader(mapfile)
        skiprows = _args.skip_rows
        for n in range(skiprows):  # default = 0
            skipped = next(reader)  # skip header
            if _args.verbose >= 1:
                print(f'Skipping row in map file: {skipped}')
        if _args.heading:
            # Check that the first row in the CSV file contains the same
            # column headings as in the title statements of the YAML file.
            row = next(reader)
            row = [r.strip() for r in row]
            irow = iter(row)
            next(irow)  # skip Serial column
            for doc in cfg.col_docs:
                col = next(irow)
                title = doc[Stmt.TITLE]
                if col.lower() != title.lower():
                    print(f'Mismatch on heading: "{title}" in config !='
                          f' "{col}" in CSV file')
                    sys.exit(1)

        for row in reader:
            row = [r.strip() for r in row]
            idnum = row[0]
            if not idnum:
                if allow_blanks:
                    trace(2, 'Row with blank accession number skipped: {}',
                          row)
                    continue  # skip blank accession numbers
                else:
                    raise ValueError('Blank accession number in include file;'
                                     ' --allow_blank not selected.')
            # Strip off the accession number in the first column so that the
            # list matches the columns in the config file. Note that this
            # depends on the accession number being in the first column.

            newval_dict[normalize_id(idnum)] = row[1:]
    return newval_dict
Exemple #8
0
def dir2list(jpegdir, normalize=False):
    jpglist = list()
    jpgfiles = os.listdir(jpegdir)
    for jpgfile in jpgfiles:
        m = re.match(r'(collection_)?(.+)\.jpg', jpgfile)
        if not m:
            print(f'dir2list skipping: {jpgfile}')
            continue
        accn = m.group(2)
        if normalize:
            try:
                accn = normalize_id(accn, verbose=2)
            except ValueError:
                pass
        jpglist.append(accn)
    return sorted(jpglist)
Exemple #9
0
def loadcsv():
    """
    Read the CSV file containing objectid -> location mappings, specified
    by the --mapfile argument.
    :return: the dictionary containing the mappings
    """
    rownum = 0
    location_dict = {}
    if _args.subp == 'validate':
        return location_dict
    loc_arg = _args.location
    need_heading = bool(_args.heading)
    with codecs.open(_args.mapfile, 'r', 'utf-8-sig') as mapfile:
        reader = csv.reader(mapfile)
        for row in reader:
            rownum += 1
            trace(3, 'row: {}', row)
            if need_heading:
                # if --location is given just skip the first row
                if not loc_arg and (row[_args.col_loc].strip().lower() !=
                                    _args.heading.lower()):
                    print(f'Fatal error: Failed heading check. '
                          f'{row[_args.col_loc].lower()} is not '
                          f'{_args.heading.lower()}.')
                    sys.exit(1)
                need_heading = False
                continue
            objid = row[_args.col_acc].strip().upper()
            if not objid and ''.join(row):
                trace(2, 'Skipping row with blank object id: {}', row)
                continue
            objidlist = expand_idnum(objid)
            for ob in objidlist:
                nobjid = nd.normalize_id(ob)
                if not nobjid:
                    print(f'Warning: Blank object ID row {rownum}: {row}')
                    continue  # blank number
                if nobjid in location_dict:
                    print(
                        f'Fatal error: Duplicate object ID row {rownum}: {row}.'
                    )
                    sys.exit(1)
                location_dict[nobjid] = loc_arg if loc_arg else row[
                    _args.col_loc].strip()
    return location_dict
Exemple #10
0
def main():
    objdict = {}
    outfile.write(b'<?xml version="1.0" encoding="utf-8"?><Interchange>\n')
    seq = 0
    for event, elem in ET.iterparse(infile):
        if elem.tag != 'Object':
            continue
        seq += 1
        num = elem.find('./ObjectIdentity/Number').text
        num = normalize_id(num, _args.mdacode)
        if num in objdict:
            print(f'seq {seq}, ID {num} is a duplicate, ignored.')
            continue
        objdict[num] = ET.tostring(elem, encoding='utf-8').strip()
    for num in sorted(objdict):
        outfile.write(objdict[num])
        outfile.write(b'\n')
    outfile.write(b'</Interchange>')
Exemple #11
0
 def onefile(imgf: str):
     imgf2 = imgf.removeprefix('collection_')
     prefix, suffix = os.path.splitext(imgf2)
     if suffix.lower() not in ('.jpg', '.png'):
         if _args.verbose > 1 and not imgf.startswith(
                 '.'):  # ignore .DS_Store
             print('not image:', imgf)
         return
     try:
         nid = normalize_id(prefix)
     except ValueError as ve:
         print(f'Skipping {imgf}: {ve}')
         return
     if nid in img_ids:
         print(
             f'Duplicate: {prefix} in {dirpath.removeprefix(_args.imgdir)},'
             f'original in {img_ids[nid][0].removeprefix(_args.imgdir)}')
     else:
         img_ids[nid] = (imgf2, dirpath)
Exemple #12
0
 def onefile(imgf: str):
     m = re.match(r'(collection_)?(.*)', imgf)
     imgf2 = m.group(2)  # remove optional leading 'collection_'
     prefix, suffix = os.path.splitext(imgf2)
     if suffix.lower() not in ('.jpg', '.png'):
         if _args.verbose > 1:
             print('not image:', imgf)
         return
     try:
         nid = normalize_id(prefix)
     except ValueError as ve:
         print(f'Skipping {imgf}')
         return
     if nid in img_ids:
         print(
             f'Duplicate: {prefix} in {dirpath.removeprefix(_args.imgdir)},'
             f'original in {img_ids[nid][0].removeprefix(_args.imgdir)}')
     else:
         img_ids[nid] = (imgf2, dirpath)
Exemple #13
0
 def add_one_id(candidate):
     """
     :param candidate: filename with trailing .csv removed
     :return: None. The nonlocal candidate_set is updated if the name
              was valid.
     """
     nonlocal notinmodes
     candidate2 = candidate.removeprefix('collection_')
     try:
         normid = normalize_id(candidate2)
     except ValueError as ve:
         if not candidate2.startswith('.'):
             trace(1, '{}', ve)
         return
     if normid in valid_idnums:
         candidate_set.add(normid)
     else:
         trace(2, 'Skipping {}, not in Modes.', candidate2)
         notinmodes += 1
Exemple #14
0
def main():
    targetdir = sys.argv[1]

    try:
        targetlist = [(normalize_id(os.path.splitext(t)[0], strict=False), t)
                      for t in os.listdir(targetdir)]
        targetlist = sorted(targetlist, key=lambda item: item[0])
        numtargets = len(targetlist)
        ntarg = 0
        for _, target in targetlist:
            ntarg += 1
            print(f'file {ntarg} of {numtargets}: {target}')
            filename, extension = os.path.splitext(target)
            if extension.lower() in ('.jpg', '.jpeg', '.png'):
                subprocess.run(['open', '-W', os.path.join(targetdir, target)])
            else:
                print('skipping', target)
    except KeyboardInterrupt:
        print('\nExiting.')
        sys.exit(1)
Exemple #15
0
 def one_accession_number(accno):
     # print(f'{row=}')
     try:
         accnum = normalize_id(accno)
     except ValueError:
         print(f"Skipping in csv: {accno}")
         return
     if accnum in cdict:
         raise KeyError(f'Duplicate accession number: {accnum}')
     cataloguenumber = None
     if _args.col_cat is not None:
         cataloguenumber = row[_args.col_cat]
         try:
             # convert "33." to 33
             cataloguenumber = int(float(cataloguenumber))
         except ValueError:
             pass  # ok, doesn't have to be an integer
     # print(row)
     # print(exhibition, cataloguenumber)
     cdict[accnum] = (exhibition, cataloguenumber)
Exemple #16
0
def update_normal_location(ol, idnum):
    """
    :param ol: the ObjectLocation element
    :param idnum: the ObjectIdentity/Number text (we've tested that idnum is in newlocs)
    :return: True if the object is updated, False otherwise
    """
    updated = False
    location = ol.find('./Location')
    if location.text is not None:
        text = location.text.strip().upper()
    else:
        text = None

    nidnum = nd.normalize_id(idnum)
    newtext = _args.location if _args.location else newlocs[nidnum]
    if text != newtext:
        trace(2, '{}: Updated normal {} -> {}', idnum, text, newtext)
        location.text = newtext
        updated = True
    else:
        trace(2, '{}: Normal location unchanged: {}', idnum, text)
    return updated
Exemple #17
0
def read_include_dict(includes_file, include_column, include_skip, verbos=1,
                      logfile=sys.stdout, allow_blanks=False):
    """
    Read the optional CSV file from the --include argument. Build a dict
    of accession IDs in upper case for use by cfgutil.select. The value
    of the dict is the row from the CSV file.
    :return: a dict or None if --include was not specified
    """

    if not includes_file:
        return None
    if os.path.splitext(includes_file)[1].lower() != 'csv':
        raise ValueError('mapfile must be a CSV file.')
    includedict: dict = dict()
    includereader = csv.reader(codecs.open(includes_file, 'r', 'utf-8-sig'))
    for n in range(include_skip):  # default in xml2csv = 0
        skipped = next(includereader)  # skip header
        if verbos >= 1:
            print(f'Skipping row in "include" file: {skipped}', file=logfile)
    for row in includereader:
        if not row:
            continue
        idnum = row[include_column].upper()  # cfgutil.select needs uppercase
        if not idnum:
            if allow_blanks:
                continue  # skip blank accession numbers
            else:
                raise ValueError('Blank accession number in include file;'
                                 ' --allow_blank not selected.')
        # idnumlist: list[str] = expand_idnum(idnum)
        idnumlist: list[str] = [normalize_id(i) for i in expand_idnum(idnum)]
        if verbos >= 1:
            for num in idnumlist:
                if num in includedict:
                    print(f'Warning: Duplicate id number in include '
                          f'file, {num}, ignored.', file=logfile)
        for idnum in idnumlist:
            includedict[idnum] = row
    return includedict
Exemple #18
0
def main():
    global objcount, selcount
    if not _args.directory:
        declaration = f'<?xml version="1.0" encoding="{_args.encoding}"?>\n'
        outfile.write(bytes(declaration, encoding=_args.encoding))
        outfile.write(b'<Interchange>\n')
    objectlevel = 0
    for event, oldobject in ET.iterparse(infile, events=('start', 'end')):
        if event == 'start':
            if oldobject.tag == config.record_tag:
                objectlevel += 1
            continue
        # It's an "end" event.
        if oldobject.tag != config.record_tag:
            continue
        objectlevel -= 1
        if objectlevel:
            continue  # It's not a top level Object.
        idelem = oldobject.find(config.record_id_xpath)
        idnum = idelem.text if idelem is not None else None
        if _args.normalize:
            idnum = normalize_id(idnum)
        selected = config.select(oldobject, includes, _args.exclude)
        objcount += 1
        if selected:
            selcount += 1
            outstring = ET.tostring(oldobject, encoding=_args.encoding)
            if _args.directory:
                objfilename = os.path.join(_args.outfile, idnum + '.xml')
                objfile = open(objfilename, 'wb')
                objfile.write(outstring)
                objfile.close()
            else:
                outfile.write(outstring)
        oldobject.clear()
        if _args.short:
            break
    if not _args.directory:
        outfile.write(b'</Interchange>')
Exemple #19
0
def one_document(document, parent, config: Config):
    command = document[Stmt.CMD]
    eltstr = document.get(Stmt.XPATH)
    text = None
    if eltstr:
        element = parent.find(eltstr)
    else:
        element = None
    if element is None:
        return None, command
    if command == Cmd.ATTRIB:
        attribute = document[Stmt.ATTRIBUTE]
        text = element.get(attribute)
    elif command == Cmd.COUNT:
        count = len(list(parent.findall(eltstr)))
        text = f'{count}'
    elif command == Cmd.KEYWORD:
        value = document[Stmt.VALUE]
        if element.text.strip() == value:
            keyword = element.find('Keyword')
            text = keyword.text.strip()
    elif command == Cmd.MULTIPLE:
        elements = parent.findall(eltstr)
        delimiter = document[Stmt.MULTIPLE_DELIMITER]
        # print(f'{elements=}')
        # for e in elements:
        #     print(f'{e.text=}')
        text = delimiter.join([e.text for e in elements if e.text is not None])
    elif element.text is None:
        text = ''
    else:
        text = element.text.strip()
    if Stmt.NORMALIZE in document:
        text = normalize_id(text, _args.mdacode)
    if Stmt.WIDTH in document:
        text = text[:int(document[Stmt.WIDTH])]
    return text, command
Exemple #20
0
def handle_update(idnum, elem):
    """
    If the location in the newlocs dictionary is different from the location
    in the XML, update the XML, insert the date specified on the command line,
    and delete the idnum from the global "newlocs" dictionary.

    :param idnum:
    :param elem:
    :return: None
    """
    global total_updated, total_written
    updated = False
    nidnum = nd.normalize_id(idnum)
    if nidnum in newlocs:  # newlocs: list returned by loadcsv()
        if not validate_locations(idnum, elem):
            trace(1, 'Failed pre-update validation.')
            sys.exit(1)
        if _args.normal:
            ol = elem.find('./ObjectLocation[@elementtype="normal location"]')
            updated = update_normal_location(ol, idnum)
        if _args.current:
            updated |= update_current_location(elem, idnum)
        if _args.previous:
            updated |= update_previous_location(elem, idnum)
        del newlocs[nidnum]
    else:
        if _args.warn:
            trace(1, '{}: Not in CSV file', idnum)
    if nidnum in newlocs and not validate_locations(idnum, elem):
        trace(1, 'Failed post-update validation.')
        sys.exit(1)
    if updated:
        total_updated += 1
    if updated or _args.all:
        outfile.write(ET.tostring(elem, encoding='utf-8'))
        total_written += 1
Exemple #21
0
 def test_01b(self):
     nid = normalize_id('JB1a')
     self.assertEqual(nid, 'JB000001A')
Exemple #22
0
def select(cfg: Config, elem, includes=None, exclude=False):
    """
    :param cfg: the Config instance
    :param elem: the Object element
    :param includes: A set or dict of id numbers of objects to be included
                     in the output CSV file. The list must be all uppercase.
    :param exclude: Treat the include list as an exclude list.
    :return: selected is true if the Object element should be written out
    """
    # print('select')
    selected = True
    idelem = elem.find(cfg.record_id_xpath)
    idnum = normalize_id(idelem.text) if idelem is not None else None
    # print(f'{idnum=}')
    if idnum and exclude and includes:
        if idnum.upper() in includes:
            return False
    elif includes is not None:
        if not idnum or idnum.upper() not in includes:
            # print('select return false')
            return False
    for document in cfg.ctrl_docs:
        command = document[Stmt.CMD]
        if command == Cmd.GLOBAL:
            continue
        eltstr = document.get(Stmt.XPATH)
        if eltstr:
            element = elem.find(eltstr)
        else:
            element = None
        # print(f'{element=}')
        if element is None:
            if Stmt.REQUIRED in document:
                print(f'*** Required element {eltstr} is missing from'
                      f' {idnum}. Object excluded.', file=cfg.logfile)
            selected = False
            break
        elif command == Cmd.IFELT:
            continue  # if the element exists
        if command in (Cmd.ATTRIB, Cmd.IFATTRIB, Cmd.IFATTRIBEQ, Cmd.IFATTRIBNOTEQ):
            attribute = document[Stmt.ATTRIBUTE]
            text = element.get(attribute).strip()
        elif element is None or element.text is None:
            text = ''
        else:
            # noinspection PyUnresolvedReferences
            text = element.text.strip()
        # print(f'{text=}')
        if text:
            if command == Cmd.IFNOT:
                selected = False
                break
        else:
            if Stmt.REQUIRED in document:
                print(f'*** Required text in {eltstr} is missing from'
                      f' {idnum}. Object excluded.', file=cfg.logfile)
            if command in (Cmd.IF, Cmd.IFATTRIB, Cmd.IFCONTAINS,
                           Cmd.IFEQ, Cmd.IFATTRIBEQ):
                selected = False
                break
        if command in (Cmd.IFEQ, Cmd.IFNOTEQ, Cmd.IFCONTAINS,
                       Cmd.IFATTRIBEQ, Cmd.IFATTRIBNOTEQ):
            value = document[Stmt.VALUE]
            textvalue = text
            if Stmt.CASESENSITIVE not in document:
                value = value.lower()
                textvalue = textvalue.lower()
            if command == Cmd.IFCONTAINS and value not in textvalue:
                selected = False
                break
            elif (command in (Cmd.IFEQ, Cmd.IFATTRIBEQ)
                  and value != textvalue):
                selected = False
                break
            elif (command in (Cmd.IFNOTEQ, Cmd.IFATTRIBNOTEQ)
                  and value == textvalue):
                selected = False
                break
            continue
    # print(f'{selected=}')
    return selected
Exemple #23
0
def norm(e):
    return normalize_id(e[0])
Exemple #24
0
def main(argv):  # can be called either by __main__ or test_xml2csv
    global _args, _logfile
    _args = getargs(argv)
    infilename = _args.infile
    outfilename = _args.outfile
    cfgfilename = _args.cfgfile
    if _args.logfile:
        _logfile = open(_args.logfile, 'w')
    else:
        _logfile = sys.stdout
    infile = openfile(infilename)
    nlines = notfound = nwritten = 0
    Config.reset_config()  # needed by test_xml2csv
    if cfgfilename:
        cfgfile = open(cfgfilename)
    else:
        cfgfile = None
        trace(
            1,
            'Warning: Config file omitted. Only accession numbers will be output.'
        )
    config = Config(cfgfile, dump=_args.verbose >= 2, logfile=_logfile)
    outcsv, outfile = opencsvwriter(outfilename, config.delimiter)
    outlist = []
    titles = yaml_fieldnames(config)
    trace(1, 'Columns: {}', ', '.join(titles))
    if not _args.heading:
        trace(1, 'Heading row not written.')
    if _args.heading:
        outcsv.writerow(titles)
    objectlevel = 0
    if _args.object:
        expanded = [normalize_id(obj) for obj in expand_idnum(_args.object)]
        includeset = set(expanded)  # JB001-002 -> JB001, JB002
        includes = dict.fromkeys(includeset)
    else:
        includes = read_include_dict(_args.include,
                                     _args.include_column,
                                     _args.include_skip,
                                     _args.verbose,
                                     logfile=_logfile,
                                     allow_blanks=_args.allow_blanks)
    for event, elem in ET.iterparse(infile, events=('start', 'end')):
        # print(event)
        if event == 'start':
            # print(elem.tag)
            if elem.tag == config.record_tag:
                objectlevel += 1
            continue
        # It's an "end" event.
        if elem.tag != config.record_tag:  # default: Object
            continue
        objectlevel -= 1
        if objectlevel:
            continue  # It's not a top level Object.
        data = []
        idelem = elem.find(config.record_id_xpath)
        idnum = idelem.text if idelem is not None else ''
        trace(3, 'idnum: {}', idnum)
        nlines += 1

        writerow = config.select(elem, includes, exclude=_args.exclude)
        # print(f'{writerow=}')
        if not writerow:
            continue
        norm_idnum = normalize_id(idnum, _args.mdacode, verbose=_args.verbose)
        # We have selected the id but only write the row if there is something
        # to display. There will always be at least the ID number in the first
        # column unless skip_number was specified in the config.
        if config.skip_number:
            writerow = False
        else:
            # Insert the ID number as the first column.
            data.append(norm_idnum)

        for document in config.col_docs:
            text, command = one_document(document, elem, config)
            # print(f'{command=}')
            if text is None:
                notfound += 1
                trace(2, '{}: cmd: {}, "{}" is not found in XML.', idnum,
                      command, document[Stmt.TITLE])
                text = ''
            if text:
                writerow = True
            data.append(text)

        if writerow:
            nwritten += 1
            outlist.append(data)
            trace(3, '{} written.', idnum)
        elem.clear()
        if includes and not _args.exclude:
            includes.pop(norm_idnum)
        if _args.short:
            break
    if config.sort_numeric:
        outlist.sort(key=lambda x: int(x[0]))
    else:
        outlist.sort()
    # Create a list of flags indicating whether the value needs to be
    # de-normalized.
    norm = []
    if not config.skip_number:
        norm.append(True)  # for the Serial number
    for doc in config.col_docs:
        if doc[Stmt.CMD] in Cmd.get_control_cmds():
            continue
        norm.append(Stmt.NORMALIZE in doc)
    lennorm = len(norm)
    for row in outlist:
        for n, cell in enumerate(row[:lennorm]):
            if norm[n]:
                row[n] = denormalize_id(cell, _args.mdacode)
        outcsv.writerow(row)
    infile.close()
    if cfgfile:
        cfgfile.close()
    outfile.close()
    if includes and len(includes):
        trace(1, '{} items in include list not in XML.', len(includes))
        if _args.verbose > 1:
            print('In include list but not xml:', file=_logfile)
            for accnum in includes:
                print(accnum, file=_logfile)
    if not _args.bom:
        trace(1, 'BOM not written.')
    return nlines, nwritten, notfound
Exemple #25
0
    objcount = selcount = 0
    object_number = ''
    _args = getargs()
    infile = open(_args.infile)
    if _args.directory:
        outfile = _args.outfile
        ld = os.listdir(outfile)
        if ld and not _args.force:
            print(f'Directory {outfile} is not empty. Exiting.')
            sys.exit()
    else:
        outfile = open(_args.outfile, 'wb')
    if _args.cfgfile:
        cfgfile = open(_args.cfgfile)
    else:
        cfgfile = None
    config = Config(cfgfile, dump=_args.verbose >= 2)
    if _args.object:
        expanded = [normalize_id(obj) for obj in expand_idnum(_args.object)]
        includeset = set(expanded)  # JB001-002 -> JB001, JB002
        includes = dict.fromkeys(includeset)
    else:
        includes = read_include_dict(_args.include, _args.include_column,
                                     _args.include_skip, _args.verbose)
    main()
    basename = os.path.basename(sys.argv[0])
    print(
        f'{selcount} object{"" if selcount == 1 else "s"} selected from {objcount}.'
    )
    print(f'End {basename.split(".")[0]}')
objects = list_objects(XMLPATH)
imgs = os.listdir(IMGSPATH)

includecsvfile = open(INCLUDECSVPATH, 'w')
csvfile = codecs.open(CSVPATH, 'r', 'utf-8-sig')
reader = csv.DictReader(csvfile)
objset = set(n.normalized for n in objects)

cat2accn = {}
for row in reader:
    cat = row['Cat'].strip()  # catalogue number
    if not cat:
        continue
    ac = row['Accn. No.']
    try:
        accnum = normalize_id(ac.strip())
    except ValueError:
        print(f'bad accnum: skipping {cat}: "{ac}"')
        continue
    if accnum not in objset:
        print(f'skipping {cat}: "{accnum}"')
        continue
    cat2accn[cat] = accnum

for filename in imgs:
    m = re.match(r'(\d+\w?)\.', filename)
    if m:
        cat = m.group(1)
    else:
        print(f'skipping image: {filename}')
        continue
Exemple #27
0
 def test_01a(self):
     nid = normalize_id('JB1')
     self.assertEqual(nid, 'JB000001')
Exemple #28
0
 def test_02(self):
     nid = normalize_id('LDHRM.2018.1')
     self.assertEqual(nid, 'LDHRM.2018.000001')
Exemple #29
0
 def test_04(self):
     with self.assertRaises(AssertionError):
         normalize_id('JB9999999')
Exemple #30
0
 def test_03(self):
     nid = normalize_id('LDHRM.2018.1.2')
     self.assertEqual(nid, 'LDHRM.2018.000001.000002')