Ejemplo n.º 1
0
def main():
    # x.normalized extracts the first entry in the namedtuple Obj_id.
    # valid_idnums is a set of all the IDs in the XML file
    valid_idnums = set([x.normalized for x in list_objects(_args.modesfile)])
    numneeded = 0
    if _args.candidatefile:
        candidate_set = build_candidate_set(valid_idnums)
    else:
        candidate_set = valid_idnums
    img_dict = dict()
    build_img_dict(img_dict, _args.imgdir)
    for nid in sorted(candidate_set):
        denid = denormalize_id(nid)
        if _args.invert:
            if nid not in img_dict:
                trace(2, 'Not in image folder: {}', denid)
                continue
        else:
            if nid in img_dict:
                trace(2, 'In image folder: {}', denid)
                if reportfile:
                    print(denid, file=reportfile)
                continue
        # print IDs of objects needed as they are not in the image folder(s).
        # if --invert is set, print the objects in the image folder(s).
        print(denid, file=outfile)
        numneeded += 1
    needq = f'{"not " if _args.invert else ""}needed'
    print(f'Total {needq}: {numneeded} of {len(candidate_set)} candidates.')
Ejemplo n.º 2
0
def main():
    global nwritten
    outfile.write(b'<?xml version="1.0" encoding="UTF-8"?><Interchange>\n')
    for event, elem in ET.iterparse(infile):
        if elem.tag != 'Object':
            continue
        idelem = elem.find(cfg.record_id_xpath)
        idnum = idelem.text if idelem is not None else None
        nidnum = normalize_id(idnum)
        trace(3, 'idnum: {}', idnum)
        if nidnum and nidnum in newvals:
            updated = one_element(elem, nidnum)
            del newvals[nidnum.upper()]
        else:
            updated = False
            if _args.missing:
                trace(2, 'Not in CSV file: "{}"', idnum)
        if updated or _args.all:
            outfile.write(ET.tostring(elem, encoding='utf-8'))
            nwritten += 1
        if _args.short:
            break
    outfile.write(b'</Interchange>')
    for nidnum in newvals:
        trace(1, 'In CSV but not XML: "{}"', denormalize_id(nidnum))
Ejemplo n.º 3
0
def main():
    for event, obj in ET.iterparse(infile):
        if obj.tag == 'Object':
            one_object(obj)
            obj.clear()
    for box in sorted(boxdict.keys()):
        writer.writerow([''])
        writer.writerow([''])
        writer.writerow([f'Box {unpad_loc(box)}'])
        writer.writerow(['--------------'])
        for nnum in sorted(boxdict[box]):
            writer.writerow([denormalize_id(nnum), titledict[nnum]])
Ejemplo n.º 4
0
    if not cat:
        continue
    ac = row['Accn. No.']
    try:
        accnum = normalize_id(ac.strip())
    except ValueError:
        print(f'bad accnum: skipping {cat}: "{ac}"')
        continue
    if accnum not in objset:
        print(f'skipping {cat}: "{accnum}"')
        continue
    cat2accn[cat] = accnum

for filename in imgs:
    m = re.match(r'(\d+\w?)\.', filename)
    if m:
        cat = m.group(1)
    else:
        print(f'skipping image: {filename}')
        continue
    try:
        accnum = cat2accn[cat]
    except KeyError:
        continue
    newfilename = denormalize_id(accnum) + '.jpg'
    source = os.path.join(IMGSPATH, filename)
    dest = os.path.join(ACCESSIONEDPATH, newfilename)
    shutil.copy(source, dest)
    print(f'{dest=},  {source=}')
    print(accnum, file=includecsvfile)
Ejemplo n.º 5
0
def one_object(objelt, idnum, exhibition: ExhibitionTuple, catalog_num=''):
    """
#
    :param objelt: the Object
    :param idnum: the ObjectIdentity/Number text (for trace)
    :param exhibition: the Exhibition tuple corresponding to exhibition_list.py
    :param catalog_num: for the CatalogueNumber element
    :return: a tuple containing the BeginDate, the new Exhibition element
    """
    global found_old_key

    def new_exhib():
        newelt = ET.Element('Exhibition')
        subelt = ET.SubElement(newelt, 'ExhibitionName')
        subelt.text = exhibition.ExhibitionName
        if catalog_num is not None:
            subelt = ET.SubElement(newelt, 'CatalogueNumber')
            subelt.text = str(catalog_num)
        subelt = ET.SubElement(newelt, 'Place')
        subelt.text = exhibition.Place
        dateelt = ET.SubElement(newelt, 'Date')
        subelt = ET.SubElement(dateelt, 'DateBegin')
        subelt.text = modesdate(exhibition.DateBegin)
        subelt = ET.SubElement(dateelt, 'DateEnd')
        subelt.text = modesdate(exhibition.DateEnd)
        return exhibition.DateBegin, newelt

    def one_exhibition(exhib_elt):
        """
        Handle an existing exhibition
        :param exhib_elt: an Exhibition element (under Object)
        :return: 0 if it is an empty template
                 1 if the input element's ExhibitionName is different from
                    the one we are inserting
                 2 (also update the values) if the ExhibitionName values match

        It is possible to have duplicate exhibition names but exhibitions are
        guaranteed to have unique name+place+begindate. So if we are replacing
        the name, make sure that we are updating the correct exhibition by
        checking the complete keys.
        """

        exhibname = exhib_elt.find('ExhibitionName')
        if exhibname is None:
            return 0  # This is an empty Exhibition template
        # Updating the exhibition name is a special case since it's used as
        # a key.  The old name is in the XML file and is to be replaced. We
        # could skip this step as the full key compare will work anyhow, but
        # this gets us out of here quickly in most cases.
        if exhibname.text not in (_oldname, exhibition.ExhibitionName):
            return 1  # not a match so just keep this element as is
        # The exhibition names match but if they are duplicated elsewhere in
        # the list, we must look deeper.
        exhibkey = _oldname if _oldname else exhibition.ExhibitionName + ':'
        exhibkey += _oldplace if _oldplace else exhibition.Place + ':'
        exhibkey += _olddate if _olddate else (
            exhibition.DateBegin.isoformat()[:10])
        xmlkey = exhibname.text
        xmlplace = ''
        xmldate = ''
        subelts = list(exhib_elt)
        for subelt in subelts:
            tag = subelt.tag
            if tag == "Place":
                xmlplace = subelt.text
            elif tag == "Date":
                dates = list(subelt)
                for dateelt in dates:
                    if dateelt.tag == 'DateBegin':
                        xmldate = dateelt.text
                        break
        xmlkey += ':' + xmlplace + ':' + xmldate
        # And finally, confirm that it's really the one we want to update.
        trace(3, '{}: exhibkey={}\nxmlkey={}', idnum, exhibkey, xmlkey)
        if exhibkey != xmlkey:
            return 1
        # The names match so update the values
        for subelt in subelts:
            tag = subelt.tag
            if tag == "ExhibitionName":
                subelt.text = exhibition.ExhibitionName
            elif tag == "CatalogueNumber":
                subelt.text = str(catalog_num)
            elif tag == "Place":
                subelt.text = exhibition.Place
            elif tag == "Date":
                dates = list(subelt)
                for dateelt in dates:
                    if dateelt.tag == 'DateBegin':
                        dateelt.text = modesdate(exhibition.DateBegin)
                    elif dateelt.tag == 'DateEnd':
                        dateelt.text = modesdate(exhibition.DateEnd)
            else:
                trace(
                    1, 'ID {}: Unknown subelt in {} Exhibition element: {},'
                    ' element not updated.', subelt.text, display_id, tag)
        return 2

    # end one_exhibition

    display_id = denormalize_id(idnum)
    trace(2, 'one_element: {} {}', display_id, exhibition)
    elts = list(objelt)  # the children of Object
    # for elt in elts:
    #     print(elt)
    exhibs_to_insert = list()  # all current plus any new
    exhibs_to_remove = list()  # empty Exhibition template or to be deleted
    firstexix = None  # index of the first Exhibition element
    need_new = True
    for n, elt in enumerate(elts):
        if elt.tag == "Exhibition":
            if firstexix is None:
                firstexix = n
            status = one_exhibition(elt)
            if status == 0:  # This is an empty Exhibition template
                exhibs_to_remove.append(elt)
                continue
            begindate, _ = datefrommodes(elt.find('./Date/DateBegin').text)
            if status == 1:  # Not this exhibition
                exhibs_to_insert.append((begindate, elt))  # will sort on date
                continue
            else:  # status == 2
                # Sanity check that we got at least one hit on the --old_xxxx parameter
                found_old_key = True
                need_new = False
                if _args.delete:
                    exhibs_to_remove.append(elt)
                else:
                    exhibs_to_insert.append(
                        (begindate, elt))  # will sort on date
    if firstexix is None:  # no Exhibition elements were found
        etype = objelt.get('elementtype')
        trace(1, 'Object number {}: No Exhibition element. etype: {}', idnum,
              etype)
        for n, elt in enumerate(elts):
            if elt.tag == "Acquisition":
                firstexix = n + 1  # insert the new elt after <Acquisition>
                break
    # Remove all the Exhibition elements and re-insert the ones we're
    # keeping in date order.
    for _edate, exhib in exhibs_to_insert:
        # print(objelt, exhib)
        objelt.remove(exhib)
    for exhib in exhibs_to_remove:
        objelt.remove(exhib)
    if need_new:
        newexhibit = new_exhib()  # returns a tuple of (date, element)
        exhibs_to_insert.append(newexhibit)
    # Insert the Exhibition elements with the most recent one first
    for _edate, exhib in sorted(exhibs_to_insert):
        objelt.insert(firstexix, exhib)
Ejemplo n.º 6
0
 def test_04(self):
     nid = denormalize_id('JB999999999')
     self.assertEqual(nid, 'JB999999999')
Ejemplo n.º 7
0
 def test_03(self):
     nid = denormalize_id('LDHRM.2018.000001.000002')
     self.assertEqual(nid, 'LDHRM.2018.1.2')
Ejemplo n.º 8
0
 def test_01(self):
     nid = denormalize_id('JB000001')
     self.assertEqual(nid, 'JB001')
Ejemplo n.º 9
0
def main(argv):  # can be called either by __main__ or test_xml2csv
    global _args, _logfile
    _args = getargs(argv)
    infilename = _args.infile
    outfilename = _args.outfile
    cfgfilename = _args.cfgfile
    if _args.logfile:
        _logfile = open(_args.logfile, 'w')
    else:
        _logfile = sys.stdout
    infile = openfile(infilename)
    nlines = notfound = nwritten = 0
    Config.reset_config()  # needed by test_xml2csv
    if cfgfilename:
        cfgfile = open(cfgfilename)
    else:
        cfgfile = None
        trace(
            1,
            'Warning: Config file omitted. Only accession numbers will be output.'
        )
    config = Config(cfgfile, dump=_args.verbose >= 2, logfile=_logfile)
    outcsv, outfile = opencsvwriter(outfilename, config.delimiter)
    outlist = []
    titles = yaml_fieldnames(config)
    trace(1, 'Columns: {}', ', '.join(titles))
    if not _args.heading:
        trace(1, 'Heading row not written.')
    if _args.heading:
        outcsv.writerow(titles)
    objectlevel = 0
    if _args.object:
        expanded = [normalize_id(obj) for obj in expand_idnum(_args.object)]
        includeset = set(expanded)  # JB001-002 -> JB001, JB002
        includes = dict.fromkeys(includeset)
    else:
        includes = read_include_dict(_args.include,
                                     _args.include_column,
                                     _args.include_skip,
                                     _args.verbose,
                                     logfile=_logfile,
                                     allow_blanks=_args.allow_blanks)
    for event, elem in ET.iterparse(infile, events=('start', 'end')):
        # print(event)
        if event == 'start':
            # print(elem.tag)
            if elem.tag == config.record_tag:
                objectlevel += 1
            continue
        # It's an "end" event.
        if elem.tag != config.record_tag:  # default: Object
            continue
        objectlevel -= 1
        if objectlevel:
            continue  # It's not a top level Object.
        data = []
        idelem = elem.find(config.record_id_xpath)
        idnum = idelem.text if idelem is not None else ''
        trace(3, 'idnum: {}', idnum)
        nlines += 1

        writerow = config.select(elem, includes, exclude=_args.exclude)
        # print(f'{writerow=}')
        if not writerow:
            continue
        norm_idnum = normalize_id(idnum, _args.mdacode, verbose=_args.verbose)
        # We have selected the id but only write the row if there is something
        # to display. There will always be at least the ID number in the first
        # column unless skip_number was specified in the config.
        if config.skip_number:
            writerow = False
        else:
            # Insert the ID number as the first column.
            data.append(norm_idnum)

        for document in config.col_docs:
            text, command = one_document(document, elem, config)
            # print(f'{command=}')
            if text is None:
                notfound += 1
                trace(2, '{}: cmd: {}, "{}" is not found in XML.', idnum,
                      command, document[Stmt.TITLE])
                text = ''
            if text:
                writerow = True
            data.append(text)

        if writerow:
            nwritten += 1
            outlist.append(data)
            trace(3, '{} written.', idnum)
        elem.clear()
        if includes and not _args.exclude:
            includes.pop(norm_idnum)
        if _args.short:
            break
    if config.sort_numeric:
        outlist.sort(key=lambda x: int(x[0]))
    else:
        outlist.sort()
    # Create a list of flags indicating whether the value needs to be
    # de-normalized.
    norm = []
    if not config.skip_number:
        norm.append(True)  # for the Serial number
    for doc in config.col_docs:
        if doc[Stmt.CMD] in Cmd.get_control_cmds():
            continue
        norm.append(Stmt.NORMALIZE in doc)
    lennorm = len(norm)
    for row in outlist:
        for n, cell in enumerate(row[:lennorm]):
            if norm[n]:
                row[n] = denormalize_id(cell, _args.mdacode)
        outcsv.writerow(row)
    infile.close()
    if cfgfile:
        cfgfile.close()
    outfile.close()
    if includes and len(includes):
        trace(1, '{} items in include list not in XML.', len(includes))
        if _args.verbose > 1:
            print('In include list but not xml:', file=_logfile)
            for accnum in includes:
                print(accnum, file=_logfile)
    if not _args.bom:
        trace(1, 'BOM not written.')
    return nlines, nwritten, notfound