Ejemplo n.º 1
0
def main():
    input_directory = "path/to/your/ead/files"

    e = EADDir(input_directory)
    results = e.characterize_dir(find_nested_tags)

    with open("eads_with_nested_tags.csv", mode="wb") as f:
        writer = csv.writer(f)
        writer.writerow(["ead file name", "xpath to self-nested tag", "tag type"])
        for result in results:
            writer.writerows(result)
Ejemplo n.º 2
0
def main():
    input_directory = "path/to/your/ead/files"

    e = EADDir(input_directory)
    results = e.characterize_dir(find_nested_tags)

    with open("eads_with_nested_tags.csv", mode="wb") as f:
        writer = csv.writer(f)
        writer.writerow(
            ["ead file name", "xpath to self-nested tag", "tag type"])
        for result in results:
            writer.writerows(result)
Ejemplo n.º 3
0
def write_ead_summary(summary_by_ead, digital_only):
    # TODO - write this. Serialize the dictionary and make sure you give default values to keys that might not exist in a given Counter.
    digital_keys = [("digital", "CDs"), ("digital", "DVDs"),
                    ("digital", "USB thumb drives"),
                    ("digital", 'floppy disks: 3.5"'),
                    ("digital", 'floppy disks: 5.25"'),
                    ("digital", "floppy disks: size not known"),
                    ("digital", "magneto-optical disks"),
                    ("digital", "zip disks")]
    video_keys = [('video', '(type not listed)'),
                  ('video', '8mm videocassettes'), ('video', 'Betacam tapes'),
                  ('video', 'Sony DVCAM videocassettes'),
                  ('video', 'U-matic tapes'), ('video', 'VHS tapes'),
                  ('video', 'film reels'), ('video', 'mini-DV tapes'),
                  ('video', 'open reel videotapes'),
                  ('video', '2-inch videotapes'),
                  ('video', '1-inch videotapes'),
                  ("video", "videocassette (unknown type)"),
                  ("video", "(unknown video type)"), ('video', 'oversize')]
    audio_keys = [('audio', 'audiocassettes'), ('audio', 'microcassettes'),
                  ('audio', 'phonograph records'),
                  ('audio', 'reel-to-reel tapes'),
                  ("audio", "wire recordings")]

    if digital_only:
        keys = digital_keys
    else:
        keys = digital_keys + video_keys + audio_keys

    # initialize empty values and setup dict for writing
    ead_dir = EADDir()
    for ead_file, counter in summary_by_ead.items():
        for key in keys:
            if key not in counter:
                counter[key] = 0
        for key in counter.keys():
            counter[u"{}: {}".format(unicode(key[0]),
                                     unicode(key[1]))] = counter[key]
            del counter[key]

        counter[u"collection name"] = EAD(
            os.path.join(ead_dir.input_dir, ead_file)).title
        summary_by_ead[ead_file] = counter

    # write
    with open("removable_media_summary_by_ead.csv", mode="wb") as f:
        # headers = [u"collection name", u"CDs", u"DVDs", u"USB thumb drives", u'floppy disks: 3.5"',
        #            u'floppy disks: 5.25"', u"floppy disks: size not known", u"magneto-optical disks", u"zip disks"]
        headers = [u"{}: {}".format(key[0], key[1]) for key in keys]
        headers.insert(0, u"collection name")
        writer = DictUnicodeWriter(f, fieldnames=headers)
        writer.writeheader()
        data = sorted(summary_by_ead.items())
        for name, counter in data:
            writer.writerow(counter)

    pass
Ejemplo n.º 4
0
def grab_all_subjects(ead_directory_path):
    ead_dir = EADDir(input_dir=ead_directory_path)
    subjects = set()

    for ead in tqdm(ead_dir.ead_files,
                    desc="extracting all unique subjects from EAD files..."):
        tree = etree.parse(os.path.join(ead_dir.input_dir, ead))
        subjects = subjects.union(grab_all_subjects_from_etree(tree))

    return subjects
Ejemplo n.º 5
0
def main():
    combined_results = []
    ead_dir = EADDir()
    for filename in tqdm(ead_dir.ead_files):
        ead = EAD(os.path.join(ead_dir.input_dir, filename))
        combined_results += get_suspicious_corpnames(ead)

    pprint(
        sorted(sorted(Counter(combined_results).most_common(),
                      key=lambda x: x[0]),
               key=lambda x: -x[1]))
Ejemplo n.º 6
0
def main():
    digital_only = True
    ead_dir = EADDir(input_dir="path/to/your/ead/files")

    results_by_ead, all_extents = get_raw_results(ead_dir, digital_only)

    summary_by_type = summarize_results_by_type(deepcopy(results_by_ead))
    summary_by_ead = summarize_results_by_ead(deepcopy(results_by_ead))

    write_removable_media_inventory(all_extents)
    write_type_summary(summary_by_type, digital_only)
    write_ead_summary(summary_by_ead, digital_only)
def main():
    results = []
    ead_dir = EADDir()
    for filename in tqdm(ead_dir.ead_files):
        ead = EAD(os.path.join(ead_dir.input_dir, filename))
        results += get_physfacet_texts(ead)

    counts = Counter([result[0] for result in results])

    with open("physfacet_counts.csv", mode="wb") as f:
        writer = UnicodeWriter(f)
        writer.writerow(["physfacet", "count"])
        writer.writerows(
            sorted(sorted(counts.most_common()), key=lambda x: -x[1]))

    with open("physfacets_with_locations.csv", mode="wb") as f:
        writer = UnicodeWriter(f)
        writer.writerow(["physfacet text", "collection", "filename", "xpath"])
        writer.writerows(sorted(results))
Ejemplo n.º 8
0
def write_new_lc_ids(subjects, input_directory, output_directory):
    ead_dir = EADDir(input_dir=input_directory)
    local_term_map = {(subject[0], subject[1]): (subject[2], subject[3])
                      for subject in subjects}

    for filename in ead_dir.ead_files:
        tree = etree.parse(os.path.join(input_directory, filename))
        ead_subjects = tree.xpath("//controlaccess/*") + tree.xpath(
            "//origination/*")

        for ead_subject in ead_subjects:
            key = create_tuple_key(ead_subject)

            if key in local_term_map:
                auth_text, auth_link = local_term_map[key]

                create_authfilenumber_attrib(auth_link, ead_subject)
                update_deathdate(auth_text, ead_subject)

        write_ead_to_file(tree, filename, output_directory)
Ejemplo n.º 9
0
def get_all_agents(input_dir):
    """
    Directs extraction of controlaccess terms from a directory of EADs.

    :param input_dir: filepath to the input director
    :return: a dictionary in the form {"corpname": {"Apple Computer": [authid, naming_source], etc.},
                                       "persname": {"Jane Doe (1900-1911)": [authid, naming_source], etc.},
                                       "famname": {"Adams family": [authid, _naming_source], etc.}}
    """

    agent_types = ["corpname", "persname", "famname"]
    agents = dict(zip(agent_types, [{}, {}, {}]))

    ead_dir = EADDir(input_dir=input_dir)

    for ead in tqdm(ead_dir.ead_files, desc="grabbing all agents from eads"):
        tree = etree.parse(os.path.join(ead_dir.input_dir, ead))
        all_agents = get_agents_from_ead(tree)

        for key, value in all_agents.items():
            agents[key].update(value)

    return agents
Ejemplo n.º 10
0
def main():
    ead_dir = EADDir()
    ead_dir.apply_function_to_dir(add_uuids, output_dir=ead_dir.input_dir)
import csv

from lxml import etree

from utilities.ead_utilities.ead_utilities import EADDir

def find_multiple_physdescs(ead):
    results = []
    parents = ead.tree.xpath("//physdesc")
    for parent in parents:
        tags = parent.xpath("extent")
        if len(tags) > 1:
            results.append([ead.filename, ead.tree.getpath(parent), len(tags), etree.tostring(etree.fromstring(etree.tostring(parent)))])

    return results

if __name__ == "__main__":
    input_dir = r'C:\Users\wboyle\PycharmProjects\vandura\Real_Masters_all'
    e = EADDir(input_dir)
    results = e.characterize_dir(find_multiple_physdescs)

    with open("eads_with_multiple_extents.csv", mode="wb") as f:
        writer = csv.writer(f)
        for result in results:
            writer.writerows(result)
Ejemplo n.º 12
0
def add_aspace_ids_to_all_agents_in_dir(name_to_id_map, path_to_eads):
    ead_dir = EADDir(input_dir=path_to_eads)
    ead_dir.apply_function_to_dir(function=add_aspace_ids_to_agents, var1=name_to_id_map, output_dir=path_to_eads)
Ejemplo n.º 13
0
from lxml import etree

from utilities.ead_utilities.ead_utilities import EADDir


def find_multiple_physdescs(ead):
    results = []
    parents = ead.tree.xpath("//physdesc")
    for parent in parents:
        tags = parent.xpath("extent")
        if len(tags) > 1:
            results.append([
                ead.filename,
                ead.tree.getpath(parent),
                len(tags),
                etree.tostring(etree.fromstring(etree.tostring(parent)))
            ])

    return results


if __name__ == "__main__":
    input_dir = r'C:\Users\wboyle\PycharmProjects\vandura\Real_Masters_all'
    e = EADDir(input_dir)
    results = e.characterize_dir(find_multiple_physdescs)

    with open("eads_with_multiple_extents.csv", mode="wb") as f:
        writer = csv.writer(f)
        for result in results:
            writer.writerows(result)