Beispiel #1
0
def main(args):
    print(f"Adding {args.award} to {args.anthology_id}...")

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is None:
        print(f"Error: Can't find paper {args.anthology_id}, quitting")

    existing_award = paper.find("./award")
    if existing_award is not None and award.text.lower() == args.award:
        print(
            f"Error: Award {args.award} already exists for {args.anthology_id}, quitting"
        )

    make_simple_element("award", args.award, parent=paper)
    indent(tree.getroot())

    tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
Beispiel #2
0
def process_volume(anthology_volume):

    collection_id, volume_id, _ = deconstruct_anthology_id(anthology_volume)

    print(f'Attempting to add DOIs for {anthology_volume}', file=sys.stderr)

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml')
    tree = ET.parse(xml_file)

    formatter = MarkupFormatter()

    num_added = 0

    volume = tree.getroot().find(f"./volume[@id='{volume_id}']")
    if volume is not None:
        volume_booktitle = volume.find(f"./meta/booktitle")
        volume_title = formatter.as_text(volume_booktitle)
        print(f'-> found existing volume "{volume_title}"', file=sys.stderr)

        # Iterate through all papers
        for paper in chain(volume.find('frontmatter'), volume.findall('paper')):
            added = add_doi(paper, collection_id, volume_id, force=args.force)
            if added:
                num_added += 1
                sleep(1)

        indent(tree.getroot())

        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
        print(f'-> added {num_added} DOIs to to the XML for collection {collection_id}', file=sys.stderr)

    else:
        print(f'-> FATAL: volume {volume} not found in the Anthology', file=sys.stderr)
        sys.exit(1)
def main(args):

    for xml_file in args.xml_files:
        tree = ET.parse(xml_file)
        for paper in tree.getroot().findall(f".//paper"):
            make_simple_element("language", "eng", parent=paper)

        indent(tree.getroot())
        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def main(args):

    for lineno, line in enumerate(sys.stdin, 1):
        # attachments/D/D15/D15-1272.Attachment.pdf
        tokens = line.rstrip().split("/")
        attachment_file_name = tokens[-1]
        try:
            anth_id, kind, *rest = attachment_file_name.split(".")
        except:
            print(f"Couldn't parse file {attachment_file_name} into 3 pieces")
            continue

        try:
            collection_id, volume_id, paper_id = deconstruct_anthology_id(
                anth_id)
        except:
            print(f"[{lineno}] BAD LINE {line.rstrip()}")

        # Update XML
        xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data",
                                "xml", f"{collection_id}.xml")
        tree = ET.parse(xml_file)

        if int(paper_id) == 0:
            paper = tree.getroot().find(
                f"./volume[@id='{volume_id}']/frontmatter")
        else:
            paper = tree.getroot().find(
                f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
        if paper is not None:
            # Check if attachment already exists
            for attachment in paper.findall("attachment"):
                if attachment.text == attachment_file_name:
                    #                    print(f'-> attachment {attachment_file_name} already exists in the XML', file=sys.stderr)
                    break
            else:
                attachment = ET.Element("attachment")
                attachment.attrib["type"] = kind.lower()
                attachment.text = attachment_file_name

                paper.append(attachment)
                indent(tree.getroot())
                tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
                print(
                    f"-> [{lineno}] added attachment {attachment_file_name} to the XML",
                    file=sys.stderr,
                )

        else:
            print(
                f"-> FATAL: [{lineno}] paper ({anth_id}) not found in the Anthology",
                file=sys.stderr,
            )
            sys.exit(1)
def main(args):
    combo_df = combine_tsv(args['tsv_files'])
    combo_df_uniques = combo_df['anthology_id'].apply(split_anth_id).unique()

    for xml in os.listdir(data_dir):
        fname, ext = os.path.splitext(xml)
        if fname in combo_df_uniques.tolist() or fname == "2020.acl":
            tree = et.parse(os.path.join(data_dir, xml))

            df_subset = combo_df[combo_df['anthology_id'].str.startswith(fname)]
            df_subset.apply(add_video_tag, axis=1, xml_parse=tree)

            with open(os.path.join(data_dir, fname + ".xml"), 'wb') as f:
                indent(tree.getroot())
                tree.write(f, encoding="UTF-8", xml_declaration=True)
def write_bibkeys(anthology, srcdir, commit=False):
    for volume_id, volume in anthology.volumes.items():
        papers_without_bibkey = []

        for paper in volume:
            bibkey = paper.bibkey
            if bibkey is None or bibkey == paper.full_id:
                papers_without_bibkey.append(paper)

        if papers_without_bibkey:
            log.info(
                f"Found {len(papers_without_bibkey):4d} papers without bibkeys in volume {volume_id}"
            )
            if not commit:
                continue
        else:
            continue

        # We got some new bibkeys and need to write them to the XML
        xml_file = os.path.join(srcdir, "xml", f"{volume.collection_id}.xml")
        tree = ET.parse(xml_file)
        root = tree.getroot()

        for paper in papers_without_bibkey:
            if paper.paper_id == "0":
                node = root.find(
                    f"./volume[@id='{paper.volume_id}']/frontmatter")
                if node is None:  # dummy frontmatter
                    continue
            else:
                node = root.find(
                    f"./volume[@id='{paper.volume_id}']/paper[@id='{paper.paper_id}']"
                )
            if node is None:
                log.error(f"Paper {paper.full_id} not found in {xml_file}")
                continue

            # Generate unique bibkey
            bibkey = anthology.pindex.create_bibkey(paper,
                                                    vidx=anthology.venues)
            make_simple_element("bibkey", bibkey, parent=node)

        indent(root)
        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def main(args):
    """
    Downloads an Anthology paper and adds a RETRACTED watermark, then updates the XML
    with an appropriate <revision> and <retracted> tag.
    """

    with tempfile.TemporaryDirectory() as tempdir:

        new_pdf = add_watermark(args.anthology_id, workdir=tempdir)

        add_revision(
            args.anthology_id,
            new_pdf,
            explanation="Retracted.",
            change_type="revision",
            dry_run=False,
        )

        xml_file = get_xml_file(args.anthology_id)
        collection_id, volume_id, paper_id = deconstruct_anthology_id(
            args.anthology_id)
        tree = ET.parse(xml_file)
        if paper_id == "0":
            paper = tree.getroot().find(
                f"./volume[@id='{volume_id}']/frontmatter")
        else:
            paper = tree.getroot().find(
                f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")

        if paper is None:
            print(f"Couldn't find paper {args.anthology_id}!", file=sys.stderr)
            sys.exit(2)

        print("Modifying the XML", file=sys.stderr)
        now = datetime.now()
        date = f"{now.year}-{now.month:02d}-{now.day:02d}"
        retracted_node = make_simple_element("retracted",
                                             args.explanation,
                                             attrib={"date": date},
                                             parent=paper)
        indent(tree.getroot())
        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
Beispiel #8
0
def main(args):

    for line in args.isbn_file:
        venue, isbn = line.rstrip().split()

        xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data",
                                "xml", f"{venue}.xml")
        if not os.path.exists(xml_file):
            print(f"Can't find {xml_file}")
            continue
        tree = ET.parse(xml_file)
        meta = tree.getroot().find(f".//volume[@id='1']/meta")
        if meta is not None and meta.find("./isbn") is None:
            print(f"Adding {isbn} to {venue} meta block")
            make_simple_element("isbn", isbn, parent=meta)
        elif volume.find("./isbn") is not None:
            print(f"{venue} already done")

        indent(tree.getroot())
        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
Beispiel #9
0
def main(args):

    for xml_file in args.files:
        # Update XML
        tree = ET.parse(xml_file)
        tree.getroot().tail = '\n'

        for paper in tree.getroot().findall('.//paper'):
            tail = paper.tail
            seen = []
            for attachment in paper.findall('./attachment'):
                if attachment.text in seen:
                    print(f'Removing: {attachment.text}')
                    paper.remove(attachment)
                seen.append(attachment.text)

            indent(paper, level=2)
            paper.tail = tail

        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
Beispiel #10
0
def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                     "data")
    venue_index = VenueIndex(srcdir=anthology_datadir)
    venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()]

    sig_index = SIGIndex(srcdir=anthology_datadir)

    # Build list of volumes, confirm uniqueness
    unseen_venues = []
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))

        venue_abbrev = meta["abbrev"]
        venue_slug = venue_index.get_slug(venue_abbrev)

        if str(datetime.now().year) in venue_abbrev:
            print(
                f"Fatal: Venue assembler put year in acronym: '{venue_abbrev}'"
            )
            sys.exit(1)

        if venue_slug not in venue_keys:
            unseen_venues.append((venue_slug, venue_abbrev, meta["title"]))

        meta["path"] = proceedings

        meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug
        volume_name = meta["volume"].lower()
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

        if "sig" in meta:
            print(
                f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:"
            )
            print(f"  - {meta['year']}:")
            print(f"    - {volume_full_id} # {meta['booktitle']}")

    # Make sure all venues exist
    if len(unseen_venues) > 0:
        for venue in unseen_venues:
            slug, abbrev, title = venue
            print(f"Creating venue '{abbrev}' ({title})")
            venue_index.add_venue(abbrev, title)
        venue_index.dump(directory=anthology_datadir)

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume"].lower()
        year = meta["year"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        # copy the book
        book_src_filename = f'{year}.{meta["abbrev"]}-{volume_name}.pdf'
        book_src_path = os.path.join(root_path, book_src_filename)
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            if not args.dry_run:
                maybe_copy(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # Skip . files
            if os.path.basename(pdf_file).startswith("."):
                continue

            # names are {abbrev}{number}.pdf
            match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                if not args.dry_run:
                    maybe_copy(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                venue_name)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                if os.path.basename(attachment_file).startswith("."):
                    continue
                attachment_file_path = os.path.join(root_path, "additional",
                                                    attachment_file)
                match = re.match(
                    rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$",
                    attachment_file)
                if match is None:
                    print(
                        f"* Warning: no attachment match for {attachment_file}",
                        file=sys.stderr,
                    )
                    sys.exit(2)

                paper_num, type_, ext = match.groups()
                paper_num = int(paper_num)

                file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                dest_path = os.path.join(attachments_dest_dir, file_name)
                if not args.dry_run and not os.path.exists(dest_path):
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    shutil.copyfile(attachment_file_path, dest_path)

                collections[collection_id][volume_name][paper_num][
                    "attachments"].append((dest_path, type_))

    people = AnthologyIndex(None, srcdir=anthology_datadir)

    def correct_caps(person, name_node, anth_id):
        """
        Many people submit their names in "ALL CAPS" or "all lowercase".
        Correct this with heuristics.
        """
        name = name_node.text
        if name.islower() or name.isupper():
            # capitalize all parts
            corrected = " ".join(
                list(map(lambda x: x.capitalize(), name.split())))
            print(
                f"-> Correcting capitalization of '{name}' to '{corrected}'",
                file=sys.stderr,
            )
            name_node.text = corrected

    def disambiguate_name(node, anth_id):
        name = PersonName.from_element(node)
        ids = people.get_ids(name)

        if len(ids) > 1:
            choice = -1
            while choice < 0 or choice >= len(ids):
                print(
                    f"({anth_id}): ambiguous author {name}; Please choose from the following:"
                )
                for i, id_ in enumerate(ids):
                    print(f"[{i}] {id_} ({people.get_comment(id_)})")
                choice = int(input("--> "))

            node.attrib["id"] = ids[choice]

    for collection_id, collection in collections.items():
        # Newly added volumes, so we can normalize and name-disambig later
        newly_added_volumes = []

        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        if os.path.exists(collection_file):
            root_node = etree.parse(collection_file).getroot()
        else:
            root_node = make_simple_element("collection",
                                            attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element(
                "volume",
                attrib={
                    "id": volume_id,
                    "ingest-date": args.ingest_date
                },
            )

            # Replace the existing one if present
            existing_volume_node = root_node.find(
                f"./volume[@id='{volume_id}']")
            for i, child in enumerate(root_node):
                if child.attrib["id"] == volume_id:
                    root_node[i] = volume_node
                    break
            else:
                root_node.append(volume_node)

            meta_node = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta_node = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta_node.append(title_node)
                    for author_or_editor in chain(
                            paper_node.findall("./author"),
                            paper_node.findall("./editor")):
                        meta_node.append(author_or_editor)
                        author_or_editor.tag = "editor"
                    meta_node.append(paper_node.find("publisher"))
                    meta_node.append(paper_node.find("address"))
                    meta_node.append(paper_node.find("month"))
                    meta_node.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            attrib={
                                "hash": compute_hash_from_file(book_dest_path)
                            },
                            parent=meta_node,
                        )

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                url = paper_node.find("./url")
                if url is not None:
                    url.attrib["hash"] = compute_hash_from_file(paper["pdf"])

                for path, type_ in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=os.path.basename(path),
                        attrib={
                            "type": type_,
                            "hash": compute_hash_from_file(path),
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

                # Normalize
                for oldnode in paper_node:
                    normalize(oldnode, informat="latex")

                # Adjust the language tag
                language_node = paper_node.find("./language")
                if language_node is not None:
                    try:
                        lang = iso639.languages.get(name=language_node.text)
                    except KeyError:
                        raise Exception(
                            f"Can't find language '{language_node.text}'")
                    language_node.text = lang.part3
                    print(language_node.text)

                # Fix author names
                for name_node in chain(paper_node.findall("./author"),
                                       paper_node.findall("./editor")):
                    disambiguate_name(name_node, paper_id_full)
                    person = PersonName.from_element(name_node)
                    for name_part in name_node:
                        correct_caps(person, name_part, paper_id_full)

        # Other data from the meta file
        if "isbn" in meta:
            make_simple_element("isbn", meta["isbn"], parent=meta_node)

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)
One-time script that converted the old revision style

    <revision id="2">P18-1001v2</revision>

to the new revision style that mandates an explanation

    <revision id="2" href="P18-1001v2">Added new references.</revision>
"""

import lxml.etree as etree
import re
import sys

from anthology.utils import infer_url, test_url, indent

filename = sys.argv[1]
outfilename = sys.argv[2]
tree = etree.parse(filename)
root = tree.getroot()
collection_id = root.attrib["id"]

papers = list(root.findall(".//paper")) + list(root.findall(".//frontmatter"))

for paper in papers:
    for revision in paper.findall("revision"):
        revision.attrib["href"] = revision.text
        revision.text = "No description of the changes were recorded."

indent(root)
tree.write(outfilename, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args):

    print(f'Processing attachment for {args.anthology_id}', file=sys.stderr)

    if args.path.startswith('http'):
        _, input_file_path = tempfile.mkstemp()
        try:
            print('-> Downloading file from {}'.format(args.path),
                  file=sys.stderr)
            with urllib.request.urlopen(args.path) as url, open(
                    input_file_path, mode='wb') as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            print(
                '-> FATAL: An SSL error was encountered in downloading the files.',
                file=sys.stderr)
            sys.exit(1)
    else:
        input_file_path = args.path

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split('.')[-1]

    if paper_extension not in ALLOWED_TYPES:
        print(
            f'-> FATAL: {args.anthology_id} unknown file extension {paper_extension}',
            file=sys.stderr)
        sys.exit(1)

    attachment_file_name = f'{args.anthology_id}.{args.type}.{paper_extension}'

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml',
                            f'{collection_id}.xml')
    tree = ET.parse(xml_file)

    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        # Check if attachment already exists
        for attachment in paper.findall('attachment'):
            if attachment.text == attachment_file_name:
                print(
                    f'-> attachment {attachment_file_name} already exists in the XML',
                    file=sys.stderr)
                break
        else:
            attachment = ET.Element('attachment')
            attachment.attrib['type'] = args.type.lower()
            attachment.text = attachment_file_name

            paper.append(attachment)
            indent(tree.getroot())
            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> added attachment {attachment_file_name} to the XML',
                  file=sys.stderr)

    else:
        print(
            f'-> FATAL: paper (volume={volume_id}, paper={paper_id}) not found in the Anthology',
            file=sys.stderr)
        sys.exit(1)

    # Make sure directory exists
    output_dir = os.path.join(args.attachment_root, collection_id[0],
                              collection_id)
    if not os.path.exists(output_dir):
        print(f'-> Creating directory {output_dir}', file=sys.stderr)
        os.makedirs(output_dir)

    # Copy file
    dest_path = os.path.join(output_dir, attachment_file_name)
    if os.path.exists(dest_path):
        print(
            f'-> target file {dest_path} already in place, refusing to overwrite',
            file=sys.stderr)
    else:
        shutil.copy(input_file_path, dest_path)
        os.chmod(dest_path, 0o644)
        print(f'-> copied {input_file_path} to {dest_path} and fixed perms',
              file=sys.stderr)

    # Clean up
    if args.path.startswith('http'):
        os.remove(input_file_path)
Beispiel #13
0
def main(args):
    year, venue, _ = os.path.basename(args.tsv_file.name).split(".")

    # Set the volume name from the collection file, or default to 1
    # The file name is either "2012.eamt.tsv" or "2012.eamt-main.tsv".
    # The default volume name is "1".
    if "-" in venue:
        venue, volume_id = venue.split("-")
    else:
        volume_id = "1"

    collection_id = f"{year}.{venue}"

    tree = etree.ElementTree(
        make_simple_element("collection", attrib={"id": collection_id}))

    now = datetime.now()
    today = f"{now.year}-{now.month:02d}-{now.day:02d}"

    volume = make_simple_element("volume",
                                 attrib={
                                     "id": volume_id,
                                     "ingest-date": today
                                 })
    tree.getroot().insert(0, volume)

    # Location of entire-proceedings PDF
    proceedings_pdf = args.proceedings

    # Create the metadata for the paper
    meta = None
    for row in csv.DictReader(args.meta_file, delimiter="\t"):
        current_collection_id = f"{row['Year']}.{row['Conference code']}"
        if current_collection_id == collection_id:
            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle",
                                row["Conference title"],
                                parent=meta)
            make_simple_element("publisher", row["Publisher"], parent=meta)
            make_simple_element("address", row["Location"], parent=meta)
            make_simple_element("month", row["Dates held"], parent=meta)
            make_simple_element("year", row["Year"], parent=meta)

            url = row["URL"]

            if url.endswith(".pdf"):
                if proceedings_pdf:
                    print(
                        "Overriding --proceedings with proceedings PDF found in conference list",
                        file=sys.stderr,
                    )
                proceedings_pdf = url

            elif "Complete PDF" in row and row["Complete PDF"] != "":
                proceedings_pdf = row["Complete PDF"]

            # volume PDF
            if proceedings_pdf is not None:
                volume_anth_id = f"{collection_id}-{volume_id}"
                pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                              f"{volume_anth_id}.pdf")
                download(proceedings_pdf, pdf_local_path)
                with open(pdf_local_path, "rb") as f:
                    checksum = compute_hash(f.read())
                make_simple_element("url",
                                    volume_anth_id,
                                    attrib={"hash": checksum},
                                    parent=meta)
                proceedings_pdf = pdf_local_path

            if row["Editors"] != "" and "?" not in row["Editors"]:
                editors = row["Editors"].split(" and ")
                for editor_name in editors:
                    editor = make_simple_element("editor", parent=meta)
                    if ", " in editor_name:
                        last, first = editor_name.split(", ")
                    else:
                        first, last = (
                            ' '.join(editor_name.split()[:-1]),
                            editor_name.split()[-1],
                        )
                    make_simple_element("first", first, parent=editor)
                    make_simple_element("last", last, parent=editor)
            break
    else:
        print(
            f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}",
            file=sys.stderr,
        )
        sys.exit(1)

    if not os.path.exists(collection_id):
        print(f"Creating {collection_id}", file=sys.stderr)
        os.makedirs(collection_id)

    paperid = 0
    # Create entries for all the papers
    for row in csv.DictReader(args.tsv_file, delimiter='\t'):
        pages = row.get("Pagenumbers", None)

        title_text = row["Title"]

        # The first row might be front matter (needs a special name)
        if title_text == "Frontmatter" and paperid == 0:
            paper = make_simple_element("frontmatter", parent=volume)

        else:
            paperid += 1
            paper = make_simple_element("paper",
                                        attrib={"id": str(paperid)},
                                        parent=volume)
            # Only make the title for not-the-frontmatter
            make_simple_element("title", title_text, parent=paper)

        author_list = row["Authors"].split(" and ")

        for author_name in author_list:
            if author_name == "":
                continue
            author = make_simple_element("author", parent=paper)
            if ", " in author_name:
                last, first = author_name.split(", ")
            else:
                first, last = ' '.join(
                    author_name.split()[:-1]), author_name.split()[-1]
            make_simple_element("first", first, parent=author)
            make_simple_element("last", last, parent=author)

        if pages is not None:
            make_simple_element("pages", pages, parent=paper)

        # Find the PDF, either listed directly, or extracted from the proceedings PDF
        anth_id = f"{collection_id}-{volume_id}.{paperid}"
        pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                      f"{anth_id}.pdf")
        url = None
        if "Pdf" in row and row["Pdf"] != "":
            if download(row["Pdf"], pdf_local_path):
                url = anth_id

        elif "pages in pdf" in row:
            pdf_pages = row["pages in pdf"]
            extract_pages(proceedings_pdf, pdf_pages, pdf_local_path)
            url = anth_id

        if url is not None:
            with open(pdf_local_path, "rb") as f:
                checksum = compute_hash(f.read())

            make_simple_element("url",
                                url,
                                attrib={"hash": checksum},
                                parent=paper)

        if "Abstract" in row:
            make_simple_element("abstract", row["Abstract"], parent=paper)

        if "Presentation" in row:
            url = row["Presentation"]
            if url is not None and url != "" and url != "None":
                extension = row["Presentation"].split(".")[-1]
                name = f"{anth_id}.Presentation.{extension}"
                local_path = os.path.join(
                    args.anthology_files_path,
                    "..",
                    "attachments",
                    venue,
                    name,
                )
                if download(row["Presentation"], local_path):
                    make_simple_element("attachment",
                                        name,
                                        attrib={"type": "presentation"},
                                        parent=paper)

        # Normalize
        for node in paper:
            normalize(node, informat="latex")

    indent(tree.getroot())

    # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
    collection_file = os.path.join(args.anthology, "data", "xml",
                                   f"{collection_id}.xml")
    tree.write(collection_file,
               encoding="UTF-8",
               xml_declaration=True,
               with_tail=True)
Beispiel #14
0
def main(args):
    code, year, _ = os.path.basename(args.tsv_file.name).split(".")

    collection_id = f"{year}.{code}"

    tree = etree.ElementTree(
        make_simple_element("collection", attrib={"id": collection_id}))

    volume_id = "1"
    volume = make_simple_element("volume", attrib={"id": volume_id})
    tree.getroot().insert(0, volume)

    # Create the metadata for the paper
    meta = None
    for row in csv.DictReader(args.meta_file, delimiter="\t"):
        if row["Conference code"] == collection_id:
            if row["Completed"] == "FALSE":
                prin(
                    f"Warning: Conference {collection_id} is not marked as completed, can't ingest."
                )
                sys.exit(1)

            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle",
                                row["Conference title"],
                                parent=meta)
            make_simple_element("publisher", row["Publisher"], parent=meta)
            make_simple_element("address", row["Location"], parent=meta)
            make_simple_element("month", row["Dates held"], parent=meta)
            make_simple_element("year", row["Year"], parent=meta)
            if row["Editors"] != "" and "?" not in row["Editors"]:
                editors = row["Editors"].split(" and ")
                for editor_name in editors:
                    editor = make_simple_element("editor", parent=meta)
                    last, first = editor_name.split(", ")
                    make_simple_element("first", first, parent=editor)
                    make_simple_element("last", last, parent=editor)
            break
    else:
        print(
            f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}",
            file=sys.stderr)
        sys.exit(1)

    if not os.path.exists(collection_id):
        print(f"Creating {collection_id}", file=sys.stderr)
        os.makedirs(collection_id)

    # Create entries for all the papers
    for paperid, row in enumerate(
            csv.DictReader(args.tsv_file, delimiter='\t'), 1):
        title_text = row["Title"]
        author_list = row["Authors"].split(" and ")
        pdf = row["Pdf"]

        paper = make_simple_element("paper",
                                    attrib={"id": str(paperid)},
                                    parent=volume)

        make_simple_element("title", title_text, parent=paper)
        for author_name in author_list:
            if author_name == "":
                continue
            author = make_simple_element("author", parent=paper)
            print(author_name)
            last, first = author_name.split(", ")
            make_simple_element("first", first, parent=author)
            make_simple_element("last", last, parent=author)

        url = f"{collection_id}-{volume_id}.{paperid}"
        pdf_local_path = os.path.join(collection_id, f"{url}.pdf")
        make_simple_element("url", url, parent=paper)
        download(pdf, pdf_local_path)

        if "Abstract" in row:
            make_simple_element("abstract", row["Abstract"], parent=paper)

        if "Presentation" in row:
            extension = row["Presentation"].split(".")[-1]
            filename = f"{collection_id}-{volume_id}.{paperid}.Presentation.{extension}"
            make_simple_element("attachment",
                                filename,
                                attrib={"type": "presentation"})
            download(row["Presentation"], os.path.join(collection_id,
                                                       filename))

    indent(tree.getroot())

    # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
    collection_file = os.path.join(args.anthology, "data", "xml",
                                   f"{collection_id}.xml")
    tree.write(collection_file,
               encoding="UTF-8",
               xml_declaration=True,
               with_tail=True)
Beispiel #15
0
def add_attachment(anthology_id, path, attach_type, overwrite=False):
    """
    Adds a single attachment to the Anthology data files.

    Arguments:
    - The ACL ID of the paper (e.g., P17-1012)
    - The path to the attachment (can be a URL)
    - The attachment type (poster, presentation, note, software)
    - Whether to overwrite the downloaded file.
    """

    collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id)

    if path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f"-> Downloading file from {path} to {input_file_path}",
                  file=sys.stderr)
            request = urllib.request.Request(
                path, headers={'User-Agent': 'Mozilla/5.0'})
            with urllib.request.urlopen(request) as url, open(
                    input_file_path, mode="wb") as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            raise Exception(f"Could not download {path}")
        except Exception as e:
            raise e
    else:
        input_file_path = path

    file_extension = path.replace("?dl=1", "").split(".")[-1]
    # Many links from file sharing services are not informative and don't have
    # extensions, so we could try to guess.
    if file_extension not in ALLOWED_TYPES:
        detected = filetype.guess(input_file_path)
        if detected is not None:
            file_extension = detected.mime.split("/")[-1]
            if file_extension not in ALLOWED_TYPES:
                print(
                    f"Could not determine file extension for {anthology_id} at {path}",
                    file=sys.stderr,
                )

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)

    attachment_file_name = f"{anthology_id}.{attach_type}.{file_extension}"

    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        # Check if attachment already exists
        for attachment in paper.findall("attachment"):
            if attachment.text == attachment_file_name:
                print(
                    f"-> attachment {attachment_file_name} already exists in the XML",
                    file=sys.stderr,
                )
                break
        else:
            attachment = ET.Element("attachment")
            attachment.attrib["type"] = attach_type.lower()
            attachment.attrib["hash"] = checksum
            attachment.text = attachment_file_name

            paper.append(attachment)
            indent(tree.getroot())
            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f"-> added attachment {attachment_file_name} to the XML",
                  file=sys.stderr)

    else:
        print(f"Paper {anthology_id} not found in the Anthology",
              file=sys.stderr)

    # Make sure directory exists
    output_dir = os.path.join(args.attachment_root, collection_id[0],
                              collection_id)
    if not os.path.exists(output_dir):
        #        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    # Copy file
    dest_path = os.path.join(output_dir, attachment_file_name)
    if os.path.exists(dest_path) and not overwrite:
        print(
            f"-> target file {dest_path} already in place, refusing to overwrite",
            file=sys.stderr,
        )
        return None

    shutil.copy(input_file_path, dest_path)
    os.chmod(dest_path, 0o644)
    print(f"-> copied {input_file_path} to {dest_path} and fixed perms",
          file=sys.stderr)

    # Clean up
    if path.startswith("http"):
        os.remove(input_file_path)

    return dest_path
Beispiel #16
0
def main(args):
    def maybe_copy(file_from, file_to):
        if not args.dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    change_type = "erratum" if args.erratum else "revision"
    change_letter = "e" if args.erratum else "v"

    print(f"Processing {change_type} to {args.anthology_id}...")

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        download_file(args.path, input_file_path)
    else:
        input_file_path = args.path

    validate_file_type(input_file_path)

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split(".")[-1]

    # The new version
    revno = None

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Files for old-style IDs are stored under anthology-files/pdf/P/P19/*
    # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/*
    if is_newstyle_id(args.anthology_id):
        venue_name = collection_id.split(".")[1]
        output_dir = os.path.join(args.anthology_dir, "pdf", venue_name)
    else:
        output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0],
                                  collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf")

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    if paper_id == "0":
        paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter")
    else:
        paper = tree.getroot().find(
            f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not args.dry_run:
            # Update the URL hash on the <url> tag
            url = paper.find("./url")
            if url is not None:
                url.attrib["hash"] = checksum

            if not args.erratum and revno == 2:
                if paper.find("./url") is not None:
                    current_version_url = infer_url(
                        paper.find("./url").text) + ".pdf"

                # Download original file
                # There are no versioned files the first time around, so create the first one
                # (essentially backing up the original version)
                revised_file_v1_path = os.path.join(
                    output_dir, f"{args.anthology_id}{change_letter}1.pdf")

                download_file(current_version_url, revised_file_v1_path)
                validate_file_type(revised_file_v1_path)

                with open(revised_file_v1_path, "rb") as f:
                    old_checksum = compute_hash(f.read())

                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{args.anthology_id}{change_letter}1",
                        "hash": old_checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                args.explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{args.anthology_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": args.date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    revised_file_versioned_path = os.path.join(
        output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(input_file_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if not args.erratum:
        maybe_copy(input_file_path, canonical_path)

    if args.path.startswith("http"):
        os.remove(input_file_path)
Beispiel #17
0
def main(args):

    change_type = 'erratum' if args.erratum else 'revision'
    change_letter = 'e' if args.erratum else 'v'

    print(f'Processing {change_type} to {args.anthology_id}...')

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith('http'):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f'-> Downloading file from {args.path}', file=sys.stderr)
            with urllib.request.urlopen(args.path) as url, open(input_file_path, mode='wb') as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            print('An SSL error was encountered in downloading the files.', file=sys.stderr)
            sys.exit(1)
    else:
        input_file_path = args.path

    collection_id, volume_id, paper_id = deconstruct_anthology_id(args.anthology_id)
    paper_extension = args.path.split('.')[-1]

    # The new version
    revno = None

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml')
    tree = ET.parse(xml_file)
    paper = tree.getroot().find(f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib['id']) + 1

        if args.do:
            revision = ET.Element(change_type)
            revision.attrib['id'] = str(revno)
            revision.attrib['href'] = f'{args.anthology_id}{change_letter}{revno}'
            revision.text = args.explanation

            # Set tails to maintain proper indentation
            paper[-1].tail += '  '
            revision.tail = '\n    '  # newline and two levels of indent

            paper.append(revision)

            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr)

    else:
        print(f'-> FATAL: paper ID {args.anthology_id} not found in the Anthology', file=sys.stderr)
        sys.exit(1)

    output_dir = os.path.join(args.anthology_dir, 'pdf', collection_id[0], collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f'-> Creating directory {output_dir}', file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f'{args.anthology_id}.pdf')

    if not args.erratum and revno == 2:
        # There are no versioned files the first time around, so create the first one
        # (essentially backing up the original version)
        revised_file_v1_path = os.path.join(output_dir, f'{args.anthology_id}{change_letter}1.pdf')

        current_version = ANTHOLOGY_PDF.format(args.anthology_id)
        if args.do:
            try:
                print(f'-> Downloading file from {args.path} to {revised_file_v1_path}', file=sys.stderr)
                with urllib.request.urlopen(current_version) as url, open(revised_file_v1_path, mode='wb') as fh:
                    fh.write(url.read())
            except ssl.SSLError:
                print(f'-> FATAL: An SSL error was encountered in downloading {args.path}.', file=sys.stderr)
                sys.exit(1)
        else:
            print(f'-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}', file=sys.stderr)


    revised_file_versioned_path = os.path.join(output_dir, f'{args.anthology_id}{change_letter}{revno}.pdf')

    maybe_copy(input_file_path, revised_file_versioned_path, args.do)
    maybe_copy(input_file_path, canonical_path, args.do)

    if args.path.startswith('http'):
        os.remove(input_file_path)
Beispiel #18
0
def main(args):
    def maybe_copy(file_from, file_to):
        if not args.dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    change_type = "erratum" if args.erratum else "revision"
    change_letter = "e" if args.erratum else "v"

    print(f"Processing {change_type} to {args.anthology_id}...")

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f"-> Downloading file from {args.path}", file=sys.stderr)
            with urllib.request.urlopen(args.path) as url, open(
                    input_file_path, mode="wb") as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            print("An SSL error was encountered in downloading the files.",
                  file=sys.stderr)
            sys.exit(1)
    else:
        input_file_path = args.path

    detected = filetype.guess(input_file_path)
    if detected is None or not detected.mime.endswith(detected.extension):
        mime_type = 'UNKNOWN' if detected is None else detected.mime
        print(
            f"FATAL: {args.anthology_id} file {args.path} has MIME type {mime_type}",
            file=sys.stderr,
        )
        sys.exit(1)

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split(".")[-1]

    # The new version
    revno = None

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not args.dry_run:
            if not args.erratum and revno == 2:
                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{args.anthology_id}{change_letter}1",
                        "hash": checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                args.explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{args.anthology_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": args.date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0],
                              collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf")

    if not args.erratum and revno == 2:
        # There are no versioned files the first time around, so create the first one
        # (essentially backing up the original version)
        revised_file_v1_path = os.path.join(
            output_dir, f"{args.anthology_id}{change_letter}1.pdf")

        current_version = ANTHOLOGY_PDF.format(args.anthology_id)
        if not args.dry_run:
            try:
                print(
                    f"-> Downloading file from {args.path} to {revised_file_v1_path}",
                    file=sys.stderr,
                )
                with urllib.request.urlopen(current_version) as url, open(
                        revised_file_v1_path, mode="wb") as fh:
                    fh.write(url.read())
            except ssl.SSLError:
                print(
                    f"-> FATAL: An SSL error was encountered in downloading {args.path}.",
                    file=sys.stderr,
                )
                sys.exit(1)
        else:
            print(
                f"-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}",
                file=sys.stderr,
            )

    revised_file_versioned_path = os.path.join(
        output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(input_file_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if not args.erratum:
        maybe_copy(input_file_path, canonical_path)

    if args.path.startswith("http"):
        os.remove(input_file_path)
Beispiel #19
0
def main(args):
    year = args.year
    venue = args.venue
    volume_id = args.volume
    collection_id = f"{year}.{venue}"

    splitter = NameSplitter(anthology_dir=args.anthology_dir)

    collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                   f"{collection_id}.xml")
    if os.path.exists(collection_file):
        tree = etree.parse(collection_file)
    else:
        tree = etree.ElementTree(
            make_simple_element("collection", attrib={"id": collection_id}))

    now = datetime.now()
    today = f"{now.year}-{now.month:02d}-{now.day:02d}"

    volume_node = tree.getroot().find(f"./volume[@id='{volume_id}']")
    if volume_node is not None:
        tree.getroot().remove(volume_node)

    volume = make_simple_element("volume",
                                 attrib={
                                     "id": volume_id,
                                     "ingest-date": today
                                 },
                                 parent=tree.getroot())

    if not os.path.exists(collection_id):
        print(f"Creating {collection_id}", file=sys.stderr)
        os.makedirs(collection_id)

    # Create entries for all the papers
    for paperid, row in enumerate(
            csv.DictReader(args.tsv_file, delimiter=args.delimiter)):
        pages = row.get("pages", None)

        if paperid == 0:
            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle", row["booktitle"], parent=meta)
            make_simple_element("publisher", row["publisher"], parent=meta)
            make_simple_element("address", row["address"], parent=meta)
            make_simple_element("month", row["month"], parent=meta)
            make_simple_element("year", year, parent=meta)

            editors = row["author"].split(" and ")
            row["author"] = ""
            for editor_name in editors:
                editor = make_simple_element("editor", parent=meta)
                surname, givenname = splitter.best_split(editor_name)
                make_simple_element("first", givenname, parent=editor)
                make_simple_element("last", surname, parent=editor)

            # volume PDF
            proceedings_pdf = args.proceedings_pdf
            if proceedings_pdf is not None:
                volume_anth_id = f"{collection_id}-{volume_id}"
                pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                              f"{volume_anth_id}.pdf")
                retrieve_url(proceedings_pdf, pdf_local_path)
                checksum = compute_hash_from_file(pdf_local_path)
                make_simple_element("url",
                                    volume_anth_id,
                                    attrib={"hash": checksum},
                                    parent=meta)
                proceedings_pdf = pdf_local_path

        title_text = row["title"]

        # The first row might be front matter (needs a special name)
        if paperid == 0 and title_text.lower() in [
                "frontmatter", "front matter"
        ]:
            paper = make_simple_element("frontmatter", parent=volume)
        else:
            if paperid == 0:
                # Not frontmatter, so paper 1
                paperid += 1

            paper = make_simple_element("paper",
                                        attrib={"id": str(paperid)},
                                        parent=volume)
            # Only make the title for not-the-frontmatter
            make_simple_element("title", title_text, parent=paper)

        author_list = row["author"].split(" and ")

        for author_name in author_list:
            if author_name == "":
                continue
            author = make_simple_element("author", parent=paper)
            surname, givenname = splitter.best_split(author_name)
            make_simple_element("first", givenname, parent=author)
            make_simple_element("last", surname, parent=author)

        if pages is not None and pages != "":
            make_simple_element("pages", pages, parent=paper)

        # Find the PDF, either listed directly, or extracted from the proceedings PDF
        anth_id = f"{collection_id}-{volume_id}.{paperid}"
        pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                      f"{anth_id}.pdf")
        url = None
        if "pdf" in row and row["pdf"] != "":
            if retrieve_url(row["pdf"], pdf_local_path):
                url = anth_id
            else:
                print("Can't find", row["pdf"])

        elif "pages in pdf" in row:
            pdf_pages = row["pages"]
            extract_pages(proceedings_pdf, pdf_pages, pdf_local_path)
            url = anth_id

        if url is not None:
            checksum = compute_hash_from_file(pdf_local_path)

            make_simple_element("url",
                                url,
                                attrib={"hash": checksum},
                                parent=paper)

        if "abstract" in row and row["abstract"] != "":
            make_simple_element("abstract", row["abstract"], parent=paper)

        if "presentation" in row:
            url = row["presentation"]
            if url is not None and url != "" and url != "None":
                extension = row["presentation"].split(".")[-1]
                name = f"{anth_id}.Presentation.{extension}"
                local_path = os.path.join(
                    args.anthology_files_path,
                    "..",
                    "attachments",
                    venue,
                    name,
                )
                if retrieve_url(row["presentation"], local_path):
                    make_simple_element(
                        "attachment",
                        name,
                        attrib={
                            "type": "presentation",
                            "hash": compute_hash_from_file(local_path),
                        },
                        parent=paper,
                    )

        # Normalize
        for node in paper:
            normalize(node, informat="latex")

    indent(tree.getroot())

    # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
    tree.write(collection_file,
               encoding="UTF-8",
               xml_declaration=True,
               with_tail=True)
def add_attachment(anthology_id, path, attach_type, overwrite=False):
    """
    Adds a single attachment to the Anthology data files.

    Arguments:
    - The ACL ID of the paper (e.g., P17-1012)
    - The path to the attachment (can be a URL)
    - The attachment type (poster, presentation, note, software)
    - Whether to overwrite the downloaded file.
    """

    collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id)
    paper_extension = path.replace("?dl=1", "").split(".")[-1]

    output_dir = os.path.join(args.attachment_root, collection_id[0],
                              collection_id)
    attachment_file_name = f"{anthology_id}.{attach_type}.{paper_extension}"
    dest_path = os.path.join(output_dir, attachment_file_name)
    if os.path.exists(dest_path) and not overwrite:
        print(
            f"-> target file {dest_path} already in place, refusing to overwrite",
            file=sys.stderr,
        )
        return None

    if path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f"-> Downloading file from {path} to {input_file_path}",
                  file=sys.stderr)
            with urllib.request.urlopen(path) as url, open(
                    input_file_path, mode="wb") as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            raise Exception(f"Could not download {path}")
    else:
        input_file_path = path

    detected = filetype.guess(input_file_path)
    if detected is None or not detected.mime.endswith(detected.extension):
        mime_type = 'UNKNOWN' if detected is None else detected.mime
        raise Exception(
            f"{anthology_id} file {path} has MIME type {mime_type}")

    if paper_extension not in ALLOWED_TYPES:
        raise Exception(
            f"-> Unknown file extension {paper_extension} for {path}")

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)

    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        # Check if attachment already exists
        for attachment in paper.findall("attachment"):
            if attachment.text == attachment_file_name:
                print(
                    f"-> attachment {attachment_file_name} already exists in the XML",
                    file=sys.stderr,
                )
                break
        else:
            attachment = ET.Element("attachment")
            attachment.attrib["type"] = attach_type.lower()
            attachment.text = attachment_file_name

            paper.append(attachment)
            indent(tree.getroot())
            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f"-> added attachment {attachment_file_name} to the XML",
                  file=sys.stderr)

    else:
        raise Exception(f"Paper {anthology_id} not found in the Anthology")

    # Make sure directory exists
    if not os.path.exists(output_dir):
        #        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    # Copy file
    shutil.copy(input_file_path, dest_path)
    os.chmod(dest_path, 0o644)
    print(f"-> copied {input_file_path} to {dest_path} and fixed perms",
          file=sys.stderr)

    # Clean up
    if path.startswith("http"):
        os.remove(input_file_path)

    return dest_path
            else:
                old_video = old_paper.find("video")
                logging.info(old_video)
                if old_video is not None:
                    logging.info("Had old video!")
                    old_video_href = old_video.attrib["href"]
                    old_video_href_https = old_video_href.replace(
                        "http://", "https://"
                    )  # Fix for techtalkx.tv links
                    old_video.attrib["href"] = old_video_href_https
                    logging.info(old_video_href)
                    papernode.append(old_video)

                old_attachment = old_paper.find("attachment")
                logging.info(old_attachment)
                if old_attachment is not None:
                    logging.info("Had an old attachment!")
                    old_attachment_type = old_attachment.attrib["type"]
                    logging.info(old_attachment_type)
                    papernode.append(old_attachment)

        # Normalize
        for oldnode in papernode:
            normalize(oldnode, informat="latex")
        volume.append(papernode)
        i += 1

    indent(collection)  # from anthology.utils
    et = etree.ElementTree(collection)
    et.write(args.outfile, encoding="UTF-8", xml_declaration=True, with_tail=True)
Beispiel #22
0
def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    # Build list of volumes, confirm uniqueness
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))
        meta["path"] = proceedings

        meta["collection_id"] = collection_id = (meta["year"] + "." +
                                                 meta["abbrev"].lower())
        volume_name = meta["volume_name"]
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume_name"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        print(f"VOLUME: {volume}")

        # copy the book
        book_src_filename = meta["abbrev"] + "-" + meta["year"]
        book_src_path = os.path.join(root_path, book_src_filename) + ".pdf"
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run)
            if not args.dry_run:
                shutil.copyfile(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # names are {abbrev}{number}.pdf
            abbrev = meta["abbrev"]
            match = re.match(rf"{abbrev}(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                log(
                    f"Copying [{paper_id_full}] {pdf_src_path} -> {pdf_dest_path}",
                    args.dry_run,
                )
                if not args.dry_run:
                    shutil.copyfile(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                collection_id)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                match = re.match(rf"{abbrev}(\d+)_(\w+)\.(\w+)")
                if match is not None:
                    paper_num, type_, ext = match.groups()
                    paper_num = int(paper_num)

                    file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                    dest_path = os.path.join(attachments_dest_dir, file_name)
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    if not args.dry_run:
                        shutil.copyfile(attachment_file, dest_path)

                    collections[collection_id][volume_name][paper_num][
                        "attachments"].append(dest_path)

    people = AnthologyIndex(None,
                            srcdir=os.path.join(os.path.dirname(sys.argv[0]),
                                                "..", "data"))

    for collection_id, collection in collections.items():
        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        root_node = make_simple_element("collection",
                                        attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element("volume",
                                              attrib={"id": volume_id},
                                              parent=root_node)
            meta = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)
                # print(etree.tostring(paper_node, pretty_print=True))

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta.append(title_node)
                    for editor in paper_node.findall("editor"):
                        meta.append(editor)
                    meta.append(paper_node.find("publisher"))
                    meta.append(paper_node.find("address"))
                    meta.append(paper_node.find("month"))
                    meta.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            parent=meta)

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                for attachment in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=attachment.path,
                        attrib={
                            "type": attachment.type,
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

        # Normalize
        for paper in root_node.findall(".//paper"):
            for oldnode in paper:
                normalize(oldnode, informat="latex")

        # Ensure names are properly identified
        ambiguous = {}
        for paper in root_node.findall(".//paper"):
            anth_id = build_anthology_id(collection_id,
                                         paper.getparent().attrib["id"],
                                         paper.attrib["id"])

        for node in chain(paper.findall("author"), paper.findall("editor")):
            name = PersonName.from_element(node)
            ids = people.get_ids(name)
            if len(ids) > 1:
                print(
                    f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}"
                )
                ambiguous[anth_id] = (name, ids)

                node.attrib["id"] = ids[0]

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)
Beispiel #23
0
def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                     "data")
    venue_keys = [
        venue["slug"].lower()
        for _, venue in VenueIndex(srcdir=anthology_datadir).items()
    ]

    # Build list of volumes, confirm uniqueness
    unseen_venues = []
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))

        venue_name = meta["abbrev"].lower()

        if venue_name not in venue_keys:
            unseen_venues.append(meta["abbrev"])

        meta["path"] = proceedings

        meta["collection_id"] = collection_id = (meta["year"] + "." +
                                                 meta["abbrev"].lower())
        volume_name = meta["volume"].lower()
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

    # Make sure all venues exist
    if len(unseen_venues) > 0:
        print("FATAL: The following venue(s) don't exist in venues.yaml")
        for venue in unseen_venues:
            print(f"- {venue}")
        print("Please create entries for them and re-ingest.")
        sys.exit(1)

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume"].lower()
        year = meta["year"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        # copy the book
        book_src_filename = meta["abbrev"] + "-" + year
        book_src_path = os.path.join(root_path, book_src_filename) + ".pdf"
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            if not args.dry_run and not os.path.exists(book_dest_path):
                log(f"Copying {book_src_path} -> {book_dest_path}",
                    args.dry_run)
                shutil.copyfile(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # names are {abbrev}{number}.pdf
            match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                if not args.dry_run and not os.path.exists(pdf_dest_path):
                    log(f"Copying {pdf_src_path} -> {pdf_dest_path}",
                        args.dry_run)
                    shutil.copyfile(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                venue_name)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                attachment_file_path = os.path.join(root_path, "additional",
                                                    attachment_file)
                match = re.match(
                    rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$",
                    attachment_file)
                if match is None:
                    print(
                        f"* Warning: no attachment match for {attachment_file}",
                        file=sys.stderr,
                    )
                    sys.exit(2)

                paper_num, type_, ext = match.groups()
                paper_num = int(paper_num)

                file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                dest_path = os.path.join(attachments_dest_dir, file_name)
                if not args.dry_run and not os.path.exists(dest_path):
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    shutil.copyfile(attachment_file_path, dest_path)

                collections[collection_id][volume_name][paper_num][
                    "attachments"].append((dest_path, type_))

    people = AnthologyIndex(None, srcdir=anthology_datadir)

    def disambiguate_name(node, anth_id):
        name = PersonName.from_element(node)
        ids = people.get_ids(name)

        if len(ids) > 1:
            choice = -1
            while choice < 0 or choice >= len(ids):
                print(
                    f"({anth_id}): ambiguous author {name}; Please choose from the following:"
                )
                for i, id_ in enumerate(ids):
                    print(f"[{i}] {id_} ({people.get_comment(id_)})")
                choice = int(input("--> "))

            node.attrib["id"] = ids[choice]

    for collection_id, collection in collections.items():
        # Newly added volumes, so we can normalize and name-disambig later
        newly_added_volumes = []

        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        if os.path.exists(collection_file):
            root_node = etree.parse(collection_file).getroot()
        else:
            root_node = make_simple_element("collection",
                                            attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element(
                "volume",
                attrib={
                    "id": volume_id,
                    "ingest-date": args.ingest_date
                },
            )

            # Replace the existing one if present
            existing_volume_node = root_node.find(
                f"./volume[@id='{volume_id}']")
            for i, child in enumerate(root_node):
                if child.attrib["id"] == volume_id:
                    root_node[i] = volume_node
                    break
            else:
                root_node.append(volume_node)

            meta_node = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)
                # print(etree.tostring(paper_node, pretty_print=True))

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta_node = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta_node.append(title_node)
                    for author_or_editor in chain(
                            paper_node.findall("./author"),
                            paper_node.findall("./editor")):
                        meta_node.append(author_or_editor)
                        author_or_editor.tag = "editor"
                    meta_node.append(paper_node.find("publisher"))
                    meta_node.append(paper_node.find("address"))
                    meta_node.append(paper_node.find("month"))
                    meta_node.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            attrib={
                                "hash": compute_hash_from_file(book_dest_path)
                            },
                            parent=meta_node,
                        )

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                url = paper_node.find("./url")
                if url is not None:
                    url.attrib["hash"] = compute_hash_from_file(paper["pdf"])

                for path, type_ in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=os.path.basename(path),
                        attrib={
                            "type": type_,
                            "hash": compute_hash_from_file(path),
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

                # Normalize
                for oldnode in paper_node:
                    normalize(oldnode, informat="latex")

                for name_node in chain(paper_node.findall("./author"),
                                       paper_node.findall("./editor")):
                    disambiguate_name(name_node, paper_id_full)

        # Other data from the meta file
        if "isbn" in meta:
            make_simple_element("isbn", meta["isbn"], parent=meta_node)

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)
Beispiel #24
0
def add_revision(anth_id,
                 pdf_path,
                 explanation,
                 change_type="revision",
                 dry_run=True,
                 date=None):
    """
    Takes an Anthology ID. It then adds a revision to the Anthology XML,
    updating and writing the XML file, and copies the PDFs into place.
    For PDFs, the revised PDF is saved to {anth_id}.pdf and {anth_id}v{version}.pdf.
    For the first revision, we first copy {anth_id}.pdf to {anth_id}v1.pdf.
    """
    if date is None:
        now = datetime.now()
        date = f"{now.year}-{now.month:02d}-{now.day:02d}"

    def maybe_copy(file_from, file_to):
        if not dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    # The new version
    revno = None

    change_letter = "e" if change_type == "erratum" else "v"

    checksum = compute_hash_from_file(pdf_path)

    # Files for old-style IDs are stored under anthology-files/pdf/P/P19/*
    # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/*
    output_dir = get_pdf_dir(anth_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{anth_id}.pdf")

    # Update XML
    xml_file = get_xml_file(anth_id)
    collection_id, volume_id, paper_id = deconstruct_anthology_id(anth_id)
    tree = ET.parse(xml_file)
    if paper_id == "0":
        paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter")
    else:
        paper = tree.getroot().find(
            f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if change_type == "erratum" else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not dry_run:
            # Update the URL hash on the <url> tag
            url = paper.find("./url")
            if url is not None:
                url.attrib["hash"] = checksum

            if change_type == "revision" and revno == 2:
                if paper.find("./url") is not None:
                    current_version_url = infer_url(
                        paper.find("./url").text) + ".pdf"

                # Download original file
                # There are no versioned files the first time around, so create the first one
                # (essentially backing up the original version)
                revised_file_v1_path = os.path.join(
                    output_dir, f"{anth_id}{change_letter}1.pdf")

                retrieve_url(current_version_url, revised_file_v1_path)
                validate_file_type(revised_file_v1_path)

                old_checksum = compute_hash_from_file(revised_file_v1_path)

                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{anth_id}{change_letter}1",
                        "hash": old_checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{anth_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {anth_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    revised_file_versioned_path = os.path.join(
        output_dir, f"{anth_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(pdf_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if change_type == "revision":
        maybe_copy(pdf_path, canonical_path)
Beispiel #25
0
            # Find the insertion point among the other volumes
            insertion_point = 0
            for i, volume in enumerate(existing_tree.getroot()):
                if new_volume_id < int(volume.attrib["id"]):
                    break
                insertion_point = i + 1
            print(
                f"Inserting volume {new_volume_id} at collection position {insertion_point}"
            )
            existing_tree.getroot().insert(insertion_point, new_volume)
        else:
            # Append to existing volume (useful for TACL, which has a single volume each year) if requested
            if args.append:
                for paper in new_volume.findall("./paper"):
                    print(f'Appending {paper.attrib["id"]}')
                    paper.attrib["ingest-date"] = args.ingest_date
                    existing_volume.append(paper)
            else:
                print(
                    f"Skipping volume {new_volume_id}, which has already been inserted into {collection_file}.\n"
                    "You can append to this volume by passing `--append` (or `-a`) to this script.\n"
                )
                continue

    indent(existing_tree.getroot())
    existing_tree.write(collection_file,
                        encoding="UTF-8",
                        xml_declaration=True,
                        with_tail=True)
Beispiel #26
0
def dump_collection(tree, collection_file):
    indent(tree.getroot())
    tree.write(collection_file,
               encoding="UTF-8",
               xml_declaration=True,
               with_tail=True)