Example #1
0
def maybe_copy(source_path, dest_path):
    """Copies the file if it's different from the target."""
    if not os.path.exists(dest_path) or compute_hash_from_file(
        source_path
    ) != compute_hash_from_file(dest_path):
        log(f"Copying {source_path} -> {dest_path}", args.dry_run)
        shutil.copyfile(source_path, dest_path)
Example #2
0
def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                     "data")
    venue_keys = [
        venue["slug"].lower()
        for _, venue in VenueIndex(srcdir=anthology_datadir).items()
    ]

    # Build list of volumes, confirm uniqueness
    unseen_venues = []
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))

        venue_name = meta["abbrev"].lower()

        if venue_name not in venue_keys:
            unseen_venues.append(meta["abbrev"])

        meta["path"] = proceedings

        meta["collection_id"] = collection_id = (meta["year"] + "." +
                                                 meta["abbrev"].lower())
        volume_name = meta["volume"].lower()
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

    # Make sure all venues exist
    if len(unseen_venues) > 0:
        print("FATAL: The following venue(s) don't exist in venues.yaml")
        for venue in unseen_venues:
            print(f"- {venue}")
        print("Please create entries for them and re-ingest.")
        sys.exit(1)

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume"].lower()
        year = meta["year"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        # copy the book
        book_src_filename = meta["abbrev"] + "-" + year
        book_src_path = os.path.join(root_path, book_src_filename) + ".pdf"
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            if not args.dry_run and not os.path.exists(book_dest_path):
                log(f"Copying {book_src_path} -> {book_dest_path}",
                    args.dry_run)
                shutil.copyfile(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # names are {abbrev}{number}.pdf
            match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                if not args.dry_run and not os.path.exists(pdf_dest_path):
                    log(f"Copying {pdf_src_path} -> {pdf_dest_path}",
                        args.dry_run)
                    shutil.copyfile(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                venue_name)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                attachment_file_path = os.path.join(root_path, "additional",
                                                    attachment_file)
                match = re.match(
                    rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$",
                    attachment_file)
                if match is None:
                    print(
                        f"* Warning: no attachment match for {attachment_file}",
                        file=sys.stderr,
                    )
                    sys.exit(2)

                paper_num, type_, ext = match.groups()
                paper_num = int(paper_num)

                file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                dest_path = os.path.join(attachments_dest_dir, file_name)
                if not args.dry_run and not os.path.exists(dest_path):
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    shutil.copyfile(attachment_file_path, dest_path)

                collections[collection_id][volume_name][paper_num][
                    "attachments"].append((dest_path, type_))

    people = AnthologyIndex(None, srcdir=anthology_datadir)

    def disambiguate_name(node, anth_id):
        name = PersonName.from_element(node)
        ids = people.get_ids(name)

        if len(ids) > 1:
            choice = -1
            while choice < 0 or choice >= len(ids):
                print(
                    f"({anth_id}): ambiguous author {name}; Please choose from the following:"
                )
                for i, id_ in enumerate(ids):
                    print(f"[{i}] {id_} ({people.get_comment(id_)})")
                choice = int(input("--> "))

            node.attrib["id"] = ids[choice]

    for collection_id, collection in collections.items():
        # Newly added volumes, so we can normalize and name-disambig later
        newly_added_volumes = []

        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        if os.path.exists(collection_file):
            root_node = etree.parse(collection_file).getroot()
        else:
            root_node = make_simple_element("collection",
                                            attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element(
                "volume",
                attrib={
                    "id": volume_id,
                    "ingest-date": args.ingest_date
                },
            )

            # Replace the existing one if present
            existing_volume_node = root_node.find(
                f"./volume[@id='{volume_id}']")
            for i, child in enumerate(root_node):
                if child.attrib["id"] == volume_id:
                    root_node[i] = volume_node
                    break
            else:
                root_node.append(volume_node)

            meta_node = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)
                # print(etree.tostring(paper_node, pretty_print=True))

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta_node = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta_node.append(title_node)
                    for author_or_editor in chain(
                            paper_node.findall("./author"),
                            paper_node.findall("./editor")):
                        meta_node.append(author_or_editor)
                        author_or_editor.tag = "editor"
                    meta_node.append(paper_node.find("publisher"))
                    meta_node.append(paper_node.find("address"))
                    meta_node.append(paper_node.find("month"))
                    meta_node.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            attrib={
                                "hash": compute_hash_from_file(book_dest_path)
                            },
                            parent=meta_node,
                        )

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                url = paper_node.find("./url")
                if url is not None:
                    url.attrib["hash"] = compute_hash_from_file(paper["pdf"])

                for path, type_ in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=os.path.basename(path),
                        attrib={
                            "type": type_,
                            "hash": compute_hash_from_file(path),
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

                # Normalize
                for oldnode in paper_node:
                    normalize(oldnode, informat="latex")

                for name_node in chain(paper_node.findall("./author"),
                                       paper_node.findall("./editor")):
                    disambiguate_name(name_node, paper_id_full)

        # Other data from the meta file
        if "isbn" in meta:
            make_simple_element("isbn", meta["isbn"], parent=meta_node)

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)
Example #3
0
def main(args):
    year = args.year
    venue = args.venue
    volume_id = args.volume
    collection_id = f"{year}.{venue}"

    splitter = NameSplitter(anthology_dir=args.anthology_dir)

    collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                   f"{collection_id}.xml")
    if os.path.exists(collection_file):
        tree = etree.parse(collection_file)
    else:
        tree = etree.ElementTree(
            make_simple_element("collection", attrib={"id": collection_id}))

    now = datetime.now()
    today = f"{now.year}-{now.month:02d}-{now.day:02d}"

    volume_node = tree.getroot().find(f"./volume[@id='{volume_id}']")
    if volume_node is not None:
        tree.getroot().remove(volume_node)

    volume = make_simple_element("volume",
                                 attrib={
                                     "id": volume_id,
                                     "ingest-date": today
                                 },
                                 parent=tree.getroot())

    if not os.path.exists(collection_id):
        print(f"Creating {collection_id}", file=sys.stderr)
        os.makedirs(collection_id)

    # Create entries for all the papers
    for paperid, row in enumerate(
            csv.DictReader(args.tsv_file, delimiter=args.delimiter)):
        pages = row.get("pages", None)

        if paperid == 0:
            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle", row["booktitle"], parent=meta)
            make_simple_element("publisher", row["publisher"], parent=meta)
            make_simple_element("address", row["address"], parent=meta)
            make_simple_element("month", row["month"], parent=meta)
            make_simple_element("year", year, parent=meta)

            editors = row["author"].split(" and ")
            row["author"] = ""
            for editor_name in editors:
                editor = make_simple_element("editor", parent=meta)
                surname, givenname = splitter.best_split(editor_name)
                make_simple_element("first", givenname, parent=editor)
                make_simple_element("last", surname, parent=editor)

            # volume PDF
            proceedings_pdf = args.proceedings_pdf
            if proceedings_pdf is not None:
                volume_anth_id = f"{collection_id}-{volume_id}"
                pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                              f"{volume_anth_id}.pdf")
                retrieve_url(proceedings_pdf, pdf_local_path)
                checksum = compute_hash_from_file(pdf_local_path)
                make_simple_element("url",
                                    volume_anth_id,
                                    attrib={"hash": checksum},
                                    parent=meta)
                proceedings_pdf = pdf_local_path

        title_text = row["title"]

        # The first row might be front matter (needs a special name)
        if paperid == 0 and title_text.lower() in [
                "frontmatter", "front matter"
        ]:
            paper = make_simple_element("frontmatter", parent=volume)
        else:
            if paperid == 0:
                # Not frontmatter, so paper 1
                paperid += 1

            paper = make_simple_element("paper",
                                        attrib={"id": str(paperid)},
                                        parent=volume)
            # Only make the title for not-the-frontmatter
            make_simple_element("title", title_text, parent=paper)

        author_list = row["author"].split(" and ")

        for author_name in author_list:
            if author_name == "":
                continue
            author = make_simple_element("author", parent=paper)
            surname, givenname = splitter.best_split(author_name)
            make_simple_element("first", givenname, parent=author)
            make_simple_element("last", surname, parent=author)

        if pages is not None and pages != "":
            make_simple_element("pages", pages, parent=paper)

        # Find the PDF, either listed directly, or extracted from the proceedings PDF
        anth_id = f"{collection_id}-{volume_id}.{paperid}"
        pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                      f"{anth_id}.pdf")
        url = None
        if "pdf" in row and row["pdf"] != "":
            if retrieve_url(row["pdf"], pdf_local_path):
                url = anth_id
            else:
                print("Can't find", row["pdf"])

        elif "pages in pdf" in row:
            pdf_pages = row["pages"]
            extract_pages(proceedings_pdf, pdf_pages, pdf_local_path)
            url = anth_id

        if url is not None:
            checksum = compute_hash_from_file(pdf_local_path)

            make_simple_element("url",
                                url,
                                attrib={"hash": checksum},
                                parent=paper)

        if "abstract" in row and row["abstract"] != "":
            make_simple_element("abstract", row["abstract"], parent=paper)

        if "presentation" in row:
            url = row["presentation"]
            if url is not None and url != "" and url != "None":
                extension = row["presentation"].split(".")[-1]
                name = f"{anth_id}.Presentation.{extension}"
                local_path = os.path.join(
                    args.anthology_files_path,
                    "..",
                    "attachments",
                    venue,
                    name,
                )
                if retrieve_url(row["presentation"], local_path):
                    make_simple_element(
                        "attachment",
                        name,
                        attrib={
                            "type": "presentation",
                            "hash": compute_hash_from_file(local_path),
                        },
                        parent=paper,
                    )

        # Normalize
        for node in paper:
            normalize(node, informat="latex")

    indent(tree.getroot())

    # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
    tree.write(collection_file,
               encoding="UTF-8",
               xml_declaration=True,
               with_tail=True)
Example #4
0
        # Check if the paper is already present in the volume
        doi_text = papernode.find("./doi").text
        doi_node = collection.xpath(f'.//doi[text()="{doi_text}"]')
        if len(doi_node):
            logging.info(
                f"Skipping existing paper {anth_id}/{doi_text} with title {papernode.find('title').text}"
            )
            continue

        papernode.attrib["id"] = f"{paper_id}"

        destination = pdf_destination / f"{anth_id}.pdf"
        print(f"Copying {pdf_path} to {destination}")
        shutil.copyfile(pdf_path, destination)
        checksum = compute_hash_from_file(pdf_path)

        url_text = anth_id
        url = etree.Element("url")
        url.attrib["hash"] = checksum
        url.text = url_text
        papernode.append(url)

        # Normalize
        for oldnode in papernode:
            normalize(oldnode, informat="latex")
        volume.append(papernode)

        paper_id += 1

    indent(collection)  # from anthology.utils
Example #5
0
def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                     "data")
    venue_index = VenueIndex(srcdir=anthology_datadir)
    venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()]

    sig_index = SIGIndex(srcdir=anthology_datadir)

    # Build list of volumes, confirm uniqueness
    unseen_venues = []
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))

        venue_abbrev = meta["abbrev"]
        venue_slug = venue_index.get_slug(venue_abbrev)

        if str(datetime.now().year) in venue_abbrev:
            print(
                f"Fatal: Venue assembler put year in acronym: '{venue_abbrev}'"
            )
            sys.exit(1)

        if venue_slug not in venue_keys:
            unseen_venues.append((venue_slug, venue_abbrev, meta["title"]))

        meta["path"] = proceedings

        meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug
        volume_name = meta["volume"].lower()
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

        if "sig" in meta:
            print(
                f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:"
            )
            print(f"  - {meta['year']}:")
            print(f"    - {volume_full_id} # {meta['booktitle']}")

    # Make sure all venues exist
    if len(unseen_venues) > 0:
        for venue in unseen_venues:
            slug, abbrev, title = venue
            print(f"Creating venue '{abbrev}' ({title})")
            venue_index.add_venue(abbrev, title)
        venue_index.dump(directory=anthology_datadir)

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume"].lower()
        year = meta["year"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        # copy the book
        book_src_filename = f'{year}.{meta["abbrev"]}-{volume_name}.pdf'
        book_src_path = os.path.join(root_path, book_src_filename)
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            if not args.dry_run:
                maybe_copy(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # Skip . files
            if os.path.basename(pdf_file).startswith("."):
                continue

            # names are {abbrev}{number}.pdf
            match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                if not args.dry_run:
                    maybe_copy(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                venue_name)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                if os.path.basename(attachment_file).startswith("."):
                    continue
                attachment_file_path = os.path.join(root_path, "additional",
                                                    attachment_file)
                match = re.match(
                    rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$",
                    attachment_file)
                if match is None:
                    print(
                        f"* Warning: no attachment match for {attachment_file}",
                        file=sys.stderr,
                    )
                    sys.exit(2)

                paper_num, type_, ext = match.groups()
                paper_num = int(paper_num)

                file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                dest_path = os.path.join(attachments_dest_dir, file_name)
                if not args.dry_run and not os.path.exists(dest_path):
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    shutil.copyfile(attachment_file_path, dest_path)

                collections[collection_id][volume_name][paper_num][
                    "attachments"].append((dest_path, type_))

    people = AnthologyIndex(None, srcdir=anthology_datadir)

    def correct_caps(person, name_node, anth_id):
        """
        Many people submit their names in "ALL CAPS" or "all lowercase".
        Correct this with heuristics.
        """
        name = name_node.text
        if name.islower() or name.isupper():
            # capitalize all parts
            corrected = " ".join(
                list(map(lambda x: x.capitalize(), name.split())))
            print(
                f"-> Correcting capitalization of '{name}' to '{corrected}'",
                file=sys.stderr,
            )
            name_node.text = corrected

    def disambiguate_name(node, anth_id):
        name = PersonName.from_element(node)
        ids = people.get_ids(name)

        if len(ids) > 1:
            choice = -1
            while choice < 0 or choice >= len(ids):
                print(
                    f"({anth_id}): ambiguous author {name}; Please choose from the following:"
                )
                for i, id_ in enumerate(ids):
                    print(f"[{i}] {id_} ({people.get_comment(id_)})")
                choice = int(input("--> "))

            node.attrib["id"] = ids[choice]

    for collection_id, collection in collections.items():
        # Newly added volumes, so we can normalize and name-disambig later
        newly_added_volumes = []

        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        if os.path.exists(collection_file):
            root_node = etree.parse(collection_file).getroot()
        else:
            root_node = make_simple_element("collection",
                                            attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element(
                "volume",
                attrib={
                    "id": volume_id,
                    "ingest-date": args.ingest_date
                },
            )

            # Replace the existing one if present
            existing_volume_node = root_node.find(
                f"./volume[@id='{volume_id}']")
            for i, child in enumerate(root_node):
                if child.attrib["id"] == volume_id:
                    root_node[i] = volume_node
                    break
            else:
                root_node.append(volume_node)

            meta_node = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta_node = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta_node.append(title_node)
                    for author_or_editor in chain(
                            paper_node.findall("./author"),
                            paper_node.findall("./editor")):
                        meta_node.append(author_or_editor)
                        author_or_editor.tag = "editor"
                    meta_node.append(paper_node.find("publisher"))
                    meta_node.append(paper_node.find("address"))
                    meta_node.append(paper_node.find("month"))
                    meta_node.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            attrib={
                                "hash": compute_hash_from_file(book_dest_path)
                            },
                            parent=meta_node,
                        )

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                url = paper_node.find("./url")
                if url is not None:
                    url.attrib["hash"] = compute_hash_from_file(paper["pdf"])

                for path, type_ in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=os.path.basename(path),
                        attrib={
                            "type": type_,
                            "hash": compute_hash_from_file(path),
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

                # Normalize
                for oldnode in paper_node:
                    normalize(oldnode, informat="latex")

                # Adjust the language tag
                language_node = paper_node.find("./language")
                if language_node is not None:
                    try:
                        lang = iso639.languages.get(name=language_node.text)
                    except KeyError:
                        raise Exception(
                            f"Can't find language '{language_node.text}'")
                    language_node.text = lang.part3
                    print(language_node.text)

                # Fix author names
                for name_node in chain(paper_node.findall("./author"),
                                       paper_node.findall("./editor")):
                    disambiguate_name(name_node, paper_id_full)
                    person = PersonName.from_element(name_node)
                    for name_part in name_node:
                        correct_caps(person, name_part, paper_id_full)

        # Other data from the meta file
        if "isbn" in meta:
            make_simple_element("isbn", meta["isbn"], parent=meta_node)

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)
Example #6
0
def add_revision(anth_id,
                 pdf_path,
                 explanation,
                 change_type="revision",
                 dry_run=True,
                 date=None):
    """
    Takes an Anthology ID. It then adds a revision to the Anthology XML,
    updating and writing the XML file, and copies the PDFs into place.
    For PDFs, the revised PDF is saved to {anth_id}.pdf and {anth_id}v{version}.pdf.
    For the first revision, we first copy {anth_id}.pdf to {anth_id}v1.pdf.
    """
    if date is None:
        now = datetime.now()
        date = f"{now.year}-{now.month:02d}-{now.day:02d}"

    def maybe_copy(file_from, file_to):
        if not dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    # The new version
    revno = None

    change_letter = "e" if change_type == "erratum" else "v"

    checksum = compute_hash_from_file(pdf_path)

    # Files for old-style IDs are stored under anthology-files/pdf/P/P19/*
    # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/*
    output_dir = get_pdf_dir(anth_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{anth_id}.pdf")

    # Update XML
    xml_file = get_xml_file(anth_id)
    collection_id, volume_id, paper_id = deconstruct_anthology_id(anth_id)
    tree = ET.parse(xml_file)
    if paper_id == "0":
        paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter")
    else:
        paper = tree.getroot().find(
            f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if change_type == "erratum" else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not dry_run:
            # Update the URL hash on the <url> tag
            url = paper.find("./url")
            if url is not None:
                url.attrib["hash"] = checksum

            if change_type == "revision" and revno == 2:
                if paper.find("./url") is not None:
                    current_version_url = infer_url(
                        paper.find("./url").text) + ".pdf"

                # Download original file
                # There are no versioned files the first time around, so create the first one
                # (essentially backing up the original version)
                revised_file_v1_path = os.path.join(
                    output_dir, f"{anth_id}{change_letter}1.pdf")

                retrieve_url(current_version_url, revised_file_v1_path)
                validate_file_type(revised_file_v1_path)

                old_checksum = compute_hash_from_file(revised_file_v1_path)

                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{anth_id}{change_letter}1",
                        "hash": old_checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{anth_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {anth_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    revised_file_versioned_path = os.path.join(
        output_dir, f"{anth_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(pdf_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if change_type == "revision":
        maybe_copy(pdf_path, canonical_path)
Example #7
0
def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    # Build list of volumes, confirm uniqueness
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))
        meta["path"] = proceedings

        meta["collection_id"] = collection_id = (
            meta["year"] + "." + meta["abbrev"].lower()
        )
        volume_name = meta["volume"]
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        print(f"VOLUME: {volume}")

        # copy the book
        book_src_filename = meta["abbrev"] + "-" + meta["year"]
        book_src_path = os.path.join(root_path, book_src_filename) + ".pdf"
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf"
            )

            log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run)
            if not args.dry_run:
                shutil.copyfile(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # names are {abbrev}{number}.pdf
            match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

            if match is None:
                print("whoa", abbrev)
            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf"
                )
                log(
                    f"Copying [{paper_id_full}] from {pdf_src_path} -> {pdf_dest_path}",
                    args.dry_run,
                )
                if not args.dry_run:
                    shutil.copyfile(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir, collection_id)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(os.path.join(root_path, "additional")):
                match = re.match(rf"{abbrev}(\d+)_(\w+)\.(\w+)")
                if match is not None:
                    paper_num, type_, ext = match.groups()
                    paper_num = int(paper_num)

                    file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                    dest_path = os.path.join(attachments_dest_dir, file_name)
                    log(f"Copying {attachment_file} -> {dest_path}", args.dry_run)
                    if not args.dry_run:
                        shutil.copyfile(attachment_file, dest_path)

                    collections[collection_id][volume_name][paper_num][
                        "attachments"
                    ].append(dest_path)

    people = AnthologyIndex(
        None, srcdir=os.path.join(os.path.dirname(sys.argv[0]), "..", "data")
    )

    for collection_id, collection in collections.items():
        collection_file = os.path.join(
            args.anthology_dir, "data", "xml", f"{collection_id}.xml"
        )
        root_node = make_simple_element("collection", attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element(
                "volume", attrib={"id": volume_id}, parent=root_node
            )
            meta_node = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)
                # print(etree.tostring(paper_node, pretty_print=True))

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta_node = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta_node.append(title_node)
                    for editor in paper_node.findall("editor"):
                        meta_node.append(editor)
                    meta_node.append(paper_node.find("publisher"))
                    meta_node.append(paper_node.find("address"))
                    meta_node.append(paper_node.find("month"))
                    meta_node.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            attrib={"hash": compute_hash_from_file(book_dest_path)},
                            parent=meta_node,
                        )

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                            "editor",
                            "address",
                            "booktitle",
                            "publisher",
                            "year",
                            "month",
                        ]:
                            paper_node.remove(child)

                url = paper_node.find("./url")
                if url is not None:
                    url.attrib["hash"] = compute_hash_from_file(paper["pdf"])

                for attachment in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=attachment.path,
                        attrib={
                            "type": attachment.type,
                            "hash": compute_hash_from_file(attachment.path),
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

        # Other data from the meta file
        if "isbn" in meta:
            make_simple_element("isbn", meta["isbn"], parent=meta_node)

        # Normalize
        for paper in root_node.findall(".//paper"):
            for oldnode in paper:
                normalize(oldnode, informat="latex")

            # Ensure names are properly identified
            ambiguous = {}
            anth_id = build_anthology_id(
                collection_id, paper.getparent().attrib["id"], paper.attrib["id"]
            )

            for node in chain(paper.findall("author"), paper.findall("editor")):
                name = PersonName.from_element(node)
                ids = people.get_ids(name)
                if len(ids) > 1:
                    print(
                        f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}"
                    )
                    ambiguous[anth_id] = (name, ids)

                    node.attrib["id"] = ids[0]

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(
            collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True
        )
Example #8
0
    def download_file(self, fname: str, hash: str, type: str) -> None:
        if type == "pdf":
            remote_url = self.source + "/" + fname
        elif type == "attachment":
            remote_url = self.source + "/attachments/" + fname
        else:
            log.error("unrecognized type: " + type)
            exit(1)
        local_target = ""
        # mkstemp opens the file but we only need the file name, so close it again.
        tmpfd, tmp_target = tempfile.mkstemp(prefix="aclmirrorer_", suffix=".pdf")
        os.close(tmpfd)
        match = NEW_ID_RE.match(fname)
        if match:
            local_target = os.path.join(self.to, type, match.groups()[1], fname)
        else:
            match = OLD_ID_RE.match(fname)
            if match:
                local_target = os.path.join(
                    self.to,
                    type,
                    match.groups()[0],
                    match.groups()[0] + match.groups()[1],
                    fname,
                )
            else:
                log.error("unrecognized format for " + fname)
                exit(1)

        if os.path.exists(local_target):
            existing_hash = anthology_utils.compute_hash_from_file(local_target)
            if existing_hash == hash:
                log.debug(
                    "File {} already up to date, not downloading again".format(
                        local_target
                    )
                )
                return
            else:
                log.debug("File {} changed, redownloading ...".format(local_target))
        else:
            log.debug("Downloading {} from {} ...".format(local_target, remote_url))

        if self.is_dry_run:
            return

        local_path = os.path.dirname(local_target)
        os.makedirs(local_path, exist_ok=True)
        try:
            urlretrieve(remote_url, tmp_target)
        except:
            log.error("could not download " + remote_url)
            self.not_downloadable.append(remote_url)
            os.remove(tmp_target)
            return
        new_hash = anthology_utils.compute_hash_from_file(tmp_target)
        if new_hash == hash:
            # all good, store downloaded file in the proper place
            shutil.move(tmp_target, local_target)
        else:
            log.error(
                "Hash mismatch for file {}, downloaded from {}. was {} should be {}".format(
                    local_target, remote_url, new_hash, hash
                )
            )
            self.hash_mismatches.append(remote_url)
        return
def main(args):
    year, venue, _ = os.path.basename(args.tsv_file.name).split(".")

    # Set the volume name from the collection file, or default to 1
    # The file name is either "2012.eamt.tsv" or "2012.eamt-main.tsv".
    # The default volume name is "1".
    if "-" in venue:
        venue, volume_id = venue.split("-")
    else:
        volume_id = "1"

    collection_id = f"{year}.{venue}"

    tree = etree.ElementTree(
        make_simple_element("collection", attrib={"id": collection_id}))

    now = datetime.now()
    today = f"{now.year}-{now.month:02d}-{now.day:02d}"

    volume = make_simple_element("volume",
                                 attrib={
                                     "id": volume_id,
                                     "ingest-date": today
                                 })
    tree.getroot().insert(0, volume)

    # Location of entire-proceedings PDF
    proceedings_pdf = args.proceedings

    # Create the metadata for the paper
    meta = None
    for row in csv.DictReader(args.meta_file, delimiter=args.delimiter):
        current_collection_id = f"{row['Year']}.{row['Conference code']}"
        if current_collection_id == collection_id:
            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle",
                                row["Conference title"],
                                parent=meta)
            make_simple_element("publisher", row["Publisher"], parent=meta)
            make_simple_element("address", row["Location"], parent=meta)
            make_simple_element("month", row["Dates held"], parent=meta)
            make_simple_element("year", row["Year"], parent=meta)

            url = row["URL"]

            if url.endswith(".pdf"):
                if proceedings_pdf:
                    print(
                        "Overriding --proceedings with proceedings PDF found in conference list",
                        file=sys.stderr,
                    )
                proceedings_pdf = url

            elif "Complete PDF" in row and row["Complete PDF"] != "":
                proceedings_pdf = row["Complete PDF"]

            # volume PDF
            if proceedings_pdf is not None:
                volume_anth_id = f"{collection_id}-{volume_id}"
                pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                              f"{volume_anth_id}.pdf")
                retrieve_url(proceedings_pdf, pdf_local_path)
                checksum = compute_hash_from_file(pdf_local_path)
                make_simple_element("url",
                                    volume_anth_id,
                                    attrib={"hash": checksum},
                                    parent=meta)
                proceedings_pdf = pdf_local_path

            if row["Editors"] != "" and "?" not in row["Editors"]:
                editors = row["Editors"].split(" and ")
                for editor_name in editors:
                    editor = make_simple_element("editor", parent=meta)
                    if ", " in editor_name:
                        last, first = editor_name.split(", ")
                    else:
                        first, last = (
                            ' '.join(editor_name.split()[:-1]),
                            editor_name.split()[-1],
                        )
                    make_simple_element("first", first, parent=editor)
                    make_simple_element("last", last, parent=editor)
            break
    else:
        print(
            f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}",
            file=sys.stderr,
        )
        sys.exit(1)

    paperid = 0
    # Create entries for all the papers
    for row in csv.DictReader(args.tsv_file, delimiter=args.delimiter):
        pages = row.get("Pagenumbers", None)

        title_text = row["Title"]

        # The first row might be front matter (needs a special name)
        if title_text == "Frontmatter" and paperid == 0:
            paper = make_simple_element("frontmatter", parent=volume)

        else:
            paperid += 1
            paper = make_simple_element("paper",
                                        attrib={"id": str(paperid)},
                                        parent=volume)
            # Only make the title for not-the-frontmatter
            make_simple_element("title", title_text, parent=paper)

            author_list = row["Authors"].split(" and ")
            for author_name in author_list:
                if author_name == "":
                    continue
                author = make_simple_element("author", parent=paper)
                if ", " in author_name:
                    last, first = author_name.split(", ")
                else:
                    first, last = (
                        ' '.join(author_name.split()[:-1]),
                        author_name.split()[-1],
                    )
                make_simple_element("first", first, parent=author)
                make_simple_element("last", last, parent=author)

        if pages is not None:
            make_simple_element("pages", pages, parent=paper)

        # Find the PDF, either listed directly, or extracted from the proceedings PDF
        anth_id = f"{collection_id}-{volume_id}.{paperid}"
        pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                      f"{anth_id}.pdf")
        url = None
        if "Pdf" in row and row["Pdf"] != "":
            if retrieve_url(row["Pdf"], pdf_local_path):
                url = anth_id

        elif "pages in pdf" in row:
            pdf_pages = row["pages in pdf"]
            extract_pages(proceedings_pdf, pdf_pages, pdf_local_path)
            url = anth_id

        if url is not None:
            checksum = compute_hash_from_file(pdf_local_path)

            make_simple_element("url",
                                url,
                                attrib={"hash": checksum},
                                parent=paper)

        if "Abstract" in row:
            make_simple_element("abstract", row["Abstract"], parent=paper)

        if "Presentation" in row:
            url = row["Presentation"]
            if url is not None and url != "" and url != "None":
                extension = row["Presentation"].split(".")[-1]
                name = f"{anth_id}.Presentation.{extension}"
                local_path = os.path.join(
                    args.anthology_files_path,
                    "..",
                    "attachments",
                    venue,
                    name,
                )
                if retrieve_url(row["Presentation"], local_path):
                    checksum = compute_hash_from_file(local_path)
                    make_simple_element(
                        "attachment",
                        name,
                        attrib={
                            "type": "presentation",
                            "hash": checksum
                        },
                        parent=paper,
                    )

        # Normalize
        for node in paper:
            normalize(node, informat="latex")

    indent(tree.getroot())

    # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
    collection_file = os.path.join(args.anthology, "data", "xml",
                                   f"{collection_id}.xml")
    tree.write(collection_file,
               encoding="UTF-8",
               xml_declaration=True,
               with_tail=True)
Example #10
0
def main(args):
    anth = anthology.Anthology(importdir=os.path.join(args.anthology, "data"))
    splitter = NameSplitter(anth)

    paper_nums = {}
    venue = "lilt"
    prev_year = None
    prev_volume = None
    for row in csv.DictReader(args.tsv_file, delimiter='\t'):
        year = row.get("year")
        month = row.get("month")
        issue = row.get("issue#", "")
        abstract = row.get("abstract")
        collection_id = f"{year}.lilt"
        if year != prev_year:
            if prev_year is not None:
                dump_collection(
                    tree,
                    os.path.join(args.anthology, "data", "xml",
                                 f"{prev_year}.lilt.xml"),
                )

            tree = etree.ElementTree(
                make_simple_element("collection", attrib={"id":
                                                          collection_id}))
            root = tree.getroot()
        prev_year = year

        volume_name = row.get("Volume#")
        if volume_name != prev_volume:
            volume = make_simple_element("volume",
                                         attrib={"id": volume_name},
                                         parent=root)
            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle", row.get("Booktitle"), parent=meta)
            make_simple_element("publisher", "CSLI Publications", parent=meta)
            make_simple_element("year", year, parent=meta)
            if month:
                make_simple_element("month", month, parent=meta)

        paper_num = paper_nums[volume_name] = paper_nums.get(volume_name,
                                                             0) + 1

        prev_volume = volume_name

        paper = make_simple_element("paper",
                                    attrib={"id": str(paper_num)},
                                    parent=volume)
        paper_id = f"{collection_id}-{volume_name}.{paper_num}"
        make_simple_element("title", row.get("title"), parent=paper)
        authors = row.get("authors")
        for author_name in authors.split(" and "):
            author = make_simple_element("author", parent=paper)
            surname, givenname = splitter.best_split(author_name)
            make_simple_element("first", givenname, parent=author)
            make_simple_element("last", surname, parent=author)

        if abstract != "":
            make_simple_element("abstract", abstract, parent=paper)
        if issue != "":
            make_simple_element("issue", issue, parent=paper)

        for node in paper:
            normalize(node, "latex")

        dest_dir = f"{args.anthology_files_path}/lilt"
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        source_path = os.path.join(
            "pdf",
            row.get("PDF").replace("\\", "/").replace("../", ""))
        if os.path.exists(source_path):
            dest_path = os.path.join(
                dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf")
            shutil.copy(source_path, dest_path)
            print(f"Copying {source_path} to {dest_path}", file=sys.stderr)
            os.chmod(dest_path, 0o644)
            checksum = compute_hash_from_file(dest_path)
            make_simple_element("url",
                                paper_id,
                                attrib={"hash": checksum},
                                parent=paper)

    dump_collection(
        tree,
        os.path.join(args.anthology, "data", "xml", f"{collection_id}.xml"))