Example #1
0
    def disambiguate_name(node):
        name = PersonName.from_element(node)
        ids = people.get_ids(name)

        if node.tag == "editor":
            # meta block, get volume
            anth_id = build_anthology_id(
                collection_id,
                node.getparent().getparent().attrib["id"])
        elif node.tag == "author":
            # paper, get full ID
            anth_id = build_anthology_id(
                collection_id,
                node.getparent().getparent().attrib["id"],
                node.getparent().attrib["id"],
            )

        if len(ids) > 1:
            choice = -1
            while choice < 0 or choice >= len(ids):
                print(
                    f"({anth_id}): ambiguous author {name}; Please choose from the following:"
                )
                for i, id_ in enumerate(ids):
                    print(f"[{i}] {id_} ({people.get_comment(id_)})")
                choice = int(input("--> "))

            node.attrib["id"] = ids[choice]
Example #2
0
def add_doi(xml_node, collection_id, volume_id, force=False):
    if 'id' in xml_node.attrib:
        # normal paper
        paper_id = int(xml_node.attrib['id'])
    else:
        # frontmatter
        paper_id = 0

    anth_id = build_anthology_id(collection_id, volume_id, paper_id)
    new_doi_text = f'{data.DOI_PREFIX}{anth_id}'

    doi = xml_node.find('doi')
    if doi is not None:
        print(f'-> [{anth_id}] Cowardly refusing to overwrite existing DOI {doi.text} (use --force)', file=sys.stderr)
        return False

    doi_url = f'{data.DOI_URL_PREFIX}{data.DOI_PREFIX}{anth_id}'
    for tries in [1, 2, 3]:  # lots of random failures
        result = test_url_code(doi_url)
        if result.status_code == 200:
            doi = make_simple_element('doi', text=new_doi_text)
            print(f'-> Adding DOI {new_doi_text}', file=sys.stderr)
            xml_node.append(doi)
            return True
        elif result.status_code == 429:  # too many requests
            pause_for = int(result.headers['Retry-After'])
            print(f'--> Got 429, pausing for {pause_for} seconds', file=sys.stderr)
            sleep(pause_for + 1)
        elif result.status_code == 404:  # not found
            break

    print(f"-> Couldn't add DOI {doi_url}", file=sys.stderr)
    return False
Example #3
0
def add_doi(xml_node, collection_id, volume_id, force=False):
    if 'id' in xml_node.attrib:
        # normal paper
        paper_id = int(xml_node.attrib['id'])
    else:
        # frontmatter
        paper_id = 0

    anth_id = build_anthology_id(collection_id, volume_id, paper_id)
    new_doi_text = f'{data.DOI_PREFIX}{anth_id}'
    doi_url = f'{data.DOI_URL_PREFIX}{data.DOI_PREFIX}{anth_id}'
    if not test_url(doi_url):
        print(f"-> [{anth_id}] Skipping since DOI {doi_url} doesn't exist")
        return False

    doi = xml_node.find('doi')
    if doi is not None:
        print(
            f'-> [{anth_id}] Cowardly refusing to overwrite existing DOI {doi.text} (use --force)',
            file=sys.stderr)
        return False

    else:
        doi = make_simple_element('doi', text=new_doi_text)
        print(f'Adding DOI {new_doi_text}', file=sys.stderr)
        xml_node.append(doi)
        return True
Example #4
0
                            srcdir=os.path.join(os.path.dirname(sys.argv[0]),
                                                "..", "data"))

    pdf_directory = os.path.dirname(args.infile)
    tree_being_added = etree.parse(args.infile)

    # Ensure nested format
    root_being_added = make_nested(tree_being_added.getroot(),
                                   pdf_path=os.path.dirname(args.infile))
    collection_id = root_being_added.attrib["id"]

    # Ensure names are properly identified
    ambiguous = {}
    for paper in root_being_added.findall(".//paper"):
        anth_id = build_anthology_id(collection_id,
                                     paper.getparent().attrib["id"],
                                     paper.attrib["id"])

        for node in chain(paper.findall("author"), paper.findall("editor")):
            name = PersonName.from_element(node)
            ids = people.get_ids(name)
            if len(ids) > 1:
                print(
                    f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}"
                )
                ambiguous[anth_id] = (name, ids)

                node.attrib["id"] = ids[0]

        # Ensure PDF exists. PDF should be in the same directory as the XML file being ingested.
        pdf_path = os.path.join(pdf_directory, f"{anth_id}.pdf")
Example #5
0
def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    # Build list of volumes, confirm uniqueness
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))
        meta["path"] = proceedings

        meta["collection_id"] = collection_id = (meta["year"] + "." +
                                                 meta["abbrev"].lower())
        volume_name = meta["volume_name"]
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume_name"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        print(f"VOLUME: {volume}")

        # copy the book
        book_src_filename = meta["abbrev"] + "-" + meta["year"]
        book_src_path = os.path.join(root_path, book_src_filename) + ".pdf"
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run)
            if not args.dry_run:
                shutil.copyfile(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # names are {abbrev}{number}.pdf
            abbrev = meta["abbrev"]
            match = re.match(rf"{abbrev}(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                log(
                    f"Copying [{paper_id_full}] {pdf_src_path} -> {pdf_dest_path}",
                    args.dry_run,
                )
                if not args.dry_run:
                    shutil.copyfile(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                collection_id)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                match = re.match(rf"{abbrev}(\d+)_(\w+)\.(\w+)")
                if match is not None:
                    paper_num, type_, ext = match.groups()
                    paper_num = int(paper_num)

                    file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                    dest_path = os.path.join(attachments_dest_dir, file_name)
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    if not args.dry_run:
                        shutil.copyfile(attachment_file, dest_path)

                    collections[collection_id][volume_name][paper_num][
                        "attachments"].append(dest_path)

    people = AnthologyIndex(None,
                            srcdir=os.path.join(os.path.dirname(sys.argv[0]),
                                                "..", "data"))

    for collection_id, collection in collections.items():
        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        root_node = make_simple_element("collection",
                                        attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element("volume",
                                              attrib={"id": volume_id},
                                              parent=root_node)
            meta = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)
                # print(etree.tostring(paper_node, pretty_print=True))

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta.append(title_node)
                    for editor in paper_node.findall("editor"):
                        meta.append(editor)
                    meta.append(paper_node.find("publisher"))
                    meta.append(paper_node.find("address"))
                    meta.append(paper_node.find("month"))
                    meta.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            parent=meta)

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                for attachment in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=attachment.path,
                        attrib={
                            "type": attachment.type,
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

        # Normalize
        for paper in root_node.findall(".//paper"):
            for oldnode in paper:
                normalize(oldnode, informat="latex")

        # Ensure names are properly identified
        ambiguous = {}
        for paper in root_node.findall(".//paper"):
            anth_id = build_anthology_id(collection_id,
                                         paper.getparent().attrib["id"],
                                         paper.attrib["id"])

        for node in chain(paper.findall("author"), paper.findall("editor")):
            name = PersonName.from_element(node)
            ids = people.get_ids(name)
            if len(ids) > 1:
                print(
                    f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}"
                )
                ambiguous[anth_id] = (name, ids)

                node.attrib["id"] = ids[0]

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)