コード例 #1
0
ファイル: add_award.py プロジェクト: pkolachi/acl-anthology
def main(args):
    print(f"Adding {args.award} to {args.anthology_id}...")

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is None:
        print(f"Error: Can't find paper {args.anthology_id}, quitting")

    existing_award = paper.find("./award")
    if existing_award is not None and award.text.lower() == args.award:
        print(
            f"Error: Award {args.award} already exists for {args.anthology_id}, quitting"
        )

    make_simple_element("award", args.award, parent=paper)
    indent(tree.getroot())

    tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
コード例 #2
0
def main(args):

    for xml_file in args.xml_files:
        tree = ET.parse(xml_file)
        for paper in tree.getroot().findall(f".//paper"):
            make_simple_element("language", "eng", parent=paper)

        indent(tree.getroot())
        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
コード例 #3
0
def add_video_tag(anth_paper, xml_parse):
    coll_id, vol_id, paper_id = deconstruct_anthology_id(anth_paper.anthology_id)
    paper = xml_parse.find(f'./volume[@id="{vol_id}"]/paper[@id="{paper_id}"]')

    if anth_paper.presentation_id.startswith("http"):
        video_url = anth_paper.presentation_id
    else:
        video_url = "https://slideslive.com/{}".format(anth_paper.presentation_id)

    make_simple_element("video", attrib={"tag": "video", "href": video_url}, parent=paper)
コード例 #4
0
ファイル: add_dois.py プロジェクト: dbonadiman/acl-anthology
def add_doi(xml_node, collection_id, volume_id, force=False):
    if 'id' in xml_node.attrib:
        # normal paper
        paper_id = int(xml_node.attrib['id'])
    else:
        # frontmatter
        paper_id = 0

    anth_id = build_anthology_id(collection_id, volume_id, paper_id)
    new_doi_text = f'{data.DOI_PREFIX}{anth_id}'
    doi_url = f'{data.DOI_URL_PREFIX}{data.DOI_PREFIX}{anth_id}'
    if not test_url(doi_url):
        print(f"-> [{anth_id}] Skipping since DOI {doi_url} doesn't exist")
        return False

    doi = xml_node.find('doi')
    if doi is not None:
        print(
            f'-> [{anth_id}] Cowardly refusing to overwrite existing DOI {doi.text} (use --force)',
            file=sys.stderr)
        return False

    else:
        doi = make_simple_element('doi', text=new_doi_text)
        print(f'Adding DOI {new_doi_text}', file=sys.stderr)
        xml_node.append(doi)
        return True
コード例 #5
0
ファイル: add_dois.py プロジェクト: sashank06/acl-anthology
def add_doi(xml_node, collection_id, volume_id, force=False):
    if 'id' in xml_node.attrib:
        # normal paper
        paper_id = int(xml_node.attrib['id'])
    else:
        # frontmatter
        paper_id = 0

    anth_id = build_anthology_id(collection_id, volume_id, paper_id)
    new_doi_text = f'{data.DOI_PREFIX}{anth_id}'

    doi = xml_node.find('doi')
    if doi is not None:
        print(f'-> [{anth_id}] Cowardly refusing to overwrite existing DOI {doi.text} (use --force)', file=sys.stderr)
        return False

    doi_url = f'{data.DOI_URL_PREFIX}{data.DOI_PREFIX}{anth_id}'
    for tries in [1, 2, 3]:  # lots of random failures
        result = test_url_code(doi_url)
        if result.status_code == 200:
            doi = make_simple_element('doi', text=new_doi_text)
            print(f'-> Adding DOI {new_doi_text}', file=sys.stderr)
            xml_node.append(doi)
            return True
        elif result.status_code == 429:  # too many requests
            pause_for = int(result.headers['Retry-After'])
            print(f'--> Got 429, pausing for {pause_for} seconds', file=sys.stderr)
            sleep(pause_for + 1)
        elif result.status_code == 404:  # not found
            break

    print(f"-> Couldn't add DOI {doi_url}", file=sys.stderr)
    return False
コード例 #6
0
def process_xml(xml: Path, is_tacl: bool) -> Optional[etree.Element]:
    logging.info("Reading {}".format(xml))

    tree = etree.parse(str(xml))
    root = tree.getroot()
    front = root.find("front")

    info, issue = get_article_journal_info(front, is_tacl)

    paper = etree.Element("paper")

    title_text = get_title(front)
    title = etree.Element("title")
    title.text = title_text
    paper.append(title)

    authors = get_authors(front)
    for given_names, surname in authors:
        first = etree.Element("first")
        first.text = given_names

        last = etree.Element("last")
        last.text = surname

        author = etree.Element("author")
        author.append(first)
        author.append(last)

        paper.append(author)

    doi_text = get_doi(front)
    doi = etree.Element("doi")
    doi.text = doi_text
    paper.append(doi)

    abstract_text = get_abstract(front)
    if abstract_text:
        make_simple_element("abstract", abstract_text, parent=paper)

    pages_tuple = get_pages(front)
    pages = etree.Element("pages")
    pages.text = "–".join(pages_tuple)  # en-dash, not hyphen!
    paper.append(pages)

    return paper, info, issue
コード例 #7
0
def write_bibkeys(anthology, srcdir, commit=False):
    for volume_id, volume in anthology.volumes.items():
        papers_without_bibkey = []

        for paper in volume:
            bibkey = paper.bibkey
            if bibkey is None or bibkey == paper.full_id:
                papers_without_bibkey.append(paper)

        if papers_without_bibkey:
            log.info(
                f"Found {len(papers_without_bibkey):4d} papers without bibkeys in volume {volume_id}"
            )
            if not commit:
                continue
        else:
            continue

        # We got some new bibkeys and need to write them to the XML
        xml_file = os.path.join(srcdir, "xml", f"{volume.collection_id}.xml")
        tree = ET.parse(xml_file)
        root = tree.getroot()

        for paper in papers_without_bibkey:
            if paper.paper_id == "0":
                node = root.find(
                    f"./volume[@id='{paper.volume_id}']/frontmatter")
                if node is None:  # dummy frontmatter
                    continue
            else:
                node = root.find(
                    f"./volume[@id='{paper.volume_id}']/paper[@id='{paper.paper_id}']"
                )
            if node is None:
                log.error(f"Paper {paper.full_id} not found in {xml_file}")
                continue

            # Generate unique bibkey
            bibkey = anthology.pindex.create_bibkey(paper,
                                                    vidx=anthology.venues)
            make_simple_element("bibkey", bibkey, parent=node)

        indent(root)
        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
コード例 #8
0
def main(args):

    for line in args.isbn_file:
        venue, isbn = line.rstrip().split()

        xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data",
                                "xml", f"{venue}.xml")
        if not os.path.exists(xml_file):
            print(f"Can't find {xml_file}")
            continue
        tree = ET.parse(xml_file)
        meta = tree.getroot().find(f".//volume[@id='1']/meta")
        if meta is not None and meta.find("./isbn") is None:
            print(f"Adding {isbn} to {venue} meta block")
            make_simple_element("isbn", isbn, parent=meta)
        elif volume.find("./isbn") is not None:
            print(f"{venue} already done")

        indent(tree.getroot())
        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
コード例 #9
0
def main(args):
    """
    Downloads an Anthology paper and adds a RETRACTED watermark, then updates the XML
    with an appropriate <revision> and <retracted> tag.
    """

    with tempfile.TemporaryDirectory() as tempdir:

        new_pdf = add_watermark(args.anthology_id, workdir=tempdir)

        add_revision(
            args.anthology_id,
            new_pdf,
            explanation="Retracted.",
            change_type="revision",
            dry_run=False,
        )

        xml_file = get_xml_file(args.anthology_id)
        collection_id, volume_id, paper_id = deconstruct_anthology_id(
            args.anthology_id)
        tree = ET.parse(xml_file)
        if paper_id == "0":
            paper = tree.getroot().find(
                f"./volume[@id='{volume_id}']/frontmatter")
        else:
            paper = tree.getroot().find(
                f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")

        if paper is None:
            print(f"Couldn't find paper {args.anthology_id}!", file=sys.stderr)
            sys.exit(2)

        print("Modifying the XML", file=sys.stderr)
        now = datetime.now()
        date = f"{now.year}-{now.month:02d}-{now.day:02d}"
        retracted_node = make_simple_element("retracted",
                                             args.explanation,
                                             attrib={"date": date},
                                             parent=paper)
        indent(tree.getroot())
        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
コード例 #10
0
ファイル: ingest.py プロジェクト: dbonadiman/acl-anthology
    # Normalize
    for paper in root_being_added.findall('.//paper'):
        for oldnode in paper:
            process(oldnode, informat='xml')

    # Ingest each volume.
    # First, find the XML file.
    collection_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data',
                                   'xml', f'{collection_id}.xml')

    if os.path.exists(collection_file):
        existing_tree = etree.parse(collection_file)
    else:
        existing_tree = etree.ElementTree(
            make_simple_element('collection', attrib={'id': collection_id}))

    # Insert each volume
    for i, new_volume in enumerate(root_being_added.findall('volume')):
        new_volume_id = int(new_volume.attrib['id'])
        existing_volume = existing_tree.getroot().find(
            f"./volume[@id='{new_volume_id}']")
        if existing_volume is None:
            new_volume.attrib['ingest-date'] = args.ingest_date

            # Find the insertion point among the other volumes
            insertion_point = 0
            for i, volume in enumerate(existing_tree.getroot()):
                if new_volume_id < int(volume.attrib['id']):
                    break
                insertion_point = i + 1
コード例 #11
0
def main(args):
    code, year, _ = os.path.basename(args.tsv_file.name).split(".")

    collection_id = f"{year}.{code}"

    tree = etree.ElementTree(
        make_simple_element("collection", attrib={"id": collection_id}))

    volume_id = "1"
    volume = make_simple_element("volume", attrib={"id": volume_id})
    tree.getroot().insert(0, volume)

    # Create the metadata for the paper
    meta = None
    for row in csv.DictReader(args.meta_file, delimiter="\t"):
        if row["Conference code"] == collection_id:
            if row["Completed"] == "FALSE":
                prin(
                    f"Warning: Conference {collection_id} is not marked as completed, can't ingest."
                )
                sys.exit(1)

            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle",
                                row["Conference title"],
                                parent=meta)
            make_simple_element("publisher", row["Publisher"], parent=meta)
            make_simple_element("address", row["Location"], parent=meta)
            make_simple_element("month", row["Dates held"], parent=meta)
            make_simple_element("year", row["Year"], parent=meta)
            if row["Editors"] != "" and "?" not in row["Editors"]:
                editors = row["Editors"].split(" and ")
                for editor_name in editors:
                    editor = make_simple_element("editor", parent=meta)
                    last, first = editor_name.split(", ")
                    make_simple_element("first", first, parent=editor)
                    make_simple_element("last", last, parent=editor)
            break
    else:
        print(
            f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}",
            file=sys.stderr)
        sys.exit(1)

    if not os.path.exists(collection_id):
        print(f"Creating {collection_id}", file=sys.stderr)
        os.makedirs(collection_id)

    # Create entries for all the papers
    for paperid, row in enumerate(
            csv.DictReader(args.tsv_file, delimiter='\t'), 1):
        title_text = row["Title"]
        author_list = row["Authors"].split(" and ")
        pdf = row["Pdf"]

        paper = make_simple_element("paper",
                                    attrib={"id": str(paperid)},
                                    parent=volume)

        make_simple_element("title", title_text, parent=paper)
        for author_name in author_list:
            if author_name == "":
                continue
            author = make_simple_element("author", parent=paper)
            print(author_name)
            last, first = author_name.split(", ")
            make_simple_element("first", first, parent=author)
            make_simple_element("last", last, parent=author)

        url = f"{collection_id}-{volume_id}.{paperid}"
        pdf_local_path = os.path.join(collection_id, f"{url}.pdf")
        make_simple_element("url", url, parent=paper)
        download(pdf, pdf_local_path)

        if "Abstract" in row:
            make_simple_element("abstract", row["Abstract"], parent=paper)

        if "Presentation" in row:
            extension = row["Presentation"].split(".")[-1]
            filename = f"{collection_id}-{volume_id}.{paperid}.Presentation.{extension}"
            make_simple_element("attachment",
                                filename,
                                attrib={"type": "presentation"})
            download(row["Presentation"], os.path.join(collection_id,
                                                       filename))

    indent(tree.getroot())

    # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
    collection_file = os.path.join(args.anthology, "data", "xml",
                                   f"{collection_id}.xml")
    tree.write(collection_file,
               encoding="UTF-8",
               xml_declaration=True,
               with_tail=True)
コード例 #12
0
def main(args):
    def maybe_copy(file_from, file_to):
        if not args.dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    change_type = "erratum" if args.erratum else "revision"
    change_letter = "e" if args.erratum else "v"

    print(f"Processing {change_type} to {args.anthology_id}...")

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f"-> Downloading file from {args.path}", file=sys.stderr)
            with urllib.request.urlopen(args.path) as url, open(
                    input_file_path, mode="wb") as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            print("An SSL error was encountered in downloading the files.",
                  file=sys.stderr)
            sys.exit(1)
    else:
        input_file_path = args.path

    detected = filetype.guess(input_file_path)
    if detected is None or not detected.mime.endswith(detected.extension):
        mime_type = 'UNKNOWN' if detected is None else detected.mime
        print(
            f"FATAL: {args.anthology_id} file {args.path} has MIME type {mime_type}",
            file=sys.stderr,
        )
        sys.exit(1)

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split(".")[-1]

    # The new version
    revno = None

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not args.dry_run:
            if not args.erratum and revno == 2:
                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{args.anthology_id}{change_letter}1",
                        "hash": checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                args.explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{args.anthology_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": args.date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0],
                              collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf")

    if not args.erratum and revno == 2:
        # There are no versioned files the first time around, so create the first one
        # (essentially backing up the original version)
        revised_file_v1_path = os.path.join(
            output_dir, f"{args.anthology_id}{change_letter}1.pdf")

        current_version = ANTHOLOGY_PDF.format(args.anthology_id)
        if not args.dry_run:
            try:
                print(
                    f"-> Downloading file from {args.path} to {revised_file_v1_path}",
                    file=sys.stderr,
                )
                with urllib.request.urlopen(current_version) as url, open(
                        revised_file_v1_path, mode="wb") as fh:
                    fh.write(url.read())
            except ssl.SSLError:
                print(
                    f"-> FATAL: An SSL error was encountered in downloading {args.path}.",
                    file=sys.stderr,
                )
                sys.exit(1)
        else:
            print(
                f"-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}",
                file=sys.stderr,
            )

    revised_file_versioned_path = os.path.join(
        output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(input_file_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if not args.erratum:
        maybe_copy(input_file_path, canonical_path)

    if args.path.startswith("http"):
        os.remove(input_file_path)
コード例 #13
0
def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                     "data")
    venue_index = VenueIndex(srcdir=anthology_datadir)
    venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()]

    sig_index = SIGIndex(srcdir=anthology_datadir)

    # Build list of volumes, confirm uniqueness
    unseen_venues = []
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))

        venue_abbrev = meta["abbrev"]
        venue_slug = venue_index.get_slug(venue_abbrev)

        if str(datetime.now().year) in venue_abbrev:
            print(
                f"Fatal: Venue assembler put year in acronym: '{venue_abbrev}'"
            )
            sys.exit(1)

        if venue_slug not in venue_keys:
            unseen_venues.append((venue_slug, venue_abbrev, meta["title"]))

        meta["path"] = proceedings

        meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug
        volume_name = meta["volume"].lower()
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

        if "sig" in meta:
            print(
                f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:"
            )
            print(f"  - {meta['year']}:")
            print(f"    - {volume_full_id} # {meta['booktitle']}")

    # Make sure all venues exist
    if len(unseen_venues) > 0:
        for venue in unseen_venues:
            slug, abbrev, title = venue
            print(f"Creating venue '{abbrev}' ({title})")
            venue_index.add_venue(abbrev, title)
        venue_index.dump(directory=anthology_datadir)

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume"].lower()
        year = meta["year"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        # copy the book
        book_src_filename = f'{year}.{meta["abbrev"]}-{volume_name}.pdf'
        book_src_path = os.path.join(root_path, book_src_filename)
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            if not args.dry_run:
                maybe_copy(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # Skip . files
            if os.path.basename(pdf_file).startswith("."):
                continue

            # names are {abbrev}{number}.pdf
            match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                if not args.dry_run:
                    maybe_copy(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                venue_name)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                if os.path.basename(attachment_file).startswith("."):
                    continue
                attachment_file_path = os.path.join(root_path, "additional",
                                                    attachment_file)
                match = re.match(
                    rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$",
                    attachment_file)
                if match is None:
                    print(
                        f"* Warning: no attachment match for {attachment_file}",
                        file=sys.stderr,
                    )
                    sys.exit(2)

                paper_num, type_, ext = match.groups()
                paper_num = int(paper_num)

                file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                dest_path = os.path.join(attachments_dest_dir, file_name)
                if not args.dry_run and not os.path.exists(dest_path):
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    shutil.copyfile(attachment_file_path, dest_path)

                collections[collection_id][volume_name][paper_num][
                    "attachments"].append((dest_path, type_))

    people = AnthologyIndex(None, srcdir=anthology_datadir)

    def correct_caps(person, name_node, anth_id):
        """
        Many people submit their names in "ALL CAPS" or "all lowercase".
        Correct this with heuristics.
        """
        name = name_node.text
        if name.islower() or name.isupper():
            # capitalize all parts
            corrected = " ".join(
                list(map(lambda x: x.capitalize(), name.split())))
            print(
                f"-> Correcting capitalization of '{name}' to '{corrected}'",
                file=sys.stderr,
            )
            name_node.text = corrected

    def disambiguate_name(node, anth_id):
        name = PersonName.from_element(node)
        ids = people.get_ids(name)

        if len(ids) > 1:
            choice = -1
            while choice < 0 or choice >= len(ids):
                print(
                    f"({anth_id}): ambiguous author {name}; Please choose from the following:"
                )
                for i, id_ in enumerate(ids):
                    print(f"[{i}] {id_} ({people.get_comment(id_)})")
                choice = int(input("--> "))

            node.attrib["id"] = ids[choice]

    for collection_id, collection in collections.items():
        # Newly added volumes, so we can normalize and name-disambig later
        newly_added_volumes = []

        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        if os.path.exists(collection_file):
            root_node = etree.parse(collection_file).getroot()
        else:
            root_node = make_simple_element("collection",
                                            attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element(
                "volume",
                attrib={
                    "id": volume_id,
                    "ingest-date": args.ingest_date
                },
            )

            # Replace the existing one if present
            existing_volume_node = root_node.find(
                f"./volume[@id='{volume_id}']")
            for i, child in enumerate(root_node):
                if child.attrib["id"] == volume_id:
                    root_node[i] = volume_node
                    break
            else:
                root_node.append(volume_node)

            meta_node = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta_node = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta_node.append(title_node)
                    for author_or_editor in chain(
                            paper_node.findall("./author"),
                            paper_node.findall("./editor")):
                        meta_node.append(author_or_editor)
                        author_or_editor.tag = "editor"
                    meta_node.append(paper_node.find("publisher"))
                    meta_node.append(paper_node.find("address"))
                    meta_node.append(paper_node.find("month"))
                    meta_node.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            attrib={
                                "hash": compute_hash_from_file(book_dest_path)
                            },
                            parent=meta_node,
                        )

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                url = paper_node.find("./url")
                if url is not None:
                    url.attrib["hash"] = compute_hash_from_file(paper["pdf"])

                for path, type_ in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=os.path.basename(path),
                        attrib={
                            "type": type_,
                            "hash": compute_hash_from_file(path),
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

                # Normalize
                for oldnode in paper_node:
                    normalize(oldnode, informat="latex")

                # Adjust the language tag
                language_node = paper_node.find("./language")
                if language_node is not None:
                    try:
                        lang = iso639.languages.get(name=language_node.text)
                    except KeyError:
                        raise Exception(
                            f"Can't find language '{language_node.text}'")
                    language_node.text = lang.part3
                    print(language_node.text)

                # Fix author names
                for name_node in chain(paper_node.findall("./author"),
                                       paper_node.findall("./editor")):
                    disambiguate_name(name_node, paper_id_full)
                    person = PersonName.from_element(name_node)
                    for name_part in name_node:
                        correct_caps(person, name_part, paper_id_full)

        # Other data from the meta file
        if "isbn" in meta:
            make_simple_element("isbn", meta["isbn"], parent=meta_node)

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)
コード例 #14
0
    logging.basicConfig(level=args.verbose)

    is_tacl = "tacl" in args.year_root.stem

    venue = TACL if is_tacl else CL  # J for CL, Q for TACL.
    year = args.year_root.stem.split(".")[1]
    year_suffix = year[-2:]  # Feels hacky, too.

    collection_id = year + "." + venue

    collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                   f"{collection_id}.xml")
    if os.path.exists(collection_file):
        collection = etree.parse(collection_file).getroot()
    else:
        collection = make_simple_element("collection",
                                         attrib={"id": collection_id})

    tacl_glob = "tacl.20*.*/tacl.20*.*.xml"
    # volume_info = get_volume_info(list(args.year_root.glob("*.*.*/*.*.*.xml"))[0])
    # volume.append(volume_info)

    pdf_destination = Path(args.pdfs_dir)
    pdf_destination = pdf_destination / "pdf" / venue
    pdf_destination.mkdir(parents=True, exist_ok=True)

    previous_issue_info = None

    papers = []
    for xml in sorted(args.year_root.glob("*_a_*/*.xml")):
        # print(xml)
コード例 #15
0
def main(volumes):

    formatter = MarkupFormatter()

    ## Assemble container
    doi_batch = make_simple_element(
        'doi_batch',
        attrib={
            'xmlns': 'http://www.crossref.org/schema/4.4.1',
            '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation':
            'http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd',
            'version': '4.4.1'
        },
        namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'})
    new_volume = etree.ElementTree(doi_batch)

    ## Assemble head
    head = make_simple_element('head', parent=new_volume.getroot())
    dbi = make_simple_element('doi_batch_id',
                              text=str(int(time.time())),
                              parent=head)

    timestamp = make_simple_element('timestamp',
                                    text=str(int(time.time())),
                                    parent=head)

    depositor = make_simple_element('depositor', parent=head)
    depositor_name = make_simple_element('depositor_name',
                                         text=DEPOSITOR_NAME,
                                         parent=depositor)
    email_address = make_simple_element('email_address',
                                        text=EMAIL_ADDRESS,
                                        parent=depositor)

    registrant = make_simple_element('registrant',
                                     text=REGISTRANT,
                                     parent=head)

    ## Assemble body
    body = make_simple_element('body', parent=new_volume.getroot())

    year = ""
    start_month = ""
    end_month = ""

    for full_volume_id in sorted(volumes):
        collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id)

        collection_file = os.path.join(os.path.dirname(sys.argv[0]), '..',
                                       'data', 'xml', f'{collection_id}.xml')
        tree = etree.parse(collection_file)

        v = tree.getroot().find(f"./volume[@id='{volume_id}']")
        if v is None:
            print(f"* Can't find volume {full_volume_id}", file=sys.stderr)
            continue

        ## Assemble frontmatter
        c = make_simple_element('conference', parent=body)
        contribs = make_simple_element('contributors', parent=c)
        editor_index = 0

        meta = v.find('./meta')
        for tag in meta:
            if tag.tag == 'year':
                year = tag.text
            elif tag.tag == 'month':
                month = tag.text
                try:
                    start_month = MONTH_HASH[re.split('[-–]', month)[0]]
                    end_month = MONTH_HASH[re.split('[-–]', month)[1]]
                except IndexError as e:  # only one month
                    start_month = MONTH_HASH[month]
                    end_month = MONTH_HASH[month]
            elif tag.tag == 'url':
                url = tag.text
            elif tag.tag == 'booktitle':
                booktitle = tag.text
            elif tag.tag == 'address':
                address = tag.text
            elif tag.tag == 'publisher':
                publisher = tag.text
            elif tag.tag == 'editor':
                pn = make_simple_element(
                    'person_name',
                    parent=contribs,
                    attrib={
                        'contributor_role': 'chair',
                        'sequence':
                        'first' if editor_index == 0 else 'additional'
                    })
                editor_index += 1

                for name_part in tag:
                    if name_part.tag == 'first':
                        gn = make_simple_element('given_name',
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == 'last':
                        sn = make_simple_element('surname',
                                                 text=name_part.text,
                                                 parent=pn)

        # Assemble Event Metadata
        em = make_simple_element('event_metadata', parent=c)
        cn = make_simple_element('conference_name', parent=em, text=booktitle)
        cl = make_simple_element('conference_location',
                                 parent=em,
                                 text=address)
        cd = make_simple_element('conference_date',
                                 parent=em,
                                 attrib={
                                     'start_year': year,
                                     'end_year': year,
                                     'start_month': start_month,
                                     'end_month': end_month
                                 })

        # Assemble Proceedings Metadata
        pm = make_simple_element('proceedings_metadata',
                                 parent=c,
                                 attrib={'language': 'en'})
        pt = make_simple_element('proceedings_title',
                                 parent=pm,
                                 text=booktitle)
        p = make_simple_element('publisher', parent=pm)
        pn = make_simple_element('publisher_name', parent=p, text=publisher)
        pp = make_simple_element('publisher_place',
                                 parent=p,
                                 text=PUBLISHER_PLACE)
        pd = make_simple_element('publication_date', parent=pm)
        y = make_simple_element('year', parent=pd, text=year)
        noisbn = make_simple_element('noisbn',
                                     parent=pm,
                                     attrib={'reason': 'simple_series'})

        # DOI assignation data
        dd = make_simple_element('doi_data', parent=pm)
        doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url)
        resource = make_simple_element('resource',
                                       parent=dd,
                                       text=ANTHOLOGY_URL.format(url))

        for paper in v.findall('./paper'):
            ## Individual Paper Data

            # TODO: this is not future-proof, should use anthology.util library functions
            aa_id = ""
            if (len(url) == 6):
                aa_id = '{:02d}'.format(int(paper.attrib['id']))
            else:
                if (len(url) == 5):
                    aa_id = '{:03d}'.format(int(paper.attrib['id']))

            cp = make_simple_element('conference_paper', parent=c)

            # contributors
            contribs = make_simple_element('contributors', parent=cp)
            author_index = 0
            for author in paper.findall('./author'):
                pn = make_simple_element(
                    'person_name',
                    parent=contribs,
                    attrib={
                        'contributor_role': 'author',
                        'sequence':
                        'first' if author_index == 0 else 'additional'
                    })
                author_index += 1

                for name_part in author:
                    if name_part.tag == 'first':
                        gn = make_simple_element('given_name',
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == 'last':
                        sn = make_simple_element('surname',
                                                 text=name_part.text,
                                                 parent=pn)

            for title in paper.iter(tag='title'):
                o_titles = make_simple_element('titles', parent=cp)
                o_title = make_simple_element('title',
                                              parent=o_titles,
                                              text=formatter.as_text(title))

            pd = make_simple_element('publication_date', parent=cp)
            o_year = make_simple_element('year', parent=pd)
            o_year.text = year

            for pages in paper.iter(tag='pages'):
                o_pages = make_simple_element('pages', parent=cp)
                fp = make_simple_element('first_page', parent=o_pages)
                lp = make_simple_element('last_page', parent=o_pages)
                try:
                    fp.text = re.split('[-–]', pages.text)[0]
                    lp.text = re.split('[-–]', pages.text)[1]
                except IndexError as e:  # only one page
                    fp.text = pages.text
                    lp.text = pages.text

            # DOI assignation data
            dd = make_simple_element('doi_data', parent=cp)
            doi = make_simple_element('doi',
                                      parent=dd,
                                      text=DOI_PREFIX + url + aa_id)
            resource = make_simple_element('resource',
                                           parent=dd,
                                           text=ANTHOLOGY_URL.format(url +
                                                                     aa_id))

    print(
        etree.tostring(new_volume,
                       pretty_print=True,
                       encoding='UTF-8',
                       xml_declaration=True,
                       with_tail=True).decode('utf-8'))
コード例 #16
0
def main(args):
    year, venue, _ = os.path.basename(args.tsv_file.name).split(".")

    # Set the volume name from the collection file, or default to 1
    # The file name is either "2012.eamt.tsv" or "2012.eamt-main.tsv".
    # The default volume name is "1".
    if "-" in venue:
        venue, volume_id = venue.split("-")
    else:
        volume_id = "1"

    collection_id = f"{year}.{venue}"

    tree = etree.ElementTree(
        make_simple_element("collection", attrib={"id": collection_id}))

    now = datetime.now()
    today = f"{now.year}-{now.month:02d}-{now.day:02d}"

    volume = make_simple_element("volume",
                                 attrib={
                                     "id": volume_id,
                                     "ingest-date": today
                                 })
    tree.getroot().insert(0, volume)

    # Location of entire-proceedings PDF
    proceedings_pdf = args.proceedings

    # Create the metadata for the paper
    meta = None
    for row in csv.DictReader(args.meta_file, delimiter="\t"):
        current_collection_id = f"{row['Year']}.{row['Conference code']}"
        if current_collection_id == collection_id:
            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle",
                                row["Conference title"],
                                parent=meta)
            make_simple_element("publisher", row["Publisher"], parent=meta)
            make_simple_element("address", row["Location"], parent=meta)
            make_simple_element("month", row["Dates held"], parent=meta)
            make_simple_element("year", row["Year"], parent=meta)

            url = row["URL"]

            if url.endswith(".pdf"):
                if proceedings_pdf:
                    print(
                        "Overriding --proceedings with proceedings PDF found in conference list",
                        file=sys.stderr,
                    )
                proceedings_pdf = url

            elif "Complete PDF" in row and row["Complete PDF"] != "":
                proceedings_pdf = row["Complete PDF"]

            # volume PDF
            if proceedings_pdf is not None:
                volume_anth_id = f"{collection_id}-{volume_id}"
                pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                              f"{volume_anth_id}.pdf")
                download(proceedings_pdf, pdf_local_path)
                with open(pdf_local_path, "rb") as f:
                    checksum = compute_hash(f.read())
                make_simple_element("url",
                                    volume_anth_id,
                                    attrib={"hash": checksum},
                                    parent=meta)
                proceedings_pdf = pdf_local_path

            if row["Editors"] != "" and "?" not in row["Editors"]:
                editors = row["Editors"].split(" and ")
                for editor_name in editors:
                    editor = make_simple_element("editor", parent=meta)
                    if ", " in editor_name:
                        last, first = editor_name.split(", ")
                    else:
                        first, last = (
                            ' '.join(editor_name.split()[:-1]),
                            editor_name.split()[-1],
                        )
                    make_simple_element("first", first, parent=editor)
                    make_simple_element("last", last, parent=editor)
            break
    else:
        print(
            f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}",
            file=sys.stderr,
        )
        sys.exit(1)

    if not os.path.exists(collection_id):
        print(f"Creating {collection_id}", file=sys.stderr)
        os.makedirs(collection_id)

    paperid = 0
    # Create entries for all the papers
    for row in csv.DictReader(args.tsv_file, delimiter='\t'):
        pages = row.get("Pagenumbers", None)

        title_text = row["Title"]

        # The first row might be front matter (needs a special name)
        if title_text == "Frontmatter" and paperid == 0:
            paper = make_simple_element("frontmatter", parent=volume)

        else:
            paperid += 1
            paper = make_simple_element("paper",
                                        attrib={"id": str(paperid)},
                                        parent=volume)
            # Only make the title for not-the-frontmatter
            make_simple_element("title", title_text, parent=paper)

        author_list = row["Authors"].split(" and ")

        for author_name in author_list:
            if author_name == "":
                continue
            author = make_simple_element("author", parent=paper)
            if ", " in author_name:
                last, first = author_name.split(", ")
            else:
                first, last = ' '.join(
                    author_name.split()[:-1]), author_name.split()[-1]
            make_simple_element("first", first, parent=author)
            make_simple_element("last", last, parent=author)

        if pages is not None:
            make_simple_element("pages", pages, parent=paper)

        # Find the PDF, either listed directly, or extracted from the proceedings PDF
        anth_id = f"{collection_id}-{volume_id}.{paperid}"
        pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                      f"{anth_id}.pdf")
        url = None
        if "Pdf" in row and row["Pdf"] != "":
            if download(row["Pdf"], pdf_local_path):
                url = anth_id

        elif "pages in pdf" in row:
            pdf_pages = row["pages in pdf"]
            extract_pages(proceedings_pdf, pdf_pages, pdf_local_path)
            url = anth_id

        if url is not None:
            with open(pdf_local_path, "rb") as f:
                checksum = compute_hash(f.read())

            make_simple_element("url",
                                url,
                                attrib={"hash": checksum},
                                parent=paper)

        if "Abstract" in row:
            make_simple_element("abstract", row["Abstract"], parent=paper)

        if "Presentation" in row:
            url = row["Presentation"]
            if url is not None and url != "" and url != "None":
                extension = row["Presentation"].split(".")[-1]
                name = f"{anth_id}.Presentation.{extension}"
                local_path = os.path.join(
                    args.anthology_files_path,
                    "..",
                    "attachments",
                    venue,
                    name,
                )
                if download(row["Presentation"], local_path):
                    make_simple_element("attachment",
                                        name,
                                        attrib={"type": "presentation"},
                                        parent=paper)

        # Normalize
        for node in paper:
            normalize(node, informat="latex")

    indent(tree.getroot())

    # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
    collection_file = os.path.join(args.anthology, "data", "xml",
                                   f"{collection_id}.xml")
    tree.write(collection_file,
               encoding="UTF-8",
               xml_declaration=True,
               with_tail=True)
コード例 #17
0
ファイル: ingest.py プロジェクト: namrathaurs/acl-anthology
def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                     "data")
    venue_keys = [
        venue["slug"].lower()
        for _, venue in VenueIndex(srcdir=anthology_datadir).items()
    ]

    # Build list of volumes, confirm uniqueness
    unseen_venues = []
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))

        venue_name = meta["abbrev"].lower()

        if venue_name not in venue_keys:
            unseen_venues.append(meta["abbrev"])

        meta["path"] = proceedings

        meta["collection_id"] = collection_id = (meta["year"] + "." +
                                                 meta["abbrev"].lower())
        volume_name = meta["volume"].lower()
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

    # Make sure all venues exist
    if len(unseen_venues) > 0:
        print("FATAL: The following venue(s) don't exist in venues.yaml")
        for venue in unseen_venues:
            print(f"- {venue}")
        print("Please create entries for them and re-ingest.")
        sys.exit(1)

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume"].lower()
        year = meta["year"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        # copy the book
        book_src_filename = meta["abbrev"] + "-" + year
        book_src_path = os.path.join(root_path, book_src_filename) + ".pdf"
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            if not args.dry_run and not os.path.exists(book_dest_path):
                log(f"Copying {book_src_path} -> {book_dest_path}",
                    args.dry_run)
                shutil.copyfile(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # names are {abbrev}{number}.pdf
            match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                if not args.dry_run and not os.path.exists(pdf_dest_path):
                    log(f"Copying {pdf_src_path} -> {pdf_dest_path}",
                        args.dry_run)
                    shutil.copyfile(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                venue_name)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                attachment_file_path = os.path.join(root_path, "additional",
                                                    attachment_file)
                match = re.match(
                    rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$",
                    attachment_file)
                if match is None:
                    print(
                        f"* Warning: no attachment match for {attachment_file}",
                        file=sys.stderr,
                    )
                    sys.exit(2)

                paper_num, type_, ext = match.groups()
                paper_num = int(paper_num)

                file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                dest_path = os.path.join(attachments_dest_dir, file_name)
                if not args.dry_run and not os.path.exists(dest_path):
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    shutil.copyfile(attachment_file_path, dest_path)

                collections[collection_id][volume_name][paper_num][
                    "attachments"].append((dest_path, type_))

    people = AnthologyIndex(None, srcdir=anthology_datadir)

    def disambiguate_name(node, anth_id):
        name = PersonName.from_element(node)
        ids = people.get_ids(name)

        if len(ids) > 1:
            choice = -1
            while choice < 0 or choice >= len(ids):
                print(
                    f"({anth_id}): ambiguous author {name}; Please choose from the following:"
                )
                for i, id_ in enumerate(ids):
                    print(f"[{i}] {id_} ({people.get_comment(id_)})")
                choice = int(input("--> "))

            node.attrib["id"] = ids[choice]

    for collection_id, collection in collections.items():
        # Newly added volumes, so we can normalize and name-disambig later
        newly_added_volumes = []

        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        if os.path.exists(collection_file):
            root_node = etree.parse(collection_file).getroot()
        else:
            root_node = make_simple_element("collection",
                                            attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element(
                "volume",
                attrib={
                    "id": volume_id,
                    "ingest-date": args.ingest_date
                },
            )

            # Replace the existing one if present
            existing_volume_node = root_node.find(
                f"./volume[@id='{volume_id}']")
            for i, child in enumerate(root_node):
                if child.attrib["id"] == volume_id:
                    root_node[i] = volume_node
                    break
            else:
                root_node.append(volume_node)

            meta_node = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)
                # print(etree.tostring(paper_node, pretty_print=True))

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta_node = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta_node.append(title_node)
                    for author_or_editor in chain(
                            paper_node.findall("./author"),
                            paper_node.findall("./editor")):
                        meta_node.append(author_or_editor)
                        author_or_editor.tag = "editor"
                    meta_node.append(paper_node.find("publisher"))
                    meta_node.append(paper_node.find("address"))
                    meta_node.append(paper_node.find("month"))
                    meta_node.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            attrib={
                                "hash": compute_hash_from_file(book_dest_path)
                            },
                            parent=meta_node,
                        )

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                url = paper_node.find("./url")
                if url is not None:
                    url.attrib["hash"] = compute_hash_from_file(paper["pdf"])

                for path, type_ in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=os.path.basename(path),
                        attrib={
                            "type": type_,
                            "hash": compute_hash_from_file(path),
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

                # Normalize
                for oldnode in paper_node:
                    normalize(oldnode, informat="latex")

                for name_node in chain(paper_node.findall("./author"),
                                       paper_node.findall("./editor")):
                    disambiguate_name(name_node, paper_id_full)

        # Other data from the meta file
        if "isbn" in meta:
            make_simple_element("isbn", meta["isbn"], parent=meta_node)

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)
コード例 #18
0
ファイル: ingest_tsv.py プロジェクト: xinru1414/acl-anthology
def main(args):
    year = args.year
    venue = args.venue
    volume_id = args.volume
    collection_id = f"{year}.{venue}"

    splitter = NameSplitter(anthology_dir=args.anthology_dir)

    collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                   f"{collection_id}.xml")
    if os.path.exists(collection_file):
        tree = etree.parse(collection_file)
    else:
        tree = etree.ElementTree(
            make_simple_element("collection", attrib={"id": collection_id}))

    now = datetime.now()
    today = f"{now.year}-{now.month:02d}-{now.day:02d}"

    volume_node = tree.getroot().find(f"./volume[@id='{volume_id}']")
    if volume_node is not None:
        tree.getroot().remove(volume_node)

    volume = make_simple_element("volume",
                                 attrib={
                                     "id": volume_id,
                                     "ingest-date": today
                                 },
                                 parent=tree.getroot())

    if not os.path.exists(collection_id):
        print(f"Creating {collection_id}", file=sys.stderr)
        os.makedirs(collection_id)

    # Create entries for all the papers
    for paperid, row in enumerate(
            csv.DictReader(args.tsv_file, delimiter=args.delimiter)):
        pages = row.get("pages", None)

        if paperid == 0:
            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle", row["booktitle"], parent=meta)
            make_simple_element("publisher", row["publisher"], parent=meta)
            make_simple_element("address", row["address"], parent=meta)
            make_simple_element("month", row["month"], parent=meta)
            make_simple_element("year", year, parent=meta)

            editors = row["author"].split(" and ")
            row["author"] = ""
            for editor_name in editors:
                editor = make_simple_element("editor", parent=meta)
                surname, givenname = splitter.best_split(editor_name)
                make_simple_element("first", givenname, parent=editor)
                make_simple_element("last", surname, parent=editor)

            # volume PDF
            proceedings_pdf = args.proceedings_pdf
            if proceedings_pdf is not None:
                volume_anth_id = f"{collection_id}-{volume_id}"
                pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                              f"{volume_anth_id}.pdf")
                retrieve_url(proceedings_pdf, pdf_local_path)
                checksum = compute_hash_from_file(pdf_local_path)
                make_simple_element("url",
                                    volume_anth_id,
                                    attrib={"hash": checksum},
                                    parent=meta)
                proceedings_pdf = pdf_local_path

        title_text = row["title"]

        # The first row might be front matter (needs a special name)
        if paperid == 0 and title_text.lower() in [
                "frontmatter", "front matter"
        ]:
            paper = make_simple_element("frontmatter", parent=volume)
        else:
            if paperid == 0:
                # Not frontmatter, so paper 1
                paperid += 1

            paper = make_simple_element("paper",
                                        attrib={"id": str(paperid)},
                                        parent=volume)
            # Only make the title for not-the-frontmatter
            make_simple_element("title", title_text, parent=paper)

        author_list = row["author"].split(" and ")

        for author_name in author_list:
            if author_name == "":
                continue
            author = make_simple_element("author", parent=paper)
            surname, givenname = splitter.best_split(author_name)
            make_simple_element("first", givenname, parent=author)
            make_simple_element("last", surname, parent=author)

        if pages is not None and pages != "":
            make_simple_element("pages", pages, parent=paper)

        # Find the PDF, either listed directly, or extracted from the proceedings PDF
        anth_id = f"{collection_id}-{volume_id}.{paperid}"
        pdf_local_path = os.path.join(args.anthology_files_path, venue,
                                      f"{anth_id}.pdf")
        url = None
        if "pdf" in row and row["pdf"] != "":
            if retrieve_url(row["pdf"], pdf_local_path):
                url = anth_id
            else:
                print("Can't find", row["pdf"])

        elif "pages in pdf" in row:
            pdf_pages = row["pages"]
            extract_pages(proceedings_pdf, pdf_pages, pdf_local_path)
            url = anth_id

        if url is not None:
            checksum = compute_hash_from_file(pdf_local_path)

            make_simple_element("url",
                                url,
                                attrib={"hash": checksum},
                                parent=paper)

        if "abstract" in row and row["abstract"] != "":
            make_simple_element("abstract", row["abstract"], parent=paper)

        if "presentation" in row:
            url = row["presentation"]
            if url is not None and url != "" and url != "None":
                extension = row["presentation"].split(".")[-1]
                name = f"{anth_id}.Presentation.{extension}"
                local_path = os.path.join(
                    args.anthology_files_path,
                    "..",
                    "attachments",
                    venue,
                    name,
                )
                if retrieve_url(row["presentation"], local_path):
                    make_simple_element(
                        "attachment",
                        name,
                        attrib={
                            "type": "presentation",
                            "hash": compute_hash_from_file(local_path),
                        },
                        parent=paper,
                    )

        # Normalize
        for node in paper:
            normalize(node, informat="latex")

    indent(tree.getroot())

    # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml
    tree.write(collection_file,
               encoding="UTF-8",
               xml_declaration=True,
               with_tail=True)
コード例 #19
0
def main(args):
    def maybe_copy(file_from, file_to):
        if not args.dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    change_type = "erratum" if args.erratum else "revision"
    change_letter = "e" if args.erratum else "v"

    print(f"Processing {change_type} to {args.anthology_id}...")

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith("http"):
        _, input_file_path = tempfile.mkstemp()
        download_file(args.path, input_file_path)
    else:
        input_file_path = args.path

    validate_file_type(input_file_path)

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split(".")[-1]

    # The new version
    revno = None

    with open(input_file_path, "rb") as f:
        checksum = compute_hash(f.read())

    # Files for old-style IDs are stored under anthology-files/pdf/P/P19/*
    # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/*
    if is_newstyle_id(args.anthology_id):
        venue_name = collection_id.split(".")[1]
        output_dir = os.path.join(args.anthology_dir, "pdf", venue_name)
    else:
        output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0],
                                  collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf")

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml",
                            f"{collection_id}.xml")
    tree = ET.parse(xml_file)
    if paper_id == "0":
        paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter")
    else:
        paper = tree.getroot().find(
            f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not args.dry_run:
            # Update the URL hash on the <url> tag
            url = paper.find("./url")
            if url is not None:
                url.attrib["hash"] = checksum

            if not args.erratum and revno == 2:
                if paper.find("./url") is not None:
                    current_version_url = infer_url(
                        paper.find("./url").text) + ".pdf"

                # Download original file
                # There are no versioned files the first time around, so create the first one
                # (essentially backing up the original version)
                revised_file_v1_path = os.path.join(
                    output_dir, f"{args.anthology_id}{change_letter}1.pdf")

                download_file(current_version_url, revised_file_v1_path)
                validate_file_type(revised_file_v1_path)

                with open(revised_file_v1_path, "rb") as f:
                    old_checksum = compute_hash(f.read())

                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{args.anthology_id}{change_letter}1",
                        "hash": old_checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                args.explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{args.anthology_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": args.date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    revised_file_versioned_path = os.path.join(
        output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(input_file_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if not args.erratum:
        maybe_copy(input_file_path, canonical_path)

    if args.path.startswith("http"):
        os.remove(input_file_path)
コード例 #20
0
def main(volumes):

    formatter = MarkupFormatter()

    ## Assemble container
    doi_batch = make_simple_element(
        "doi_batch",
        attrib={
            "xmlns": "http://www.crossref.org/schema/4.4.1",
            "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation":
            "http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd",
            "version": "4.4.1",
        },
        namespaces={"xsi": "http://www.w3.org/2001/XMLSchema-instance"},
    )
    new_volume = etree.ElementTree(doi_batch)

    ## Assemble head
    head = make_simple_element("head", parent=new_volume.getroot())
    dbi = make_simple_element("doi_batch_id",
                              text=str(int(time.time())),
                              parent=head)

    timestamp = make_simple_element("timestamp",
                                    text=str(int(time.time())),
                                    parent=head)

    depositor = make_simple_element("depositor", parent=head)
    depositor_name = make_simple_element("depositor_name",
                                         text=DEPOSITOR_NAME,
                                         parent=depositor)
    email_address = make_simple_element("email_address",
                                        text=EMAIL_ADDRESS,
                                        parent=depositor)

    registrant = make_simple_element("registrant",
                                     text=REGISTRANT,
                                     parent=head)

    ## Assemble body
    body = make_simple_element("body", parent=new_volume.getroot())

    year = ""
    start_month = ""
    end_month = ""

    for full_volume_id in sorted(volumes):
        collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id)

        collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                       "data", "xml", f"{collection_id}.xml")
        tree = etree.parse(collection_file)

        v = tree.getroot().find(f"./volume[@id='{volume_id}']")
        if v is None:
            print(f"* Can't find volume {full_volume_id}", file=sys.stderr)
            continue

        ## Assemble frontmatter
        c = make_simple_element("conference", parent=body)
        contribs = make_simple_element("contributors", parent=c)
        editor_index = 0

        meta = v.find("./meta")
        for tag in meta:
            if tag.tag == "year":
                year = tag.text
            elif tag.tag == "month":
                month = tag.text
                try:
                    start_month = MONTH_HASH[re.split("[-–]", month)[0]]
                    end_month = MONTH_HASH[re.split("[-–]", month)[1]]
                except IndexError as e:  # only one month
                    start_month = MONTH_HASH[month]
                    end_month = MONTH_HASH[month]
                except Exception as e:
                    print(
                        f"FATAL: can't parse month {month} in {full_volume_id}",
                        file=sys.stderr,
                    )
                    sys.exit(1)
            elif tag.tag == "url":
                url = tag.text
            elif tag.tag == "booktitle":
                booktitle = formatter.as_text(tag)
            elif tag.tag == "address":
                address = tag.text
            elif tag.tag == "publisher":
                publisher = tag.text
            elif tag.tag == "editor":
                pn = make_simple_element(
                    "person_name",
                    parent=contribs,
                    attrib={
                        "contributor_role": "chair",
                        "sequence":
                        "first" if editor_index == 0 else "additional",
                    },
                )
                editor_index += 1

                for name_part in tag:
                    if name_part.tag == "first":
                        gn = make_simple_element("given_name",
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == "last":
                        sn = make_simple_element("surname",
                                                 text=name_part.text,
                                                 parent=pn)

        # Assemble Event Metadata
        em = make_simple_element("event_metadata", parent=c)
        cn = make_simple_element("conference_name", parent=em, text=booktitle)
        cl = make_simple_element("conference_location",
                                 parent=em,
                                 text=address)
        cd = make_simple_element(
            "conference_date",
            parent=em,
            attrib={
                "start_year": year,
                "end_year": year,
                "start_month": start_month,
                "end_month": end_month,
            },
        )

        # Assemble Proceedings Metadata
        pm = make_simple_element("proceedings_metadata",
                                 parent=c,
                                 attrib={"language": "en"})
        pt = make_simple_element("proceedings_title",
                                 parent=pm,
                                 text=booktitle)
        p = make_simple_element("publisher", parent=pm)
        pn = make_simple_element("publisher_name", parent=p, text=publisher)
        pp = make_simple_element("publisher_place",
                                 parent=p,
                                 text=PUBLISHER_PLACE)
        pd = make_simple_element("publication_date", parent=pm)
        y = make_simple_element("year", parent=pd, text=year)
        noisbn = make_simple_element("noisbn",
                                     parent=pm,
                                     attrib={"reason": "simple_series"})

        # DOI assignation data
        dd = make_simple_element("doi_data", parent=pm)
        doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url)
        resource = make_simple_element("resource",
                                       parent=dd,
                                       text=ANTHOLOGY_URL.format(url))

        for paper in v.findall("./paper"):
            ## Individual Paper Data

            # TODO: this is not future-proof, should use anthology.util library functions
            aa_id = ""
            if len(url) == 6:
                aa_id = "{:02d}".format(int(paper.attrib["id"]))
            else:
                if len(url) == 5:
                    aa_id = "{:03d}".format(int(paper.attrib["id"]))

            cp = make_simple_element("conference_paper", parent=c)

            # contributors
            contribs = make_simple_element("contributors", parent=cp)
            author_index = 0
            for author in paper.findall("./author"):
                pn = make_simple_element(
                    "person_name",
                    parent=contribs,
                    attrib={
                        "contributor_role": "author",
                        "sequence":
                        "first" if author_index == 0 else "additional",
                    },
                )
                author_index += 1

                for name_part in author:
                    if name_part.tag == "first":
                        gn = make_simple_element("given_name",
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == "last":
                        sn = make_simple_element("surname",
                                                 text=name_part.text,
                                                 parent=pn)

            for title in paper.iter(tag="title"):
                o_titles = make_simple_element("titles", parent=cp)
                o_title = make_simple_element("title",
                                              parent=o_titles,
                                              text=formatter.as_text(title))

            pd = make_simple_element("publication_date", parent=cp)
            o_year = make_simple_element("year", parent=pd)
            o_year.text = year

            for pages in paper.iter(tag="pages"):
                o_pages = make_simple_element("pages", parent=cp)
                fp = make_simple_element("first_page", parent=o_pages)
                lp = make_simple_element("last_page", parent=o_pages)
                try:
                    fp.text = re.split("[-–]", pages.text)[0]
                    lp.text = re.split("[-–]", pages.text)[1]
                except IndexError as e:  # only one page
                    fp.text = pages.text
                    lp.text = pages.text

            # DOI assignation data
            dd = make_simple_element("doi_data", parent=cp)
            doi = make_simple_element("doi",
                                      parent=dd,
                                      text=DOI_PREFIX + url + aa_id)
            resource = make_simple_element("resource",
                                           parent=dd,
                                           text=ANTHOLOGY_URL.format(url +
                                                                     aa_id))

    print(
        etree.tostring(
            new_volume,
            pretty_print=True,
            encoding="UTF-8",
            xml_declaration=True,
            with_tail=True,
        ).decode("utf-8"))
コード例 #21
0
def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    # Build list of volumes, confirm uniqueness
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))
        meta["path"] = proceedings

        meta["collection_id"] = collection_id = (meta["year"] + "." +
                                                 meta["abbrev"].lower())
        volume_name = meta["volume_name"]
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume_name"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        print(f"VOLUME: {volume}")

        # copy the book
        book_src_filename = meta["abbrev"] + "-" + meta["year"]
        book_src_path = os.path.join(root_path, book_src_filename) + ".pdf"
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run)
            if not args.dry_run:
                shutil.copyfile(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # names are {abbrev}{number}.pdf
            abbrev = meta["abbrev"]
            match = re.match(rf"{abbrev}(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                log(
                    f"Copying [{paper_id_full}] {pdf_src_path} -> {pdf_dest_path}",
                    args.dry_run,
                )
                if not args.dry_run:
                    shutil.copyfile(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                collection_id)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                match = re.match(rf"{abbrev}(\d+)_(\w+)\.(\w+)")
                if match is not None:
                    paper_num, type_, ext = match.groups()
                    paper_num = int(paper_num)

                    file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                    dest_path = os.path.join(attachments_dest_dir, file_name)
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    if not args.dry_run:
                        shutil.copyfile(attachment_file, dest_path)

                    collections[collection_id][volume_name][paper_num][
                        "attachments"].append(dest_path)

    people = AnthologyIndex(None,
                            srcdir=os.path.join(os.path.dirname(sys.argv[0]),
                                                "..", "data"))

    for collection_id, collection in collections.items():
        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        root_node = make_simple_element("collection",
                                        attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element("volume",
                                              attrib={"id": volume_id},
                                              parent=root_node)
            meta = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)
                # print(etree.tostring(paper_node, pretty_print=True))

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta.append(title_node)
                    for editor in paper_node.findall("editor"):
                        meta.append(editor)
                    meta.append(paper_node.find("publisher"))
                    meta.append(paper_node.find("address"))
                    meta.append(paper_node.find("month"))
                    meta.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            parent=meta)

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                for attachment in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=attachment.path,
                        attrib={
                            "type": attachment.type,
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

        # Normalize
        for paper in root_node.findall(".//paper"):
            for oldnode in paper:
                normalize(oldnode, informat="latex")

        # Ensure names are properly identified
        ambiguous = {}
        for paper in root_node.findall(".//paper"):
            anth_id = build_anthology_id(collection_id,
                                         paper.getparent().attrib["id"],
                                         paper.attrib["id"])

        for node in chain(paper.findall("author"), paper.findall("editor")):
            name = PersonName.from_element(node)
            ids = people.get_ids(name)
            if len(ids) > 1:
                print(
                    f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}"
                )
                ambiguous[anth_id] = (name, ids)

                node.attrib["id"] = ids[0]

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)
コード例 #22
0
ファイル: ingest.py プロジェクト: namrathaurs/acl-anthology
def bib2xml(bibfilename, anthology_id):
    """
    Moved here from ACLPUB's anthology_xml.py script.
    """

    fields = [
        'title',
        'author',
        'editor',
        'booktitle',
        'month',
        'year',
        'address',
        'publisher',
        'pages',
        'abstract',
        'url',
        'doi',
        'language',
    ]

    collection_id, volume_name, paper_no = deconstruct_anthology_id(
        anthology_id)
    if paper_no == '':
        return  # skip the master bib file; we only process the individual files

    bibdata = read_bibtex(bibfilename)
    if len(bibdata.entries) != 1:
        log(f"more than one entry in {bibfilename}")
    bibkey, bibentry = bibdata.entries.items()[0]
    if len(bibentry.fields) == 0:
        log(f"parsing bib of paper {paper_no} failed")
        sys.exit(1)

    paper = make_simple_element("paper", attrib={"id": paper_no})
    for field in list(bibentry.fields) + list(bibentry.persons):
        if field not in fields:
            log(f"unknown field {field}")

    for field in fields:
        if field in ['author', 'editor']:
            if field in bibentry.persons:
                for person in bibentry.persons[field]:
                    first_text = ' '.join(person.bibtex_first_names)
                    last_text = ' '.join(person.prelast_names +
                                         person.last_names)
                    if person.lineage_names:
                        last_text += ', ' + ' '.join(person.lineage_names)

                    # Don't distinguish between authors that have only a first name
                    # vs. authors that have only a last name; always make it a last name.
                    if last_text.strip() in [
                            '',
                            '-',
                    ]:  # Some START users have '-' for null
                        last_text = first_text
                        first_text = ''

                    name_node = make_simple_element(field, parent=paper)
                    make_simple_element("first", first_text, parent=name_node)
                    make_simple_element("last", last_text, parent=name_node)
        else:
            if field == 'url':
                value = f"{anthology_id}"
            elif field in bibentry.fields:
                value = bibentry.fields[field]
            elif field == 'bibtype':
                value = bibentry.type
            elif field == 'bibkey':
                value = bibkey
            else:
                continue

            make_simple_element(field, text=value, parent=paper)

    return paper
コード例 #23
0
def add_revision(anth_id,
                 pdf_path,
                 explanation,
                 change_type="revision",
                 dry_run=True,
                 date=None):
    """
    Takes an Anthology ID. It then adds a revision to the Anthology XML,
    updating and writing the XML file, and copies the PDFs into place.
    For PDFs, the revised PDF is saved to {anth_id}.pdf and {anth_id}v{version}.pdf.
    For the first revision, we first copy {anth_id}.pdf to {anth_id}v1.pdf.
    """
    if date is None:
        now = datetime.now()
        date = f"{now.year}-{now.month:02d}-{now.day:02d}"

    def maybe_copy(file_from, file_to):
        if not dry_run:
            print("-> Copying from {} -> {}".format(file_from, file_to),
                  file=sys.stderr)
            shutil.copy(file_from, file_to)
            os.chmod(file_to, 0o644)
        else:
            print(
                "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to),
                file=sys.stderr,
            )

    # The new version
    revno = None

    change_letter = "e" if change_type == "erratum" else "v"

    checksum = compute_hash_from_file(pdf_path)

    # Files for old-style IDs are stored under anthology-files/pdf/P/P19/*
    # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/*
    output_dir = get_pdf_dir(anth_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f"-> Creating directory {output_dir}", file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f"{anth_id}.pdf")

    # Update XML
    xml_file = get_xml_file(anth_id)
    collection_id, volume_id, paper_id = deconstruct_anthology_id(anth_id)
    tree = ET.parse(xml_file)
    if paper_id == "0":
        paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter")
    else:
        paper = tree.getroot().find(
            f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if change_type == "erratum" else 2
        for revision in revisions:
            revno = int(revision.attrib["id"]) + 1

        if not dry_run:
            # Update the URL hash on the <url> tag
            url = paper.find("./url")
            if url is not None:
                url.attrib["hash"] = checksum

            if change_type == "revision" and revno == 2:
                if paper.find("./url") is not None:
                    current_version_url = infer_url(
                        paper.find("./url").text) + ".pdf"

                # Download original file
                # There are no versioned files the first time around, so create the first one
                # (essentially backing up the original version)
                revised_file_v1_path = os.path.join(
                    output_dir, f"{anth_id}{change_letter}1.pdf")

                retrieve_url(current_version_url, revised_file_v1_path)
                validate_file_type(revised_file_v1_path)

                old_checksum = compute_hash_from_file(revised_file_v1_path)

                # First revision requires making the original version explicit
                revision = make_simple_element(
                    change_type,
                    None,
                    attrib={
                        "id": "1",
                        "href": f"{anth_id}{change_letter}1",
                        "hash": old_checksum,
                    },
                    parent=paper,
                )

            revision = make_simple_element(
                change_type,
                explanation,
                attrib={
                    "id": str(revno),
                    "href": f"{anth_id}{change_letter}{revno}",
                    "hash": checksum,
                    "date": date,
                },
                parent=paper,
            )
            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f"-> FATAL: paper ID {anth_id} not found in the Anthology",
            file=sys.stderr,
        )
        sys.exit(1)

    revised_file_versioned_path = os.path.join(
        output_dir, f"{anth_id}{change_letter}{revno}.pdf")

    # Copy the file to the versioned path
    maybe_copy(pdf_path, revised_file_versioned_path)

    # Copy it over the canonical path
    if change_type == "revision":
        maybe_copy(pdf_path, canonical_path)
コード例 #24
0
ファイル: ingest.py プロジェクト: sjmielke/acl-anthology
    # Normalize
    for paper in root_being_added.findall(".//paper"):
        for oldnode in paper:
            normalize(oldnode, informat="latex")

    # Ingest each volume.
    # First, find the XML file.
    collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data",
                                   "xml", f"{collection_id}.xml")

    if os.path.exists(collection_file):
        existing_tree = etree.parse(collection_file)
    else:
        existing_tree = etree.ElementTree(
            make_simple_element("collection", attrib={"id": collection_id}))

    # Insert each volume
    for i, new_volume in enumerate(root_being_added.findall("volume")):
        new_volume_id = int(new_volume.attrib["id"])
        existing_volume = existing_tree.getroot().find(
            f"./volume[@id='{new_volume_id}']")
        if existing_volume is None:
            new_volume.attrib["ingest-date"] = args.ingest_date

            # Find the insertion point among the other volumes
            insertion_point = 0
            for i, volume in enumerate(existing_tree.getroot()):
                if new_volume_id < int(volume.attrib["id"]):
                    break
                insertion_point = i + 1
コード例 #25
0
def main(args):
    anth = anthology.Anthology(importdir=os.path.join(args.anthology, "data"))
    splitter = NameSplitter(anth)

    paper_nums = {}
    venue = "lilt"
    prev_year = None
    prev_volume = None
    for row in csv.DictReader(args.tsv_file, delimiter='\t'):
        year = row.get("year")
        month = row.get("month")
        issue = row.get("issue#", "")
        abstract = row.get("abstract")
        collection_id = f"{year}.lilt"
        if year != prev_year:
            if prev_year is not None:
                dump_collection(
                    tree,
                    os.path.join(args.anthology, "data", "xml",
                                 f"{prev_year}.lilt.xml"),
                )

            tree = etree.ElementTree(
                make_simple_element("collection", attrib={"id":
                                                          collection_id}))
            root = tree.getroot()
        prev_year = year

        volume_name = row.get("Volume#")
        if volume_name != prev_volume:
            volume = make_simple_element("volume",
                                         attrib={"id": volume_name},
                                         parent=root)
            meta = make_simple_element("meta", parent=volume)
            make_simple_element("booktitle", row.get("Booktitle"), parent=meta)
            make_simple_element("publisher", "CSLI Publications", parent=meta)
            make_simple_element("year", year, parent=meta)
            if month:
                make_simple_element("month", month, parent=meta)

        paper_num = paper_nums[volume_name] = paper_nums.get(volume_name,
                                                             0) + 1

        prev_volume = volume_name

        paper = make_simple_element("paper",
                                    attrib={"id": str(paper_num)},
                                    parent=volume)
        paper_id = f"{collection_id}-{volume_name}.{paper_num}"
        make_simple_element("title", row.get("title"), parent=paper)
        authors = row.get("authors")
        for author_name in authors.split(" and "):
            author = make_simple_element("author", parent=paper)
            surname, givenname = splitter.best_split(author_name)
            make_simple_element("first", givenname, parent=author)
            make_simple_element("last", surname, parent=author)

        if abstract != "":
            make_simple_element("abstract", abstract, parent=paper)
        if issue != "":
            make_simple_element("issue", issue, parent=paper)

        for node in paper:
            normalize(node, "latex")

        dest_dir = f"{args.anthology_files_path}/lilt"
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        source_path = os.path.join(
            "pdf",
            row.get("PDF").replace("\\", "/").replace("../", ""))
        if os.path.exists(source_path):
            dest_path = os.path.join(
                dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf")
            shutil.copy(source_path, dest_path)
            print(f"Copying {source_path} to {dest_path}", file=sys.stderr)
            os.chmod(dest_path, 0o644)
            checksum = compute_hash_from_file(dest_path)
            make_simple_element("url",
                                paper_id,
                                attrib={"hash": checksum},
                                parent=paper)

    dump_collection(
        tree,
        os.path.join(args.anthology, "data", "xml", f"{collection_id}.xml"))