def maybe_copy(source_path, dest_path): """Copies the file if it's different from the target.""" if not os.path.exists(dest_path) or compute_hash_from_file( source_path ) != compute_hash_from_file(dest_path): log(f"Copying {source_path} -> {dest_path}", args.dry_run) shutil.copyfile(source_path, dest_path)
def main(args): collections = defaultdict(OrderedDict) volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_keys = [ venue["slug"].lower() for _, venue in VenueIndex(srcdir=anthology_datadir).items() ] # Build list of volumes, confirm uniqueness unseen_venues = [] for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) venue_name = meta["abbrev"].lower() if venue_name not in venue_keys: unseen_venues.append(meta["abbrev"]) meta["path"] = proceedings meta["collection_id"] = collection_id = (meta["year"] + "." + meta["abbrev"].lower()) volume_name = meta["volume"].lower() volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta # Make sure all venues exist if len(unseen_venues) > 0: print("FATAL: The following venue(s) don't exist in venues.yaml") for venue in unseen_venues: print(f"- {venue}") print("Please create entries for them and re-ingest.") sys.exit(1) # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume"].lower() year = meta["year"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) # copy the book book_src_filename = meta["abbrev"] + "-" + year book_src_path = os.path.join(root_path, book_src_filename) + ".pdf" book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") if not args.dry_run and not os.path.exists(book_dest_path): log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run) shutil.copyfile(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") if not args.dry_run and not os.path.exists(pdf_dest_path): log(f"Copying {pdf_src_path} -> {pdf_dest_path}", args.dry_run) shutil.copyfile(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): attachment_file_path = os.path.join(root_path, "additional", attachment_file) match = re.match( rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file) if match is None: print( f"* Warning: no attachment match for {attachment_file}", file=sys.stderr, ) sys.exit(2) paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) if not args.dry_run and not os.path.exists(dest_path): log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) shutil.copyfile(attachment_file_path, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append((dest_path, type_)) people = AnthologyIndex(None, srcdir=anthology_datadir) def disambiguate_name(node, anth_id): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: choice = -1 while choice < 0 or choice >= len(ids): print( f"({anth_id}): ambiguous author {name}; Please choose from the following:" ) for i, id_ in enumerate(ids): print(f"[{i}] {id_} ({people.get_comment(id_)})") choice = int(input("--> ")) node.attrib["id"] = ids[choice] for collection_id, collection in collections.items(): # Newly added volumes, so we can normalize and name-disambig later newly_added_volumes = [] collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): root_node = etree.parse(collection_file).getroot() else: root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element( "volume", attrib={ "id": volume_id, "ingest-date": args.ingest_date }, ) # Replace the existing one if present existing_volume_node = root_node.find( f"./volume[@id='{volume_id}']") for i, child in enumerate(root_node): if child.attrib["id"] == volume_id: root_node[i] = volume_node break else: root_node.append(volume_node) meta_node = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) # print(etree.tostring(paper_node, pretty_print=True)) if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta_node.append(title_node) for author_or_editor in chain( paper_node.findall("./author"), paper_node.findall("./editor")): meta_node.append(author_or_editor) author_or_editor.tag = "editor" meta_node.append(paper_node.find("publisher")) meta_node.append(paper_node.find("address")) meta_node.append(paper_node.find("month")) meta_node.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", attrib={ "hash": compute_hash_from_file(book_dest_path) }, parent=meta_node, ) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) url = paper_node.find("./url") if url is not None: url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) for path, type_ in paper["attachments"]: make_simple_element( "attachment", text=os.path.basename(path), attrib={ "type": type_, "hash": compute_hash_from_file(path), }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for oldnode in paper_node: normalize(oldnode, informat="latex") for name_node in chain(paper_node.findall("./author"), paper_node.findall("./editor")): disambiguate_name(name_node, paper_id_full) # Other data from the meta file if "isbn" in meta: make_simple_element("isbn", meta["isbn"], parent=meta_node) indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): year = args.year venue = args.venue volume_id = args.volume collection_id = f"{year}.{venue}" splitter = NameSplitter(anthology_dir=args.anthology_dir) collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): tree = etree.parse(collection_file) else: tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) now = datetime.now() today = f"{now.year}-{now.month:02d}-{now.day:02d}" volume_node = tree.getroot().find(f"./volume[@id='{volume_id}']") if volume_node is not None: tree.getroot().remove(volume_node) volume = make_simple_element("volume", attrib={ "id": volume_id, "ingest-date": today }, parent=tree.getroot()) if not os.path.exists(collection_id): print(f"Creating {collection_id}", file=sys.stderr) os.makedirs(collection_id) # Create entries for all the papers for paperid, row in enumerate( csv.DictReader(args.tsv_file, delimiter=args.delimiter)): pages = row.get("pages", None) if paperid == 0: meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row["booktitle"], parent=meta) make_simple_element("publisher", row["publisher"], parent=meta) make_simple_element("address", row["address"], parent=meta) make_simple_element("month", row["month"], parent=meta) make_simple_element("year", year, parent=meta) editors = row["author"].split(" and ") row["author"] = "" for editor_name in editors: editor = make_simple_element("editor", parent=meta) surname, givenname = splitter.best_split(editor_name) make_simple_element("first", givenname, parent=editor) make_simple_element("last", surname, parent=editor) # volume PDF proceedings_pdf = args.proceedings_pdf if proceedings_pdf is not None: volume_anth_id = f"{collection_id}-{volume_id}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{volume_anth_id}.pdf") retrieve_url(proceedings_pdf, pdf_local_path) checksum = compute_hash_from_file(pdf_local_path) make_simple_element("url", volume_anth_id, attrib={"hash": checksum}, parent=meta) proceedings_pdf = pdf_local_path title_text = row["title"] # The first row might be front matter (needs a special name) if paperid == 0 and title_text.lower() in [ "frontmatter", "front matter" ]: paper = make_simple_element("frontmatter", parent=volume) else: if paperid == 0: # Not frontmatter, so paper 1 paperid += 1 paper = make_simple_element("paper", attrib={"id": str(paperid)}, parent=volume) # Only make the title for not-the-frontmatter make_simple_element("title", title_text, parent=paper) author_list = row["author"].split(" and ") for author_name in author_list: if author_name == "": continue author = make_simple_element("author", parent=paper) surname, givenname = splitter.best_split(author_name) make_simple_element("first", givenname, parent=author) make_simple_element("last", surname, parent=author) if pages is not None and pages != "": make_simple_element("pages", pages, parent=paper) # Find the PDF, either listed directly, or extracted from the proceedings PDF anth_id = f"{collection_id}-{volume_id}.{paperid}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{anth_id}.pdf") url = None if "pdf" in row and row["pdf"] != "": if retrieve_url(row["pdf"], pdf_local_path): url = anth_id else: print("Can't find", row["pdf"]) elif "pages in pdf" in row: pdf_pages = row["pages"] extract_pages(proceedings_pdf, pdf_pages, pdf_local_path) url = anth_id if url is not None: checksum = compute_hash_from_file(pdf_local_path) make_simple_element("url", url, attrib={"hash": checksum}, parent=paper) if "abstract" in row and row["abstract"] != "": make_simple_element("abstract", row["abstract"], parent=paper) if "presentation" in row: url = row["presentation"] if url is not None and url != "" and url != "None": extension = row["presentation"].split(".")[-1] name = f"{anth_id}.Presentation.{extension}" local_path = os.path.join( args.anthology_files_path, "..", "attachments", venue, name, ) if retrieve_url(row["presentation"], local_path): make_simple_element( "attachment", name, attrib={ "type": "presentation", "hash": compute_hash_from_file(local_path), }, parent=paper, ) # Normalize for node in paper: normalize(node, informat="latex") indent(tree.getroot()) # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
# Check if the paper is already present in the volume doi_text = papernode.find("./doi").text doi_node = collection.xpath(f'.//doi[text()="{doi_text}"]') if len(doi_node): logging.info( f"Skipping existing paper {anth_id}/{doi_text} with title {papernode.find('title').text}" ) continue papernode.attrib["id"] = f"{paper_id}" destination = pdf_destination / f"{anth_id}.pdf" print(f"Copying {pdf_path} to {destination}") shutil.copyfile(pdf_path, destination) checksum = compute_hash_from_file(pdf_path) url_text = anth_id url = etree.Element("url") url.attrib["hash"] = checksum url.text = url_text papernode.append(url) # Normalize for oldnode in papernode: normalize(oldnode, informat="latex") volume.append(papernode) paper_id += 1 indent(collection) # from anthology.utils
def main(args): collections = defaultdict(OrderedDict) volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_index = VenueIndex(srcdir=anthology_datadir) venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] sig_index = SIGIndex(srcdir=anthology_datadir) # Build list of volumes, confirm uniqueness unseen_venues = [] for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) venue_abbrev = meta["abbrev"] venue_slug = venue_index.get_slug(venue_abbrev) if str(datetime.now().year) in venue_abbrev: print( f"Fatal: Venue assembler put year in acronym: '{venue_abbrev}'" ) sys.exit(1) if venue_slug not in venue_keys: unseen_venues.append((venue_slug, venue_abbrev, meta["title"])) meta["path"] = proceedings meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug volume_name = meta["volume"].lower() volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta if "sig" in meta: print( f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:" ) print(f" - {meta['year']}:") print(f" - {volume_full_id} # {meta['booktitle']}") # Make sure all venues exist if len(unseen_venues) > 0: for venue in unseen_venues: slug, abbrev, title = venue print(f"Creating venue '{abbrev}' ({title})") venue_index.add_venue(abbrev, title) venue_index.dump(directory=anthology_datadir) # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume"].lower() year = meta["year"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) # copy the book book_src_filename = f'{year}.{meta["abbrev"]}-{volume_name}.pdf' book_src_path = os.path.join(root_path, book_src_filename) book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") if not args.dry_run: maybe_copy(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # Skip . files if os.path.basename(pdf_file).startswith("."): continue # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") if not args.dry_run: maybe_copy(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): if os.path.basename(attachment_file).startswith("."): continue attachment_file_path = os.path.join(root_path, "additional", attachment_file) match = re.match( rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file) if match is None: print( f"* Warning: no attachment match for {attachment_file}", file=sys.stderr, ) sys.exit(2) paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) if not args.dry_run and not os.path.exists(dest_path): log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) shutil.copyfile(attachment_file_path, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append((dest_path, type_)) people = AnthologyIndex(None, srcdir=anthology_datadir) def correct_caps(person, name_node, anth_id): """ Many people submit their names in "ALL CAPS" or "all lowercase". Correct this with heuristics. """ name = name_node.text if name.islower() or name.isupper(): # capitalize all parts corrected = " ".join( list(map(lambda x: x.capitalize(), name.split()))) print( f"-> Correcting capitalization of '{name}' to '{corrected}'", file=sys.stderr, ) name_node.text = corrected def disambiguate_name(node, anth_id): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: choice = -1 while choice < 0 or choice >= len(ids): print( f"({anth_id}): ambiguous author {name}; Please choose from the following:" ) for i, id_ in enumerate(ids): print(f"[{i}] {id_} ({people.get_comment(id_)})") choice = int(input("--> ")) node.attrib["id"] = ids[choice] for collection_id, collection in collections.items(): # Newly added volumes, so we can normalize and name-disambig later newly_added_volumes = [] collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): root_node = etree.parse(collection_file).getroot() else: root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element( "volume", attrib={ "id": volume_id, "ingest-date": args.ingest_date }, ) # Replace the existing one if present existing_volume_node = root_node.find( f"./volume[@id='{volume_id}']") for i, child in enumerate(root_node): if child.attrib["id"] == volume_id: root_node[i] = volume_node break else: root_node.append(volume_node) meta_node = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta_node.append(title_node) for author_or_editor in chain( paper_node.findall("./author"), paper_node.findall("./editor")): meta_node.append(author_or_editor) author_or_editor.tag = "editor" meta_node.append(paper_node.find("publisher")) meta_node.append(paper_node.find("address")) meta_node.append(paper_node.find("month")) meta_node.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", attrib={ "hash": compute_hash_from_file(book_dest_path) }, parent=meta_node, ) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) url = paper_node.find("./url") if url is not None: url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) for path, type_ in paper["attachments"]: make_simple_element( "attachment", text=os.path.basename(path), attrib={ "type": type_, "hash": compute_hash_from_file(path), }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for oldnode in paper_node: normalize(oldnode, informat="latex") # Adjust the language tag language_node = paper_node.find("./language") if language_node is not None: try: lang = iso639.languages.get(name=language_node.text) except KeyError: raise Exception( f"Can't find language '{language_node.text}'") language_node.text = lang.part3 print(language_node.text) # Fix author names for name_node in chain(paper_node.findall("./author"), paper_node.findall("./editor")): disambiguate_name(name_node, paper_id_full) person = PersonName.from_element(name_node) for name_part in name_node: correct_caps(person, name_part, paper_id_full) # Other data from the meta file if "isbn" in meta: make_simple_element("isbn", meta["isbn"], parent=meta_node) indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def add_revision(anth_id, pdf_path, explanation, change_type="revision", dry_run=True, date=None): """ Takes an Anthology ID. It then adds a revision to the Anthology XML, updating and writing the XML file, and copies the PDFs into place. For PDFs, the revised PDF is saved to {anth_id}.pdf and {anth_id}v{version}.pdf. For the first revision, we first copy {anth_id}.pdf to {anth_id}v1.pdf. """ if date is None: now = datetime.now() date = f"{now.year}-{now.month:02d}-{now.day:02d}" def maybe_copy(file_from, file_to): if not dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) # The new version revno = None change_letter = "e" if change_type == "erratum" else "v" checksum = compute_hash_from_file(pdf_path) # Files for old-style IDs are stored under anthology-files/pdf/P/P19/* # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/* output_dir = get_pdf_dir(anth_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{anth_id}.pdf") # Update XML xml_file = get_xml_file(anth_id) collection_id, volume_id, paper_id = deconstruct_anthology_id(anth_id) tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if change_type == "erratum" else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not dry_run: # Update the URL hash on the <url> tag url = paper.find("./url") if url is not None: url.attrib["hash"] = checksum if change_type == "revision" and revno == 2: if paper.find("./url") is not None: current_version_url = infer_url( paper.find("./url").text) + ".pdf" # Download original file # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{anth_id}{change_letter}1.pdf") retrieve_url(current_version_url, revised_file_v1_path) validate_file_type(revised_file_v1_path) old_checksum = compute_hash_from_file(revised_file_v1_path) # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{anth_id}{change_letter}1", "hash": old_checksum, }, parent=paper, ) revision = make_simple_element( change_type, explanation, attrib={ "id": str(revno), "href": f"{anth_id}{change_letter}{revno}", "hash": checksum, "date": date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {anth_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) revised_file_versioned_path = os.path.join( output_dir, f"{anth_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(pdf_path, revised_file_versioned_path) # Copy it over the canonical path if change_type == "revision": maybe_copy(pdf_path, canonical_path)
def main(args): collections = defaultdict(OrderedDict) volumes = {} # Build list of volumes, confirm uniqueness for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) meta["path"] = proceedings meta["collection_id"] = collection_id = ( meta["year"] + "." + meta["abbrev"].lower() ) volume_name = meta["volume"] volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) print(f"VOLUME: {volume}") # copy the book book_src_filename = meta["abbrev"] + "-" + meta["year"] book_src_path = os.path.join(root_path, book_src_filename) + ".pdf" book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf" ) log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run) if not args.dry_run: shutil.copyfile(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) if match is None: print("whoa", abbrev) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf" ) log( f"Copying [{paper_id_full}] from {pdf_src_path} -> {pdf_dest_path}", args.dry_run, ) if not args.dry_run: shutil.copyfile(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, collection_id) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir(os.path.join(root_path, "additional")): match = re.match(rf"{abbrev}(\d+)_(\w+)\.(\w+)") if match is not None: paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) if not args.dry_run: shutil.copyfile(attachment_file, dest_path) collections[collection_id][volume_name][paper_num][ "attachments" ].append(dest_path) people = AnthologyIndex( None, srcdir=os.path.join(os.path.dirname(sys.argv[0]), "..", "data") ) for collection_id, collection in collections.items(): collection_file = os.path.join( args.anthology_dir, "data", "xml", f"{collection_id}.xml" ) root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element( "volume", attrib={"id": volume_id}, parent=root_node ) meta_node = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) # print(etree.tostring(paper_node, pretty_print=True)) if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta_node.append(title_node) for editor in paper_node.findall("editor"): meta_node.append(editor) meta_node.append(paper_node.find("publisher")) meta_node.append(paper_node.find("address")) meta_node.append(paper_node.find("month")) meta_node.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", attrib={"hash": compute_hash_from_file(book_dest_path)}, parent=meta_node, ) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) url = paper_node.find("./url") if url is not None: url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) for attachment in paper["attachments"]: make_simple_element( "attachment", text=attachment.path, attrib={ "type": attachment.type, "hash": compute_hash_from_file(attachment.path), }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Other data from the meta file if "isbn" in meta: make_simple_element("isbn", meta["isbn"], parent=meta_node) # Normalize for paper in root_node.findall(".//paper"): for oldnode in paper: normalize(oldnode, informat="latex") # Ensure names are properly identified ambiguous = {} anth_id = build_anthology_id( collection_id, paper.getparent().attrib["id"], paper.attrib["id"] ) for node in chain(paper.findall("author"), paper.findall("editor")): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: print( f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}" ) ambiguous[anth_id] = (name, ids) node.attrib["id"] = ids[0] indent(root_node) tree = etree.ElementTree(root_node) tree.write( collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True )
def download_file(self, fname: str, hash: str, type: str) -> None: if type == "pdf": remote_url = self.source + "/" + fname elif type == "attachment": remote_url = self.source + "/attachments/" + fname else: log.error("unrecognized type: " + type) exit(1) local_target = "" # mkstemp opens the file but we only need the file name, so close it again. tmpfd, tmp_target = tempfile.mkstemp(prefix="aclmirrorer_", suffix=".pdf") os.close(tmpfd) match = NEW_ID_RE.match(fname) if match: local_target = os.path.join(self.to, type, match.groups()[1], fname) else: match = OLD_ID_RE.match(fname) if match: local_target = os.path.join( self.to, type, match.groups()[0], match.groups()[0] + match.groups()[1], fname, ) else: log.error("unrecognized format for " + fname) exit(1) if os.path.exists(local_target): existing_hash = anthology_utils.compute_hash_from_file(local_target) if existing_hash == hash: log.debug( "File {} already up to date, not downloading again".format( local_target ) ) return else: log.debug("File {} changed, redownloading ...".format(local_target)) else: log.debug("Downloading {} from {} ...".format(local_target, remote_url)) if self.is_dry_run: return local_path = os.path.dirname(local_target) os.makedirs(local_path, exist_ok=True) try: urlretrieve(remote_url, tmp_target) except: log.error("could not download " + remote_url) self.not_downloadable.append(remote_url) os.remove(tmp_target) return new_hash = anthology_utils.compute_hash_from_file(tmp_target) if new_hash == hash: # all good, store downloaded file in the proper place shutil.move(tmp_target, local_target) else: log.error( "Hash mismatch for file {}, downloaded from {}. was {} should be {}".format( local_target, remote_url, new_hash, hash ) ) self.hash_mismatches.append(remote_url) return
def main(args): year, venue, _ = os.path.basename(args.tsv_file.name).split(".") # Set the volume name from the collection file, or default to 1 # The file name is either "2012.eamt.tsv" or "2012.eamt-main.tsv". # The default volume name is "1". if "-" in venue: venue, volume_id = venue.split("-") else: volume_id = "1" collection_id = f"{year}.{venue}" tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) now = datetime.now() today = f"{now.year}-{now.month:02d}-{now.day:02d}" volume = make_simple_element("volume", attrib={ "id": volume_id, "ingest-date": today }) tree.getroot().insert(0, volume) # Location of entire-proceedings PDF proceedings_pdf = args.proceedings # Create the metadata for the paper meta = None for row in csv.DictReader(args.meta_file, delimiter=args.delimiter): current_collection_id = f"{row['Year']}.{row['Conference code']}" if current_collection_id == collection_id: meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row["Conference title"], parent=meta) make_simple_element("publisher", row["Publisher"], parent=meta) make_simple_element("address", row["Location"], parent=meta) make_simple_element("month", row["Dates held"], parent=meta) make_simple_element("year", row["Year"], parent=meta) url = row["URL"] if url.endswith(".pdf"): if proceedings_pdf: print( "Overriding --proceedings with proceedings PDF found in conference list", file=sys.stderr, ) proceedings_pdf = url elif "Complete PDF" in row and row["Complete PDF"] != "": proceedings_pdf = row["Complete PDF"] # volume PDF if proceedings_pdf is not None: volume_anth_id = f"{collection_id}-{volume_id}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{volume_anth_id}.pdf") retrieve_url(proceedings_pdf, pdf_local_path) checksum = compute_hash_from_file(pdf_local_path) make_simple_element("url", volume_anth_id, attrib={"hash": checksum}, parent=meta) proceedings_pdf = pdf_local_path if row["Editors"] != "" and "?" not in row["Editors"]: editors = row["Editors"].split(" and ") for editor_name in editors: editor = make_simple_element("editor", parent=meta) if ", " in editor_name: last, first = editor_name.split(", ") else: first, last = ( ' '.join(editor_name.split()[:-1]), editor_name.split()[-1], ) make_simple_element("first", first, parent=editor) make_simple_element("last", last, parent=editor) break else: print( f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}", file=sys.stderr, ) sys.exit(1) paperid = 0 # Create entries for all the papers for row in csv.DictReader(args.tsv_file, delimiter=args.delimiter): pages = row.get("Pagenumbers", None) title_text = row["Title"] # The first row might be front matter (needs a special name) if title_text == "Frontmatter" and paperid == 0: paper = make_simple_element("frontmatter", parent=volume) else: paperid += 1 paper = make_simple_element("paper", attrib={"id": str(paperid)}, parent=volume) # Only make the title for not-the-frontmatter make_simple_element("title", title_text, parent=paper) author_list = row["Authors"].split(" and ") for author_name in author_list: if author_name == "": continue author = make_simple_element("author", parent=paper) if ", " in author_name: last, first = author_name.split(", ") else: first, last = ( ' '.join(author_name.split()[:-1]), author_name.split()[-1], ) make_simple_element("first", first, parent=author) make_simple_element("last", last, parent=author) if pages is not None: make_simple_element("pages", pages, parent=paper) # Find the PDF, either listed directly, or extracted from the proceedings PDF anth_id = f"{collection_id}-{volume_id}.{paperid}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{anth_id}.pdf") url = None if "Pdf" in row and row["Pdf"] != "": if retrieve_url(row["Pdf"], pdf_local_path): url = anth_id elif "pages in pdf" in row: pdf_pages = row["pages in pdf"] extract_pages(proceedings_pdf, pdf_pages, pdf_local_path) url = anth_id if url is not None: checksum = compute_hash_from_file(pdf_local_path) make_simple_element("url", url, attrib={"hash": checksum}, parent=paper) if "Abstract" in row: make_simple_element("abstract", row["Abstract"], parent=paper) if "Presentation" in row: url = row["Presentation"] if url is not None and url != "" and url != "None": extension = row["Presentation"].split(".")[-1] name = f"{anth_id}.Presentation.{extension}" local_path = os.path.join( args.anthology_files_path, "..", "attachments", venue, name, ) if retrieve_url(row["Presentation"], local_path): checksum = compute_hash_from_file(local_path) make_simple_element( "attachment", name, attrib={ "type": "presentation", "hash": checksum }, parent=paper, ) # Normalize for node in paper: normalize(node, informat="latex") indent(tree.getroot()) # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml collection_file = os.path.join(args.anthology, "data", "xml", f"{collection_id}.xml") tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): anth = anthology.Anthology(importdir=os.path.join(args.anthology, "data")) splitter = NameSplitter(anth) paper_nums = {} venue = "lilt" prev_year = None prev_volume = None for row in csv.DictReader(args.tsv_file, delimiter='\t'): year = row.get("year") month = row.get("month") issue = row.get("issue#", "") abstract = row.get("abstract") collection_id = f"{year}.lilt" if year != prev_year: if prev_year is not None: dump_collection( tree, os.path.join(args.anthology, "data", "xml", f"{prev_year}.lilt.xml"), ) tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) root = tree.getroot() prev_year = year volume_name = row.get("Volume#") if volume_name != prev_volume: volume = make_simple_element("volume", attrib={"id": volume_name}, parent=root) meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row.get("Booktitle"), parent=meta) make_simple_element("publisher", "CSLI Publications", parent=meta) make_simple_element("year", year, parent=meta) if month: make_simple_element("month", month, parent=meta) paper_num = paper_nums[volume_name] = paper_nums.get(volume_name, 0) + 1 prev_volume = volume_name paper = make_simple_element("paper", attrib={"id": str(paper_num)}, parent=volume) paper_id = f"{collection_id}-{volume_name}.{paper_num}" make_simple_element("title", row.get("title"), parent=paper) authors = row.get("authors") for author_name in authors.split(" and "): author = make_simple_element("author", parent=paper) surname, givenname = splitter.best_split(author_name) make_simple_element("first", givenname, parent=author) make_simple_element("last", surname, parent=author) if abstract != "": make_simple_element("abstract", abstract, parent=paper) if issue != "": make_simple_element("issue", issue, parent=paper) for node in paper: normalize(node, "latex") dest_dir = f"{args.anthology_files_path}/lilt" if not os.path.exists(dest_dir): os.makedirs(dest_dir) source_path = os.path.join( "pdf", row.get("PDF").replace("\\", "/").replace("../", "")) if os.path.exists(source_path): dest_path = os.path.join( dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") shutil.copy(source_path, dest_path) print(f"Copying {source_path} to {dest_path}", file=sys.stderr) os.chmod(dest_path, 0o644) checksum = compute_hash_from_file(dest_path) make_simple_element("url", paper_id, attrib={"hash": checksum}, parent=paper) dump_collection( tree, os.path.join(args.anthology, "data", "xml", f"{collection_id}.xml"))