def main(args): print(f"Adding {args.award} to {args.anthology_id}...") collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is None: print(f"Error: Can't find paper {args.anthology_id}, quitting") existing_award = paper.find("./award") if existing_award is not None and award.text.lower() == args.award: print( f"Error: Award {args.award} already exists for {args.anthology_id}, quitting" ) make_simple_element("award", args.award, parent=paper) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def process_volume(anthology_volume): collection_id, volume_id, _ = deconstruct_anthology_id(anthology_volume) print(f'Attempting to add DOIs for {anthology_volume}', file=sys.stderr) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = ET.parse(xml_file) formatter = MarkupFormatter() num_added = 0 volume = tree.getroot().find(f"./volume[@id='{volume_id}']") if volume is not None: volume_booktitle = volume.find(f"./meta/booktitle") volume_title = formatter.as_text(volume_booktitle) print(f'-> found existing volume "{volume_title}"', file=sys.stderr) # Iterate through all papers for paper in chain(volume.find('frontmatter'), volume.findall('paper')): added = add_doi(paper, collection_id, volume_id, force=args.force) if added: num_added += 1 sleep(1) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> added {num_added} DOIs to to the XML for collection {collection_id}', file=sys.stderr) else: print(f'-> FATAL: volume {volume} not found in the Anthology', file=sys.stderr) sys.exit(1)
def main(args): for xml_file in args.xml_files: tree = ET.parse(xml_file) for paper in tree.getroot().findall(f".//paper"): make_simple_element("language", "eng", parent=paper) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def main(args): for lineno, line in enumerate(sys.stdin, 1): # attachments/D/D15/D15-1272.Attachment.pdf tokens = line.rstrip().split("/") attachment_file_name = tokens[-1] try: anth_id, kind, *rest = attachment_file_name.split(".") except: print(f"Couldn't parse file {attachment_file_name} into 3 pieces") continue try: collection_id, volume_id, paper_id = deconstruct_anthology_id( anth_id) except: print(f"[{lineno}] BAD LINE {line.rstrip()}") # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) if int(paper_id) == 0: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: # Check if attachment already exists for attachment in paper.findall("attachment"): if attachment.text == attachment_file_name: # print(f'-> attachment {attachment_file_name} already exists in the XML', file=sys.stderr) break else: attachment = ET.Element("attachment") attachment.attrib["type"] = kind.lower() attachment.text = attachment_file_name paper.append(attachment) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print( f"-> [{lineno}] added attachment {attachment_file_name} to the XML", file=sys.stderr, ) else: print( f"-> FATAL: [{lineno}] paper ({anth_id}) not found in the Anthology", file=sys.stderr, ) sys.exit(1)
def main(args): combo_df = combine_tsv(args['tsv_files']) combo_df_uniques = combo_df['anthology_id'].apply(split_anth_id).unique() for xml in os.listdir(data_dir): fname, ext = os.path.splitext(xml) if fname in combo_df_uniques.tolist() or fname == "2020.acl": tree = et.parse(os.path.join(data_dir, xml)) df_subset = combo_df[combo_df['anthology_id'].str.startswith(fname)] df_subset.apply(add_video_tag, axis=1, xml_parse=tree) with open(os.path.join(data_dir, fname + ".xml"), 'wb') as f: indent(tree.getroot()) tree.write(f, encoding="UTF-8", xml_declaration=True)
def write_bibkeys(anthology, srcdir, commit=False): for volume_id, volume in anthology.volumes.items(): papers_without_bibkey = [] for paper in volume: bibkey = paper.bibkey if bibkey is None or bibkey == paper.full_id: papers_without_bibkey.append(paper) if papers_without_bibkey: log.info( f"Found {len(papers_without_bibkey):4d} papers without bibkeys in volume {volume_id}" ) if not commit: continue else: continue # We got some new bibkeys and need to write them to the XML xml_file = os.path.join(srcdir, "xml", f"{volume.collection_id}.xml") tree = ET.parse(xml_file) root = tree.getroot() for paper in papers_without_bibkey: if paper.paper_id == "0": node = root.find( f"./volume[@id='{paper.volume_id}']/frontmatter") if node is None: # dummy frontmatter continue else: node = root.find( f"./volume[@id='{paper.volume_id}']/paper[@id='{paper.paper_id}']" ) if node is None: log.error(f"Paper {paper.full_id} not found in {xml_file}") continue # Generate unique bibkey bibkey = anthology.pindex.create_bibkey(paper, vidx=anthology.venues) make_simple_element("bibkey", bibkey, parent=node) indent(root) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def main(args): """ Downloads an Anthology paper and adds a RETRACTED watermark, then updates the XML with an appropriate <revision> and <retracted> tag. """ with tempfile.TemporaryDirectory() as tempdir: new_pdf = add_watermark(args.anthology_id, workdir=tempdir) add_revision( args.anthology_id, new_pdf, explanation="Retracted.", change_type="revision", dry_run=False, ) xml_file = get_xml_file(args.anthology_id) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find( f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is None: print(f"Couldn't find paper {args.anthology_id}!", file=sys.stderr) sys.exit(2) print("Modifying the XML", file=sys.stderr) now = datetime.now() date = f"{now.year}-{now.month:02d}-{now.day:02d}" retracted_node = make_simple_element("retracted", args.explanation, attrib={"date": date}, parent=paper) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def main(args): for line in args.isbn_file: venue, isbn = line.rstrip().split() xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{venue}.xml") if not os.path.exists(xml_file): print(f"Can't find {xml_file}") continue tree = ET.parse(xml_file) meta = tree.getroot().find(f".//volume[@id='1']/meta") if meta is not None and meta.find("./isbn") is None: print(f"Adding {isbn} to {venue} meta block") make_simple_element("isbn", isbn, parent=meta) elif volume.find("./isbn") is not None: print(f"{venue} already done") indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def main(args): for xml_file in args.files: # Update XML tree = ET.parse(xml_file) tree.getroot().tail = '\n' for paper in tree.getroot().findall('.//paper'): tail = paper.tail seen = [] for attachment in paper.findall('./attachment'): if attachment.text in seen: print(f'Removing: {attachment.text}') paper.remove(attachment) seen.append(attachment.text) indent(paper, level=2) paper.tail = tail tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
def main(args): collections = defaultdict(OrderedDict) volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_index = VenueIndex(srcdir=anthology_datadir) venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] sig_index = SIGIndex(srcdir=anthology_datadir) # Build list of volumes, confirm uniqueness unseen_venues = [] for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) venue_abbrev = meta["abbrev"] venue_slug = venue_index.get_slug(venue_abbrev) if str(datetime.now().year) in venue_abbrev: print( f"Fatal: Venue assembler put year in acronym: '{venue_abbrev}'" ) sys.exit(1) if venue_slug not in venue_keys: unseen_venues.append((venue_slug, venue_abbrev, meta["title"])) meta["path"] = proceedings meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug volume_name = meta["volume"].lower() volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta if "sig" in meta: print( f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:" ) print(f" - {meta['year']}:") print(f" - {volume_full_id} # {meta['booktitle']}") # Make sure all venues exist if len(unseen_venues) > 0: for venue in unseen_venues: slug, abbrev, title = venue print(f"Creating venue '{abbrev}' ({title})") venue_index.add_venue(abbrev, title) venue_index.dump(directory=anthology_datadir) # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume"].lower() year = meta["year"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) # copy the book book_src_filename = f'{year}.{meta["abbrev"]}-{volume_name}.pdf' book_src_path = os.path.join(root_path, book_src_filename) book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") if not args.dry_run: maybe_copy(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # Skip . files if os.path.basename(pdf_file).startswith("."): continue # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") if not args.dry_run: maybe_copy(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): if os.path.basename(attachment_file).startswith("."): continue attachment_file_path = os.path.join(root_path, "additional", attachment_file) match = re.match( rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file) if match is None: print( f"* Warning: no attachment match for {attachment_file}", file=sys.stderr, ) sys.exit(2) paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) if not args.dry_run and not os.path.exists(dest_path): log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) shutil.copyfile(attachment_file_path, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append((dest_path, type_)) people = AnthologyIndex(None, srcdir=anthology_datadir) def correct_caps(person, name_node, anth_id): """ Many people submit their names in "ALL CAPS" or "all lowercase". Correct this with heuristics. """ name = name_node.text if name.islower() or name.isupper(): # capitalize all parts corrected = " ".join( list(map(lambda x: x.capitalize(), name.split()))) print( f"-> Correcting capitalization of '{name}' to '{corrected}'", file=sys.stderr, ) name_node.text = corrected def disambiguate_name(node, anth_id): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: choice = -1 while choice < 0 or choice >= len(ids): print( f"({anth_id}): ambiguous author {name}; Please choose from the following:" ) for i, id_ in enumerate(ids): print(f"[{i}] {id_} ({people.get_comment(id_)})") choice = int(input("--> ")) node.attrib["id"] = ids[choice] for collection_id, collection in collections.items(): # Newly added volumes, so we can normalize and name-disambig later newly_added_volumes = [] collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): root_node = etree.parse(collection_file).getroot() else: root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element( "volume", attrib={ "id": volume_id, "ingest-date": args.ingest_date }, ) # Replace the existing one if present existing_volume_node = root_node.find( f"./volume[@id='{volume_id}']") for i, child in enumerate(root_node): if child.attrib["id"] == volume_id: root_node[i] = volume_node break else: root_node.append(volume_node) meta_node = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta_node.append(title_node) for author_or_editor in chain( paper_node.findall("./author"), paper_node.findall("./editor")): meta_node.append(author_or_editor) author_or_editor.tag = "editor" meta_node.append(paper_node.find("publisher")) meta_node.append(paper_node.find("address")) meta_node.append(paper_node.find("month")) meta_node.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", attrib={ "hash": compute_hash_from_file(book_dest_path) }, parent=meta_node, ) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) url = paper_node.find("./url") if url is not None: url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) for path, type_ in paper["attachments"]: make_simple_element( "attachment", text=os.path.basename(path), attrib={ "type": type_, "hash": compute_hash_from_file(path), }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for oldnode in paper_node: normalize(oldnode, informat="latex") # Adjust the language tag language_node = paper_node.find("./language") if language_node is not None: try: lang = iso639.languages.get(name=language_node.text) except KeyError: raise Exception( f"Can't find language '{language_node.text}'") language_node.text = lang.part3 print(language_node.text) # Fix author names for name_node in chain(paper_node.findall("./author"), paper_node.findall("./editor")): disambiguate_name(name_node, paper_id_full) person = PersonName.from_element(name_node) for name_part in name_node: correct_caps(person, name_part, paper_id_full) # Other data from the meta file if "isbn" in meta: make_simple_element("isbn", meta["isbn"], parent=meta_node) indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
One-time script that converted the old revision style <revision id="2">P18-1001v2</revision> to the new revision style that mandates an explanation <revision id="2" href="P18-1001v2">Added new references.</revision> """ import lxml.etree as etree import re import sys from anthology.utils import infer_url, test_url, indent filename = sys.argv[1] outfilename = sys.argv[2] tree = etree.parse(filename) root = tree.getroot() collection_id = root.attrib["id"] papers = list(root.findall(".//paper")) + list(root.findall(".//frontmatter")) for paper in papers: for revision in paper.findall("revision"): revision.attrib["href"] = revision.text revision.text = "No description of the changes were recorded." indent(root) tree.write(outfilename, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): print(f'Processing attachment for {args.anthology_id}', file=sys.stderr) if args.path.startswith('http'): _, input_file_path = tempfile.mkstemp() try: print('-> Downloading file from {}'.format(args.path), file=sys.stderr) with urllib.request.urlopen(args.path) as url, open( input_file_path, mode='wb') as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: print( '-> FATAL: An SSL error was encountered in downloading the files.', file=sys.stderr) sys.exit(1) else: input_file_path = args.path collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split('.')[-1] if paper_extension not in ALLOWED_TYPES: print( f'-> FATAL: {args.anthology_id} unknown file extension {paper_extension}', file=sys.stderr) sys.exit(1) attachment_file_name = f'{args.anthology_id}.{args.type}.{paper_extension}' # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: # Check if attachment already exists for attachment in paper.findall('attachment'): if attachment.text == attachment_file_name: print( f'-> attachment {attachment_file_name} already exists in the XML', file=sys.stderr) break else: attachment = ET.Element('attachment') attachment.attrib['type'] = args.type.lower() attachment.text = attachment_file_name paper.append(attachment) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> added attachment {attachment_file_name} to the XML', file=sys.stderr) else: print( f'-> FATAL: paper (volume={volume_id}, paper={paper_id}) not found in the Anthology', file=sys.stderr) sys.exit(1) # Make sure directory exists output_dir = os.path.join(args.attachment_root, collection_id[0], collection_id) if not os.path.exists(output_dir): print(f'-> Creating directory {output_dir}', file=sys.stderr) os.makedirs(output_dir) # Copy file dest_path = os.path.join(output_dir, attachment_file_name) if os.path.exists(dest_path): print( f'-> target file {dest_path} already in place, refusing to overwrite', file=sys.stderr) else: shutil.copy(input_file_path, dest_path) os.chmod(dest_path, 0o644) print(f'-> copied {input_file_path} to {dest_path} and fixed perms', file=sys.stderr) # Clean up if args.path.startswith('http'): os.remove(input_file_path)
def main(args): year, venue, _ = os.path.basename(args.tsv_file.name).split(".") # Set the volume name from the collection file, or default to 1 # The file name is either "2012.eamt.tsv" or "2012.eamt-main.tsv". # The default volume name is "1". if "-" in venue: venue, volume_id = venue.split("-") else: volume_id = "1" collection_id = f"{year}.{venue}" tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) now = datetime.now() today = f"{now.year}-{now.month:02d}-{now.day:02d}" volume = make_simple_element("volume", attrib={ "id": volume_id, "ingest-date": today }) tree.getroot().insert(0, volume) # Location of entire-proceedings PDF proceedings_pdf = args.proceedings # Create the metadata for the paper meta = None for row in csv.DictReader(args.meta_file, delimiter="\t"): current_collection_id = f"{row['Year']}.{row['Conference code']}" if current_collection_id == collection_id: meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row["Conference title"], parent=meta) make_simple_element("publisher", row["Publisher"], parent=meta) make_simple_element("address", row["Location"], parent=meta) make_simple_element("month", row["Dates held"], parent=meta) make_simple_element("year", row["Year"], parent=meta) url = row["URL"] if url.endswith(".pdf"): if proceedings_pdf: print( "Overriding --proceedings with proceedings PDF found in conference list", file=sys.stderr, ) proceedings_pdf = url elif "Complete PDF" in row and row["Complete PDF"] != "": proceedings_pdf = row["Complete PDF"] # volume PDF if proceedings_pdf is not None: volume_anth_id = f"{collection_id}-{volume_id}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{volume_anth_id}.pdf") download(proceedings_pdf, pdf_local_path) with open(pdf_local_path, "rb") as f: checksum = compute_hash(f.read()) make_simple_element("url", volume_anth_id, attrib={"hash": checksum}, parent=meta) proceedings_pdf = pdf_local_path if row["Editors"] != "" and "?" not in row["Editors"]: editors = row["Editors"].split(" and ") for editor_name in editors: editor = make_simple_element("editor", parent=meta) if ", " in editor_name: last, first = editor_name.split(", ") else: first, last = ( ' '.join(editor_name.split()[:-1]), editor_name.split()[-1], ) make_simple_element("first", first, parent=editor) make_simple_element("last", last, parent=editor) break else: print( f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}", file=sys.stderr, ) sys.exit(1) if not os.path.exists(collection_id): print(f"Creating {collection_id}", file=sys.stderr) os.makedirs(collection_id) paperid = 0 # Create entries for all the papers for row in csv.DictReader(args.tsv_file, delimiter='\t'): pages = row.get("Pagenumbers", None) title_text = row["Title"] # The first row might be front matter (needs a special name) if title_text == "Frontmatter" and paperid == 0: paper = make_simple_element("frontmatter", parent=volume) else: paperid += 1 paper = make_simple_element("paper", attrib={"id": str(paperid)}, parent=volume) # Only make the title for not-the-frontmatter make_simple_element("title", title_text, parent=paper) author_list = row["Authors"].split(" and ") for author_name in author_list: if author_name == "": continue author = make_simple_element("author", parent=paper) if ", " in author_name: last, first = author_name.split(", ") else: first, last = ' '.join( author_name.split()[:-1]), author_name.split()[-1] make_simple_element("first", first, parent=author) make_simple_element("last", last, parent=author) if pages is not None: make_simple_element("pages", pages, parent=paper) # Find the PDF, either listed directly, or extracted from the proceedings PDF anth_id = f"{collection_id}-{volume_id}.{paperid}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{anth_id}.pdf") url = None if "Pdf" in row and row["Pdf"] != "": if download(row["Pdf"], pdf_local_path): url = anth_id elif "pages in pdf" in row: pdf_pages = row["pages in pdf"] extract_pages(proceedings_pdf, pdf_pages, pdf_local_path) url = anth_id if url is not None: with open(pdf_local_path, "rb") as f: checksum = compute_hash(f.read()) make_simple_element("url", url, attrib={"hash": checksum}, parent=paper) if "Abstract" in row: make_simple_element("abstract", row["Abstract"], parent=paper) if "Presentation" in row: url = row["Presentation"] if url is not None and url != "" and url != "None": extension = row["Presentation"].split(".")[-1] name = f"{anth_id}.Presentation.{extension}" local_path = os.path.join( args.anthology_files_path, "..", "attachments", venue, name, ) if download(row["Presentation"], local_path): make_simple_element("attachment", name, attrib={"type": "presentation"}, parent=paper) # Normalize for node in paper: normalize(node, informat="latex") indent(tree.getroot()) # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml collection_file = os.path.join(args.anthology, "data", "xml", f"{collection_id}.xml") tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): code, year, _ = os.path.basename(args.tsv_file.name).split(".") collection_id = f"{year}.{code}" tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) volume_id = "1" volume = make_simple_element("volume", attrib={"id": volume_id}) tree.getroot().insert(0, volume) # Create the metadata for the paper meta = None for row in csv.DictReader(args.meta_file, delimiter="\t"): if row["Conference code"] == collection_id: if row["Completed"] == "FALSE": prin( f"Warning: Conference {collection_id} is not marked as completed, can't ingest." ) sys.exit(1) meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row["Conference title"], parent=meta) make_simple_element("publisher", row["Publisher"], parent=meta) make_simple_element("address", row["Location"], parent=meta) make_simple_element("month", row["Dates held"], parent=meta) make_simple_element("year", row["Year"], parent=meta) if row["Editors"] != "" and "?" not in row["Editors"]: editors = row["Editors"].split(" and ") for editor_name in editors: editor = make_simple_element("editor", parent=meta) last, first = editor_name.split(", ") make_simple_element("first", first, parent=editor) make_simple_element("last", last, parent=editor) break else: print( f"Couldn't find conference code {collection_id} in 'Conference code' field of metadata file {args.meta_file.name}", file=sys.stderr) sys.exit(1) if not os.path.exists(collection_id): print(f"Creating {collection_id}", file=sys.stderr) os.makedirs(collection_id) # Create entries for all the papers for paperid, row in enumerate( csv.DictReader(args.tsv_file, delimiter='\t'), 1): title_text = row["Title"] author_list = row["Authors"].split(" and ") pdf = row["Pdf"] paper = make_simple_element("paper", attrib={"id": str(paperid)}, parent=volume) make_simple_element("title", title_text, parent=paper) for author_name in author_list: if author_name == "": continue author = make_simple_element("author", parent=paper) print(author_name) last, first = author_name.split(", ") make_simple_element("first", first, parent=author) make_simple_element("last", last, parent=author) url = f"{collection_id}-{volume_id}.{paperid}" pdf_local_path = os.path.join(collection_id, f"{url}.pdf") make_simple_element("url", url, parent=paper) download(pdf, pdf_local_path) if "Abstract" in row: make_simple_element("abstract", row["Abstract"], parent=paper) if "Presentation" in row: extension = row["Presentation"].split(".")[-1] filename = f"{collection_id}-{volume_id}.{paperid}.Presentation.{extension}" make_simple_element("attachment", filename, attrib={"type": "presentation"}) download(row["Presentation"], os.path.join(collection_id, filename)) indent(tree.getroot()) # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml collection_file = os.path.join(args.anthology, "data", "xml", f"{collection_id}.xml") tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def add_attachment(anthology_id, path, attach_type, overwrite=False): """ Adds a single attachment to the Anthology data files. Arguments: - The ACL ID of the paper (e.g., P17-1012) - The path to the attachment (can be a URL) - The attachment type (poster, presentation, note, software) - Whether to overwrite the downloaded file. """ collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id) if path.startswith("http"): _, input_file_path = tempfile.mkstemp() try: print(f"-> Downloading file from {path} to {input_file_path}", file=sys.stderr) request = urllib.request.Request( path, headers={'User-Agent': 'Mozilla/5.0'}) with urllib.request.urlopen(request) as url, open( input_file_path, mode="wb") as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: raise Exception(f"Could not download {path}") except Exception as e: raise e else: input_file_path = path file_extension = path.replace("?dl=1", "").split(".")[-1] # Many links from file sharing services are not informative and don't have # extensions, so we could try to guess. if file_extension not in ALLOWED_TYPES: detected = filetype.guess(input_file_path) if detected is not None: file_extension = detected.mime.split("/")[-1] if file_extension not in ALLOWED_TYPES: print( f"Could not determine file extension for {anthology_id} at {path}", file=sys.stderr, ) with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) attachment_file_name = f"{anthology_id}.{attach_type}.{file_extension}" paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: # Check if attachment already exists for attachment in paper.findall("attachment"): if attachment.text == attachment_file_name: print( f"-> attachment {attachment_file_name} already exists in the XML", file=sys.stderr, ) break else: attachment = ET.Element("attachment") attachment.attrib["type"] = attach_type.lower() attachment.attrib["hash"] = checksum attachment.text = attachment_file_name paper.append(attachment) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f"-> added attachment {attachment_file_name} to the XML", file=sys.stderr) else: print(f"Paper {anthology_id} not found in the Anthology", file=sys.stderr) # Make sure directory exists output_dir = os.path.join(args.attachment_root, collection_id[0], collection_id) if not os.path.exists(output_dir): # print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) # Copy file dest_path = os.path.join(output_dir, attachment_file_name) if os.path.exists(dest_path) and not overwrite: print( f"-> target file {dest_path} already in place, refusing to overwrite", file=sys.stderr, ) return None shutil.copy(input_file_path, dest_path) os.chmod(dest_path, 0o644) print(f"-> copied {input_file_path} to {dest_path} and fixed perms", file=sys.stderr) # Clean up if path.startswith("http"): os.remove(input_file_path) return dest_path
def main(args): def maybe_copy(file_from, file_to): if not args.dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) change_type = "erratum" if args.erratum else "revision" change_letter = "e" if args.erratum else "v" print(f"Processing {change_type} to {args.anthology_id}...") # TODO: make sure path exists, or download URL to temp file if args.path.startswith("http"): _, input_file_path = tempfile.mkstemp() download_file(args.path, input_file_path) else: input_file_path = args.path validate_file_type(input_file_path) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split(".")[-1] # The new version revno = None with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Files for old-style IDs are stored under anthology-files/pdf/P/P19/* # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/* if is_newstyle_id(args.anthology_id): venue_name = collection_id.split(".")[1] output_dir = os.path.join(args.anthology_dir, "pdf", venue_name) else: output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf") # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not args.dry_run: # Update the URL hash on the <url> tag url = paper.find("./url") if url is not None: url.attrib["hash"] = checksum if not args.erratum and revno == 2: if paper.find("./url") is not None: current_version_url = infer_url( paper.find("./url").text) + ".pdf" # Download original file # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}1.pdf") download_file(current_version_url, revised_file_v1_path) validate_file_type(revised_file_v1_path) with open(revised_file_v1_path, "rb") as f: old_checksum = compute_hash(f.read()) # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{args.anthology_id}{change_letter}1", "hash": old_checksum, }, parent=paper, ) revision = make_simple_element( change_type, args.explanation, attrib={ "id": str(revno), "href": f"{args.anthology_id}{change_letter}{revno}", "hash": checksum, "date": args.date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) revised_file_versioned_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(input_file_path, revised_file_versioned_path) # Copy it over the canonical path if not args.erratum: maybe_copy(input_file_path, canonical_path) if args.path.startswith("http"): os.remove(input_file_path)
def main(args): change_type = 'erratum' if args.erratum else 'revision' change_letter = 'e' if args.erratum else 'v' print(f'Processing {change_type} to {args.anthology_id}...') # TODO: make sure path exists, or download URL to temp file if args.path.startswith('http'): _, input_file_path = tempfile.mkstemp() try: print(f'-> Downloading file from {args.path}', file=sys.stderr) with urllib.request.urlopen(args.path) as url, open(input_file_path, mode='wb') as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: print('An SSL error was encountered in downloading the files.', file=sys.stderr) sys.exit(1) else: input_file_path = args.path collection_id, volume_id, paper_id = deconstruct_anthology_id(args.anthology_id) paper_extension = args.path.split('.')[-1] # The new version revno = None # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = ET.parse(xml_file) paper = tree.getroot().find(f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib['id']) + 1 if args.do: revision = ET.Element(change_type) revision.attrib['id'] = str(revno) revision.attrib['href'] = f'{args.anthology_id}{change_letter}{revno}' revision.text = args.explanation # Set tails to maintain proper indentation paper[-1].tail += ' ' revision.tail = '\n ' # newline and two levels of indent paper.append(revision) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print(f'-> FATAL: paper ID {args.anthology_id} not found in the Anthology', file=sys.stderr) sys.exit(1) output_dir = os.path.join(args.anthology_dir, 'pdf', collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f'-> Creating directory {output_dir}', file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f'{args.anthology_id}.pdf') if not args.erratum and revno == 2: # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join(output_dir, f'{args.anthology_id}{change_letter}1.pdf') current_version = ANTHOLOGY_PDF.format(args.anthology_id) if args.do: try: print(f'-> Downloading file from {args.path} to {revised_file_v1_path}', file=sys.stderr) with urllib.request.urlopen(current_version) as url, open(revised_file_v1_path, mode='wb') as fh: fh.write(url.read()) except ssl.SSLError: print(f'-> FATAL: An SSL error was encountered in downloading {args.path}.', file=sys.stderr) sys.exit(1) else: print(f'-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}', file=sys.stderr) revised_file_versioned_path = os.path.join(output_dir, f'{args.anthology_id}{change_letter}{revno}.pdf') maybe_copy(input_file_path, revised_file_versioned_path, args.do) maybe_copy(input_file_path, canonical_path, args.do) if args.path.startswith('http'): os.remove(input_file_path)
def main(args): def maybe_copy(file_from, file_to): if not args.dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) change_type = "erratum" if args.erratum else "revision" change_letter = "e" if args.erratum else "v" print(f"Processing {change_type} to {args.anthology_id}...") # TODO: make sure path exists, or download URL to temp file if args.path.startswith("http"): _, input_file_path = tempfile.mkstemp() try: print(f"-> Downloading file from {args.path}", file=sys.stderr) with urllib.request.urlopen(args.path) as url, open( input_file_path, mode="wb") as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: print("An SSL error was encountered in downloading the files.", file=sys.stderr) sys.exit(1) else: input_file_path = args.path detected = filetype.guess(input_file_path) if detected is None or not detected.mime.endswith(detected.extension): mime_type = 'UNKNOWN' if detected is None else detected.mime print( f"FATAL: {args.anthology_id} file {args.path} has MIME type {mime_type}", file=sys.stderr, ) sys.exit(1) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split(".")[-1] # The new version revno = None with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not args.dry_run: if not args.erratum and revno == 2: # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{args.anthology_id}{change_letter}1", "hash": checksum, }, parent=paper, ) revision = make_simple_element( change_type, args.explanation, attrib={ "id": str(revno), "href": f"{args.anthology_id}{change_letter}{revno}", "hash": checksum, "date": args.date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf") if not args.erratum and revno == 2: # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}1.pdf") current_version = ANTHOLOGY_PDF.format(args.anthology_id) if not args.dry_run: try: print( f"-> Downloading file from {args.path} to {revised_file_v1_path}", file=sys.stderr, ) with urllib.request.urlopen(current_version) as url, open( revised_file_v1_path, mode="wb") as fh: fh.write(url.read()) except ssl.SSLError: print( f"-> FATAL: An SSL error was encountered in downloading {args.path}.", file=sys.stderr, ) sys.exit(1) else: print( f"-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}", file=sys.stderr, ) revised_file_versioned_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(input_file_path, revised_file_versioned_path) # Copy it over the canonical path if not args.erratum: maybe_copy(input_file_path, canonical_path) if args.path.startswith("http"): os.remove(input_file_path)
def main(args): year = args.year venue = args.venue volume_id = args.volume collection_id = f"{year}.{venue}" splitter = NameSplitter(anthology_dir=args.anthology_dir) collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): tree = etree.parse(collection_file) else: tree = etree.ElementTree( make_simple_element("collection", attrib={"id": collection_id})) now = datetime.now() today = f"{now.year}-{now.month:02d}-{now.day:02d}" volume_node = tree.getroot().find(f"./volume[@id='{volume_id}']") if volume_node is not None: tree.getroot().remove(volume_node) volume = make_simple_element("volume", attrib={ "id": volume_id, "ingest-date": today }, parent=tree.getroot()) if not os.path.exists(collection_id): print(f"Creating {collection_id}", file=sys.stderr) os.makedirs(collection_id) # Create entries for all the papers for paperid, row in enumerate( csv.DictReader(args.tsv_file, delimiter=args.delimiter)): pages = row.get("pages", None) if paperid == 0: meta = make_simple_element("meta", parent=volume) make_simple_element("booktitle", row["booktitle"], parent=meta) make_simple_element("publisher", row["publisher"], parent=meta) make_simple_element("address", row["address"], parent=meta) make_simple_element("month", row["month"], parent=meta) make_simple_element("year", year, parent=meta) editors = row["author"].split(" and ") row["author"] = "" for editor_name in editors: editor = make_simple_element("editor", parent=meta) surname, givenname = splitter.best_split(editor_name) make_simple_element("first", givenname, parent=editor) make_simple_element("last", surname, parent=editor) # volume PDF proceedings_pdf = args.proceedings_pdf if proceedings_pdf is not None: volume_anth_id = f"{collection_id}-{volume_id}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{volume_anth_id}.pdf") retrieve_url(proceedings_pdf, pdf_local_path) checksum = compute_hash_from_file(pdf_local_path) make_simple_element("url", volume_anth_id, attrib={"hash": checksum}, parent=meta) proceedings_pdf = pdf_local_path title_text = row["title"] # The first row might be front matter (needs a special name) if paperid == 0 and title_text.lower() in [ "frontmatter", "front matter" ]: paper = make_simple_element("frontmatter", parent=volume) else: if paperid == 0: # Not frontmatter, so paper 1 paperid += 1 paper = make_simple_element("paper", attrib={"id": str(paperid)}, parent=volume) # Only make the title for not-the-frontmatter make_simple_element("title", title_text, parent=paper) author_list = row["author"].split(" and ") for author_name in author_list: if author_name == "": continue author = make_simple_element("author", parent=paper) surname, givenname = splitter.best_split(author_name) make_simple_element("first", givenname, parent=author) make_simple_element("last", surname, parent=author) if pages is not None and pages != "": make_simple_element("pages", pages, parent=paper) # Find the PDF, either listed directly, or extracted from the proceedings PDF anth_id = f"{collection_id}-{volume_id}.{paperid}" pdf_local_path = os.path.join(args.anthology_files_path, venue, f"{anth_id}.pdf") url = None if "pdf" in row and row["pdf"] != "": if retrieve_url(row["pdf"], pdf_local_path): url = anth_id else: print("Can't find", row["pdf"]) elif "pages in pdf" in row: pdf_pages = row["pages"] extract_pages(proceedings_pdf, pdf_pages, pdf_local_path) url = anth_id if url is not None: checksum = compute_hash_from_file(pdf_local_path) make_simple_element("url", url, attrib={"hash": checksum}, parent=paper) if "abstract" in row and row["abstract"] != "": make_simple_element("abstract", row["abstract"], parent=paper) if "presentation" in row: url = row["presentation"] if url is not None and url != "" and url != "None": extension = row["presentation"].split(".")[-1] name = f"{anth_id}.Presentation.{extension}" local_path = os.path.join( args.anthology_files_path, "..", "attachments", venue, name, ) if retrieve_url(row["presentation"], local_path): make_simple_element( "attachment", name, attrib={ "type": "presentation", "hash": compute_hash_from_file(local_path), }, parent=paper, ) # Normalize for node in paper: normalize(node, informat="latex") indent(tree.getroot()) # Write the file to disk: acl-anthology/data/xml/{collection_id}.xml tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def add_attachment(anthology_id, path, attach_type, overwrite=False): """ Adds a single attachment to the Anthology data files. Arguments: - The ACL ID of the paper (e.g., P17-1012) - The path to the attachment (can be a URL) - The attachment type (poster, presentation, note, software) - Whether to overwrite the downloaded file. """ collection_id, volume_id, paper_id = deconstruct_anthology_id(anthology_id) paper_extension = path.replace("?dl=1", "").split(".")[-1] output_dir = os.path.join(args.attachment_root, collection_id[0], collection_id) attachment_file_name = f"{anthology_id}.{attach_type}.{paper_extension}" dest_path = os.path.join(output_dir, attachment_file_name) if os.path.exists(dest_path) and not overwrite: print( f"-> target file {dest_path} already in place, refusing to overwrite", file=sys.stderr, ) return None if path.startswith("http"): _, input_file_path = tempfile.mkstemp() try: print(f"-> Downloading file from {path} to {input_file_path}", file=sys.stderr) with urllib.request.urlopen(path) as url, open( input_file_path, mode="wb") as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: raise Exception(f"Could not download {path}") else: input_file_path = path detected = filetype.guess(input_file_path) if detected is None or not detected.mime.endswith(detected.extension): mime_type = 'UNKNOWN' if detected is None else detected.mime raise Exception( f"{anthology_id} file {path} has MIME type {mime_type}") if paper_extension not in ALLOWED_TYPES: raise Exception( f"-> Unknown file extension {paper_extension} for {path}") # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: # Check if attachment already exists for attachment in paper.findall("attachment"): if attachment.text == attachment_file_name: print( f"-> attachment {attachment_file_name} already exists in the XML", file=sys.stderr, ) break else: attachment = ET.Element("attachment") attachment.attrib["type"] = attach_type.lower() attachment.text = attachment_file_name paper.append(attachment) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f"-> added attachment {attachment_file_name} to the XML", file=sys.stderr) else: raise Exception(f"Paper {anthology_id} not found in the Anthology") # Make sure directory exists if not os.path.exists(output_dir): # print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) # Copy file shutil.copy(input_file_path, dest_path) os.chmod(dest_path, 0o644) print(f"-> copied {input_file_path} to {dest_path} and fixed perms", file=sys.stderr) # Clean up if path.startswith("http"): os.remove(input_file_path) return dest_path
else: old_video = old_paper.find("video") logging.info(old_video) if old_video is not None: logging.info("Had old video!") old_video_href = old_video.attrib["href"] old_video_href_https = old_video_href.replace( "http://", "https://" ) # Fix for techtalkx.tv links old_video.attrib["href"] = old_video_href_https logging.info(old_video_href) papernode.append(old_video) old_attachment = old_paper.find("attachment") logging.info(old_attachment) if old_attachment is not None: logging.info("Had an old attachment!") old_attachment_type = old_attachment.attrib["type"] logging.info(old_attachment_type) papernode.append(old_attachment) # Normalize for oldnode in papernode: normalize(oldnode, informat="latex") volume.append(papernode) i += 1 indent(collection) # from anthology.utils et = etree.ElementTree(collection) et.write(args.outfile, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): collections = defaultdict(OrderedDict) volumes = {} # Build list of volumes, confirm uniqueness for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) meta["path"] = proceedings meta["collection_id"] = collection_id = (meta["year"] + "." + meta["abbrev"].lower()) volume_name = meta["volume_name"] volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume_name"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) print(f"VOLUME: {volume}") # copy the book book_src_filename = meta["abbrev"] + "-" + meta["year"] book_src_path = os.path.join(root_path, book_src_filename) + ".pdf" book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run) if not args.dry_run: shutil.copyfile(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # names are {abbrev}{number}.pdf abbrev = meta["abbrev"] match = re.match(rf"{abbrev}(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") log( f"Copying [{paper_id_full}] {pdf_src_path} -> {pdf_dest_path}", args.dry_run, ) if not args.dry_run: shutil.copyfile(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, collection_id) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): match = re.match(rf"{abbrev}(\d+)_(\w+)\.(\w+)") if match is not None: paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) if not args.dry_run: shutil.copyfile(attachment_file, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append(dest_path) people = AnthologyIndex(None, srcdir=os.path.join(os.path.dirname(sys.argv[0]), "..", "data")) for collection_id, collection in collections.items(): collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element("volume", attrib={"id": volume_id}, parent=root_node) meta = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) # print(etree.tostring(paper_node, pretty_print=True)) if paper_node.attrib["id"] == "0": # create metadata subtree meta = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta.append(title_node) for editor in paper_node.findall("editor"): meta.append(editor) meta.append(paper_node.find("publisher")) meta.append(paper_node.find("address")) meta.append(paper_node.find("month")) meta.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", parent=meta) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) for attachment in paper["attachments"]: make_simple_element( "attachment", text=attachment.path, attrib={ "type": attachment.type, }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for paper in root_node.findall(".//paper"): for oldnode in paper: normalize(oldnode, informat="latex") # Ensure names are properly identified ambiguous = {} for paper in root_node.findall(".//paper"): anth_id = build_anthology_id(collection_id, paper.getparent().attrib["id"], paper.attrib["id"]) for node in chain(paper.findall("author"), paper.findall("editor")): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: print( f"WARNING ({anth_id}): ambiguous author {name}, defaulting to first of {ids}" ) ambiguous[anth_id] = (name, ids) node.attrib["id"] = ids[0] indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def main(args): collections = defaultdict(OrderedDict) volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_keys = [ venue["slug"].lower() for _, venue in VenueIndex(srcdir=anthology_datadir).items() ] # Build list of volumes, confirm uniqueness unseen_venues = [] for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) venue_name = meta["abbrev"].lower() if venue_name not in venue_keys: unseen_venues.append(meta["abbrev"]) meta["path"] = proceedings meta["collection_id"] = collection_id = (meta["year"] + "." + meta["abbrev"].lower()) volume_name = meta["volume"].lower() volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta # Make sure all venues exist if len(unseen_venues) > 0: print("FATAL: The following venue(s) don't exist in venues.yaml") for venue in unseen_venues: print(f"- {venue}") print("Please create entries for them and re-ingest.") sys.exit(1) # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume"].lower() year = meta["year"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) # copy the book book_src_filename = meta["abbrev"] + "-" + year book_src_path = os.path.join(root_path, book_src_filename) + ".pdf" book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf") if not args.dry_run and not os.path.exists(book_dest_path): log(f"Copying {book_src_path} -> {book_dest_path}", args.dry_run) shutil.copyfile(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf") if not args.dry_run and not os.path.exists(pdf_dest_path): log(f"Copying {pdf_src_path} -> {pdf_dest_path}", args.dry_run) shutil.copyfile(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir( os.path.join(root_path, "additional")): attachment_file_path = os.path.join(root_path, "additional", attachment_file) match = re.match( rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file) if match is None: print( f"* Warning: no attachment match for {attachment_file}", file=sys.stderr, ) sys.exit(2) paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) if not args.dry_run and not os.path.exists(dest_path): log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) shutil.copyfile(attachment_file_path, dest_path) collections[collection_id][volume_name][paper_num][ "attachments"].append((dest_path, type_)) people = AnthologyIndex(None, srcdir=anthology_datadir) def disambiguate_name(node, anth_id): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: choice = -1 while choice < 0 or choice >= len(ids): print( f"({anth_id}): ambiguous author {name}; Please choose from the following:" ) for i, id_ in enumerate(ids): print(f"[{i}] {id_} ({people.get_comment(id_)})") choice = int(input("--> ")) node.attrib["id"] = ids[choice] for collection_id, collection in collections.items(): # Newly added volumes, so we can normalize and name-disambig later newly_added_volumes = [] collection_file = os.path.join(args.anthology_dir, "data", "xml", f"{collection_id}.xml") if os.path.exists(collection_file): root_node = etree.parse(collection_file).getroot() else: root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element( "volume", attrib={ "id": volume_id, "ingest-date": args.ingest_date }, ) # Replace the existing one if present existing_volume_node = root_node.find( f"./volume[@id='{volume_id}']") for i, child in enumerate(root_node): if child.attrib["id"] == volume_id: root_node[i] = volume_node break else: root_node.append(volume_node) meta_node = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) # print(etree.tostring(paper_node, pretty_print=True)) if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta_node.append(title_node) for author_or_editor in chain( paper_node.findall("./author"), paper_node.findall("./editor")): meta_node.append(author_or_editor) author_or_editor.tag = "editor" meta_node.append(paper_node.find("publisher")) meta_node.append(paper_node.find("address")) meta_node.append(paper_node.find("month")) meta_node.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", attrib={ "hash": compute_hash_from_file(book_dest_path) }, parent=meta_node, ) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) url = paper_node.find("./url") if url is not None: url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) for path, type_ in paper["attachments"]: make_simple_element( "attachment", text=os.path.basename(path), attrib={ "type": type_, "hash": compute_hash_from_file(path), }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for oldnode in paper_node: normalize(oldnode, informat="latex") for name_node in chain(paper_node.findall("./author"), paper_node.findall("./editor")): disambiguate_name(name_node, paper_id_full) # Other data from the meta file if "isbn" in meta: make_simple_element("isbn", meta["isbn"], parent=meta_node) indent(root_node) tree = etree.ElementTree(root_node) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def add_revision(anth_id, pdf_path, explanation, change_type="revision", dry_run=True, date=None): """ Takes an Anthology ID. It then adds a revision to the Anthology XML, updating and writing the XML file, and copies the PDFs into place. For PDFs, the revised PDF is saved to {anth_id}.pdf and {anth_id}v{version}.pdf. For the first revision, we first copy {anth_id}.pdf to {anth_id}v1.pdf. """ if date is None: now = datetime.now() date = f"{now.year}-{now.month:02d}-{now.day:02d}" def maybe_copy(file_from, file_to): if not dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) # The new version revno = None change_letter = "e" if change_type == "erratum" else "v" checksum = compute_hash_from_file(pdf_path) # Files for old-style IDs are stored under anthology-files/pdf/P/P19/* # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/* output_dir = get_pdf_dir(anth_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{anth_id}.pdf") # Update XML xml_file = get_xml_file(anth_id) collection_id, volume_id, paper_id = deconstruct_anthology_id(anth_id) tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if change_type == "erratum" else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not dry_run: # Update the URL hash on the <url> tag url = paper.find("./url") if url is not None: url.attrib["hash"] = checksum if change_type == "revision" and revno == 2: if paper.find("./url") is not None: current_version_url = infer_url( paper.find("./url").text) + ".pdf" # Download original file # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{anth_id}{change_letter}1.pdf") retrieve_url(current_version_url, revised_file_v1_path) validate_file_type(revised_file_v1_path) old_checksum = compute_hash_from_file(revised_file_v1_path) # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{anth_id}{change_letter}1", "hash": old_checksum, }, parent=paper, ) revision = make_simple_element( change_type, explanation, attrib={ "id": str(revno), "href": f"{anth_id}{change_letter}{revno}", "hash": checksum, "date": date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {anth_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) revised_file_versioned_path = os.path.join( output_dir, f"{anth_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(pdf_path, revised_file_versioned_path) # Copy it over the canonical path if change_type == "revision": maybe_copy(pdf_path, canonical_path)
# Find the insertion point among the other volumes insertion_point = 0 for i, volume in enumerate(existing_tree.getroot()): if new_volume_id < int(volume.attrib["id"]): break insertion_point = i + 1 print( f"Inserting volume {new_volume_id} at collection position {insertion_point}" ) existing_tree.getroot().insert(insertion_point, new_volume) else: # Append to existing volume (useful for TACL, which has a single volume each year) if requested if args.append: for paper in new_volume.findall("./paper"): print(f'Appending {paper.attrib["id"]}') paper.attrib["ingest-date"] = args.ingest_date existing_volume.append(paper) else: print( f"Skipping volume {new_volume_id}, which has already been inserted into {collection_file}.\n" "You can append to this volume by passing `--append` (or `-a`) to this script.\n" ) continue indent(existing_tree.getroot()) existing_tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)
def dump_collection(tree, collection_file): indent(tree.getroot()) tree.write(collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True)