def main(args): datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venues = VenueIndex(srcdir=datadir) print(f"Adding '{args.acronym}' ({args.name})") venues.add_venue(args.acronym, args.name, is_acl=args.acl, url=args.url) venues.dump(datadir)
def main(args): collections = defaultdict(OrderedDict) volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_index = VenueIndex(srcdir=anthology_datadir) venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] # Build list of volumes, confirm uniqueness unseen_venues = [] for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) venue_abbrev = meta["abbrev"] venue_slug = venue_index.get_slug(venue_abbrev) if venue_slug not in venue_keys: unseen_venues.append((venue_slug, venue_abbrev, meta["title"])) meta["path"] = proceedings meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug volume_name = meta["volume"].lower() volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") collections[collection_id][volume_name] = {} volumes[volume_full_id] = meta # Make sure all venues exist if len(unseen_venues) > 0: for venue in unseen_venues: slug, abbrev, title = venue print(f"Creating venue '{abbrev}' ({title})") venue_index.add_venue(abbrev, title) venue_index.dump(directory=anthology_datadir) # Copy over the PDFs and attachments for volume, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume"].lower() year = meta["year"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) # copy the book book_src_filename = meta["abbrev"] + "-" + year book_src_path = os.path.join(root_path, book_src_filename) + ".pdf" book_dest_path = None if os.path.exists(book_src_path): book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf" ) if not args.dry_run: maybe_copy(book_src_path, book_dest_path) # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # Skip . files if os.path.basename(pdf_file).startswith("."): continue # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf" ) if not args.dry_run: maybe_copy(pdf_src_path, pdf_dest_path) collections[collection_id][volume_name][paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir(os.path.join(root_path, "additional")): if os.path.basename(attachment_file).startswith("."): continue attachment_file_path = os.path.join( root_path, "additional", attachment_file ) match = re.match( rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file ) if match is None: print( f"* Warning: no attachment match for {attachment_file}", file=sys.stderr, ) sys.exit(2) paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) if not args.dry_run and not os.path.exists(dest_path): log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) shutil.copyfile(attachment_file_path, dest_path) collections[collection_id][volume_name][paper_num]["attachments"].append( (dest_path, type_) ) people = AnthologyIndex(None, srcdir=anthology_datadir) def correct_caps(person, name_node, anth_id): """ Many people submit their names in "ALL CAPS" or "all lowercase". Correct this with heuristics. """ name = name_node.text if name.islower() or name.isupper(): # capitalize all parts corrected = " ".join(list(map(lambda x: x.capitalize(), name.split()))) print( f"-> Correcting capitalization of '{name}' to '{corrected}'", file=sys.stderr, ) name_node.text = corrected def disambiguate_name(node, anth_id): name = PersonName.from_element(node) ids = people.get_ids(name) if len(ids) > 1: choice = -1 while choice < 0 or choice >= len(ids): print( f"({anth_id}): ambiguous author {name}; Please choose from the following:" ) for i, id_ in enumerate(ids): print(f"[{i}] {id_} ({people.get_comment(id_)})") choice = int(input("--> ")) node.attrib["id"] = ids[choice] for collection_id, collection in collections.items(): # Newly added volumes, so we can normalize and name-disambig later newly_added_volumes = [] collection_file = os.path.join( args.anthology_dir, "data", "xml", f"{collection_id}.xml" ) if os.path.exists(collection_file): root_node = etree.parse(collection_file).getroot() else: root_node = make_simple_element("collection", attrib={"id": collection_id}) for volume_id, volume in collection.items(): volume_node = make_simple_element( "volume", attrib={"id": volume_id, "ingest-date": args.ingest_date}, ) # Replace the existing one if present existing_volume_node = root_node.find(f"./volume[@id='{volume_id}']") for i, child in enumerate(root_node): if child.attrib["id"] == volume_id: root_node[i] = volume_node break else: root_node.append(volume_node) meta_node = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta_node.append(title_node) for author_or_editor in chain( paper_node.findall("./author"), paper_node.findall("./editor") ): meta_node.append(author_or_editor) author_or_editor.tag = "editor" meta_node.append(paper_node.find("publisher")) meta_node.append(paper_node.find("address")) meta_node.append(paper_node.find("month")) meta_node.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", attrib={"hash": compute_hash_from_file(book_dest_path)}, parent=meta_node, ) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) url = paper_node.find("./url") if url is not None: url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) for path, type_ in paper["attachments"]: make_simple_element( "attachment", text=os.path.basename(path), attrib={ "type": type_, "hash": compute_hash_from_file(path), }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for oldnode in paper_node: normalize(oldnode, informat="latex") for name_node in chain( paper_node.findall("./author"), paper_node.findall("./editor") ): disambiguate_name(name_node, paper_id_full) person = PersonName.from_element(name_node) for name_part in name_node: correct_caps(person, name_part, paper_id_full) # Other data from the meta file if "isbn" in meta: make_simple_element("isbn", meta["isbn"], parent=meta_node) indent(root_node) tree = etree.ElementTree(root_node) tree.write( collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True )
def main(args): volumes = {} anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data") venue_index = VenueIndex(srcdir=anthology_datadir) venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()] sig_index = SIGIndex(srcdir=anthology_datadir) people = AnthologyIndex(srcdir=anthology_datadir) people.bibkeys = load_bibkeys(anthology_datadir) def correct_caps(name): """ Many people submit their names in "ALL CAPS" or "all lowercase". Correct this with heuristics. """ if name.islower() or name.isupper(): # capitalize all parts corrected = " ".join(list(map(lambda x: x.capitalize(), name.split()))) print( f"-> Correcting capitalization of '{name}' to '{corrected}'", file=sys.stderr, ) name = corrected return name def disambiguate_name(node, anth_id): name = PersonName.from_element(node) ids = people.get_ids(name) choice = -1 if len(ids) > 1: while choice < 0 or choice >= len(ids): print( f"({anth_id}): ambiguous author {name}; Please choose from the following:" ) for i, id_ in enumerate(ids): print(f"[{i}] {id_} ({people.get_comment(id_)})") choice = int(input("--> ")) return ids[choice], choice # Build list of volumes, confirm uniqueness unseen_venues = [] for proceedings in args.proceedings: meta = read_meta(os.path.join(proceedings, "meta")) venue_abbrev = meta["abbrev"] venue_slug = venue_index.get_slug(venue_abbrev) if str(datetime.now().year) in venue_abbrev: print(f"Fatal: Venue assembler put year in acronym: '{venue_abbrev}'") sys.exit(1) if re.match(r".*\d$", venue_abbrev) is not None: print( f"WARNING: Venue {venue_abbrev} ends in a number, this is probably a mistake" ) if venue_slug not in venue_keys: unseen_venues.append((venue_slug, venue_abbrev, meta["title"])) meta["path"] = proceedings meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug volume_name = meta["volume"].lower() volume_full_id = f"{collection_id}-{volume_name}" if volume_full_id in volumes: print("Error: ") volumes[volume_full_id] = meta if "sig" in meta: print( f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:" ) print(f" - {meta['year']}:") print(f" - {volume_full_id} # {meta['booktitle']}") # Make sure all venues exist if len(unseen_venues) > 0: for venue in unseen_venues: slug, abbrev, title = venue print(f"Creating venue '{abbrev}' ({title})") venue_index.add_venue(abbrev, title) venue_index.dump(directory=anthology_datadir) # Copy over the PDFs and attachments for volume_full_id, meta in volumes.items(): root_path = os.path.join(meta["path"], "cdrom") collection_id = meta["collection_id"] venue_name = meta["abbrev"].lower() volume_name = meta["volume"].lower() year = meta["year"] pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name) if not os.path.exists(pdfs_dest_dir): os.makedirs(pdfs_dest_dir) # copy the book from the top-level proceedings/ dir, named "VENUE-year.pdf" book_path = f'cdrom/{venue_name.upper()}-{year}.pdf' book_src_path = os.path.join(meta["path"], book_path) book_dest_path = None if os.path.exists(book_src_path) and not args.dry_run: book_dest_path = ( os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") + ".pdf" ) maybe_copy(book_src_path, book_dest_path) # temp holder for papers in each volume volume = dict() # copy the paper PDFs pdf_src_dir = os.path.join(root_path, "pdf") for pdf_file in os.listdir(pdf_src_dir): # Skip . files if os.path.basename(pdf_file).startswith("."): continue # names are {abbrev}{number}.pdf match = re.match(rf".*\.(\d+)\.pdf", pdf_file) if match is not None: paper_num = int(match[1]) paper_id_full = f"{collection_id}-{volume_name}.{paper_num}" bib_path = os.path.join( root_path, "bib", pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"), ) pdf_src_path = os.path.join(pdf_src_dir, pdf_file) pdf_dest_path = os.path.join( pdfs_dest_dir, f"{collection_id}-{volume_name}.{paper_num}.pdf" ) if not args.dry_run: maybe_copy(pdf_src_path, pdf_dest_path) volume[paper_num] = { "anthology_id": paper_id_full, "bib": bib_path, "pdf": pdf_dest_path, "attachments": [], } # copy the attachments if os.path.exists(os.path.join(root_path, "additional")): attachments_dest_dir = os.path.join(args.attachments_dir, venue_name) if not os.path.exists(attachments_dest_dir): os.makedirs(attachments_dest_dir) for attachment_file in os.listdir(os.path.join(root_path, "additional")): if os.path.basename(attachment_file).startswith("."): continue attachment_file_path = os.path.join( root_path, "additional", attachment_file ) # Find the attachment file, using a bit of a fuzzy # match. The fuzzy match is because sometimes people # generate the proceedings with the wrong venue # code. If we correct it, we still need to be able to # find the file. match = re.match( rf"{year}\..*-\w+\.(\d+)_?(\w+)\.(\w+)$", attachment_file ) if match is None: print( f"* Warning: no attachment match for {attachment_file}", file=sys.stderr, ) sys.exit(2) paper_num, type_, ext = match.groups() paper_num = int(paper_num) file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}" dest_path = os.path.join(attachments_dest_dir, file_name) if not args.dry_run and not os.path.exists(dest_path): log(f"Copying {attachment_file} -> {dest_path}", args.dry_run) shutil.copyfile(attachment_file_path, dest_path) volume[paper_num]["attachments"].append((dest_path, type_)) # create xml collection_file = os.path.join( args.anthology_dir, "data", "xml", f"{collection_id}.xml" ) if os.path.exists(collection_file): root_node = etree.parse(collection_file).getroot() else: root_node = make_simple_element("collection", attrib={"id": collection_id}) volume_node = make_simple_element( "volume", attrib={"id": volume_name, "ingest-date": args.ingest_date}, ) # Replace the existing one if present existing_volume_node = root_node.find(f"./volume[@id='{volume_name}']") for i, child in enumerate(root_node): if child.attrib["id"] == volume_name: root_node[i] = volume_node break else: root_node.append(volume_node) meta_node = None for paper_num, paper in sorted(volume.items()): paper_id_full = paper["anthology_id"] bibfile = paper["bib"] paper_node = bib2xml(bibfile, paper_id_full) if paper_node.attrib["id"] == "0": # create metadata subtree meta_node = make_simple_element("meta", parent=volume_node) title_node = paper_node.find("title") title_node.tag = "booktitle" meta_node.append(title_node) for author_or_editor in chain( paper_node.findall("./author"), paper_node.findall("./editor") ): disamb_name, name_choice = disambiguate_name( author_or_editor, paper_id_full ) if name_choice != -1: author_or_editor.attrib["id"] = disamb_name person = PersonName.from_element(author_or_editor) for name_part in author_or_editor: name_part.text = correct_caps(name_part.text) meta_node.append(author_or_editor) author_or_editor.tag = "editor" # Here, we grab the publisher from the meta file, in case it's not in the # frontmatter paper. We don't handle the situation where it's in neither! publisher_node = paper_node.find("publisher") if publisher_node is None: publisher_node = make_simple_element("publisher", meta["publisher"]) meta_node.append(publisher_node) # Look for the address in the bib file, then the meta file address_node = paper_node.find("address") if address_node is None: address_node = make_simple_element("address", meta["location"]) meta_node.append(address_node) meta_node.append(paper_node.find("month")) meta_node.append(paper_node.find("year")) if book_dest_path is not None: make_simple_element( "url", text=f"{collection_id}-{volume_name}", attrib={"hash": compute_hash_from_file(book_dest_path)}, parent=meta_node, ) # modify frontmatter tag paper_node.tag = "frontmatter" del paper_node.attrib["id"] else: # remove unneeded fields for child in paper_node: if child.tag in [ "editor", "address", "booktitle", "publisher", "year", "month", ]: paper_node.remove(child) url = paper_node.find("./url") if url is not None: url.attrib["hash"] = compute_hash_from_file(paper["pdf"]) for path, type_ in paper["attachments"]: make_simple_element( "attachment", text=os.path.basename(path), attrib={ "type": type_, "hash": compute_hash_from_file(path), }, parent=paper_node, ) if len(paper_node) > 0: volume_node.append(paper_node) # Normalize for oldnode in paper_node: normalize(oldnode, informat="latex") # Adjust the language tag language_node = paper_node.find("./language") if language_node is not None: try: lang = iso639.languages.get(name=language_node.text) except KeyError: raise Exception(f"Can't find language '{language_node.text}'") language_node.text = lang.part3 # Fix author names for name_node in chain( paper_node.findall("./author"), paper_node.findall("./editor") ): disamb_name, name_choice = disambiguate_name(name_node, paper_id_full) if name_choice != -1: name_node.attrib["id"] = disamb_name person = PersonName.from_element(name_node) for name_part in name_node: name_part.text = correct_caps(name_part.text) # Other data from the meta file if "isbn" in meta: make_simple_element("isbn", meta["isbn"], parent=meta_node) indent(root_node) tree = etree.ElementTree(root_node) tree.write( collection_file, encoding="UTF-8", xml_declaration=True, with_tail=True )