def process_volume(anthology_volume): collection_id, volume_id, _ = deconstruct_anthology_id(anthology_volume) if is_newstyle_id(anthology_volume): venue_path = collection_id.split(".")[1] else: venue_path = os.path.join(collection_id[0], collection_id) print(f"Downloading PDFs for {anthology_volume}", file=sys.stderr) # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) for paper in tree.getroot().findall(f".//paper"): anthid = paper.find("./url").text # Try to get the URL from the Anthology if not test_url_code(infer_url(anthid)): doi = paper.find("./doi").text doi_pdf = f"https://www.mitpressjournals.org/doi/pdf/{doi}" local_path = os.path.join(args.anthology_files_dir, venue_path, f"{anthid}.pdf") if not os.path.exists(os.path.dirname(local_path)): os.makedirs(os.path.dirname(local_path)) retrieve_url(doi_pdf, local_path) print(f"Saved {doi_pdf} to {local_path}") sleep(1)
def __init__(self, acronym, letter, anth_id): self.parent_venue = acronym.lower() self.anth_id = anth_id collection_id, self.volume_id, _ = deconstruct_anthology_id(anth_id) if is_newstyle_id(collection_id): self.venue = collection_id.split(".")[1] self.is_parent_venue = self.venue == self.parent_venue else: self.venue = collection_id[0] self.is_parent_venue = self.venue == letter
def main(args): def maybe_copy(file_from, file_to): if not args.dry_run: print("-> Copying from {} -> {}".format(file_from, file_to), file=sys.stderr) shutil.copy(file_from, file_to) os.chmod(file_to, 0o644) else: print( "-> DRY RUN: Copying from {} -> {}".format(file_from, file_to), file=sys.stderr, ) change_type = "erratum" if args.erratum else "revision" change_letter = "e" if args.erratum else "v" print(f"Processing {change_type} to {args.anthology_id}...") # TODO: make sure path exists, or download URL to temp file if args.path.startswith("http"): _, input_file_path = tempfile.mkstemp() download_file(args.path, input_file_path) else: input_file_path = args.path validate_file_type(input_file_path) collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split(".")[-1] # The new version revno = None with open(input_file_path, "rb") as f: checksum = compute_hash(f.read()) # Files for old-style IDs are stored under anthology-files/pdf/P/P19/* # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/* if is_newstyle_id(args.anthology_id): venue_name = collection_id.split(".")[1] output_dir = os.path.join(args.anthology_dir, "pdf", venue_name) else: output_dir = os.path.join(args.anthology_dir, "pdf", collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f"-> Creating directory {output_dir}", file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f"{args.anthology_id}.pdf") # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = ET.parse(xml_file) if paper_id == "0": paper = tree.getroot().find(f"./volume[@id='{volume_id}']/frontmatter") else: paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib["id"]) + 1 if not args.dry_run: # Update the URL hash on the <url> tag url = paper.find("./url") if url is not None: url.attrib["hash"] = checksum if not args.erratum and revno == 2: if paper.find("./url") is not None: current_version_url = infer_url( paper.find("./url").text) + ".pdf" # Download original file # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}1.pdf") download_file(current_version_url, revised_file_v1_path) validate_file_type(revised_file_v1_path) with open(revised_file_v1_path, "rb") as f: old_checksum = compute_hash(f.read()) # First revision requires making the original version explicit revision = make_simple_element( change_type, None, attrib={ "id": "1", "href": f"{args.anthology_id}{change_letter}1", "hash": old_checksum, }, parent=paper, ) revision = make_simple_element( change_type, args.explanation, attrib={ "id": str(revno), "href": f"{args.anthology_id}{change_letter}{revno}", "hash": checksum, "date": args.date, }, parent=paper, ) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f"-> FATAL: paper ID {args.anthology_id} not found in the Anthology", file=sys.stderr, ) sys.exit(1) revised_file_versioned_path = os.path.join( output_dir, f"{args.anthology_id}{change_letter}{revno}.pdf") # Copy the file to the versioned path maybe_copy(input_file_path, revised_file_versioned_path) # Copy it over the canonical path if not args.erratum: maybe_copy(input_file_path, canonical_path) if args.path.startswith("http"): os.remove(input_file_path)
def main(volumes): formatter = MarkupFormatter() ## Assemble container doi_batch = make_simple_element( "doi_batch", attrib={ "xmlns": "http://www.crossref.org/schema/4.4.1", "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation": "http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd", "version": "4.4.1", }, namespaces={"xsi": "http://www.w3.org/2001/XMLSchema-instance"}, ) new_volume = etree.ElementTree(doi_batch) ## Assemble head head = make_simple_element("head", parent=new_volume.getroot()) dbi = make_simple_element("doi_batch_id", text=str(int(time.time())), parent=head) timestamp = make_simple_element("timestamp", text=str(int(time.time())), parent=head) depositor = make_simple_element("depositor", parent=head) depositor_name = make_simple_element("depositor_name", text=DEPOSITOR_NAME, parent=depositor) email_address = make_simple_element("email_address", text=EMAIL_ADDRESS, parent=depositor) registrant = make_simple_element("registrant", text=REGISTRANT, parent=head) ## Assemble body body = make_simple_element("body", parent=new_volume.getroot()) year = "" start_month = "" end_month = "" for full_volume_id in sorted(volumes): collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id) collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = etree.parse(collection_file) v = tree.getroot().find(f"./volume[@id='{volume_id}']") if v is None: print(f"* Can't find volume {full_volume_id}", file=sys.stderr) continue ## Assemble frontmatter c = make_simple_element("conference", parent=body) contribs = make_simple_element("contributors", parent=c) editor_index = 0 meta = v.find("./meta") for tag in meta: if tag.tag == "year": year = tag.text elif tag.tag == "month": month = tag.text try: start_month = MONTH_HASH[re.split("[-–]", month)[0]] end_month = MONTH_HASH[re.split("[-–]", month)[1]] except IndexError as e: # only one month start_month = MONTH_HASH[month] end_month = MONTH_HASH[month] except Exception as e: print( f"FATAL: can't parse month {month} in {full_volume_id}", file=sys.stderr, ) sys.exit(1) elif tag.tag == "url": url = tag.text elif tag.tag == "booktitle": booktitle = formatter.as_text(tag) elif tag.tag == "address": address = tag.text elif tag.tag == "publisher": publisher = tag.text elif tag.tag == "editor": pn = make_simple_element( "person_name", parent=contribs, attrib={ "contributor_role": "chair", "sequence": "first" if editor_index == 0 else "additional", }, ) editor_index += 1 for name_part in tag: # Check if empty (e.g., "Mausam") if name_part.tag == "first" and name_part.text != "": gn = make_simple_element("given_name", parent=pn, text=name_part.text) elif name_part.tag == "last": sn = make_simple_element("surname", text=name_part.text, parent=pn) # Assemble Event Metadata em = make_simple_element("event_metadata", parent=c) cn = make_simple_element("conference_name", parent=em, text=booktitle) cl = make_simple_element("conference_location", parent=em, text=address) cd = make_simple_element( "conference_date", parent=em, attrib={ "start_year": year, "end_year": year, "start_month": start_month, "end_month": end_month, }, ) # Assemble Proceedings Metadata pm = make_simple_element("proceedings_metadata", parent=c, attrib={"language": "en"}) pt = make_simple_element("proceedings_title", parent=pm, text=booktitle) p = make_simple_element("publisher", parent=pm) pn = make_simple_element("publisher_name", parent=p, text=publisher) pp = make_simple_element("publisher_place", parent=p, text=PUBLISHER_PLACE) pd = make_simple_element("publication_date", parent=pm) y = make_simple_element("year", parent=pd, text=year) noisbn = make_simple_element("noisbn", parent=pm, attrib={"reason": "simple_series"}) # DOI assignation data dd = make_simple_element("doi_data", parent=pm) doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url) resource = make_simple_element("resource", parent=dd, text=ANTHOLOGY_URL.format(url)) for paper in v.findall("./paper"): ## Individual Paper Data paper_id = paper.attrib["id"] if paper.find("./url") is not None: url = paper.find("./url").text else: if is_newstyle_id(full_volume_id): url = f"{full_volume_id}.{paper_id}" elif len(full_volume_id) == 6: url = f"{full_volume_id}{paper_id:02d}" elif len(full_volume_id) == 5: url = f"{full_volume_id}{paper_id:03d}" cp = make_simple_element("conference_paper", parent=c) # contributors contribs = make_simple_element("contributors", parent=cp) author_index = 0 for author in paper.findall("./author"): pn = make_simple_element( "person_name", parent=contribs, attrib={ "contributor_role": "author", "sequence": "first" if author_index == 0 else "additional", }, ) author_index += 1 for name_part in author: if name_part.tag == "first" and name_part.text != "": gn = make_simple_element("given_name", parent=pn, text=name_part.text) elif name_part.tag == "last": sn = make_simple_element("surname", text=name_part.text, parent=pn) for title in paper.iter(tag="title"): o_titles = make_simple_element("titles", parent=cp) o_title = make_simple_element("title", parent=o_titles, text=formatter.as_text(title)) pd = make_simple_element("publication_date", parent=cp) o_year = make_simple_element("year", parent=pd) o_year.text = year for pages in paper.iter(tag="pages"): o_pages = make_simple_element("pages", parent=cp) fp = make_simple_element("first_page", parent=o_pages) lp = make_simple_element("last_page", parent=o_pages) try: fp.text = re.split("[-–]", pages.text)[0] lp.text = re.split("[-–]", pages.text)[1] except IndexError as e: # only one page fp.text = pages.text lp.text = pages.text # DOI assignation data dd = make_simple_element("doi_data", parent=cp) doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url) resource = make_simple_element("resource", parent=dd, text=ANTHOLOGY_URL.format(url)) print( etree.tostring( new_volume, pretty_print=True, encoding="UTF-8", xml_declaration=True, with_tail=True, ).decode("utf-8"))