def main(volumes): formatter = MarkupFormatter() ## Assemble container doi_batch = make_simple_element( "doi_batch", attrib={ "xmlns": "http://www.crossref.org/schema/4.4.1", "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation": "http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd", "version": "4.4.1", }, namespaces={"xsi": "http://www.w3.org/2001/XMLSchema-instance"}, ) new_volume = etree.ElementTree(doi_batch) ## Assemble head head = make_simple_element("head", parent=new_volume.getroot()) dbi = make_simple_element("doi_batch_id", text=str(int(time.time())), parent=head) timestamp = make_simple_element("timestamp", text=str(int(time.time())), parent=head) depositor = make_simple_element("depositor", parent=head) depositor_name = make_simple_element("depositor_name", text=DEPOSITOR_NAME, parent=depositor) email_address = make_simple_element("email_address", text=EMAIL_ADDRESS, parent=depositor) registrant = make_simple_element("registrant", text=REGISTRANT, parent=head) ## Assemble body body = make_simple_element("body", parent=new_volume.getroot()) year = "" start_month = "" end_month = "" for full_volume_id in sorted(volumes): collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id) collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..", "data", "xml", f"{collection_id}.xml") tree = etree.parse(collection_file) v = tree.getroot().find(f"./volume[@id='{volume_id}']") if v is None: print(f"* Can't find volume {full_volume_id}", file=sys.stderr) continue ## Assemble frontmatter c = make_simple_element("conference", parent=body) contribs = make_simple_element("contributors", parent=c) editor_index = 0 meta = v.find("./meta") for tag in meta: if tag.tag == "year": year = tag.text elif tag.tag == "month": month = tag.text try: start_month = MONTH_HASH[re.split("[-–]", month)[0]] end_month = MONTH_HASH[re.split("[-–]", month)[1]] except IndexError as e: # only one month start_month = MONTH_HASH[month] end_month = MONTH_HASH[month] except Exception as e: print( f"FATAL: can't parse month {month} in {full_volume_id}", file=sys.stderr, ) sys.exit(1) elif tag.tag == "url": url = tag.text elif tag.tag == "booktitle": booktitle = formatter.as_text(tag) elif tag.tag == "address": address = tag.text elif tag.tag == "publisher": publisher = tag.text elif tag.tag == "editor": pn = make_simple_element( "person_name", parent=contribs, attrib={ "contributor_role": "chair", "sequence": "first" if editor_index == 0 else "additional", }, ) editor_index += 1 for name_part in tag: if name_part.tag == "first": gn = make_simple_element("given_name", parent=pn, text=name_part.text) elif name_part.tag == "last": sn = make_simple_element("surname", text=name_part.text, parent=pn) # Assemble Event Metadata em = make_simple_element("event_metadata", parent=c) cn = make_simple_element("conference_name", parent=em, text=booktitle) cl = make_simple_element("conference_location", parent=em, text=address) cd = make_simple_element( "conference_date", parent=em, attrib={ "start_year": year, "end_year": year, "start_month": start_month, "end_month": end_month, }, ) # Assemble Proceedings Metadata pm = make_simple_element("proceedings_metadata", parent=c, attrib={"language": "en"}) pt = make_simple_element("proceedings_title", parent=pm, text=booktitle) p = make_simple_element("publisher", parent=pm) pn = make_simple_element("publisher_name", parent=p, text=publisher) pp = make_simple_element("publisher_place", parent=p, text=PUBLISHER_PLACE) pd = make_simple_element("publication_date", parent=pm) y = make_simple_element("year", parent=pd, text=year) noisbn = make_simple_element("noisbn", parent=pm, attrib={"reason": "simple_series"}) # DOI assignation data dd = make_simple_element("doi_data", parent=pm) doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url) resource = make_simple_element("resource", parent=dd, text=ANTHOLOGY_URL.format(url)) for paper in v.findall("./paper"): ## Individual Paper Data # TODO: this is not future-proof, should use anthology.util library functions aa_id = "" if len(url) == 6: aa_id = "{:02d}".format(int(paper.attrib["id"])) else: if len(url) == 5: aa_id = "{:03d}".format(int(paper.attrib["id"])) cp = make_simple_element("conference_paper", parent=c) # contributors contribs = make_simple_element("contributors", parent=cp) author_index = 0 for author in paper.findall("./author"): pn = make_simple_element( "person_name", parent=contribs, attrib={ "contributor_role": "author", "sequence": "first" if author_index == 0 else "additional", }, ) author_index += 1 for name_part in author: if name_part.tag == "first": gn = make_simple_element("given_name", parent=pn, text=name_part.text) elif name_part.tag == "last": sn = make_simple_element("surname", text=name_part.text, parent=pn) for title in paper.iter(tag="title"): o_titles = make_simple_element("titles", parent=cp) o_title = make_simple_element("title", parent=o_titles, text=formatter.as_text(title)) pd = make_simple_element("publication_date", parent=cp) o_year = make_simple_element("year", parent=pd) o_year.text = year for pages in paper.iter(tag="pages"): o_pages = make_simple_element("pages", parent=cp) fp = make_simple_element("first_page", parent=o_pages) lp = make_simple_element("last_page", parent=o_pages) try: fp.text = re.split("[-–]", pages.text)[0] lp.text = re.split("[-–]", pages.text)[1] except IndexError as e: # only one page fp.text = pages.text lp.text = pages.text # DOI assignation data dd = make_simple_element("doi_data", parent=cp) doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url + aa_id) resource = make_simple_element("resource", parent=dd, text=ANTHOLOGY_URL.format(url + aa_id)) print( etree.tostring( new_volume, pretty_print=True, encoding="UTF-8", xml_declaration=True, with_tail=True, ).decode("utf-8"))
def main(volumes): formatter = MarkupFormatter() ## Assemble container doi_batch = make_simple_element( 'doi_batch', attrib={ 'xmlns': 'http://www.crossref.org/schema/4.4.1', '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation': 'http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd', 'version': '4.4.1' }, namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'}) new_volume = etree.ElementTree(doi_batch) ## Assemble head head = make_simple_element('head', parent=new_volume.getroot()) dbi = make_simple_element('doi_batch_id', text=str(int(time.time())), parent=head) timestamp = make_simple_element('timestamp', text=str(int(time.time())), parent=head) depositor = make_simple_element('depositor', parent=head) depositor_name = make_simple_element('depositor_name', text=DEPOSITOR_NAME, parent=depositor) email_address = make_simple_element('email_address', text=EMAIL_ADDRESS, parent=depositor) registrant = make_simple_element('registrant', text=REGISTRANT, parent=head) ## Assemble body body = make_simple_element('body', parent=new_volume.getroot()) year = "" start_month = "" end_month = "" for full_volume_id in sorted(volumes): collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id) collection_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = etree.parse(collection_file) v = tree.getroot().find(f"./volume[@id='{volume_id}']") if v is None: print(f"* Can't find volume {full_volume_id}", file=sys.stderr) continue ## Assemble frontmatter c = make_simple_element('conference', parent=body) contribs = make_simple_element('contributors', parent=c) editor_index = 0 meta = v.find('./meta') for tag in meta: if tag.tag == 'year': year = tag.text elif tag.tag == 'month': month = tag.text try: start_month = MONTH_HASH[re.split('[-–]', month)[0]] end_month = MONTH_HASH[re.split('[-–]', month)[1]] except IndexError as e: # only one month start_month = MONTH_HASH[month] end_month = MONTH_HASH[month] elif tag.tag == 'url': url = tag.text elif tag.tag == 'booktitle': booktitle = tag.text elif tag.tag == 'address': address = tag.text elif tag.tag == 'publisher': publisher = tag.text elif tag.tag == 'editor': pn = make_simple_element( 'person_name', parent=contribs, attrib={ 'contributor_role': 'chair', 'sequence': 'first' if editor_index == 0 else 'additional' }) editor_index += 1 for name_part in tag: if name_part.tag == 'first': gn = make_simple_element('given_name', parent=pn, text=name_part.text) elif name_part.tag == 'last': sn = make_simple_element('surname', text=name_part.text, parent=pn) # Assemble Event Metadata em = make_simple_element('event_metadata', parent=c) cn = make_simple_element('conference_name', parent=em, text=booktitle) cl = make_simple_element('conference_location', parent=em, text=address) cd = make_simple_element('conference_date', parent=em, attrib={ 'start_year': year, 'end_year': year, 'start_month': start_month, 'end_month': end_month }) # Assemble Proceedings Metadata pm = make_simple_element('proceedings_metadata', parent=c, attrib={'language': 'en'}) pt = make_simple_element('proceedings_title', parent=pm, text=booktitle) p = make_simple_element('publisher', parent=pm) pn = make_simple_element('publisher_name', parent=p, text=publisher) pp = make_simple_element('publisher_place', parent=p, text=PUBLISHER_PLACE) pd = make_simple_element('publication_date', parent=pm) y = make_simple_element('year', parent=pd, text=year) noisbn = make_simple_element('noisbn', parent=pm, attrib={'reason': 'simple_series'}) # DOI assignation data dd = make_simple_element('doi_data', parent=pm) doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url) resource = make_simple_element('resource', parent=dd, text=ANTHOLOGY_URL.format(url)) for paper in v.findall('./paper'): ## Individual Paper Data # TODO: this is not future-proof, should use anthology.util library functions aa_id = "" if (len(url) == 6): aa_id = '{:02d}'.format(int(paper.attrib['id'])) else: if (len(url) == 5): aa_id = '{:03d}'.format(int(paper.attrib['id'])) cp = make_simple_element('conference_paper', parent=c) # contributors contribs = make_simple_element('contributors', parent=cp) author_index = 0 for author in paper.findall('./author'): pn = make_simple_element( 'person_name', parent=contribs, attrib={ 'contributor_role': 'author', 'sequence': 'first' if author_index == 0 else 'additional' }) author_index += 1 for name_part in author: if name_part.tag == 'first': gn = make_simple_element('given_name', parent=pn, text=name_part.text) elif name_part.tag == 'last': sn = make_simple_element('surname', text=name_part.text, parent=pn) for title in paper.iter(tag='title'): o_titles = make_simple_element('titles', parent=cp) o_title = make_simple_element('title', parent=o_titles, text=formatter.as_text(title)) pd = make_simple_element('publication_date', parent=cp) o_year = make_simple_element('year', parent=pd) o_year.text = year for pages in paper.iter(tag='pages'): o_pages = make_simple_element('pages', parent=cp) fp = make_simple_element('first_page', parent=o_pages) lp = make_simple_element('last_page', parent=o_pages) try: fp.text = re.split('[-–]', pages.text)[0] lp.text = re.split('[-–]', pages.text)[1] except IndexError as e: # only one page fp.text = pages.text lp.text = pages.text # DOI assignation data dd = make_simple_element('doi_data', parent=cp) doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url + aa_id) resource = make_simple_element('resource', parent=dd, text=ANTHOLOGY_URL.format(url + aa_id)) print( etree.tostring(new_volume, pretty_print=True, encoding='UTF-8', xml_declaration=True, with_tail=True).decode('utf-8'))
def main(args): scriptdir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'data') anthology = Anthology(importdir=scriptdir) attachments = defaultdict(list) revisions = [] errata = [] for line in sys.stdin: if not line.startswith("+"): continue line = line[1:].strip() if line.startswith("<attachment"): try: match_str = rf'<attachment type="(\w+)">({ANTHOLOGY_ID_REGEX}).*' match = re.match(match_str, line) attach_type, anthology_id = match.groups() except: print(f"* Couldn't match '{match_str}' to '{line}'", file=sys.stderr) attachments[attach_type].append(( anthology.papers[anthology_id].get_title('plain'), ANTHOLOGY_URL.format(anthology_id), )) elif line.startswith("<revision"): try: match_str = rf'<revision.*href="({ANTHOLOGY_ID_REGEX}).*>.*' match = re.match(match_str, line) anthology_id = match.group(1) except: print(f"* Couldn't match '{match_str}' to '{line}'", file=sys.stderr) paper = anthology.papers[anthology_id] explanation = paper.attrib["revision"][-1]["explanation"] revisions.append(( paper.get_title("plain"), ANTHOLOGY_URL.format(anthology_id), explanation, )) elif line.startswith("<errat"): try: match_str = rf"<errat.*?>({ANTHOLOGY_ID_REGEX}).*" match = re.match(match_str, line) anthology_id = match.group(1) except: print(f"* Couldn't match '{match_str}' to '{line}'", file=sys.stderr) errata.append(( anthology.papers[anthology_id].get_title('plain'), ANTHOLOGY_URL.format(anthology_id), )) inflector = inflect.engine() for attach_type, attachments in attachments.items(): phrase = inflector.a(attach_type) print(f"\nAdded {phrase}:") for title, url in attachments: print("-", title, "\n ", url, "\n") if len(revisions): print(f"\nRevisions:") for title, url, explanation in revisions: print("-", title, "\n ", url, "\n ", explanation, "\n") if len(errata): print(f"\nErrata:") for title, url in errata: print("-", title, "\n ", url, "\n")
def main(args): change_type = 'erratum' if args.erratum else 'revision' change_letter = 'e' if args.erratum else 'v' print(f'Processing {change_type} to {args.anthology_id}...') # TODO: make sure path exists, or download URL to temp file if args.path.startswith('http'): _, input_file_path = tempfile.mkstemp() try: print(f'-> Downloading file from {args.path}', file=sys.stderr) with urllib.request.urlopen(args.path) as url, open( input_file_path, mode='wb') as input_file_fh: input_file_fh.write(url.read()) except ssl.SSLError: print('An SSL error was encountered in downloading the files.', file=sys.stderr) sys.exit(1) else: input_file_path = args.path collection_id, volume_id, paper_id = deconstruct_anthology_id( args.anthology_id) paper_extension = args.path.split('.')[-1] # The new version revno = None # Update XML xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml') tree = ET.parse(xml_file) paper = tree.getroot().find( f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']") if paper is not None: revisions = paper.findall(change_type) revno = 1 if args.erratum else 2 for revision in revisions: revno = int(revision.attrib['id']) + 1 if args.do: revision = ET.Element(change_type) revision.attrib['id'] = str(revno) revision.attrib[ 'href'] = f'{args.anthology_id}{change_letter}{revno}' revision.text = args.explanation # Set tails to maintain proper indentation paper[-1].tail += ' ' revision.tail = '\n ' # newline and two levels of indent paper.append(revision) indent(tree.getroot()) tree.write(xml_file, encoding="UTF-8", xml_declaration=True) print(f'-> Added {change_type} node "{revision.text}" to XML', file=sys.stderr) else: print( f'-> FATAL: paper ID {args.anthology_id} not found in the Anthology', file=sys.stderr) sys.exit(1) output_dir = os.path.join(args.anthology_dir, 'pdf', collection_id[0], collection_id) # Make sure directory exists if not os.path.exists(output_dir): print(f'-> Creating directory {output_dir}', file=sys.stderr) os.makedirs(output_dir) canonical_path = os.path.join(output_dir, f'{args.anthology_id}.pdf') if not args.erratum and revno == 2: # There are no versioned files the first time around, so create the first one # (essentially backing up the original version) revised_file_v1_path = os.path.join( output_dir, f'{args.anthology_id}{change_letter}1.pdf') current_version = ANTHOLOGY_URL.format(args.anthology_id) if args.do: try: print( f'-> Downloading file from {args.path} to {revised_file_v1_path}', file=sys.stderr) with urllib.request.urlopen(current_version) as url, open( revised_file_v1_path, mode='wb') as fh: fh.write(url.read()) except ssl.SSLError: print( f'-> FATAL: An SSL error was encountered in downloading {args.path}.', file=sys.stderr) sys.exit(1) else: print( f'-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}', file=sys.stderr) revised_file_versioned_path = os.path.join( output_dir, f'{args.anthology_id}{change_letter}{revno}.pdf') maybe_copy(input_file_path, revised_file_versioned_path, args.do) maybe_copy(input_file_path, canonical_path, args.do) if args.path.startswith('http'): os.remove(input_file_path)