def create_draft(args: list) -> int: """ Entry point for `se create-draft` """ if args.create_github_repo and not args.create_se_repo: se.print_error("--create-github-repo option specified, but --create-se-repo option not specified.") return se.InvalidInputException.code if args.pg_url and not regex.match("^https?://www.gutenberg.org/ebooks/[0-9]+$", args.pg_url): se.print_error("Project Gutenberg URL must look like: https://www.gutenberg.org/ebooks/<EBOOK-ID>") return se.InvalidInputException.code # Put together some variables for later use identifier = se.formatting.make_url_safe(args.author) + "/" + se.formatting.make_url_safe(args.title) title_string = args.title.replace("'", "’") + ", by " + args.author.replace("'", "’") sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title) pg_producers = [] if args.translator: identifier = identifier + "/" + se.formatting.make_url_safe(args.translator) title_string = title_string + ". Translated by " + args.translator if args.illustrator: identifier = identifier + "/" + se.formatting.make_url_safe(args.illustrator) title_string = title_string + ". Illustrated by " + args.illustrator repo_name = identifier.replace("/", "_") if os.path.isdir(repo_name): se.print_error("./{}/ already exists.".format(repo_name)) return se.InvalidInputException.code # Download PG HTML and do some fixups if args.pg_url: args.pg_url = args.pg_url.replace("http://", "https://") # Get the ebook metadata try: response = requests.get(args.pg_url) pg_metadata_html = response.text except Exception as ex: se.print_error("Couldn’t download Project Gutenberg ebook metadata page. Error: {}".format(ex)) return se.RemoteCommandErrorException.code soup = BeautifulSoup(pg_metadata_html, "lxml") # Get the ebook HTML URL from the metadata pg_ebook_url = None for element in soup.select("a[type^=\"text/html\"]"): pg_ebook_url = regex.sub("^//", "https://", element["href"]) if not pg_ebook_url: se.print_error("Could download ebook metadata, but couldn’t find URL for the ebook HTML.") return se.RemoteCommandErrorException.code # Get the ebook LCSH categories pg_subjects = [] for element in soup.select("td[property=\"dcterms:subject\"]"): if element["datatype"] == "dcterms:LCSH": for subject_link in element.find("a"): pg_subjects.append(subject_link.strip()) # Get the PG publication date pg_publication_year = None for element in soup.select("td[itemprop=\"datePublished\"]"): pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", element.text) # Get the actual ebook URL try: response = requests.get(pg_ebook_url) pg_ebook_html = response.text except Exception as ex: se.print_error("Couldn’t download Project Gutenberg ebook HTML. Error: {}".format(ex)) return se.RemoteCommandErrorException.code try: fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False) pg_ebook_html = se.strip_bom(fixed_pg_ebook_html) except Exception as ex: se.print_error("Couldn’t determine text encoding of Project Gutenberg HTML file. Error: {}".format(ex)) return se.InvalidEncodingException.code # Try to guess the ebook language pg_language = "en-US" if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html: pg_language = "en-GB" # Create necessary directories os.makedirs(os.path.join(repo_name, "images")) os.makedirs(os.path.join(repo_name, "src", "epub", "css")) os.makedirs(os.path.join(repo_name, "src", "epub", "images")) os.makedirs(os.path.join(repo_name, "src", "epub", "text")) os.makedirs(os.path.join(repo_name, "src", "META-INF")) # Write PG data if we have it if args.pg_url and pg_ebook_html: soup = BeautifulSoup(pg_ebook_html, "html.parser") # Try to get the PG producers. We only try this if there's a <pre> block with the header info (which is not always the case) for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$", flags=regex.DOTALL)): if element.parent.name == "pre": pg_producers = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", element, flags=regex.DOTALL) pg_producers = regex.sub(r"\(.+?\)", "", pg_producers, flags=regex.DOTALL) pg_producers = regex.sub(r"(at )?https?://www\.pgdp\.net", "", pg_producers, flags=regex.DOTALL) pg_producers = regex.sub(r"[\r\n]+", " ", pg_producers, flags=regex.DOTALL) pg_producers = regex.sub(r",? and ", ", and ", pg_producers) pg_producers = pg_producers.replace(" and the Online", " and The Online") pg_producers = pg_producers.replace(", and ", ", ").strip().split(", ") # Try to strip out the PG header for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")): for sibling in element.parent.find_previous_siblings(): sibling.decompose() element.parent.decompose() # Try to strip out the PG license footer for element in soup(text=regex.compile(r"End of (the )?Project Gutenberg")): for sibling in element.parent.find_next_siblings(): sibling.decompose() element.parent.decompose() with open(os.path.join(repo_name, "src", "epub", "text", "body.xhtml"), "w") as file: file.write(str(soup)) # Copy over templates shutil.copy(resource_filename("se", os.path.join("data", "templates", "gitignore")), os.path.normpath(repo_name + "/.gitignore")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "LICENSE.md")), os.path.normpath(repo_name + "/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "META-INF", "container.xml")), os.path.normpath(repo_name + "/src/META-INF/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "mimetype")), os.path.normpath(repo_name + "/src/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "content.opf")), os.path.normpath(repo_name + "/src/epub/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "onix.xml")), os.path.normpath(repo_name + "/src/epub/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "toc.xhtml")), os.path.normpath(repo_name + "/src/epub/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "core.css")), os.path.normpath(repo_name + "/src/epub/css/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "local.css")), os.path.normpath(repo_name + "/src/epub/css/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "logo.svg")), os.path.normpath(repo_name + "/src/epub/images/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "colophon.xhtml")), os.path.normpath(repo_name + "/src/epub/text/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "imprint.xhtml")), os.path.normpath(repo_name + "/src/epub/text/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "titlepage.xhtml")), os.path.normpath(repo_name + "/src/epub/text/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "uncopyright.xhtml")), os.path.normpath(repo_name + "/src/epub/text/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "titlepage.svg")), os.path.normpath(repo_name + "/images/")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "cover.jpg")), os.path.normpath(repo_name + "/images/cover.jpg")) shutil.copy(resource_filename("se", os.path.join("data", "templates", "cover.svg")), os.path.normpath(repo_name + "/images/cover.svg")) # Try to find Wikipedia links if possible author_wiki_url, author_nacoaf_url = _get_wikipedia_url(args.author, True) ebook_wiki_url, _ = _get_wikipedia_url(args.title, False) translator_wiki_url = None if args.translator: translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url(args.translator, True) # Pre-fill a few templates se.replace_in_file(os.path.normpath(repo_name + "/src/epub/text/titlepage.xhtml"), "TITLESTRING", title_string) se.replace_in_file(os.path.normpath(repo_name + "/images/titlepage.svg"), "TITLESTRING", title_string) se.replace_in_file(os.path.normpath(repo_name + "/images/cover.svg"), "TITLESTRING", title_string) # Create the titlepage SVG contributors = {} if args.translator: contributors["translated by"] = args.translator if args.illustrator: contributors["illustrated by"] = args.illustrator with open(os.path.join(repo_name, "images", "titlepage.svg"), "w") as file: file.write(_generate_titlepage_svg(args.title, args.author, contributors, title_string)) # Create the cover SVG with open(os.path.join(repo_name, "images", "cover.svg"), "w") as file: file.write(_generate_cover_svg(args.title, args.author, title_string)) if args.pg_url: se.replace_in_file(os.path.normpath(repo_name + "/src/epub/text/imprint.xhtml"), "PGLINK", args.pg_url) with open(os.path.join(repo_name, "src", "epub", "text", "colophon.xhtml"), "r+", encoding="utf-8") as file: colophon_xhtml = file.read() colophon_xhtml = colophon_xhtml.replace("SEIDENTIFIER", identifier) colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", ">{}<".format(args.author)) colophon_xhtml = colophon_xhtml.replace("TITLE", args.title) if author_wiki_url: colophon_xhtml = colophon_xhtml.replace("AUTHORWIKILINK", author_wiki_url) if args.pg_url: colophon_xhtml = colophon_xhtml.replace("PGLINK", args.pg_url) if pg_publication_year: colophon_xhtml = colophon_xhtml.replace("PG_YEAR", pg_publication_year) if pg_producers: producers_xhtml = "" for i, producer in enumerate(pg_producers): if "Distributed Proofreading" in producer: producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>" else: producers_xhtml = producers_xhtml + "<span class=\"name\">{}</span>".format(producer) if i < len(pg_producers) - 1: producers_xhtml = producers_xhtml + ", " if i == len(pg_producers) - 2: producers_xhtml = producers_xhtml + "and " producers_xhtml = producers_xhtml + "<br/>" colophon_xhtml = colophon_xhtml.replace("<span class=\"name\">TRANSCRIBER1</span>, <span class=\"name\">TRANSCRIBER2</span>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml) file.seek(0) file.write(colophon_xhtml) file.truncate() with open(os.path.join(repo_name, "src", "epub", "content.opf"), "r+", encoding="utf-8") as file: metadata_xhtml = file.read() metadata_xhtml = metadata_xhtml.replace("SEIDENTIFIER", identifier) metadata_xhtml = metadata_xhtml.replace(">AUTHOR<", ">{}<".format(args.author)) metadata_xhtml = metadata_xhtml.replace(">TITLESORT<", ">{}<".format(sorted_title)) metadata_xhtml = metadata_xhtml.replace(">TITLE<", ">{}<".format(args.title)) metadata_xhtml = metadata_xhtml.replace("VCSIDENTIFIER", repo_name) if pg_producers: producers_xhtml = "" i = 1 for producer in pg_producers: producers_xhtml = producers_xhtml + "\t\t<dc:contributor id=\"transcriber-{}\">{}</dc:contributor>\n".format(i, producer) if "Distributed Proofreading" in producer: producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{}\">https://pgdp.net</meta>\n".format(i, i) else: producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{}\">TRANSCRIBERSORT</meta>\n".format(i) producers_xhtml = producers_xhtml + "\t\t<meta property=\"role\" refines=\"#transcriber-{}\" scheme=\"marc:relators\">trc</meta>\n".format(i) i = i + 1 metadata_xhtml = regex.sub(r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBERSORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">LINK</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xhtml, flags=regex.DOTALL) if author_wiki_url: metadata_xhtml = metadata_xhtml.replace(">AUTHORWIKILINK<", ">{}<".format(author_wiki_url)) if author_nacoaf_url: metadata_xhtml = metadata_xhtml.replace(">AUTHORNACOAFLINK<", ">{}<".format(author_nacoaf_url)) if ebook_wiki_url: metadata_xhtml = metadata_xhtml.replace(">EBOOKWIKILINK<", ">{}<".format(ebook_wiki_url)) if args.translator: metadata_xhtml = metadata_xhtml.replace(">TRANSLATOR<", ">{}<".format(args.translator)) if translator_wiki_url: metadata_xhtml = metadata_xhtml.replace(">TRANSLATORWIKILINK<", ">{}<".format(translator_wiki_url)) if translator_nacoaf_url: metadata_xhtml = metadata_xhtml.replace(">TRANSLATORNACOAFLINK<", ">{}<".format(translator_nacoaf_url)) else: metadata_xhtml = regex.sub(r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">", "<dc:contributor id=\"artist\">", metadata_xhtml, flags=regex.DOTALL) if args.pg_url: if pg_subjects: subject_xhtml = "" i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + "\t\t<dc:subject id=\"subject-{}\">{}</dc:subject>\n".format(i, subject) i = i + 1 i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + "\t\t<meta property=\"meta-auth\" refines=\"#subject-{}\">{}</meta>\n".format(i, args.pg_url) i = i + 1 metadata_xhtml = regex.sub(r"\t\t<dc:subject id=\"subject-1\">SUBJECT1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT2</dc:subject>\s*<meta property=\"meta-auth\" refines=\"#subject-1\">LOCLINK1</meta>\s*<meta property=\"meta-auth\" refines=\"#subject-2\">LOCLINK2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xhtml) metadata_xhtml = metadata_xhtml.replace("<dc:language>LANG</dc:language>", "<dc:language>{}</dc:language>".format(pg_language)) metadata_xhtml = metadata_xhtml.replace("<dc:source>LINK</dc:source>", "<dc:source>{}</dc:source>".format(args.pg_url)) file.seek(0) file.write(metadata_xhtml) file.truncate() # Set up local git repo repo = git.Repo.init(repo_name) if args.email: with repo.config_writer() as config: config.set_value("user", "email", args.email) # Set up remote git repos if args.create_se_repo: git_command = git.cmd.Git(repo_name) git_command.remote("add", "origin", "standardebooks.org:/standardebooks.org/ebooks/{}.git".format(repo_name)) # Set git to automatically push to SE git_command.config("branch.master.remote", "origin") git_command.config("branch.master.merge", "refs/heads/master") github_option = "" if args.create_github_repo: github_option = "--github" return_code = call(["ssh", "standardebooks.org", "/standardebooks.org/scripts/init-se-repo --repo-name={} --title-string=\"{}\" {}".format(repo_name, title_string, github_option)]) if return_code != 0: se.print_error("Failed to create repository on Standard Ebooks server: ssh returned code {}.".format(return_code)) return se.RemoteCommandErrorException.code return 0
def create_draft(args: Namespace): """ Entry point for `se create-draft` """ # Put together some variables for later use identifier = se.formatting.make_url_safe( args.author) + "/" + se.formatting.make_url_safe(args.title) title_string = args.title.replace( "'", "’") + ", by " + args.author.replace("'", "’") sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title) pg_producers = [] if args.translator: identifier = identifier + "/" + se.formatting.make_url_safe( args.translator) title_string = title_string + ". Translated by " + args.translator if args.illustrator: identifier = identifier + "/" + se.formatting.make_url_safe( args.illustrator) title_string = title_string + ". Illustrated by " + args.illustrator repo_name = Path(identifier.replace("/", "_")) if repo_name.is_dir(): raise se.InvalidInputException(f"./{repo_name}/ already exists.") # Download PG HTML and do some fixups if args.pg_url: args.pg_url = args.pg_url.replace("http://", "https://") # Get the ebook metadata try: response = requests.get(args.pg_url) pg_metadata_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook metadata page. Error: {ex}" ) soup = BeautifulSoup(pg_metadata_html, "lxml") # Get the ebook HTML URL from the metadata pg_ebook_url = None for element in soup.select("a[type^=\"text/html\"]"): pg_ebook_url = regex.sub(r"^//", "https://", element["href"]) pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/", pg_ebook_url) if not pg_ebook_url: raise se.RemoteCommandErrorException( "Could download ebook metadata, but couldn’t find URL for the ebook HTML." ) # Get the ebook LCSH categories pg_subjects = [] for element in soup.select("td[property=\"dcterms:subject\"]"): if element["datatype"] == "dcterms:LCSH": for subject_link in element.find("a"): pg_subjects.append(subject_link.strip()) # Get the PG publication date pg_publication_year = None for element in soup.select("td[itemprop=\"datePublished\"]"): pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", element.text) # Get the actual ebook URL try: response = requests.get(pg_ebook_url) pg_ebook_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook HTML. Error: {ex}") try: fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False) pg_ebook_html = se.strip_bom(fixed_pg_ebook_html) except Exception as ex: raise se.InvalidEncodingException( f"Couldn’t determine text encoding of Project Gutenberg HTML file. Error: {ex}" ) # Try to guess the ebook language pg_language = "en-US" if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html: pg_language = "en-GB" # Create necessary directories (repo_name / "images").mkdir(parents=True) (repo_name / "src" / "epub" / "css").mkdir(parents=True) (repo_name / "src" / "epub" / "images").mkdir(parents=True) (repo_name / "src" / "epub" / "text").mkdir(parents=True) (repo_name / "src" / "META-INF").mkdir(parents=True) is_pg_html_parsed = True # Write PG data if we have it if args.pg_url and pg_ebook_html: try: soup = BeautifulSoup(pg_ebook_html, "html.parser") # Try to get the PG producers. We only try this if there's a <pre> block with the header info (which is not always the case) for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$", flags=regex.DOTALL)): if element.parent.name == "pre": producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", element, flags=regex.DOTALL) producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub( r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r",? and ", ", and ", producers_text) producers_text = producers_text.replace( " and the Online", " and The Online") producers_text = producers_text.replace(", and ", ", ").strip() pg_producers = producers_text.split(", ") # Try to strip out the PG header for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")): for sibling in element.parent.find_previous_siblings(): sibling.decompose() element.parent.decompose() # Try to strip out the PG license footer for element in soup( text=regex.compile(r"End of (the )?Project Gutenberg")): for sibling in element.parent.find_next_siblings(): sibling.decompose() element.parent.decompose() with open(repo_name / "src" / "epub" / "text" / "body.xhtml", "w", encoding="utf-8") as file: file.write(str(soup)) except OSError as ex: raise se.InvalidFileException( f"Couldn’t write to ebook directory. Error: {ex}") except: # Save this error for later, because it's still useful to complete the create-draft process # even if we've failed to parse PG's HTML source. is_pg_html_parsed = False se.quiet_remove(repo_name / "src" / "epub" / "text" / "body.xhtml") # Copy over templates _copy_template_file("gitignore", repo_name / ".gitignore") _copy_template_file("LICENSE.md", repo_name) _copy_template_file("container.xml", repo_name / "src" / "META-INF") _copy_template_file("mimetype", repo_name / "src") _copy_template_file("content.opf", repo_name / "src" / "epub") _copy_template_file("onix.xml", repo_name / "src" / "epub") _copy_template_file("toc.xhtml", repo_name / "src" / "epub") _copy_template_file("core.css", repo_name / "src" / "epub" / "css") _copy_template_file("local.css", repo_name / "src" / "epub" / "css") _copy_template_file("logo.svg", repo_name / "src" / "epub" / "images") _copy_template_file("colophon.xhtml", repo_name / "src" / "epub" / "text") _copy_template_file("imprint.xhtml", repo_name / "src" / "epub" / "text") _copy_template_file("titlepage.xhtml", repo_name / "src" / "epub" / "text") _copy_template_file("uncopyright.xhtml", repo_name / "src" / "epub" / "text") _copy_template_file("titlepage.svg", repo_name / "images") _copy_template_file("cover.jpg", repo_name / "images" / "cover.jpg") _copy_template_file("cover.svg", repo_name / "images" / "cover.svg") # Try to find Wikipedia links if possible author_wiki_url, author_nacoaf_url = _get_wikipedia_url(args.author, True) ebook_wiki_url, _ = _get_wikipedia_url(args.title, False) translator_wiki_url = None if args.translator: translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url( args.translator, True) # Pre-fill a few templates se.replace_in_file(repo_name / "src" / "epub" / "text" / "titlepage.xhtml", "TITLE_STRING", title_string) se.replace_in_file(repo_name / "images" / "titlepage.svg", "TITLE_STRING", title_string) se.replace_in_file(repo_name / "images" / "cover.svg", "TITLE_STRING", title_string) # Create the titlepage SVG contributors = {} if args.translator: contributors["translated by"] = args.translator if args.illustrator: contributors["illustrated by"] = args.illustrator with open(repo_name / "images" / "titlepage.svg", "w", encoding="utf-8") as file: file.write( _generate_titlepage_svg(args.title, args.author, contributors, title_string)) # Create the cover SVG with open(repo_name / "images" / "cover.svg", "w", encoding="utf-8") as file: file.write(_generate_cover_svg(args.title, args.author, title_string)) if args.pg_url: se.replace_in_file( repo_name / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL", args.pg_url) with open(repo_name / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: colophon_xhtml = file.read() colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier) colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", f">{args.author}<") colophon_xhtml = colophon_xhtml.replace("TITLE", args.title) if author_wiki_url: colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL", author_wiki_url) if args.pg_url: colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url) if pg_publication_year: colophon_xhtml = colophon_xhtml.replace( "PG_YEAR", pg_publication_year) if pg_producers: producers_xhtml = "" for i, producer in enumerate(pg_producers): if "Distributed Proofreading" in producer: producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>" else: producers_xhtml = producers_xhtml + f"<b class=\"name\">{producer}</b>" if i < len(pg_producers) - 1: producers_xhtml = producers_xhtml + ", " if i == len(pg_producers) - 2: producers_xhtml = producers_xhtml + "and " producers_xhtml = producers_xhtml + "<br/>" colophon_xhtml = colophon_xhtml.replace( "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml) file.seek(0) file.write(colophon_xhtml) file.truncate() with open(repo_name / "src" / "epub" / "content.opf", "r+", encoding="utf-8") as file: metadata_xhtml = file.read() metadata_xhtml = metadata_xhtml.replace("SE_IDENTIFIER", identifier) metadata_xhtml = metadata_xhtml.replace(">AUTHOR<", f">{args.author}<") metadata_xhtml = metadata_xhtml.replace(">TITLE_SORT<", f">{sorted_title}<") metadata_xhtml = metadata_xhtml.replace(">TITLE<", f">{args.title}<") metadata_xhtml = metadata_xhtml.replace("VCS_IDENTIFIER", str(repo_name)) if pg_producers: producers_xhtml = "" i = 1 for producer in pg_producers: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer}</dc:contributor>\n" if "Distributed Proofreading" in producer: producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{0}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{0}\">https://pgdp.net</meta>\n".format( i) else: producers_xhtml = producers_xhtml + f"\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n" producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n" i = i + 1 metadata_xhtml = regex.sub( r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xhtml, flags=regex.DOTALL) if author_wiki_url: metadata_xhtml = metadata_xhtml.replace(">AUTHOR_WIKI_URL<", f">{author_wiki_url}<") if author_nacoaf_url: metadata_xhtml = metadata_xhtml.replace(">AUTHOR_NACOAF_URL<", f">{author_nacoaf_url}<") if ebook_wiki_url: metadata_xhtml = metadata_xhtml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<") if args.translator: metadata_xhtml = metadata_xhtml.replace(">TRANSLATOR<", f">{args.translator}<") if translator_wiki_url: metadata_xhtml = metadata_xhtml.replace( ">TRANSLATOR_WIKI_URL<", f">{translator_wiki_url}<") if translator_nacoaf_url: metadata_xhtml = metadata_xhtml.replace( ">TRANSLATOR_NACOAF_URL<", f">{translator_nacoaf_url}<") else: metadata_xhtml = regex.sub( r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">", "<dc:contributor id=\"artist\">", metadata_xhtml, flags=regex.DOTALL) if args.pg_url: if pg_subjects: subject_xhtml = "" i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n" i = i + 1 i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n" # Now, get the LCSH ID by querying LCSH directly. try: response = requests.get( "http://id.loc.gov/search/?q=%22{}%22".format( urllib.parse.quote(subject))) result = regex.search( r"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{}</a>" .format(regex.escape(subject.replace(" -- ", "--"))), response.text) loc_id = "Unknown" try: loc_id = result.group(1) except Exception as ex: pass subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n" except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t connect to id.loc.gov. Error: {ex}") i = i + 1 metadata_xhtml = regex.sub( r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xhtml) metadata_xhtml = metadata_xhtml.replace( "<dc:language>LANG</dc:language>", f"<dc:language>{pg_language}</dc:language>") metadata_xhtml = metadata_xhtml.replace( "<dc:source>PG_URL</dc:source>", f"<dc:source>{args.pg_url}</dc:source>") file.seek(0) file.write(metadata_xhtml) file.truncate() # Set up local git repo repo = git.Repo.init(repo_name) if args.email: with repo.config_writer() as config: config.set_value("user", "email", args.email) # Set up remote git repos if args.create_se_repo: git_command = git.cmd.Git(repo_name) git_command.remote( "add", "origin", f"standardebooks.org:/standardebooks.org/ebooks/{repo_name}.git") # Set git to automatically push to SE git_command.config("branch.master.remote", "origin") git_command.config("branch.master.merge", "refs/heads/master") github_option = "" if args.create_github_repo: github_option = "--github" return_code = call([ "ssh", "standardebooks.org", f"/standardebooks.org/scripts/init-se-repo --repo-name={repo_name} --title-string=\"{title_string}\" {github_option}" ]) if return_code != 0: raise se.RemoteCommandErrorException( f"Failed to create repository on Standard Ebooks server: ssh returned code {return_code}." ) if args.pg_url and pg_ebook_html and not is_pg_html_parsed: raise se.InvalidXhtmlException( "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook." )