def build_images() -> int: """ Entry point for `se build-images` """ parser = argparse.ArgumentParser( description= "Build ebook covers and titlepages for a Standard Ebook source directory, and place the output in DIRECTORY/src/epub/images/." ) parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity") parser.add_argument("directories", metavar="DIRECTORY", nargs="+", help="a Standard Ebooks source directory") args = parser.parse_args() for directory in args.directories: directory = Path(directory) if args.verbose: print(f"Processing {directory} ...") directory = directory.resolve() se_epub = SeEpub(directory) try: if args.verbose: print("\tCleaning metadata ...", end="", flush=True) # Remove useless metadata from cover source files for root, _, filenames in os.walk(directory): for filename in fnmatch.filter(filenames, "cover.*"): se.images.remove_image_metadata(Path(root) / filename) if args.verbose: print(" OK") print("\tBuilding cover.svg ...", end="", flush=True) se_epub.generate_cover_svg() if args.verbose: print(" OK") print("\tBuilding titlepage.svg ...", end="", flush=True) se_epub.generate_titlepage_svg() if args.verbose: print(" OK") except se.SeException as ex: se.print_error(ex) return ex.code return 0
def build_images() -> int: """ Entry point for `se build-images` """ parser = argparse.ArgumentParser(description="Build ebook covers and titlepages for a Standard Ebook source directory, and place the output in DIRECTORY/src/epub/images/.") parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity") parser.add_argument("directories", metavar="DIRECTORY", nargs="+", help="a Standard Ebooks source directory") args = parser.parse_args() for directory in args.directories: directory = Path(directory) if args.verbose: print("Processing {} ...".format(directory)) directory = directory.resolve() se_epub = SeEpub(directory) try: if args.verbose: print("\tBuilding cover.svg ...", end="", flush=True) se_epub.generate_cover_svg() if args.verbose: print(" OK") if args.verbose: print("\tBuilding titlepage.svg ...", end="", flush=True) se_epub.generate_titlepage_svg() if args.verbose: print(" OK") except se.SeException as ex: se.print_error(ex) return ex.code return 0
def build_images(plain_output: bool) -> int: """ Entry point for `se build-images` """ parser = argparse.ArgumentParser( description= "Build ebook covers and titlepages for a Standard Ebook source directory, and place the output in DIRECTORY/src/epub/images/." ) parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity") parser.add_argument("directories", metavar="DIRECTORY", nargs="+", help="a Standard Ebooks source directory") args = parser.parse_args() console = Console( highlight=False, theme=se.RICH_THEME, force_terminal=se.is_called_from_parallel() ) # Syntax highlighting will do weird things when printing paths; force_terminal prints colors when called from GNU Parallel for directory in args.directories: directory = Path(directory).resolve() if args.verbose: console.print( se.prep_output( f"Processing [path][link=file://{directory}]{directory}[/][/] ...", plain_output)) try: se_epub = SeEpub(directory) if args.verbose: console.print("\tCleaning metadata ...", end="") # Remove useless metadata from cover source files for file_path in directory.glob("**/cover.*"): se.images.remove_image_metadata(file_path) if args.verbose: console.print(" OK") console.print(se.prep_output( f"\tBuilding [path][link=file://{directory / 'src/epub/images/cover.svg'}]cover.svg[/][/] ...", plain_output), end="") se_epub.generate_cover_svg() if args.verbose: console.print(" OK") console.print(se.prep_output( f"\tBuilding [path][link=file://{directory / 'src/epub/images/titlepage.svg'}]titlepage.svg[/][/] ...", plain_output), end="") se_epub.generate_titlepage_svg() if args.verbose: console.print(" OK") except se.SeException as ex: se.print_error(ex) return ex.code return 0
def _create_draft(args: Namespace): """ Implementation for `se create-draft` """ # Put together some variables for later use authors = [] translators = [] illustrators = [] pg_producers = [] title = args.title.replace("'", "’") for author in args.author: authors.append({ "name": author.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) if args.translator: for translator in args.translator: translators.append({ "name": translator.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) if args.illustrator: for illustrator in args.illustrator: illustrators.append({ "name": illustrator.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) title_string = title if authors and authors[0]["name"].lower() != "anonymous": title_string += ", by " + _generate_contributor_string(authors, False) identifier = "" for author in authors: identifier += se.formatting.make_url_safe(author["name"]) + "_" identifier = identifier.rstrip("_") + "/" + se.formatting.make_url_safe( title) sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", title) if translators: title_string = title_string + ". Translated by " + _generate_contributor_string( translators, False) identifier = identifier + "/" for translator in translators: identifier += se.formatting.make_url_safe(translator["name"]) + "_" identifier = identifier.rstrip("_") if illustrators: title_string = title_string + ". Illustrated by " + _generate_contributor_string( illustrators, False) identifier = identifier + "/" for illustrator in illustrators: identifier += se.formatting.make_url_safe( illustrator["name"]) + "_" identifier = identifier.rstrip("_") repo_name = identifier.replace("/", "_") repo_path = Path(repo_name).resolve() if repo_path.is_dir(): raise se.InvalidInputException( f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]." ) # Get data on authors for i, author in enumerate(authors): if not args.offline and author["name"].lower() != "anonymous": author["wiki_url"], author["nacoaf_url"] = _get_wikipedia_url( author["name"], True) # Get data on translators for i, translator in enumerate(translators): if not args.offline and translator["name"].lower() != "anonymous": translator["wiki_url"], translator[ "nacoaf_url"] = _get_wikipedia_url(translator["name"], True) # Get data on illlustrators for i, illustrator in enumerate(illustrators): if not args.offline and illustrator["name"].lower() != "anonymous": illustrator["wiki_url"], illustrator[ "nacoaf_url"] = _get_wikipedia_url(illustrator["name"], True) # Download PG HTML and do some fixups if args.pg_url: if args.offline: raise se.RemoteCommandErrorException( "Cannot download Project Gutenberg ebook when offline option is enabled." ) args.pg_url = args.pg_url.replace("http://", "https://") # Get the ebook metadata try: response = requests.get(args.pg_url) pg_metadata_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}" ) parser = etree.HTMLParser() dom = etree.parse(StringIO(pg_metadata_html), parser) # Get the ebook HTML URL from the metadata pg_ebook_url = None for node in dom.xpath("/html/body//a[contains(@type, 'text/html')]"): pg_ebook_url = regex.sub(r"^//", "https://", node.get("href")) pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/", pg_ebook_url) if not pg_ebook_url: raise se.RemoteCommandErrorException( "Could download ebook metadata, but couldn’t find URL for the ebook HTML." ) # Get the ebook LCSH categories pg_subjects = [] for node in dom.xpath( "/html/body//td[contains(@property, 'dcterms:subject')]"): if node.get("datatype") == "dcterms:LCSH": for subject_link in node.xpath("./a"): pg_subjects.append(subject_link.text.strip()) # Get the PG publication date pg_publication_year = None for node in dom.xpath("//td[@itemprop='datePublished']"): pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", node.text) # Get the actual ebook URL try: response = requests.get(pg_ebook_url) pg_ebook_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}" ) try: fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False) pg_ebook_html = se.strip_bom(fixed_pg_ebook_html) except Exception as ex: raise se.InvalidEncodingException( f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}" ) # Try to guess the ebook language pg_language = "en-US" if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html: pg_language = "en-GB" # Create necessary directories (repo_path / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "css").mkdir(parents=True) (repo_path / "src" / "epub" / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "text").mkdir(parents=True) (repo_path / "src" / "META-INF").mkdir(parents=True) is_pg_html_parsed = True # Write PG data if we have it if args.pg_url and pg_ebook_html: try: dom = etree.parse( StringIO(regex.sub(r"encoding=\".+?\"", "", pg_ebook_html)), parser) namespaces = {"re": "http://exslt.org/regular-expressions"} for node in dom.xpath( "//*[re:test(text(), '\\*\\*\\*\\s*Produced by.+')]", namespaces=namespaces): producers_text = regex.sub( r"^<[^>]+?>", "", etree.tostring(node, encoding=str, with_tail=False)) producers_text = regex.sub(r"<[^>]+?>$", "", producers_text) producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r",? and ", ", and ", producers_text) producers_text = producers_text.replace( " and the Online", " and The Online") producers_text = producers_text.replace(", and ", ", ").strip() pg_producers = [ producer.strip() for producer in regex.split(',|;', producers_text) ] # Try to strip out the PG header for node in dom.xpath( "//*[re:test(text(), '\\*\\*\\*\\s*START OF THIS')]", namespaces=namespaces): for sibling_node in node.xpath("./preceding-sibling::*"): easy_node = se.easy_xml.EasyXmlElement(sibling_node) easy_node.remove() easy_node = se.easy_xml.EasyXmlElement(node) easy_node.remove() # Try to strip out the PG license footer for node in dom.xpath( "//*[re:test(text(), 'End of (the )?Project Gutenberg')]", namespaces=namespaces): for sibling_node in node.xpath("./following-sibling::*"): easy_node = se.easy_xml.EasyXmlElement(sibling_node) easy_node.remove() easy_node = se.easy_xml.EasyXmlElement(node) easy_node.remove() # lxml will but the xml declaration in a weird place, remove it first output = regex.sub(r"<\?xml.+?\?>", "", etree.tostring(dom, encoding="unicode")) # Now re-add it output = """<?xml version="1.0" encoding="utf-8"?>\n""" + output # lxml can also output duplicate default namespace declarations so remove the first one only output = regex.sub(r"(xmlns=\".+?\")(\sxmlns=\".+?\")+", r"\1", output) with open(repo_path / "src" / "epub" / "text" / "body.xhtml", "w", encoding="utf-8") as file: file.write(output) except OSError as ex: raise se.InvalidFileException( f"Couldn’t write to ebook directory. Exception: {ex}") except Exception as ex: # Save this error for later, because it's still useful to complete the create-draft process # even if we've failed to parse PG's HTML source. is_pg_html_parsed = False se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml") # Copy over templates _copy_template_file("gitignore", repo_path / ".gitignore") _copy_template_file("LICENSE.md", repo_path) _copy_template_file("container.xml", repo_path / "src" / "META-INF") _copy_template_file("mimetype", repo_path / "src") _copy_template_file("content.opf", repo_path / "src" / "epub") _copy_template_file("onix.xml", repo_path / "src" / "epub") _copy_template_file("toc.xhtml", repo_path / "src" / "epub") _copy_template_file("core.css", repo_path / "src" / "epub" / "css") _copy_template_file("local.css", repo_path / "src" / "epub" / "css") _copy_template_file("se.css", repo_path / "src" / "epub" / "css") _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images") _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("uncopyright.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.svg", repo_path / "images") _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg") _copy_template_file("cover.svg", repo_path / "images" / "cover.svg") # Try to find Wikipedia links if possible ebook_wiki_url = None if not args.offline and title != "Short Fiction": # There's a "Short Fiction" Wikipedia article, so make an exception for that case ebook_wiki_url, _ = _get_wikipedia_url(title, False) # Pre-fill a few templates _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING", title_string) # Create the titlepage SVG contributors = {} if args.translator: contributors["translated by"] = _generate_contributor_string( translators, False) if args.illustrator: contributors["illustrated by"] = _generate_contributor_string( illustrators, False) with open(repo_path / "images" / "titlepage.svg", "w", encoding="utf-8") as file: file.write( _generate_titlepage_svg(title, [author["name"] for author in authors], contributors, title_string)) # Create the cover SVG with open(repo_path / "images" / "cover.svg", "w", encoding="utf-8") as file: file.write( _generate_cover_svg(title, [author["name"] for author in authors], title_string)) # Build the cover/titlepage for distribution epub = SeEpub(repo_path) epub.generate_cover_svg() epub.generate_titlepage_svg() if args.pg_url: _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL", args.pg_url) # Fill out the colophon with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: colophon_xhtml = file.read() colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier) colophon_xhtml = colophon_xhtml.replace("TITLE", title) contributor_string = _generate_contributor_string(authors, True) if contributor_string == "": colophon_xhtml = colophon_xhtml.replace( " by<br/>\n\t\t\t<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string) else: colophon_xhtml = colophon_xhtml.replace( "<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string) if translators: translator_block = f"It was translated from ORIGINAL_LANGUAGE in TRANSLATION_YEAR by<br/>\n\t\t\t{_generate_contributor_string(translators, True)}.</p>" colophon_xhtml = colophon_xhtml.replace( "</p>\n\t\t\t<p>This ebook was produced for the<br/>", f"<br/>\n\t\t\t{translator_block}\n\t\t\t<p>This ebook was produced for the<br/>" ) if args.pg_url: colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url) if pg_publication_year: colophon_xhtml = colophon_xhtml.replace( "PG_YEAR", pg_publication_year) if pg_producers: producers_xhtml = "" for i, producer in enumerate(pg_producers): if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>" else: producers_xhtml = producers_xhtml + f"<b class=\"name\">{_add_name_abbr(producer).strip('.')}</b>" if i < len(pg_producers) - 1: producers_xhtml = producers_xhtml + ", " if i == len(pg_producers) - 2: producers_xhtml = producers_xhtml + "and " producers_xhtml = producers_xhtml + "<br/>" colophon_xhtml = colophon_xhtml.replace( "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml) file.seek(0) file.write(colophon_xhtml) file.truncate() # Fill out the metadata file with open(repo_path / "src" / "epub" / "content.opf", "r+", encoding="utf-8") as file: metadata_xml = file.read() metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier) metadata_xml = metadata_xml.replace(">TITLE_SORT<", f">{sorted_title}<") metadata_xml = metadata_xml.replace(">TITLE<", f">{title}<") metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name)) if pg_producers: producers_xhtml = "" i = 1 for producer in pg_producers: if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n" else: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n" producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n" i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xml, flags=regex.DOTALL) if ebook_wiki_url: metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<") authors_xml = _generate_metadata_contributor_xml(authors, "author") authors_xml = authors_xml.replace("dc:contributor", "dc:creator") metadata_xml = regex.sub( r"<dc:creator id=\"author\">AUTHOR</dc:creator>.+?scheme=\"marc:relators\">aut</meta>", authors_xml, metadata_xml, flags=regex.DOTALL) if translators: translators_xml = _generate_metadata_contributor_xml( translators, "translator") metadata_xml = regex.sub( r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>", translators_xml, metadata_xml, flags=regex.DOTALL) else: metadata_xml = regex.sub( r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>\n\t\t", "", metadata_xml, flags=regex.DOTALL) if illustrators: illustrators_xml = _generate_metadata_contributor_xml( illustrators, "illustrator") metadata_xml = regex.sub( r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>", illustrators_xml, metadata_xml, flags=regex.DOTALL) else: metadata_xml = regex.sub( r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>\n\t\t", "", metadata_xml, flags=regex.DOTALL) if args.pg_url: if pg_subjects: subject_xhtml = "" i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n" i = i + 1 i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n" # Now, get the LCSH ID by querying LCSH directly. try: response = requests.get( f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22" ) result = regex.search( fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>", response.text) loc_id = "Unknown" try: loc_id = result.group(1) except Exception as ex: pass subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n" except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}" ) i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xml) metadata_xml = metadata_xml.replace( "<dc:language>LANG</dc:language>", f"<dc:language>{pg_language}</dc:language>") metadata_xml = metadata_xml.replace( "<dc:source>PG_URL</dc:source>", f"<dc:source>{args.pg_url}</dc:source>") file.seek(0) file.write(metadata_xml) file.truncate() # Set up local git repo repo = git.Repo.init(repo_path) if args.email: with repo.config_writer() as config: config.set_value("user", "email", args.email) if args.pg_url and pg_ebook_html and not is_pg_html_parsed: raise se.InvalidXhtmlException( "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook." )
def _create_draft(args: Namespace): """ Implementation for `se create-draft` """ # Put together some variables for later use identifier = se.formatting.make_url_safe( args.author) + "/" + se.formatting.make_url_safe(args.title) title_string = args.title.replace( "'", "’") + ", by " + args.author.replace("'", "’") sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title) pg_producers = [] if args.translator: identifier = identifier + "/" + se.formatting.make_url_safe( args.translator) title_string = title_string + ". Translated by " + args.translator if args.illustrator: identifier = identifier + "/" + se.formatting.make_url_safe( args.illustrator) title_string = title_string + ". Illustrated by " + args.illustrator repo_name = identifier.replace("/", "_") repo_path = Path(repo_name).resolve() if repo_path.is_dir(): raise se.InvalidInputException( f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]." ) # Download PG HTML and do some fixups if args.pg_url: if args.offline: raise se.RemoteCommandErrorException( "Cannot download Project Gutenberg ebook when offline option is enabled." ) args.pg_url = args.pg_url.replace("http://", "https://") # Get the ebook metadata try: response = requests.get(args.pg_url) pg_metadata_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}" ) soup = BeautifulSoup(pg_metadata_html, "lxml") # Get the ebook HTML URL from the metadata pg_ebook_url = None for element in soup.select("a[type^=\"text/html\"]"): pg_ebook_url = regex.sub(r"^//", "https://", element["href"]) pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/", pg_ebook_url) if not pg_ebook_url: raise se.RemoteCommandErrorException( "Could download ebook metadata, but couldn’t find URL for the ebook HTML." ) # Get the ebook LCSH categories pg_subjects = [] for element in soup.select("td[property=\"dcterms:subject\"]"): if element["datatype"] == "dcterms:LCSH": for subject_link in element.find("a"): pg_subjects.append(subject_link.strip()) # Get the PG publication date pg_publication_year = None for element in soup.select("td[itemprop=\"datePublished\"]"): pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", element.text) # Get the actual ebook URL try: response = requests.get(pg_ebook_url) pg_ebook_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}" ) try: fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False) pg_ebook_html = se.strip_bom(fixed_pg_ebook_html) except Exception as ex: raise se.InvalidEncodingException( f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}" ) # Try to guess the ebook language pg_language = "en-US" if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html: pg_language = "en-GB" # Create necessary directories (repo_path / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "css").mkdir(parents=True) (repo_path / "src" / "epub" / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "text").mkdir(parents=True) (repo_path / "src" / "META-INF").mkdir(parents=True) is_pg_html_parsed = True # Write PG data if we have it if args.pg_url and pg_ebook_html: try: soup = BeautifulSoup(pg_ebook_html, "html.parser") # Try to get the PG producers. We only try this if there's a <pre> block with the header info (which is not always the case) for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$", flags=regex.DOTALL)): if element.parent.name == "pre": producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", element, flags=regex.DOTALL) producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub( r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r",? and ", ", and ", producers_text) producers_text = producers_text.replace( " and the Online", " and The Online") producers_text = producers_text.replace(", and ", ", ").strip() pg_producers = producers_text.split(", ") # Try to strip out the PG header for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")): for sibling in element.parent.find_previous_siblings(): sibling.decompose() element.parent.decompose() # Try to strip out the PG license footer for element in soup( text=regex.compile(r"End of (the )?Project Gutenberg")): for sibling in element.parent.find_next_siblings(): sibling.decompose() element.parent.decompose() with open(repo_path / "src" / "epub" / "text" / "body.xhtml", "w", encoding="utf-8") as file: file.write(str(soup)) except OSError as ex: raise se.InvalidFileException( f"Couldn’t write to ebook directory. Exception: {ex}") except: # Save this error for later, because it's still useful to complete the create-draft process # even if we've failed to parse PG's HTML source. is_pg_html_parsed = False se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml") # Copy over templates _copy_template_file("gitignore", repo_path / ".gitignore") _copy_template_file("LICENSE.md", repo_path) _copy_template_file("container.xml", repo_path / "src" / "META-INF") _copy_template_file("mimetype", repo_path / "src") _copy_template_file("content.opf", repo_path / "src" / "epub") _copy_template_file("onix.xml", repo_path / "src" / "epub") _copy_template_file("toc.xhtml", repo_path / "src" / "epub") _copy_template_file("core.css", repo_path / "src" / "epub" / "css") _copy_template_file("local.css", repo_path / "src" / "epub" / "css") _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images") _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("uncopyright.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.svg", repo_path / "images") _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg") _copy_template_file("cover.svg", repo_path / "images" / "cover.svg") # Try to find Wikipedia links if possible if args.offline: author_wiki_url = None author_nacoaf_url = None ebook_wiki_url = None translator_wiki_url = None translator_nacoaf_url = None else: author_wiki_url, author_nacoaf_url = _get_wikipedia_url( args.author, True) ebook_wiki_url = None if args.title != "Short Fiction": # There's a "Short Fiction" Wikipedia article, so make an exception for that case ebook_wiki_url, _ = _get_wikipedia_url(args.title, False) translator_wiki_url = None if args.translator: translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url( args.translator, True) # Pre-fill a few templates _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING", title_string) # Create the titlepage SVG contributors = {} if args.translator: contributors["translated by"] = args.translator if args.illustrator: contributors["illustrated by"] = args.illustrator with open(repo_path / "images" / "titlepage.svg", "w", encoding="utf-8") as file: file.write( _generate_titlepage_svg(args.title, args.author, contributors, title_string)) # Create the cover SVG with open(repo_path / "images" / "cover.svg", "w", encoding="utf-8") as file: file.write(_generate_cover_svg(args.title, args.author, title_string)) # Build the cover/titlepage for distribution epub = SeEpub(repo_path) epub.generate_cover_svg() epub.generate_titlepage_svg() if args.pg_url: _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL", args.pg_url) with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: colophon_xhtml = file.read() colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier) colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", f">{args.author}<") colophon_xhtml = colophon_xhtml.replace("TITLE", args.title) if author_wiki_url: colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL", author_wiki_url) if args.pg_url: colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url) if pg_publication_year: colophon_xhtml = colophon_xhtml.replace( "PG_YEAR", pg_publication_year) if pg_producers: producers_xhtml = "" for i, producer in enumerate(pg_producers): if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>" else: producers_xhtml = producers_xhtml + f"<b class=\"name\">{producer.strip('.')}</b>" if i < len(pg_producers) - 1: producers_xhtml = producers_xhtml + ", " if i == len(pg_producers) - 2: producers_xhtml = producers_xhtml + "and " producers_xhtml = producers_xhtml + "<br/>" colophon_xhtml = colophon_xhtml.replace( "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml) file.seek(0) file.write(colophon_xhtml) file.truncate() with open(repo_path / "src" / "epub" / "content.opf", "r+", encoding="utf-8") as file: metadata_xml = file.read() metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier) metadata_xml = metadata_xml.replace(">AUTHOR<", f">{args.author}<") metadata_xml = metadata_xml.replace(">TITLE_SORT<", f">{sorted_title}<") metadata_xml = metadata_xml.replace(">TITLE<", f">{args.title}<") metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name)) if pg_producers: producers_xhtml = "" i = 1 for producer in pg_producers: if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n" else: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n" producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n" i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xml, flags=regex.DOTALL) if author_wiki_url: metadata_xml = metadata_xml.replace(">AUTHOR_WIKI_URL<", f">{author_wiki_url}<") if author_nacoaf_url: metadata_xml = metadata_xml.replace(">AUTHOR_NACOAF_URL<", f">{author_nacoaf_url}<") if ebook_wiki_url: metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<") if args.translator: metadata_xml = metadata_xml.replace(">TRANSLATOR<", f">{args.translator}<") if translator_wiki_url: metadata_xml = metadata_xml.replace( ">TRANSLATOR_WIKI_URL<", f">{translator_wiki_url}<") if translator_nacoaf_url: metadata_xml = metadata_xml.replace( ">TRANSLATOR_NACOAF_URL<", f">{translator_nacoaf_url}<") else: metadata_xml = regex.sub( r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">", "<dc:contributor id=\"artist\">", metadata_xml, flags=regex.DOTALL) if args.pg_url: if pg_subjects: subject_xhtml = "" i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n" i = i + 1 i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n" # Now, get the LCSH ID by querying LCSH directly. try: response = requests.get( f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22" ) result = regex.search( fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>", response.text) loc_id = "Unknown" try: loc_id = result.group(1) except Exception as ex: pass subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n" except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}" ) i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xml) metadata_xml = metadata_xml.replace( "<dc:language>LANG</dc:language>", f"<dc:language>{pg_language}</dc:language>") metadata_xml = metadata_xml.replace( "<dc:source>PG_URL</dc:source>", f"<dc:source>{args.pg_url}</dc:source>") file.seek(0) file.write(metadata_xml) file.truncate() # Set up local git repo repo = git.Repo.init(repo_path) if args.email: with repo.config_writer() as config: config.set_value("user", "email", args.email) if args.pg_url and pg_ebook_html and not is_pg_html_parsed: raise se.InvalidXhtmlException( "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook." )