Ejemplos de strip_bom en Python

Lenguaje de programación: Python

Namespace/Package Name: se

Método / Función: strip_bom

Ejemplos en hotexamples.com: 6

Python strip_bom - 6 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de se.strip_bom extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

def split_file() -> int:
	"""
	Entry point for `se split-file`
	"""

	parser = argparse.ArgumentParser(description="Split an XHTML file into many files at all instances of <!--se:split-->, and include a header template for each file.")
	parser.add_argument("-f", "--filename-format", metavar="STRING", type=str, default="chapter-%n.xhtml", help="a format string for the output files; `%%n` is replaced with the current chapter number; defaults to `chapter-%%n.xhtml`")
	parser.add_argument("-s", "--start-at", metavar="INTEGER", type=se.is_positive_integer, default="1", help="start numbering chapters at this number, instead of at 1")
	parser.add_argument("-t", "--template-file", metavar="FILE", type=str, default="", help="a file containing an XHTML template to use for each chapter; the string `NUMBER` is replaced by the chapter number, and the string `TEXT` is replaced by the chapter body")
	parser.add_argument("filename", metavar="FILE", help="an HTML/XHTML file")
	args = parser.parse_args()

	try:
		filename = Path(args.filename).resolve()
		with open(filename, "r", encoding="utf-8") as file:
			xhtml = se.strip_bom(file.read())
	except FileNotFoundError:
		se.print_error(f"Couldn’t open file: [path][link=file://{filename}]{filename}[/][/].")
		return se.InvalidFileException.code

	if args.template_file:
		try:
			filename = Path(args.template_file).resolve()
			with open(filename, "r", encoding="utf-8") as file:
				template_xhtml = file.read()
		except FileNotFoundError:
			se.print_error(f"Couldn’t open file: [path][link=file://{filename}]{filename}[/][/].")
			return se.InvalidFileException.code
	else:
		with importlib_resources.open_text("se.data.templates", "chapter-template.xhtml", encoding="utf-8") as file:
			template_xhtml = file.read()

	chapter_xhtml = ""

	# Remove leading split tags
	xhtml = regex.sub(r"^\s*<\!--se:split-->", "", xhtml)

	for line in xhtml.splitlines():
		if "<!--se:split-->" in line:
			prefix, suffix = line.split("<!--se:split-->")
			chapter_xhtml = chapter_xhtml + prefix
			_split_file_output_file(args.filename_format, args.start_at, template_xhtml, chapter_xhtml)

			args.start_at = args.start_at + 1
			chapter_xhtml = suffix

		else:
			chapter_xhtml = f"{chapter_xhtml}\n{line}"

	if chapter_xhtml and not chapter_xhtml.isspace():
		_split_file_output_file(args.filename_format, args.start_at, template_xhtml, chapter_xhtml)

	return 0

Ejemplo n.º 2

Mostrar archivo

Archivo: executables.py Proyecto: marcus-crane/tools

def split_file() -> int:
	"""
	Entry point for `se split-file`
	"""

	parser = argparse.ArgumentParser(description="Split an XHTML file into many files at all instances of <!--se:split-->, and include a header template for each file.")
	parser.add_argument("filename", metavar="FILE", help="an HTML/XHTML file")
	args = parser.parse_args()

	with open(args.filename, "r", encoding="utf-8") as file:
		xhtml = se.strip_bom(file.read())

	with open(resource_filename("se", str(Path("data") / "templates" / "header.xhtml")), "r", encoding="utf-8") as file:
		header_xhtml = file.read()

	chapter_number = 1
	chapter_xhtml = ""

	# Remove leading split tags
	xhtml = regex.sub(r"^\s*<\!--se:split-->", "", xhtml)

	for line in xhtml.splitlines():
		if "<!--se:split-->" in line:
			prefix, suffix = line.split("<!--se:split-->")
			chapter_xhtml = chapter_xhtml + prefix
			_split_file_output_file(chapter_number, header_xhtml, chapter_xhtml)

			chapter_number = chapter_number + 1
			chapter_xhtml = suffix

		else:
			chapter_xhtml = chapter_xhtml + "\n" + line

	if chapter_xhtml and not chapter_xhtml.isspace():
		_split_file_output_file(chapter_number, header_xhtml, chapter_xhtml)

	return 0

Ejemplo n.º 3

Mostrar archivo

Archivo: create_draft.py Proyecto: Venkata16924B/tools-1

def _create_draft(args: Namespace):
    """
	Implementation for `se create-draft`
	"""

    # Put together some variables for later use
    authors = []
    translators = []
    illustrators = []
    pg_producers = []
    title = args.title.replace("'", "’")

    for author in args.author:
        authors.append({
            "name": author.replace("'", "’"),
            "wiki_url": None,
            "nacoaf_url": None
        })

    if args.translator:
        for translator in args.translator:
            translators.append({
                "name": translator.replace("'", "’"),
                "wiki_url": None,
                "nacoaf_url": None
            })

    if args.illustrator:
        for illustrator in args.illustrator:
            illustrators.append({
                "name": illustrator.replace("'", "’"),
                "wiki_url": None,
                "nacoaf_url": None
            })

    title_string = title
    if authors and authors[0]["name"].lower() != "anonymous":
        title_string += ", by " + _generate_contributor_string(authors, False)

    identifier = ""
    for author in authors:
        identifier += se.formatting.make_url_safe(author["name"]) + "_"

    identifier = identifier.rstrip("_") + "/" + se.formatting.make_url_safe(
        title)

    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", title)

    if translators:
        title_string = title_string + ". Translated by " + _generate_contributor_string(
            translators, False)

        identifier = identifier + "/"

        for translator in translators:
            identifier += se.formatting.make_url_safe(translator["name"]) + "_"

        identifier = identifier.rstrip("_")

    if illustrators:
        title_string = title_string + ". Illustrated by " + _generate_contributor_string(
            illustrators, False)

        identifier = identifier + "/"

        for illustrator in illustrators:
            identifier += se.formatting.make_url_safe(
                illustrator["name"]) + "_"

        identifier = identifier.rstrip("_")

    repo_name = identifier.replace("/", "_")

    repo_path = Path(repo_name).resolve()

    if repo_path.is_dir():
        raise se.InvalidInputException(
            f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]."
        )

    # Get data on authors
    for i, author in enumerate(authors):
        if not args.offline and author["name"].lower() != "anonymous":
            author["wiki_url"], author["nacoaf_url"] = _get_wikipedia_url(
                author["name"], True)

    # Get data on translators
    for i, translator in enumerate(translators):
        if not args.offline and translator["name"].lower() != "anonymous":
            translator["wiki_url"], translator[
                "nacoaf_url"] = _get_wikipedia_url(translator["name"], True)

    # Get data on illlustrators
    for i, illustrator in enumerate(illustrators):
        if not args.offline and illustrator["name"].lower() != "anonymous":
            illustrator["wiki_url"], illustrator[
                "nacoaf_url"] = _get_wikipedia_url(illustrator["name"], True)

    # Download PG HTML and do some fixups
    if args.pg_url:
        if args.offline:
            raise se.RemoteCommandErrorException(
                "Cannot download Project Gutenberg ebook when offline option is enabled."
            )

        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}"
            )

        parser = etree.HTMLParser()
        dom = etree.parse(StringIO(pg_metadata_html), parser)

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for node in dom.xpath("/html/body//a[contains(@type, 'text/html')]"):
            pg_ebook_url = regex.sub(r"^//", "https://", node.get("href"))
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for node in dom.xpath(
                "/html/body//td[contains(@property, 'dcterms:subject')]"):
            if node.get("datatype") == "dcterms:LCSH":
                for subject_link in node.xpath("./a"):
                    pg_subjects.append(subject_link.text.strip())

        # Get the PG publication date
        pg_publication_year = None
        for node in dom.xpath("//td[@itemprop='datePublished']"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", node.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}"
            )

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}"
            )

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_path / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "css").mkdir(parents=True)
    (repo_path / "src" / "epub" / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "text").mkdir(parents=True)
    (repo_path / "src" / "META-INF").mkdir(parents=True)

    is_pg_html_parsed = True

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            dom = etree.parse(
                StringIO(regex.sub(r"encoding=\".+?\"", "", pg_ebook_html)),
                parser)
            namespaces = {"re": "http://exslt.org/regular-expressions"}

            for node in dom.xpath(
                    "//*[re:test(text(), '\\*\\*\\*\\s*Produced by.+')]",
                    namespaces=namespaces):
                producers_text = regex.sub(
                    r"^<[^>]+?>", "",
                    etree.tostring(node, encoding=str, with_tail=False))
                producers_text = regex.sub(r"<[^>]+?>$", "", producers_text)

                producers_text = regex.sub(r".+?Produced by (.+?)\s*$",
                                           "\\1",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"\(.+?\)",
                                           "",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net",
                                           "",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r"[\r\n]+",
                                           " ",
                                           producers_text,
                                           flags=regex.DOTALL)
                producers_text = regex.sub(r",? and ", ", and ",
                                           producers_text)
                producers_text = producers_text.replace(
                    " and the Online", " and The Online")
                producers_text = producers_text.replace(", and ", ", ").strip()

                pg_producers = [
                    producer.strip()
                    for producer in regex.split(',|;', producers_text)
                ]

            # Try to strip out the PG header
            for node in dom.xpath(
                    "//*[re:test(text(), '\\*\\*\\*\\s*START OF THIS')]",
                    namespaces=namespaces):
                for sibling_node in node.xpath("./preceding-sibling::*"):
                    easy_node = se.easy_xml.EasyXmlElement(sibling_node)
                    easy_node.remove()

                easy_node = se.easy_xml.EasyXmlElement(node)
                easy_node.remove()

            # Try to strip out the PG license footer
            for node in dom.xpath(
                    "//*[re:test(text(), 'End of (the )?Project Gutenberg')]",
                    namespaces=namespaces):
                for sibling_node in node.xpath("./following-sibling::*"):
                    easy_node = se.easy_xml.EasyXmlElement(sibling_node)
                    easy_node.remove()

                easy_node = se.easy_xml.EasyXmlElement(node)
                easy_node.remove()

            # lxml will but the xml declaration in a weird place, remove it first
            output = regex.sub(r"<\?xml.+?\?>", "",
                               etree.tostring(dom, encoding="unicode"))

            # Now re-add it
            output = """<?xml version="1.0" encoding="utf-8"?>\n""" + output

            # lxml can also output duplicate default namespace declarations so remove the first one only
            output = regex.sub(r"(xmlns=\".+?\")(\sxmlns=\".+?\")+", r"\1",
                               output)

            with open(repo_path / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(output)

        except OSError as ex:
            raise se.InvalidFileException(
                f"Couldn’t write to ebook directory. Exception: {ex}")
        except Exception as ex:
            # Save this error for later, because it's still useful to complete the create-draft process
            # even if we've failed to parse PG's HTML source.
            is_pg_html_parsed = False
            se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml")

    # Copy over templates
    _copy_template_file("gitignore", repo_path / ".gitignore")
    _copy_template_file("LICENSE.md", repo_path)
    _copy_template_file("container.xml", repo_path / "src" / "META-INF")
    _copy_template_file("mimetype", repo_path / "src")
    _copy_template_file("content.opf", repo_path / "src" / "epub")
    _copy_template_file("onix.xml", repo_path / "src" / "epub")
    _copy_template_file("toc.xhtml", repo_path / "src" / "epub")
    _copy_template_file("core.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("local.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("se.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images")
    _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("uncopyright.xhtml",
                        repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.svg", repo_path / "images")
    _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg")
    _copy_template_file("cover.svg", repo_path / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    ebook_wiki_url = None

    if not args.offline and title != "Short Fiction":
        # There's a "Short Fiction" Wikipedia article, so make an exception for that case
        ebook_wiki_url, _ = _get_wikipedia_url(title, False)

    # Pre-fill a few templates
    _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml",
                     "TITLE_STRING", title_string)
    _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING",
                     title_string)
    _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING",
                     title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = _generate_contributor_string(
            translators, False)

    if args.illustrator:
        contributors["illustrated by"] = _generate_contributor_string(
            illustrators, False)

    with open(repo_path / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(title,
                                    [author["name"] for author in authors],
                                    contributors, title_string))

    # Create the cover SVG
    with open(repo_path / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_cover_svg(title, [author["name"] for author in authors],
                                title_string))

    # Build the cover/titlepage for distribution
    epub = SeEpub(repo_path)
    epub.generate_cover_svg()
    epub.generate_titlepage_svg()

    if args.pg_url:
        _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml",
                         "PG_URL", args.pg_url)

    # Fill out the colophon
    with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace("TITLE", title)

        contributor_string = _generate_contributor_string(authors, True)

        if contributor_string == "":
            colophon_xhtml = colophon_xhtml.replace(
                " by<br/>\n\t\t\t<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>",
                contributor_string)
        else:
            colophon_xhtml = colophon_xhtml.replace(
                "<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string)

        if translators:
            translator_block = f"It was translated from ORIGINAL_LANGUAGE in TRANSLATION_YEAR by<br/>\n\t\t\t{_generate_contributor_string(translators, True)}.</p>"
            colophon_xhtml = colophon_xhtml.replace(
                "</p>\n\t\t\t<p>This ebook was produced for the<br/>",
                f"<br/>\n\t\t\t{translator_block}\n\t\t\t<p>This ebook was produced for the<br/>"
            )

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofread" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    elif "anonymous" in producer.lower():
                        producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>"
                    else:
                        producers_xhtml = producers_xhtml + f"<b class=\"name\">{_add_name_abbr(producer).strip('.')}</b>"

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    # Fill out the metadata file
    with open(repo_path / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xml = file.read()

        metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier)
        metadata_xml = metadata_xml.replace(">TITLE_SORT<",
                                            f">{sorted_title}<")
        metadata_xml = metadata_xml.replace(">TITLE<", f">{title}<")
        metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                if "Distributed Proofread" in producer:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n"
                elif "anonymous" in producer.lower():
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n"
                else:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n"

                producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n"

                i = i + 1

            metadata_xml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xml,
                flags=regex.DOTALL)

        if ebook_wiki_url:
            metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<",
                                                f">{ebook_wiki_url}<")

        authors_xml = _generate_metadata_contributor_xml(authors, "author")
        authors_xml = authors_xml.replace("dc:contributor", "dc:creator")
        metadata_xml = regex.sub(
            r"<dc:creator id=\"author\">AUTHOR</dc:creator>.+?scheme=\"marc:relators\">aut</meta>",
            authors_xml,
            metadata_xml,
            flags=regex.DOTALL)

        if translators:
            translators_xml = _generate_metadata_contributor_xml(
                translators, "translator")
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>",
                translators_xml,
                metadata_xml,
                flags=regex.DOTALL)
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>\n\t\t",
                "",
                metadata_xml,
                flags=regex.DOTALL)

        if illustrators:
            illustrators_xml = _generate_metadata_contributor_xml(
                illustrators, "illustrator")
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>",
                illustrators_xml,
                metadata_xml,
                flags=regex.DOTALL)
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>\n\t\t",
                "",
                metadata_xml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n"
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n"

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22"
                        )
                        result = regex.search(
                            fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>",
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n"

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}"
                        )

                    i = i + 1

                metadata_xml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xml)

            metadata_xml = metadata_xml.replace(
                "<dc:language>LANG</dc:language>",
                f"<dc:language>{pg_language}</dc:language>")
            metadata_xml = metadata_xml.replace(
                "<dc:source>PG_URL</dc:source>",
                f"<dc:source>{args.pg_url}</dc:source>")

        file.seek(0)
        file.write(metadata_xml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_path)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    if args.pg_url and pg_ebook_html and not is_pg_html_parsed:
        raise se.InvalidXhtmlException(
            "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
        )

Ejemplo n.º 4

Mostrar archivo

def create_draft(args: list) -> int:
	"""
	Entry point for `se create-draft`
	"""

	if args.create_github_repo and not args.create_se_repo:
		se.print_error("--create-github-repo option specified, but --create-se-repo option not specified.")
		return se.InvalidInputException.code

	if args.pg_url and not regex.match("^https?://www.gutenberg.org/ebooks/[0-9]+$", args.pg_url):
		se.print_error("Project Gutenberg URL must look like: https://www.gutenberg.org/ebooks/<EBOOK-ID>")
		return se.InvalidInputException.code

	# Put together some variables for later use
	identifier = se.formatting.make_url_safe(args.author) + "/" + se.formatting.make_url_safe(args.title)
	title_string = args.title.replace("'", "’") + ", by " + args.author.replace("'", "’")
	sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title)
	pg_producers = []

	if args.translator:
		identifier = identifier + "/" + se.formatting.make_url_safe(args.translator)
		title_string = title_string + ". Translated by " + args.translator

	if args.illustrator:
		identifier = identifier + "/" + se.formatting.make_url_safe(args.illustrator)
		title_string = title_string + ". Illustrated by " + args.illustrator

	repo_name = identifier.replace("/", "_")

	if os.path.isdir(repo_name):
		se.print_error("./{}/ already exists.".format(repo_name))
		return se.InvalidInputException.code

	# Download PG HTML and do some fixups
	if args.pg_url:
		args.pg_url = args.pg_url.replace("http://", "https://")

		# Get the ebook metadata
		try:
			response = requests.get(args.pg_url)
			pg_metadata_html = response.text
		except Exception as ex:
			se.print_error("Couldn’t download Project Gutenberg ebook metadata page. Error: {}".format(ex))
			return se.RemoteCommandErrorException.code

		soup = BeautifulSoup(pg_metadata_html, "lxml")

		# Get the ebook HTML URL from the metadata
		pg_ebook_url = None
		for element in soup.select("a[type^=\"text/html\"]"):
			pg_ebook_url = regex.sub("^//", "https://", element["href"])

		if not pg_ebook_url:
			se.print_error("Could download ebook metadata, but couldn’t find URL for the ebook HTML.")
			return se.RemoteCommandErrorException.code

		# Get the ebook LCSH categories
		pg_subjects = []
		for element in soup.select("td[property=\"dcterms:subject\"]"):
			if element["datatype"] == "dcterms:LCSH":
				for subject_link in element.find("a"):
					pg_subjects.append(subject_link.strip())

		# Get the PG publication date
		pg_publication_year = None
		for element in soup.select("td[itemprop=\"datePublished\"]"):
			pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", element.text)

		# Get the actual ebook URL
		try:
			response = requests.get(pg_ebook_url)
			pg_ebook_html = response.text
		except Exception as ex:
			se.print_error("Couldn’t download Project Gutenberg ebook HTML. Error: {}".format(ex))
			return se.RemoteCommandErrorException.code

		try:
			fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
			pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
		except Exception as ex:
			se.print_error("Couldn’t determine text encoding of Project Gutenberg HTML file. Error: {}".format(ex))
			return se.InvalidEncodingException.code

		# Try to guess the ebook language
		pg_language = "en-US"
		if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
			pg_language = "en-GB"

	# Create necessary directories
	os.makedirs(os.path.join(repo_name, "images"))
	os.makedirs(os.path.join(repo_name, "src", "epub", "css"))
	os.makedirs(os.path.join(repo_name, "src", "epub", "images"))
	os.makedirs(os.path.join(repo_name, "src", "epub", "text"))
	os.makedirs(os.path.join(repo_name, "src", "META-INF"))

	# Write PG data if we have it
	if args.pg_url and pg_ebook_html:
		soup = BeautifulSoup(pg_ebook_html, "html.parser")

		# Try to get the PG producers.  We only try this if there's a <pre> block with the header info (which is not always the case)
		for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$", flags=regex.DOTALL)):
			if element.parent.name == "pre":
				pg_producers = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", element, flags=regex.DOTALL)
				pg_producers = regex.sub(r"\(.+?\)", "", pg_producers, flags=regex.DOTALL)
				pg_producers = regex.sub(r"(at )?https?://www\.pgdp\.net", "", pg_producers, flags=regex.DOTALL)
				pg_producers = regex.sub(r"[\r\n]+", " ", pg_producers, flags=regex.DOTALL)
				pg_producers = regex.sub(r",? and ", ", and ", pg_producers)
				pg_producers = pg_producers.replace(" and the Online", " and The Online")
				pg_producers = pg_producers.replace(", and ", ", ").strip().split(", ")

		# Try to strip out the PG header
		for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")):
			for sibling in element.parent.find_previous_siblings():
				sibling.decompose()

			element.parent.decompose()

		# Try to strip out the PG license footer
		for element in soup(text=regex.compile(r"End of (the )?Project Gutenberg")):
			for sibling in element.parent.find_next_siblings():
				sibling.decompose()

			element.parent.decompose()

		with open(os.path.join(repo_name, "src", "epub", "text", "body.xhtml"), "w") as file:
			file.write(str(soup))

	# Copy over templates
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "gitignore")), os.path.normpath(repo_name + "/.gitignore"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "LICENSE.md")), os.path.normpath(repo_name + "/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "META-INF", "container.xml")), os.path.normpath(repo_name + "/src/META-INF/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "mimetype")), os.path.normpath(repo_name + "/src/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "content.opf")), os.path.normpath(repo_name + "/src/epub/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "onix.xml")), os.path.normpath(repo_name + "/src/epub/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "toc.xhtml")), os.path.normpath(repo_name + "/src/epub/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "core.css")), os.path.normpath(repo_name + "/src/epub/css/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "local.css")), os.path.normpath(repo_name + "/src/epub/css/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "logo.svg")), os.path.normpath(repo_name + "/src/epub/images/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "colophon.xhtml")), os.path.normpath(repo_name + "/src/epub/text/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "imprint.xhtml")), os.path.normpath(repo_name + "/src/epub/text/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "titlepage.xhtml")), os.path.normpath(repo_name + "/src/epub/text/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "uncopyright.xhtml")), os.path.normpath(repo_name + "/src/epub/text/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "titlepage.svg")), os.path.normpath(repo_name + "/images/"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "cover.jpg")), os.path.normpath(repo_name + "/images/cover.jpg"))
	shutil.copy(resource_filename("se", os.path.join("data", "templates", "cover.svg")), os.path.normpath(repo_name + "/images/cover.svg"))

	# Try to find Wikipedia links if possible
	author_wiki_url, author_nacoaf_url = _get_wikipedia_url(args.author, True)
	ebook_wiki_url, _ = _get_wikipedia_url(args.title, False)
	translator_wiki_url = None
	if args.translator:
		translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url(args.translator, True)

	# Pre-fill a few templates
	se.replace_in_file(os.path.normpath(repo_name + "/src/epub/text/titlepage.xhtml"), "TITLESTRING", title_string)
	se.replace_in_file(os.path.normpath(repo_name + "/images/titlepage.svg"), "TITLESTRING", title_string)
	se.replace_in_file(os.path.normpath(repo_name + "/images/cover.svg"), "TITLESTRING", title_string)

	# Create the titlepage SVG
	contributors = {}
	if args.translator:
		contributors["translated by"] = args.translator

	if args.illustrator:
		contributors["illustrated by"] = args.illustrator

	with open(os.path.join(repo_name, "images", "titlepage.svg"), "w") as file:
		file.write(_generate_titlepage_svg(args.title, args.author, contributors, title_string))

	# Create the cover SVG
	with open(os.path.join(repo_name, "images", "cover.svg"), "w") as file:
		file.write(_generate_cover_svg(args.title, args.author, title_string))

	if args.pg_url:
		se.replace_in_file(os.path.normpath(repo_name + "/src/epub/text/imprint.xhtml"), "PGLINK", args.pg_url)

	with open(os.path.join(repo_name, "src", "epub", "text", "colophon.xhtml"), "r+", encoding="utf-8") as file:
		colophon_xhtml = file.read()

		colophon_xhtml = colophon_xhtml.replace("SEIDENTIFIER", identifier)
		colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", ">{}<".format(args.author))
		colophon_xhtml = colophon_xhtml.replace("TITLE", args.title)

		if author_wiki_url:
			colophon_xhtml = colophon_xhtml.replace("AUTHORWIKILINK", author_wiki_url)

		if args.pg_url:
			colophon_xhtml = colophon_xhtml.replace("PGLINK", args.pg_url)

			if pg_publication_year:
				colophon_xhtml = colophon_xhtml.replace("PG_YEAR", pg_publication_year)

			if pg_producers:
				producers_xhtml = ""
				for i, producer  in enumerate(pg_producers):
					if "Distributed Proofreading" in producer:
						producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
					else:
						producers_xhtml = producers_xhtml + "<span class=\"name\">{}</span>".format(producer)

					if i < len(pg_producers) - 1:
						producers_xhtml = producers_xhtml + ", "

					if i == len(pg_producers) - 2:
						producers_xhtml = producers_xhtml + "and "

				producers_xhtml = producers_xhtml + "<br/>"

				colophon_xhtml = colophon_xhtml.replace("<span class=\"name\">TRANSCRIBER1</span>, <span class=\"name\">TRANSCRIBER2</span>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml)

		file.seek(0)
		file.write(colophon_xhtml)
		file.truncate()

	with open(os.path.join(repo_name, "src", "epub", "content.opf"), "r+", encoding="utf-8") as file:
		metadata_xhtml = file.read()

		metadata_xhtml = metadata_xhtml.replace("SEIDENTIFIER", identifier)
		metadata_xhtml = metadata_xhtml.replace(">AUTHOR<", ">{}<".format(args.author))
		metadata_xhtml = metadata_xhtml.replace(">TITLESORT<", ">{}<".format(sorted_title))
		metadata_xhtml = metadata_xhtml.replace(">TITLE<", ">{}<".format(args.title))
		metadata_xhtml = metadata_xhtml.replace("VCSIDENTIFIER", repo_name)

		if pg_producers:
			producers_xhtml = ""
			i = 1
			for producer in pg_producers:
				producers_xhtml = producers_xhtml + "\t\t<dc:contributor id=\"transcriber-{}\">{}</dc:contributor>\n".format(i, producer)

				if "Distributed Proofreading" in producer:
					producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{}\">https://pgdp.net</meta>\n".format(i, i)
				else:
					producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{}\">TRANSCRIBERSORT</meta>\n".format(i)

				producers_xhtml = producers_xhtml + "\t\t<meta property=\"role\" refines=\"#transcriber-{}\" scheme=\"marc:relators\">trc</meta>\n".format(i)

				i = i + 1

			metadata_xhtml = regex.sub(r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBERSORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">LINK</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xhtml, flags=regex.DOTALL)

		if author_wiki_url:
			metadata_xhtml = metadata_xhtml.replace(">AUTHORWIKILINK<", ">{}<".format(author_wiki_url))

		if author_nacoaf_url:
			metadata_xhtml = metadata_xhtml.replace(">AUTHORNACOAFLINK<", ">{}<".format(author_nacoaf_url))

		if ebook_wiki_url:
			metadata_xhtml = metadata_xhtml.replace(">EBOOKWIKILINK<", ">{}<".format(ebook_wiki_url))

		if args.translator:
			metadata_xhtml = metadata_xhtml.replace(">TRANSLATOR<", ">{}<".format(args.translator))

			if translator_wiki_url:
				metadata_xhtml = metadata_xhtml.replace(">TRANSLATORWIKILINK<", ">{}<".format(translator_wiki_url))

			if translator_nacoaf_url:
				metadata_xhtml = metadata_xhtml.replace(">TRANSLATORNACOAFLINK<", ">{}<".format(translator_nacoaf_url))
		else:
			metadata_xhtml = regex.sub(r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">", "<dc:contributor id=\"artist\">", metadata_xhtml, flags=regex.DOTALL)

		if args.pg_url:
			if pg_subjects:
				subject_xhtml = ""

				i = 1
				for subject in pg_subjects:
					subject_xhtml = subject_xhtml + "\t\t<dc:subject id=\"subject-{}\">{}</dc:subject>\n".format(i, subject)
					i = i + 1

				i = 1
				for subject in pg_subjects:
					subject_xhtml = subject_xhtml + "\t\t<meta property=\"meta-auth\" refines=\"#subject-{}\">{}</meta>\n".format(i, args.pg_url)
					i = i + 1

				metadata_xhtml = regex.sub(r"\t\t<dc:subject id=\"subject-1\">SUBJECT1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT2</dc:subject>\s*<meta property=\"meta-auth\" refines=\"#subject-1\">LOCLINK1</meta>\s*<meta property=\"meta-auth\" refines=\"#subject-2\">LOCLINK2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xhtml)

			metadata_xhtml = metadata_xhtml.replace("<dc:language>LANG</dc:language>", "<dc:language>{}</dc:language>".format(pg_language))
			metadata_xhtml = metadata_xhtml.replace("<dc:source>LINK</dc:source>", "<dc:source>{}</dc:source>".format(args.pg_url))

		file.seek(0)
		file.write(metadata_xhtml)
		file.truncate()

	# Set up local git repo
	repo = git.Repo.init(repo_name)

	if args.email:
		with repo.config_writer() as config:
			config.set_value("user", "email", args.email)

	# Set up remote git repos
	if args.create_se_repo:
		git_command = git.cmd.Git(repo_name)
		git_command.remote("add", "origin", "standardebooks.org:/standardebooks.org/ebooks/{}.git".format(repo_name))

		# Set git to automatically push to SE
		git_command.config("branch.master.remote", "origin")
		git_command.config("branch.master.merge", "refs/heads/master")

		github_option = ""
		if args.create_github_repo:
			github_option = "--github"

		return_code = call(["ssh", "standardebooks.org", "/standardebooks.org/scripts/init-se-repo --repo-name={} --title-string=\"{}\" {}".format(repo_name, title_string, github_option)])
		if return_code != 0:
			se.print_error("Failed to create repository on Standard Ebooks server: ssh returned code {}.".format(return_code))
			return se.RemoteCommandErrorException.code

	return 0

Ejemplo n.º 5

Mostrar archivo

def _create_draft(args: Namespace):
    """
	Implementation for `se create-draft`
	"""

    # Put together some variables for later use
    identifier = se.formatting.make_url_safe(
        args.author) + "/" + se.formatting.make_url_safe(args.title)
    title_string = args.title.replace(
        "'", "’") + ", by " + args.author.replace("'", "’")
    sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title)
    pg_producers = []

    if args.translator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.translator)
        title_string = title_string + ". Translated by " + args.translator

    if args.illustrator:
        identifier = identifier + "/" + se.formatting.make_url_safe(
            args.illustrator)
        title_string = title_string + ". Illustrated by " + args.illustrator

    repo_name = identifier.replace("/", "_")

    repo_path = Path(repo_name).resolve()

    if repo_path.is_dir():
        raise se.InvalidInputException(
            f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]."
        )

    # Download PG HTML and do some fixups
    if args.pg_url:
        if args.offline:
            raise se.RemoteCommandErrorException(
                "Cannot download Project Gutenberg ebook when offline option is enabled."
            )

        args.pg_url = args.pg_url.replace("http://", "https://")

        # Get the ebook metadata
        try:
            response = requests.get(args.pg_url)
            pg_metadata_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}"
            )

        soup = BeautifulSoup(pg_metadata_html, "lxml")

        # Get the ebook HTML URL from the metadata
        pg_ebook_url = None
        for element in soup.select("a[type^=\"text/html\"]"):
            pg_ebook_url = regex.sub(r"^//", "https://", element["href"])
            pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/",
                                     pg_ebook_url)

        if not pg_ebook_url:
            raise se.RemoteCommandErrorException(
                "Could download ebook metadata, but couldn’t find URL for the ebook HTML."
            )

        # Get the ebook LCSH categories
        pg_subjects = []
        for element in soup.select("td[property=\"dcterms:subject\"]"):
            if element["datatype"] == "dcterms:LCSH":
                for subject_link in element.find("a"):
                    pg_subjects.append(subject_link.strip())

        # Get the PG publication date
        pg_publication_year = None
        for element in soup.select("td[itemprop=\"datePublished\"]"):
            pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1",
                                            element.text)

        # Get the actual ebook URL
        try:
            response = requests.get(pg_ebook_url)
            pg_ebook_html = response.text
        except Exception as ex:
            raise se.RemoteCommandErrorException(
                f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}"
            )

        try:
            fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
            pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
        except Exception as ex:
            raise se.InvalidEncodingException(
                f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}"
            )

        # Try to guess the ebook language
        pg_language = "en-US"
        if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
            pg_language = "en-GB"

    # Create necessary directories
    (repo_path / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "css").mkdir(parents=True)
    (repo_path / "src" / "epub" / "images").mkdir(parents=True)
    (repo_path / "src" / "epub" / "text").mkdir(parents=True)
    (repo_path / "src" / "META-INF").mkdir(parents=True)

    is_pg_html_parsed = True

    # Write PG data if we have it
    if args.pg_url and pg_ebook_html:
        try:
            soup = BeautifulSoup(pg_ebook_html, "html.parser")

            # Try to get the PG producers.  We only try this if there's a <pre> block with the header info (which is not always the case)
            for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$",
                                                   flags=regex.DOTALL)):
                if element.parent.name == "pre":
                    producers_text = regex.sub(r".+?Produced by (.+?)\s*$",
                                               "\\1",
                                               element,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(r"\(.+?\)",
                                               "",
                                               producers_text,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(
                        r"(at )?https?://www\.pgdp\.net",
                        "",
                        producers_text,
                        flags=regex.DOTALL)
                    producers_text = regex.sub(r"[\r\n]+",
                                               " ",
                                               producers_text,
                                               flags=regex.DOTALL)
                    producers_text = regex.sub(r",? and ", ", and ",
                                               producers_text)
                    producers_text = producers_text.replace(
                        " and the Online", " and The Online")
                    producers_text = producers_text.replace(", and ",
                                                            ", ").strip()

                    pg_producers = producers_text.split(", ")

            # Try to strip out the PG header
            for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")):
                for sibling in element.parent.find_previous_siblings():
                    sibling.decompose()

                element.parent.decompose()

            # Try to strip out the PG license footer
            for element in soup(
                    text=regex.compile(r"End of (the )?Project Gutenberg")):
                for sibling in element.parent.find_next_siblings():
                    sibling.decompose()

                element.parent.decompose()

            with open(repo_path / "src" / "epub" / "text" / "body.xhtml",
                      "w",
                      encoding="utf-8") as file:
                file.write(str(soup))
        except OSError as ex:
            raise se.InvalidFileException(
                f"Couldn’t write to ebook directory. Exception: {ex}")
        except:
            # Save this error for later, because it's still useful to complete the create-draft process
            # even if we've failed to parse PG's HTML source.
            is_pg_html_parsed = False
            se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml")

    # Copy over templates

    _copy_template_file("gitignore", repo_path / ".gitignore")
    _copy_template_file("LICENSE.md", repo_path)
    _copy_template_file("container.xml", repo_path / "src" / "META-INF")
    _copy_template_file("mimetype", repo_path / "src")
    _copy_template_file("content.opf", repo_path / "src" / "epub")
    _copy_template_file("onix.xml", repo_path / "src" / "epub")
    _copy_template_file("toc.xhtml", repo_path / "src" / "epub")
    _copy_template_file("core.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("local.css", repo_path / "src" / "epub" / "css")
    _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images")
    _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text")
    _copy_template_file("uncopyright.xhtml",
                        repo_path / "src" / "epub" / "text")
    _copy_template_file("titlepage.svg", repo_path / "images")
    _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg")
    _copy_template_file("cover.svg", repo_path / "images" / "cover.svg")

    # Try to find Wikipedia links if possible
    if args.offline:
        author_wiki_url = None
        author_nacoaf_url = None
        ebook_wiki_url = None
        translator_wiki_url = None
        translator_nacoaf_url = None
    else:
        author_wiki_url, author_nacoaf_url = _get_wikipedia_url(
            args.author, True)
        ebook_wiki_url = None
        if args.title != "Short Fiction":
            # There's a "Short Fiction" Wikipedia article, so make an exception for that case
            ebook_wiki_url, _ = _get_wikipedia_url(args.title, False)
        translator_wiki_url = None
        if args.translator:
            translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url(
                args.translator, True)

    # Pre-fill a few templates
    _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml",
                     "TITLE_STRING", title_string)
    _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING",
                     title_string)
    _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING",
                     title_string)

    # Create the titlepage SVG
    contributors = {}
    if args.translator:
        contributors["translated by"] = args.translator

    if args.illustrator:
        contributors["illustrated by"] = args.illustrator

    with open(repo_path / "images" / "titlepage.svg", "w",
              encoding="utf-8") as file:
        file.write(
            _generate_titlepage_svg(args.title, args.author, contributors,
                                    title_string))

    # Create the cover SVG
    with open(repo_path / "images" / "cover.svg", "w",
              encoding="utf-8") as file:
        file.write(_generate_cover_svg(args.title, args.author, title_string))

    # Build the cover/titlepage for distribution
    epub = SeEpub(repo_path)
    epub.generate_cover_svg()
    epub.generate_titlepage_svg()

    if args.pg_url:
        _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml",
                         "PG_URL", args.pg_url)

    with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml",
              "r+",
              encoding="utf-8") as file:
        colophon_xhtml = file.read()

        colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
        colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", f">{args.author}<")
        colophon_xhtml = colophon_xhtml.replace("TITLE", args.title)

        if author_wiki_url:
            colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL",
                                                    author_wiki_url)

        if args.pg_url:
            colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

            if pg_publication_year:
                colophon_xhtml = colophon_xhtml.replace(
                    "PG_YEAR", pg_publication_year)

            if pg_producers:
                producers_xhtml = ""
                for i, producer in enumerate(pg_producers):
                    if "Distributed Proofread" in producer:
                        producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
                    elif "anonymous" in producer.lower():
                        producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>"
                    else:
                        producers_xhtml = producers_xhtml + f"<b class=\"name\">{producer.strip('.')}</b>"

                    if i < len(pg_producers) - 1:
                        producers_xhtml = producers_xhtml + ", "

                    if i == len(pg_producers) - 2:
                        producers_xhtml = producers_xhtml + "and "

                producers_xhtml = producers_xhtml + "<br/>"

                colophon_xhtml = colophon_xhtml.replace(
                    "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>",
                    producers_xhtml)

        file.seek(0)
        file.write(colophon_xhtml)
        file.truncate()

    with open(repo_path / "src" / "epub" / "content.opf",
              "r+",
              encoding="utf-8") as file:
        metadata_xml = file.read()

        metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier)
        metadata_xml = metadata_xml.replace(">AUTHOR<", f">{args.author}<")
        metadata_xml = metadata_xml.replace(">TITLE_SORT<",
                                            f">{sorted_title}<")
        metadata_xml = metadata_xml.replace(">TITLE<", f">{args.title}<")
        metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name))

        if pg_producers:
            producers_xhtml = ""
            i = 1
            for producer in pg_producers:
                if "Distributed Proofread" in producer:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n"
                elif "anonymous" in producer.lower():
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n"
                else:
                    producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n"

                producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n"

                i = i + 1

            metadata_xml = regex.sub(
                r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>",
                "\t\t" + producers_xhtml.strip(),
                metadata_xml,
                flags=regex.DOTALL)

        if author_wiki_url:
            metadata_xml = metadata_xml.replace(">AUTHOR_WIKI_URL<",
                                                f">{author_wiki_url}<")

        if author_nacoaf_url:
            metadata_xml = metadata_xml.replace(">AUTHOR_NACOAF_URL<",
                                                f">{author_nacoaf_url}<")

        if ebook_wiki_url:
            metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<",
                                                f">{ebook_wiki_url}<")

        if args.translator:
            metadata_xml = metadata_xml.replace(">TRANSLATOR<",
                                                f">{args.translator}<")

            if translator_wiki_url:
                metadata_xml = metadata_xml.replace(
                    ">TRANSLATOR_WIKI_URL<", f">{translator_wiki_url}<")

            if translator_nacoaf_url:
                metadata_xml = metadata_xml.replace(
                    ">TRANSLATOR_NACOAF_URL<", f">{translator_nacoaf_url}<")
        else:
            metadata_xml = regex.sub(
                r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">",
                "<dc:contributor id=\"artist\">",
                metadata_xml,
                flags=regex.DOTALL)

        if args.pg_url:
            if pg_subjects:
                subject_xhtml = ""

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n"
                    i = i + 1

                i = 1
                for subject in pg_subjects:
                    subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n"

                    # Now, get the LCSH ID by querying LCSH directly.
                    try:
                        response = requests.get(
                            f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22"
                        )
                        result = regex.search(
                            fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>",
                            response.text)

                        loc_id = "Unknown"
                        try:
                            loc_id = result.group(1)
                        except Exception as ex:
                            pass

                        subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n"

                    except Exception as ex:
                        raise se.RemoteCommandErrorException(
                            f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}"
                        )

                    i = i + 1

                metadata_xml = regex.sub(
                    r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>",
                    "\t\t" + subject_xhtml.strip(), metadata_xml)

            metadata_xml = metadata_xml.replace(
                "<dc:language>LANG</dc:language>",
                f"<dc:language>{pg_language}</dc:language>")
            metadata_xml = metadata_xml.replace(
                "<dc:source>PG_URL</dc:source>",
                f"<dc:source>{args.pg_url}</dc:source>")

        file.seek(0)
        file.write(metadata_xml)
        file.truncate()

    # Set up local git repo
    repo = git.Repo.init(repo_path)

    if args.email:
        with repo.config_writer() as config:
            config.set_value("user", "email", args.email)

    if args.pg_url and pg_ebook_html and not is_pg_html_parsed:
        raise se.InvalidXhtmlException(
            "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook."
        )

Ejemplo n.º 6

Mostrar archivo

Archivo: executables_create_draft.py Proyecto: PhilanthroLab/tools

def create_draft(args: list):
	"""
	Entry point for `se create-draft`
	"""

	# Put together some variables for later use
	identifier = se.formatting.make_url_safe(args.author) + "/" + se.formatting.make_url_safe(args.title)
	title_string = args.title.replace("'", "’") + ", by " + args.author.replace("'", "’")
	sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title)
	pg_producers = []

	if args.translator:
		identifier = identifier + "/" + se.formatting.make_url_safe(args.translator)
		title_string = title_string + ". Translated by " + args.translator

	if args.illustrator:
		identifier = identifier + "/" + se.formatting.make_url_safe(args.illustrator)
		title_string = title_string + ". Illustrated by " + args.illustrator

	repo_name = Path(identifier.replace("/", "_"))

	if repo_name.is_dir():
		raise se.InvalidInputException("./{}/ already exists.".format(repo_name))

	# Download PG HTML and do some fixups
	if args.pg_url:
		args.pg_url = args.pg_url.replace("http://", "https://")

		# Get the ebook metadata
		try:
			response = requests.get(args.pg_url)
			pg_metadata_html = response.text
		except Exception as ex:
			raise se.RemoteCommandErrorException("Couldn’t download Project Gutenberg ebook metadata page. Error: {}".format(ex))

		soup = BeautifulSoup(pg_metadata_html, "lxml")

		# Get the ebook HTML URL from the metadata
		pg_ebook_url = None
		for element in soup.select("a[type^=\"text/html\"]"):
			pg_ebook_url = regex.sub(r"^//", "https://", element["href"])
			pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/", pg_ebook_url)

		if not pg_ebook_url:
			raise se.RemoteCommandErrorException("Could download ebook metadata, but couldn’t find URL for the ebook HTML.")

		# Get the ebook LCSH categories
		pg_subjects = []
		for element in soup.select("td[property=\"dcterms:subject\"]"):
			if element["datatype"] == "dcterms:LCSH":
				for subject_link in element.find("a"):
					pg_subjects.append(subject_link.strip())

		# Get the PG publication date
		pg_publication_year = None
		for element in soup.select("td[itemprop=\"datePublished\"]"):
			pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", element.text)

		# Get the actual ebook URL
		try:
			response = requests.get(pg_ebook_url)
			pg_ebook_html = response.text
		except Exception as ex:
			raise se.RemoteCommandErrorException("Couldn’t download Project Gutenberg ebook HTML. Error: {}".format(ex))

		try:
			fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False)
			pg_ebook_html = se.strip_bom(fixed_pg_ebook_html)
		except Exception as ex:
			raise se.InvalidEncodingException("Couldn’t determine text encoding of Project Gutenberg HTML file. Error: {}".format(ex))

		# Try to guess the ebook language
		pg_language = "en-US"
		if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html:
			pg_language = "en-GB"

	# Create necessary directories
	(repo_name / "images").mkdir(parents=True)
	(repo_name / "src" / "epub" / "css").mkdir(parents=True)
	(repo_name / "src" / "epub" / "images").mkdir(parents=True)
	(repo_name / "src" / "epub" / "text").mkdir(parents=True)
	(repo_name / "src" / "META-INF").mkdir(parents=True)

	is_pg_html_parsed = True

	# Write PG data if we have it
	if args.pg_url and pg_ebook_html:
		try:
			soup = BeautifulSoup(pg_ebook_html, "html.parser")

			# Try to get the PG producers.  We only try this if there's a <pre> block with the header info (which is not always the case)
			for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$", flags=regex.DOTALL)):
				if element.parent.name == "pre":
					pg_producers = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", element, flags=regex.DOTALL)
					pg_producers = regex.sub(r"\(.+?\)", "", pg_producers, flags=regex.DOTALL)
					pg_producers = regex.sub(r"(at )?https?://www\.pgdp\.net", "", pg_producers, flags=regex.DOTALL)
					pg_producers = regex.sub(r"[\r\n]+", " ", pg_producers, flags=regex.DOTALL)
					pg_producers = regex.sub(r",? and ", ", and ", pg_producers)
					pg_producers = pg_producers.replace(" and the Online", " and The Online")
					pg_producers = pg_producers.replace(", and ", ", ").strip().split(", ")

			# Try to strip out the PG header
			for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")):
				for sibling in element.parent.find_previous_siblings():
					sibling.decompose()

				element.parent.decompose()

			# Try to strip out the PG license footer
			for element in soup(text=regex.compile(r"End of (the )?Project Gutenberg")):
				for sibling in element.parent.find_next_siblings():
					sibling.decompose()

				element.parent.decompose()

			with open(repo_name / "src" / "epub" / "text" / "body.xhtml", "w", encoding="utf-8") as file:
				file.write(str(soup))
		except IOError as ex:
			raise se.InvalidFileException("Couldn’t write to ebook directory. Error: {}".format(ex))
		except:
			# Save this error for later, because it's still useful to complete the create-draft process
			# even if we've failed to parse PG's HTML source.
			is_pg_html_parsed = False
			se.quiet_remove(repo_name / "src" / "epub" / "text" / "body.xhtml")

	# Copy over templates
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "gitignore")), repo_name / ".gitignore")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "LICENSE.md")), repo_name)
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "META-INF" / "container.xml")), repo_name / "src" / "META-INF")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "mimetype")), repo_name / "src")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "content.opf")), repo_name / "src" / "epub")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "onix.xml")), repo_name / "src" / "epub")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "toc.xhtml")), repo_name / "src" / "epub")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "core.css")), repo_name / "src" / "epub" / "css")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "local.css")), repo_name / "src" / "epub" / "css")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "logo.svg")), repo_name / "src" / "epub" / "images")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "colophon.xhtml")), repo_name / "src" / "epub" / "text")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "imprint.xhtml")), repo_name / "src" / "epub" / "text")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "titlepage.xhtml")), repo_name / "src" / "epub" / "text")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "uncopyright.xhtml")), repo_name / "src" / "epub" / "text")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "titlepage.svg")), repo_name / "images")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "cover.jpg")), repo_name / "images" / "cover.jpg")
	shutil.copy(resource_filename("se", str(Path("data") / "templates" / "cover.svg")), repo_name / "images" / "cover.svg")

	# Try to find Wikipedia links if possible
	author_wiki_url, author_nacoaf_url = _get_wikipedia_url(args.author, True)
	ebook_wiki_url, _ = _get_wikipedia_url(args.title, False)
	translator_wiki_url = None
	if args.translator:
		translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url(args.translator, True)

	# Pre-fill a few templates
	se.replace_in_file(repo_name / "src" / "epub" / "text" / "titlepage.xhtml", "TITLE_STRING", title_string)
	se.replace_in_file(repo_name / "images" / "titlepage.svg", "TITLE_STRING", title_string)
	se.replace_in_file(repo_name / "images" / "cover.svg", "TITLE_STRING", title_string)

	# Create the titlepage SVG
	contributors = {}
	if args.translator:
		contributors["translated by"] = args.translator

	if args.illustrator:
		contributors["illustrated by"] = args.illustrator

	with open(repo_name / "images" / "titlepage.svg", "w", encoding="utf-8") as file:
		file.write(_generate_titlepage_svg(args.title, args.author, contributors, title_string))

	# Create the cover SVG
	with open(repo_name / "images" / "cover.svg", "w", encoding="utf-8") as file:
		file.write(_generate_cover_svg(args.title, args.author, title_string))

	if args.pg_url:
		se.replace_in_file(repo_name / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL", args.pg_url)

	with open(repo_name / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file:
		colophon_xhtml = file.read()

		colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier)
		colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", ">{}<".format(args.author))
		colophon_xhtml = colophon_xhtml.replace("TITLE", args.title)

		if author_wiki_url:
			colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL", author_wiki_url)

		if args.pg_url:
			colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url)

			if pg_publication_year:
				colophon_xhtml = colophon_xhtml.replace("PG_YEAR", pg_publication_year)

			if pg_producers:
				producers_xhtml = ""
				for i, producer in enumerate(pg_producers):
					if "Distributed Proofreading" in producer:
						producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>"
					else:
						producers_xhtml = producers_xhtml + "<b class=\"name\">{}</b>".format(producer)

					if i < len(pg_producers) - 1:
						producers_xhtml = producers_xhtml + ", "

					if i == len(pg_producers) - 2:
						producers_xhtml = producers_xhtml + "and "

				producers_xhtml = producers_xhtml + "<br/>"

				colophon_xhtml = colophon_xhtml.replace("<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml)

		file.seek(0)
		file.write(colophon_xhtml)
		file.truncate()

	with open(repo_name / "src" / "epub" / "content.opf", "r+", encoding="utf-8") as file:
		metadata_xhtml = file.read()

		metadata_xhtml = metadata_xhtml.replace("SE_IDENTIFIER", identifier)
		metadata_xhtml = metadata_xhtml.replace(">AUTHOR<", ">{}<".format(args.author))
		metadata_xhtml = metadata_xhtml.replace(">TITLE_SORT<", ">{}<".format(sorted_title))
		metadata_xhtml = metadata_xhtml.replace(">TITLE<", ">{}<".format(args.title))
		metadata_xhtml = metadata_xhtml.replace("VCS_IDENTIFIER", str(repo_name))

		if pg_producers:
			producers_xhtml = ""
			i = 1
			for producer in pg_producers:
				producers_xhtml = producers_xhtml + "\t\t<dc:contributor id=\"transcriber-{}\">{}</dc:contributor>\n".format(i, producer)

				if "Distributed Proofreading" in producer:
					producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{0}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{0}\">https://pgdp.net</meta>\n".format(i)
				else:
					producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{}\">TRANSCRIBER_SORT</meta>\n".format(i)

				producers_xhtml = producers_xhtml + "\t\t<meta property=\"role\" refines=\"#transcriber-{}\" scheme=\"marc:relators\">trc</meta>\n".format(i)

				i = i + 1

			metadata_xhtml = regex.sub(r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xhtml, flags=regex.DOTALL)

		if author_wiki_url:
			metadata_xhtml = metadata_xhtml.replace(">AUTHOR_WIKI_URL<", ">{}<".format(author_wiki_url))

		if author_nacoaf_url:
			metadata_xhtml = metadata_xhtml.replace(">AUTHOR_NACOAF_URL<", ">{}<".format(author_nacoaf_url))

		if ebook_wiki_url:
			metadata_xhtml = metadata_xhtml.replace(">EBOOK_WIKI_URL<", ">{}<".format(ebook_wiki_url))

		if args.translator:
			metadata_xhtml = metadata_xhtml.replace(">TRANSLATOR<", ">{}<".format(args.translator))

			if translator_wiki_url:
				metadata_xhtml = metadata_xhtml.replace(">TRANSLATOR_WIKI_URL<", ">{}<".format(translator_wiki_url))

			if translator_nacoaf_url:
				metadata_xhtml = metadata_xhtml.replace(">TRANSLATOR_NACOAF_URL<", ">{}<".format(translator_nacoaf_url))
		else:
			metadata_xhtml = regex.sub(r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">", "<dc:contributor id=\"artist\">", metadata_xhtml, flags=regex.DOTALL)

		if args.pg_url:
			if pg_subjects:
				subject_xhtml = ""

				i = 1
				for subject in pg_subjects:
					subject_xhtml = subject_xhtml + "\t\t<dc:subject id=\"subject-{}\">{}</dc:subject>\n".format(i, subject)
					i = i + 1

				i = 1
				for subject in pg_subjects:
					subject_xhtml = subject_xhtml + "\t\t<meta property=\"authority\" refines=\"#subject-{}\">LCSH</meta>\n".format(i)

					# Now, get the LCSH ID by querying LCSH directly.
					try:
						response = requests.get("http://id.loc.gov/search/?q=%22{}%22".format(urllib.parse.quote(subject)))
						result = regex.search(r"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{}</a>".format(regex.escape(subject.replace(" -- ", "--"))), response.text)

						loc_id = "Unknown"
						try:
							loc_id = result.group(1)
						except Exception as ex:
							pass

						subject_xhtml = subject_xhtml + "\t\t<meta property=\"term\" refines=\"#subject-{}\">{}</meta>\n".format(i, loc_id)

					except Exception as ex:
						raise se.RemoteCommandErrorException("Couldn’t connect to id.loc.gov. Error: {}".format(ex))

					i = i + 1

				metadata_xhtml = regex.sub(r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xhtml)

			metadata_xhtml = metadata_xhtml.replace("<dc:language>LANG</dc:language>", "<dc:language>{}</dc:language>".format(pg_language))
			metadata_xhtml = metadata_xhtml.replace("<dc:source>PG_URL</dc:source>", "<dc:source>{}</dc:source>".format(args.pg_url))

		file.seek(0)
		file.write(metadata_xhtml)
		file.truncate()

	# Set up local git repo
	repo = git.Repo.init(repo_name)

	if args.email:
		with repo.config_writer() as config:
			config.set_value("user", "email", args.email)

	# Set up remote git repos
	if args.create_se_repo:
		git_command = git.cmd.Git(repo_name)
		git_command.remote("add", "origin", "standardebooks.org:/standardebooks.org/ebooks/{}.git".format(repo_name))

		# Set git to automatically push to SE
		git_command.config("branch.master.remote", "origin")
		git_command.config("branch.master.merge", "refs/heads/master")

		github_option = ""
		if args.create_github_repo:
			github_option = "--github"

		return_code = call(["ssh", "standardebooks.org", "/standardebooks.org/scripts/init-se-repo --repo-name={} --title-string=\"{}\" {}".format(repo_name, title_string, github_option)])
		if return_code != 0:
			raise se.RemoteCommandErrorException("Failed to create repository on Standard Ebooks server: ssh returned code {}.".format(return_code))

	if args.pg_url and pg_ebook_html and not is_pg_html_parsed:
		raise se.InvalidXhtmlException("Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook.")