def xml2leo(event, from_string=None): """handle import of an .xml file, places new subtree after c.p """ c = event['c'] p = c.p if from_string: parser_func = etree.fromstring file_name = from_string else: parser_func = etree.parse cd_here(c, p) file_name = g.app.gui.runOpenFileDialog(c, title="Open", filetypes=table, defaultextension=".xml") if not file_name: raise Exception("No file selected") try: xml_ = parser_func(file_name) except etree.XMLSyntaxError: xml_ = parser_func(file_name, parser=etree.HTMLParser()) except Exception: g.es("Failed to read '%s'" % file_name) raise if from_string: # etree.fromstring and etree.parse return Element and # ElementTree respectively xml_ = etree.ElementTree(xml_) nd = p.insertAfter() nd.h = os.path.basename(file_name) # the root Element isn't necessarily the first thing in the XML file # move to the beginning of the list to capture preceding comments # and processing instructions toplevel = xml_.getroot() while toplevel.getprevious() is not None: toplevel = toplevel.getprevious() # move through list, covering root Element and any comments # or processing instructions which follow it while toplevel is not None: append_element(toplevel, nd) toplevel = toplevel.getnext() nd.b = '<?xml version="%s"?>\n' % (xml_.docinfo.xml_version or '1.0') if xml_.docinfo.encoding: nd.b = '<?xml version="%s" encoding="%s"?>\n' % ( xml_.docinfo.xml_version or '1.0', xml_.docinfo.encoding) if NSMAP: for k in sorted(NSMAP): if k: nd.b += "%s: %s\n" % (k, NSMAP[k]) else: nd.b += "%s\n" % NSMAP[k] nd.b += xml_.docinfo.doctype + '\n' c.redraw() return nd
def html2plaintext(html, body_id=None, encoding='utf-8'): """ From an HTML text, convert the HTML to plain text. If @param body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ ## (c) Fry-IT, www.fry-it.com, 2007 ## <*****@*****.**> ## download here: http://www.peterbe.com/plog/html2plaintext html = ustr(html) if not html: return '' tree = etree.fromstring(html, parser=etree.HTMLParser()) if body_id is not None: source = tree.xpath('//*[@id=%s]' % (body_id,)) else: source = tree.xpath('//body') if len(source): tree = source[0] url_index = [] i = 0 for link in tree.findall('.//a'): url = link.get('href') if url: i += 1 link.tag = 'span' link.text = '%s [%s]' % (link.text, i) url_index.append(url) html = ustr(etree.tostring(tree, encoding=encoding)) # \r char is converted into , must remove it html = html.replace(' ', '') html = html.replace('<strong>', '*').replace('</strong>', '*') html = html.replace('<b>', '*').replace('</b>', '*') html = html.replace('<h3>', '*').replace('</h3>', '*') html = html.replace('<h2>', '**').replace('</h2>', '**') html = html.replace('<h1>', '**').replace('</h1>', '**') html = html.replace('<em>', '/').replace('</em>', '/') html = html.replace('<tr>', '\n') html = html.replace('</p>', '\n') html = re.sub('<br\s*/?>', '\n', html) html = re.sub('<.*?>', ' ', html) html = html.replace(' ' * 2, ' ') html = html.replace('>', '>') html = html.replace('<', '<') html = html.replace('&', '&') # strip all lines html = '\n'.join([x.strip() for x in html.splitlines()]) html = html.replace('\n' * 2, '\n') for i, url in enumerate(url_index): if i == 0: html += '\n\n' html += ustr('[%s] %s\n') % (i + 1, url) return html
from lxml import etree parser = etree.HTMLParser() tree = etree.parse("app.html", parser) name_xpath_1 = '/html/body/div[1]/div[7]/div[4]/div[1]/div[2]/div[2]/div[2]/div/div[3]/text()' name_xpath_2 = '/html/body/div[1]/div[7]/div[4]/div[1]/div[2]/div[1]/div[2]/div/div[3]/text()' name_1 = tree.xpath(name_xpath_1) name_2 = tree.xpath(name_xpath_2) print(name_1) print(type(name_1)) print(name_2) print(type(name_2))
def extract_next_links(rawDatas): global most_outlinks, visited_subdomains outputLinks = list() for urlResponse in rawDatas: outlinks = [] # The URL base path basePath = urlResponse.url hostName = urlparse(basePath).hostname if hostName not in visited_subdomains: visited_subdomains[hostName] = set() # The content of the page content = urlResponse.content # Stops us from trying parse pages with no content or an error if not urlResponse.error_message or content: # Debug if DEBUG_VERY_VERBOSE: print "Error Message: ", urlResponse.error_message print "Headers: ", urlResponse.headers print "Is Redirected: ", urlResponse.is_redirected print "Final URL: ", urlResponse.final_url print "Content: ", urlResponse.content, "-\n" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" try: # Loading the DOM with etree parser = etree.HTMLParser(recover=True) pageDom = etree.parse(StringIO.StringIO(content), parser) # Checks for the presence of a base tag if pageDom.xpath('//base/@href'): basePath = pageDom.xpath('//base/@href')[0] # Extracting all of the links for linkPath in pageDom.xpath('//a/@href'): # absolutePath = urljoin(basePath, relativePath) absoluteUrl = urljoin(basePath, linkPath) # Adding link to list outlinks.append(absoluteUrl) visited_subdomains[hostName].add(absoluteUrl) #If outlinks is currently empty then assign it new tuple if most_outlinks[0] == "None": most_outlinks = (basePath, len(outlinks)) #If the current tuples outlinks count is lower to current then replace elif most_outlinks[1] < len(outlinks): most_outlinks = (basePath, len(outlinks)) outputLinks += outlinks except AssertionError as err: # Setting this as a bad link urlResponse.bad_url = True # might want to set that built in bad within the url object here??? if DEBUG: print err.message else: # Setting this as a bad link urlResponse.bad_url = True if DEBUG: print "No content or an error code exists" # Debug if DEBUG_VERBOSE: print "List of found link: ", outputLinks return outputLinks
def parsehtml(file,urlbiglist): #开始处理url for i in range(len(urlbiglist)): for j in range(len(urlbiglist[i])): # print urlbiglist[i][j] for k in range(3): try: request = urllib2.Request(url = urlbiglist[i][j],headers=headers) html = urllib2.urlopen(request).read() print "连接成功,跳出循环" break except urllib2.HTTPError,e: print "有问题,再连一遍" continue #编码问题解决 char_type = chardet.detect(html) print (char_type) #再对编码进行一次判断 只要中文字符的网页 若超出范围直接跳出 language = ['Chinese',''] print char_type['language'] print char_type['language'] in language if not ( char_type['language'] in language): print char_type['language'] print char_type['language'] in language continue if(char_type["encoding"]=='GB2312'): try: html = html.decode('gbk').encode('utf-8') except UnicodeDecodeError,e: print "编码有些问题,已跳过" continue else: html = unicode(html, char_type["encoding"]).encode("utf-8") pagecontent = etree.HTML(html,parser=etree.HTMLParser(encoding='utf-8')) #因为每个界面的网页结构不同 所以要查找多种形式 # 第一种找法 搞一个大的字符串(其中包括空格和空行) filecontent = '' p1 = pagecontent.xpath('//div[@class="main-content"]') print type(p1) print p1 print ("第一次找") for i in range(len(p1)): filecontent = filecontent + p1[i].xpath('string()') # 去空格 去空行 filestringcontent = '' file.write('\n这是一篇:\n') for line in filecontent.splitlines(): if not line.split(): continue line = line.strip()#去空格 也是去掉了换行符 filestringcontent += line file.write(filestringcontent) # 每一个网页抓完后的标志 if(len(p1)): continue print "第二次找" p2 = pagecontent.xpath('//body//div//p//text()') print p2 for l in range(len(p2)): print '打印每一页的内容=====' print p2[l] print type(p2[l]) file.write(p2[l])
from lxml import etree html = etree.parse("./test.html", etree.HTMLParser()) result = html.xpath('//li[@class="item-0"]') print(result)
def parse_lagoufile(): parser = etree.HTMLParser(encoding='utf-8') htmlElement = etree.parse('lagou.html', parser=parser) print(etree.tostring(htmlElement, encoding='utf-8').decode('utf-8'))
def main(number, javlibrary_url): try: htmlcode = get_html('http://' + javlibrary_url + '/ja/vl_searchbyid.php?keyword=' + number).replace( u'\xa0', u' ') title = getTitle(htmlcode) movie_found = 1 if title == '': # 页面为搜索结果页,而不是视频信息页,遍历搜索结果 movie_found = 0 html = etree.fromstring( htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() count_all = len( html.xpath( "//div[@class='videothumblist']/div[@class='videos']/div[@class='video']" )) for count in range(1, count_all + 1): number_get = str( html.xpath( "//div[@class='videothumblist']/div[@class='videos']/div[" + str(count) + "]/a/div[1]/text()")).strip(" ['']") if number_get == number.upper(): url_get = str( html.xpath( "//div[@class='videothumblist']/div[@class='videos']/div[" + str(count) + "]/a/@href")).strip(" ['.']") htmlcode = get_html('http://' + javlibrary_url + '/ja' + url_get).replace(u'\xa0', u' ') movie_found = 1 break if movie_found == 1: try: # 从dmm获取简介 dww_htmlcode = get_html( "https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=" + number.replace("-", '00')) except: dww_htmlcode = '' actor = getActor(htmlcode) number = getNum(htmlcode) release = getRelease(htmlcode) dic = { 'actor': str(actor).strip(" [',']").replace('\'', ''), 'title': getTitle(htmlcode).replace( '中文字幕', '').replace("\\n", '').replace('_', '-').replace( number, '').strip().replace(' ', '-').replace('--', '-'), 'studio': getStudio(htmlcode), 'publisher': getPublisher(htmlcode), 'outline': getOutline(dww_htmlcode).replace('\n', '').replace( '\\n', '').replace('\'', '').replace(',', '').replace(' ', ''), 'runtime': getRuntime(htmlcode), 'director': str(getDirector(htmlcode)).replace('----', ''), 'release': release, 'number': number, 'cover': getCover(htmlcode), 'imagecut': 1, 'tag': getTag(htmlcode), 'series': '', 'year': getYear(release), 'actor_photo': getActorPhoto(actor), 'website': getWebsite(htmlcode), 'source': 'javlibrary.py', } else: dic = { 'title': '', 'website': '', } except: if htmlcode == 'ProxyError': dic = { 'title': '', 'website': 'timeout', } else: dic = { 'title': '', 'website': '', } js = json.dumps( dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js
def getTitle(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str( html.xpath("//h3[@class='post-title text']/a/text()")).strip(" ['']") return result
def getOutline(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str( html.xpath("//div[@class='mg-b20 lh4']/text()")).strip(" ['']") return result
def getWebsite(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = 'http:' + str( html.xpath('/html/head/meta[@property=\'og:url\']/@content')).strip( " ['']") return result
def getCover(htmlcode): html = etree.fromstring(htmlcode, etree.HTMLParser()) result = 'http:' + str( html.xpath("//img[@id='video_jacket_img']/@src")).strip(" ['']") return result
def _create_draft(args: Namespace): """ Implementation for `se create-draft` """ # Put together some variables for later use authors = [] translators = [] illustrators = [] pg_producers = [] title = args.title.replace("'", "’") for author in args.author: authors.append({ "name": author.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) if args.translator: for translator in args.translator: translators.append({ "name": translator.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) if args.illustrator: for illustrator in args.illustrator: illustrators.append({ "name": illustrator.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) title_string = title if authors and authors[0]["name"].lower() != "anonymous": title_string += ", by " + _generate_contributor_string(authors, False) identifier = "" for author in authors: identifier += se.formatting.make_url_safe(author["name"]) + "_" identifier = identifier.rstrip("_") + "/" + se.formatting.make_url_safe( title) sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", title) if translators: title_string = title_string + ". Translated by " + _generate_contributor_string( translators, False) identifier = identifier + "/" for translator in translators: identifier += se.formatting.make_url_safe(translator["name"]) + "_" identifier = identifier.rstrip("_") if illustrators: title_string = title_string + ". Illustrated by " + _generate_contributor_string( illustrators, False) identifier = identifier + "/" for illustrator in illustrators: identifier += se.formatting.make_url_safe( illustrator["name"]) + "_" identifier = identifier.rstrip("_") repo_name = identifier.replace("/", "_") repo_path = Path(repo_name).resolve() if repo_path.is_dir(): raise se.InvalidInputException( f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]." ) # Get data on authors for i, author in enumerate(authors): if not args.offline and author["name"].lower() != "anonymous": author["wiki_url"], author["nacoaf_url"] = _get_wikipedia_url( author["name"], True) # Get data on translators for i, translator in enumerate(translators): if not args.offline and translator["name"].lower() != "anonymous": translator["wiki_url"], translator[ "nacoaf_url"] = _get_wikipedia_url(translator["name"], True) # Get data on illlustrators for i, illustrator in enumerate(illustrators): if not args.offline and illustrator["name"].lower() != "anonymous": illustrator["wiki_url"], illustrator[ "nacoaf_url"] = _get_wikipedia_url(illustrator["name"], True) # Download PG HTML and do some fixups if args.pg_url: if args.offline: raise se.RemoteCommandErrorException( "Cannot download Project Gutenberg ebook when offline option is enabled." ) args.pg_url = args.pg_url.replace("http://", "https://") # Get the ebook metadata try: response = requests.get(args.pg_url) pg_metadata_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}" ) parser = etree.HTMLParser() dom = etree.parse(StringIO(pg_metadata_html), parser) # Get the ebook HTML URL from the metadata pg_ebook_url = None for node in dom.xpath("/html/body//a[contains(@type, 'text/html')]"): pg_ebook_url = regex.sub(r"^//", "https://", node.get("href")) pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/", pg_ebook_url) if not pg_ebook_url: raise se.RemoteCommandErrorException( "Could download ebook metadata, but couldn’t find URL for the ebook HTML." ) # Get the ebook LCSH categories pg_subjects = [] for node in dom.xpath( "/html/body//td[contains(@property, 'dcterms:subject')]"): if node.get("datatype") == "dcterms:LCSH": for subject_link in node.xpath("./a"): pg_subjects.append(subject_link.text.strip()) # Get the PG publication date pg_publication_year = None for node in dom.xpath("//td[@itemprop='datePublished']"): pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", node.text) # Get the actual ebook URL try: response = requests.get(pg_ebook_url) pg_ebook_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}" ) try: fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False) pg_ebook_html = se.strip_bom(fixed_pg_ebook_html) except Exception as ex: raise se.InvalidEncodingException( f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}" ) # Try to guess the ebook language pg_language = "en-US" if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html: pg_language = "en-GB" # Create necessary directories (repo_path / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "css").mkdir(parents=True) (repo_path / "src" / "epub" / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "text").mkdir(parents=True) (repo_path / "src" / "META-INF").mkdir(parents=True) is_pg_html_parsed = True # Write PG data if we have it if args.pg_url and pg_ebook_html: try: dom = etree.parse( StringIO(regex.sub(r"encoding=\".+?\"", "", pg_ebook_html)), parser) namespaces = {"re": "http://exslt.org/regular-expressions"} for node in dom.xpath( "//*[re:test(text(), '\\*\\*\\*\\s*Produced by.+')]", namespaces=namespaces): producers_text = regex.sub( r"^<[^>]+?>", "", etree.tostring(node, encoding=str, with_tail=False)) producers_text = regex.sub(r"<[^>]+?>$", "", producers_text) producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r",? and ", ", and ", producers_text) producers_text = producers_text.replace( " and the Online", " and The Online") producers_text = producers_text.replace(", and ", ", ").strip() pg_producers = [ producer.strip() for producer in regex.split(',|;', producers_text) ] # Try to strip out the PG header for node in dom.xpath( "//*[re:test(text(), '\\*\\*\\*\\s*START OF THIS')]", namespaces=namespaces): for sibling_node in node.xpath("./preceding-sibling::*"): easy_node = se.easy_xml.EasyXmlElement(sibling_node) easy_node.remove() easy_node = se.easy_xml.EasyXmlElement(node) easy_node.remove() # Try to strip out the PG license footer for node in dom.xpath( "//*[re:test(text(), 'End of (the )?Project Gutenberg')]", namespaces=namespaces): for sibling_node in node.xpath("./following-sibling::*"): easy_node = se.easy_xml.EasyXmlElement(sibling_node) easy_node.remove() easy_node = se.easy_xml.EasyXmlElement(node) easy_node.remove() # lxml will but the xml declaration in a weird place, remove it first output = regex.sub(r"<\?xml.+?\?>", "", etree.tostring(dom, encoding="unicode")) # Now re-add it output = """<?xml version="1.0" encoding="utf-8"?>\n""" + output # lxml can also output duplicate default namespace declarations so remove the first one only output = regex.sub(r"(xmlns=\".+?\")(\sxmlns=\".+?\")+", r"\1", output) with open(repo_path / "src" / "epub" / "text" / "body.xhtml", "w", encoding="utf-8") as file: file.write(output) except OSError as ex: raise se.InvalidFileException( f"Couldn’t write to ebook directory. Exception: {ex}") except Exception as ex: # Save this error for later, because it's still useful to complete the create-draft process # even if we've failed to parse PG's HTML source. is_pg_html_parsed = False se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml") # Copy over templates _copy_template_file("gitignore", repo_path / ".gitignore") _copy_template_file("LICENSE.md", repo_path) _copy_template_file("container.xml", repo_path / "src" / "META-INF") _copy_template_file("mimetype", repo_path / "src") _copy_template_file("content.opf", repo_path / "src" / "epub") _copy_template_file("onix.xml", repo_path / "src" / "epub") _copy_template_file("toc.xhtml", repo_path / "src" / "epub") _copy_template_file("core.css", repo_path / "src" / "epub" / "css") _copy_template_file("local.css", repo_path / "src" / "epub" / "css") _copy_template_file("se.css", repo_path / "src" / "epub" / "css") _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images") _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("uncopyright.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.svg", repo_path / "images") _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg") _copy_template_file("cover.svg", repo_path / "images" / "cover.svg") # Try to find Wikipedia links if possible ebook_wiki_url = None if not args.offline and title != "Short Fiction": # There's a "Short Fiction" Wikipedia article, so make an exception for that case ebook_wiki_url, _ = _get_wikipedia_url(title, False) # Pre-fill titlepage.xhtml _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml", "TITLE_STRING", title_string) # Create the titlepage SVG contributors = {} if args.translator: contributors["translated by"] = _generate_contributor_string( translators, False) if args.illustrator: contributors["illustrated by"] = _generate_contributor_string( illustrators, False) with open(repo_path / "images" / "titlepage.svg", "w", encoding="utf-8") as file: file.write( _generate_titlepage_svg(title, [author["name"] for author in authors], contributors, title_string)) # Create the cover SVG with open(repo_path / "images" / "cover.svg", "w", encoding="utf-8") as file: file.write( _generate_cover_svg(title, [author["name"] for author in authors], title_string)) # Build the cover/titlepage for distribution epub = SeEpub(repo_path) epub.generate_cover_svg() epub.generate_titlepage_svg() if args.pg_url: _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL", args.pg_url) # Fill out the colophon with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: colophon_xhtml = file.read() colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier) colophon_xhtml = colophon_xhtml.replace("TITLE", title) contributor_string = _generate_contributor_string(authors, True) if contributor_string == "": colophon_xhtml = colophon_xhtml.replace( " by<br/>\n\t\t\t<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string) else: colophon_xhtml = colophon_xhtml.replace( "<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string) if translators: translator_block = f"It was translated from ORIGINAL_LANGUAGE in TRANSLATION_YEAR by<br/>\n\t\t\t{_generate_contributor_string(translators, True)}.</p>" colophon_xhtml = colophon_xhtml.replace( "</p>\n\t\t\t<p>This ebook was produced for the<br/>", f"<br/>\n\t\t\t{translator_block}\n\t\t\t<p>This ebook was produced for the<br/>" ) if args.pg_url: colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url) if pg_publication_year: colophon_xhtml = colophon_xhtml.replace( "PG_YEAR", pg_publication_year) if pg_producers: producers_xhtml = "" for i, producer in enumerate(pg_producers): if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>" else: producers_xhtml = producers_xhtml + f"<b class=\"name\">{_add_name_abbr(producer).strip('.')}</b>" if i < len(pg_producers) - 1: producers_xhtml = producers_xhtml + ", " if i == len(pg_producers) - 2: producers_xhtml = producers_xhtml + "and " producers_xhtml = producers_xhtml + "<br/>" colophon_xhtml = colophon_xhtml.replace( "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml) file.seek(0) file.write(colophon_xhtml) file.truncate() # Fill out the metadata file with open(repo_path / "src" / "epub" / "content.opf", "r+", encoding="utf-8") as file: metadata_xml = file.read() metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier) metadata_xml = metadata_xml.replace(">TITLE_SORT<", f">{sorted_title}<") metadata_xml = metadata_xml.replace(">TITLE<", f">{title}<") metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name)) if pg_producers: producers_xhtml = "" i = 1 for producer in pg_producers: if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n" else: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n" producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n" i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xml, flags=regex.DOTALL) if ebook_wiki_url: metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<") authors_xml = _generate_metadata_contributor_xml(authors, "author") authors_xml = authors_xml.replace("dc:contributor", "dc:creator") metadata_xml = regex.sub( r"<dc:creator id=\"author\">AUTHOR</dc:creator>.+?scheme=\"marc:relators\">aut</meta>", authors_xml, metadata_xml, flags=regex.DOTALL) if translators: translators_xml = _generate_metadata_contributor_xml( translators, "translator") metadata_xml = regex.sub( r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>", translators_xml, metadata_xml, flags=regex.DOTALL) else: metadata_xml = regex.sub( r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>\n\t\t", "", metadata_xml, flags=regex.DOTALL) if illustrators: illustrators_xml = _generate_metadata_contributor_xml( illustrators, "illustrator") metadata_xml = regex.sub( r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>", illustrators_xml, metadata_xml, flags=regex.DOTALL) else: metadata_xml = regex.sub( r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>\n\t\t", "", metadata_xml, flags=regex.DOTALL) if args.pg_url: if pg_subjects: subject_xhtml = "" i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n" i = i + 1 i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n" # Now, get the LCSH ID by querying LCSH directly. try: response = requests.get( f"https://id.loc.gov/search/?q=cs:http://id.loc.gov/authorities/subjects&q=\"{urllib.parse.quote(subject)}\"" ) result = regex.search( fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>", response.text) loc_id = "Unknown" try: loc_id = result.group(1) except Exception as ex: pass subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n" except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}" ) i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xml) metadata_xml = metadata_xml.replace( "<dc:language>LANG</dc:language>", f"<dc:language>{pg_language}</dc:language>") metadata_xml = metadata_xml.replace( "<dc:source>PG_URL</dc:source>", f"<dc:source>{args.pg_url}</dc:source>") file.seek(0) file.write(metadata_xml) file.truncate() # Set up local git repo repo = git.Repo.init(repo_path) if args.email: with repo.config_writer() as config: config.set_value("user", "email", args.email) if args.pg_url and pg_ebook_html and not is_pg_html_parsed: raise se.InvalidXhtmlException( "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook." )
"--dry", action="store_true", help="Dry run (do not save output)") p.add_argument("-t", "--throttle", action="store_true", help="Throttle requests") args = p.parse_args() # override print function to only print when verbose is specified printv = partial(print_verbose, args.verbose) URL = "https://thuisarts.nl/overzicht/onderwerpen" printv("[1/5] Getting thuisarts onderwerpen...") page = requests.get(URL).text tree = etree.parse(StringIO(page), etree.HTMLParser()) links = tree.xpath('//ul[@class="subject-list"]/li/a') # build result list results = [{ "ID": i, "title": link.text, "link": f'https://thuisarts.nl/{link.get("href")}', } for i, link in enumerate(links)] printv(f"[2/5] Dumping surface level results to ./thuisarts.yaml...") if not args.dry: with open("thuisarts.yaml", "w") as f: yaml.dump(results, f, allow_unicode=True) # scrape the links for each entry in the results list printv(f"[3/5] Scraping individual pages...")
def main(): """Called from console script """ op = _createOptionParser(usage=usage) op.add_option("-x", "--xsl", metavar="transform.xsl", help="XSL transform", dest="xsl", default=None) op.add_option("--path", metavar="PATH", help="URI path", dest="path", default=None) op.add_option("--parameters", metavar="param1=val1,param2=val2", help="Set the values of arbitrary parameters", dest="parameters", default=None) op.add_option("--runtrace-xml", metavar="runtrace.xml", help="Write an xml format runtrace to file", dest="runtrace_xml", default=None) op.add_option("--runtrace-html", metavar="runtrace.html", help="Write an html format runtrace to file", dest="runtrace_html", default=None) (options, args) = op.parse_args() if len(args) > 2: op.error("Wrong number of arguments.") elif len(args) == 2: if options.xsl or options.rules: op.error("Wrong number of arguments.") path, content = args if path.lower().endswith('.xsl'): options.xsl = path else: options.rules = path elif len(args) == 1: content, = args else: op.error("Wrong number of arguments.") if options.rules is None and options.xsl is None: op.error("Must supply either options or rules") if options.trace: logger.setLevel(logging.DEBUG) runtrace = False if options.runtrace_xml or options.runtrace_html: runtrace = True parser = etree.HTMLParser() parser.resolvers.add(RunResolver(os.path.dirname(content))) if options.xsl is not None: output_xslt = etree.parse(options.xsl) else: xsl_params = None if options.xsl_params: xsl_params = split_params(options.xsl_params) output_xslt = compile_theme( rules=options.rules, theme=options.theme, extra=options.extra, parser=parser, read_network=options.read_network, absolute_prefix=options.absolute_prefix, includemode=options.includemode, indent=options.pretty_print, xsl_params=xsl_params, runtrace=runtrace, ) if content == '-': content = sys.stdin if options.read_network: access_control = AC_READ_NET else: access_control = AC_READ_FILE transform = etree.XSLT(output_xslt, access_control=access_control) content_doc = etree.parse(content, parser=parser) params = {} if options.path is not None: params['path'] = "'%s'" % options.path if options.parameters: for key, value in split_params(options.parameters).items(): params[key] = quote_param(value) output_html = transform(content_doc, **params) if isinstance(options.output, basestring): out = open(options.output, 'wt') else: out = options.output out.write(str(output_html)) if runtrace: runtrace_doc = diazo.runtrace.generate_runtrace( rules=options.rules, error_log=transform.error_log) if options.runtrace_xml: if options.runtrace_xml == '-': out = sys.stdout else: out = open(options.runtrace_xml, 'wt') runtrace_doc.write(out, encoding='utf-8', pretty_print=options.pretty_print) if options.runtrace_html: if options.runtrace_html == '-': out = sys.stdout else: out = open(options.runtrace_html, 'wt') out.write(str(diazo.runtrace.runtrace_to_html(runtrace_doc))) for msg in transform.error_log: if not msg.message.startswith('<runtrace '): logger.warn(msg)
def getStudio(htmlcode): # 获取厂商 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath( '/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']") return result
def canBeMultiple(self, weekList, showID): url = 'http://www.rte.ie/player/ie/show/' + showID showIDs = [] try: parser = etree.HTMLParser(encoding='utf-8') tree = etree.parse(url, parser) for shows in tree.xpath( '//div[@class="more-videos-pane"]//article[@class="thumbnail-module"]//a[@class="thumbnail-programme-link"]/@href' ): show_split = shows.rsplit('/', 2) show = str(show_split[1]) showIDs.append(show) except (Exception) as exception: print('canBeMultiple: getShows: Error getting show numbers: ', exception) showIDs.append(showID) # If zero we only have 1 show in this category if len(showIDs) == 0: showIDs.append(showID) short = '' name = '' date1 = '' stream = '' channel = '' icon = '' duration = '' for show in showIDs: newUrl = 'http://feeds.rasset.ie/rteavgen/player/playlist?showId=' + show try: # Parse the XML with lxml tree = etree.parse(newUrl) # Find the first element <entry> for elem in tree.xpath('//*[local-name() = "entry"]'): # Iterate through the children of <entry> try: stream = str(elem[0].text) except (Exception) as exception: print("canBeMultiple: stream parse error: ", exception) stream = '' try: name_tmp = str(elem[3].text) except (Exception) as exception: print("canBeMultiple: name_tmp parse error: ", exception) name_tmp = '' try: short_tmp = str(elem[4].text) except (Exception) as exception: print("canBeMultiple: short_tmp parse error: ", exception) short_tmp = '' try: channel = str(elem[5].attrib.get('term')) except (Exception) as exception: print("canBeMultiple: channel parse error: ", exception) channel = '' try: millisecs = int(elem[15].attrib.get('ms')) except (Exception) as exception: print("canBeMultiple: millisecs parse error: ", exception) millisecs = 0 try: lastDate = datetime.fromtimestamp( mktime( strptime(str(elem[1].text), "%Y-%m-%dT%H:%M:%S+00:00")) ) #2012-12-31T12:54:29+00:00 date_tmp = lastDate.strftime(u"%a %b %d %Y %H:%M") date1 = _("Added: ") + str(date_tmp) except (Exception) as exception: lastDate = datetime.fromtimestamp( mktime( strptime(str(elem[1].text), "%Y-%m-%dT%H:%M:%S+01:00")) ) #2012-12-31T12:54:29+01:00 date_tmp = lastDate.strftime(u"%a %b %d %Y %H:%M") date1 = _("Added: ") + str(date_tmp) print("canBeMultiple: date1 parse error: ", exception) name = checkUnicode(name_tmp) short = checkUnicode(short_tmp) # Calcualte the stream duration duration = _("Duration: ") + str(calcDuration(millisecs)) # Only set the Icon if they are enabled if self.showIcon == 'True': try: icon_url = str(elem[22].attrib.get('url')) icon = icon_url[0:-7] + "-261.jpg" except (Exception) as exception: print("canBeMultiple: icon parse error: ", exception) icon = '' else: icon = '' weekList.append((date1, name, short, channel, stream, icon, duration, False)) except (Exception) as exception: print("canBeMultiple: Problem parsing data: ", exception)
def getTag(htmlcode): # 获取番号 html = etree.fromstring(htmlcode, etree.HTMLParser()) result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[4]/a/text()')) return result.strip(" ['']").replace("'", '').replace(' ', '')
def process_message(self, message_json: str) -> bool: self.logger.debug(f"processing message {message_json}") # parse the JSON SQS message add_article_msg = AddArticleMessage.from_json(message_json) try: # fetch the content from the URL in the message resp = requests.get(add_article_msg.url) except Exception: self.logger.exception( f"failed to fetch article at url {add_article_msg.url}") return False self.logger.debug("simplifying content") readable_content = Document(resp.text) parser = etree.HTMLParser() content_dom = etree.fromstring(readable_content.summary(), parser) # create an Article model article = Article(add_article_msg.user_id) article.url = resp.url # extract the title from the content self.logger.debug("extracting article title") article.title = readable_content.title() # extract the images from the content self.logger.debug("fetching related content") for image in content_dom.iter("img"): img_url = image.get("src") try: # fetch the image by the URL self.logger.debug(f"fetching related image at {img_url}") img_resp = requests.get(img_url) img_key = f"{article.user_id}/articles/{article.article_id}/related/{Fetcher.get_filename_from_url(img_resp.url)}" except Exception: self.logger.exception( f"failed to fetch related image at url {img_url}") continue # save the images to S3 self.logger.debug( f"writing image {img_url} to S3 with key {img_key}") if not self.file_repository.put(img_key, BytesIO(resp.content)): continue # create RelatedContent models for each image and add to the Article article.related_content.append( RelatedContent(resp.headers["Content-Type"], img_key)) # re-write the content HTML to point to the new image URL self.logger.debug(f"re-writing img element with new URL {img_key}") image.set("src", img_key) # write the content to S3 content_key = f"{article.user_id}/articles/{article.article_id}/content.html" self.logger.debug(f"writing content to S# with key {content_key}") if not self.file_repository.put( content_key, BytesIO( etree.tostring(content_dom.getroottree(), pretty_print=True, method="html"))): return False # update the Article with the content key article.content_key = content_key # write the Article to Dynamo self.logger.debug( f"writing article to debug with keys user_id {article.user_id} article_id {article.article_id}" ) if not self.article_repository.put(article): return False # send a completed message to SQS self.logger.debug("writing completed message to SQS") if not self.finished_queue_producer.send_message( ArticleFetchCompleteMessage(article.user_id, article.article_id).to_json()): return False return True
async def main(): # done, pending = await asyncio.wait(futures, timeout=5) with open('100k.csv', 'w', newline='', encoding='utf-8') as csvfile: hp = etree.HTMLParser(encoding='utf-8') writer = csv.writer(csvfile) writer.writerow([ 'nro_documento', 'nombres', 'apellidos', 'fecha_nacim', 'sexo', 'tipo_aseg', 'beneficiarios_activos', 'enrolado', 'vencimiento_de_fe_de_vida', 'nro_titular', 'titular', 'estado_titular', 'meses_de_aporte_titular', 'vencimiento_titular', 'ultimo_periodo_abonado_titular' ]) start_time = time.time() async with ClientSession() as session: sem = asyncio.Semaphore(100) futures = [ asyncio.ensure_future(fetch_data(sem, param, session)) for param in param_generator ] for i, future in enumerate(asyncio.as_completed(futures)): #print(future.result()) try: t, ced, result_html = await future root = html.fromstring(result_html, parser=hp) nro_documento = root.xpath( u"/html/body/center[2]/form/table[2]/tr[2]/td[2]" )[0].text.strip() nombres = root.xpath( u"/html/body/center[2]/form/table[2]/tr[2]/td[3]" )[0].text.strip() apellidos = root.xpath( u"/html/body/center[2]/form/table[2]/tr[2]/td[4]" )[0].text.strip() fecha_nacim = root.xpath( u"/html/body/center[2]/form/table[2]/tr[2]/td[5]" )[0].text.strip() sexo = root.xpath( u"/html/body/center[2]/form/table[2]/tr[2]/td[6]" )[0].text.strip() tipo_aseg = root.xpath( u"/html/body/center[2]/form/table[2]/tr[2]/td[7]" )[0].text.strip() beneficiarios_activos = root.xpath( u"/html/body/center[2]/form/table[2]/tr[2]/td[8]" )[0].text.strip() enrolado = root.xpath( u"/html/body/center[2]/form/table[2]/tr[2]/td[9]" )[0].text.strip() vencimiento_de_fe_de_vida = root.xpath( u"/html/body/center[2]/form/table[2]/tr[2]/td[10]" )[0].text.strip() nro_titular = root.xpath( u"/html/body/center[2]/form/table[3]/tr[2]/td[1]" )[0].text.strip() titular = root.xpath( u"/html/body/center[2]/form/table[3]/tr[2]/td[2]" )[0].text.strip() estado_titular = root.xpath( u"/html/body/center[2]/form/table[3]/tr[2]/td[3]" )[0].text.strip() meses_de_aporte_titular = root.xpath( u"/html/body/center[2]/form/table[3]/tr[2]/td[4]" )[0].text.strip() vencimiento_titular = root.xpath( u"/html/body/center[2]/form/table[3]/tr[2]/td[5]" )[0].text.strip() ultimo_periodo_abonado_titular = root.xpath( u"/html/body/center[2]/form/table[3]/tr[2]/td[6]" )[0].text.strip() print('{}, {}, {} retornado en {:.2f} segundos'.format( nro_documento, nombres, apellidos, t)) writer.writerow([ nro_documento, nombres, apellidos, fecha_nacim, sexo, tipo_aseg, beneficiarios_activos, enrolado, vencimiento_de_fe_de_vida, nro_titular, titular, estado_titular, meses_de_aporte_titular, vencimiento_titular, ultimo_periodo_abonado_titular ]) except Exception as e: print("Cedula: %s no existe" % (ced)) continue t_total = time.time() - start_time nr_of_requests = ((stop + 1) - start) print("Process took: {:.2f} seconds".format(t_total)) print("{} requests per second".format(nr_of_requests / t_total))
from lxml import etree html = etree.parse('./test.html', etree.HTMLParser()) # open local file result = html.xpath('//li/a/@href') # ################# or #################### parser = etree.HTML(html) # this html is html text not file result = parser.xpath('//li/a/text()') # contains() result = parser.xpath('//li[contains(@class, "test")]/a/@value') result = parser.xpath('//li[contains(@class, "li") and @name="item"]/a/text()') # position result = parser.xpath('//li[1]/a/text()') result = parser.xpath('//li[last()]/a/text()') result = parser.xpath('//li[last()-2]/a/text()') # The third from last result = parser.xpath('//li[position()<3]/a/text()') # family i do't know result = parser.xpath('//li[1]/ancestor::*') result = parser.xpath('//li[1]/ancestor::div') result = parser.xpath('//li[1]/attribute::*') result = parser.xpath('//li[1]/child::a[@href="link1.html"]') result = parser.xpath('//li[1]/descendant::span') result = parser.xpath('//li[1]/following::*[2]') result = parser.xpath('//li[1]/following-sibling::*')
def html2dom(htmlstr): parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True) domtree = etree.fromstring(htmlstr, parser) return etree.ElementTree(domtree)
print char_type['language'] in language if not (char_type['language'] in language): print char_type['language'] print char_type['language'] in language if (char_type["encoding"] == 'GB2312'): try: html = html.decode('gbk').encode('utf-8') except UnicodeDecodeError, e: print "编码有些问题,已跳过" else: html = unicode(html, char_type["encoding"]).encode("utf-8") pagecontent = etree.HTML(html,parser=etree.HTMLParser(encoding='utf-8')) #对页面进行解析 p = pagecontent.xpath('//body//div//p//text()') print type(p) print p # filecontent = '' for i in range(len(p)): print p[i] print type(p[i]) file.write(p[i]) # filecontent = filecontent+p[i].xpath('string()') #去空格 # filestringcontent = '' # for line in filecontent.splitlines(): # if not line.split(): # continue
def parms_page(html): root = etree.HTML(html, parser=etree.HTMLParser(encoding="utf-8")) html_parms = re.findall('', root)
from lxml import etree html = etree.parse('./text.html', etree.HTMLParser()) result = html.xpath('//li[@class="item-0"]/text()') print(result) result = html.xpath('//li[@class="item-0"]/a/text()') print(result) result = html.xpath('//li[@class="item-0"]//text()') print(result)
def transform(self, pretty_print=True): """change the self.html and return it with CSS turned into style attributes. """ if etree is None: return self.html parser = etree.HTMLParser() stripped = self.html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. root = tree if stripped.startswith(tree.docinfo.doctype) else page if page is None: print repr(self.html) raise PremailerError("Could not parse the html") assert page is not None ## ## style selectors ## rules = [] for index, style in enumerate(CSSSelector('style')(page)): # If we have a media attribute whose value is anything other than # 'screen', ignore the ruleset. media = style.attrib.get('media') if media and media != 'screen': continue these_rules, these_leftover = self._parse_style_rules( style.text, index) rules.extend(these_rules) parent_of_style = style.getparent() if these_leftover: style.text = '\n'.join( ['%s {%s}' % (k, v) for (k, v) in these_leftover]) elif not self.keep_style_tags: parent_of_style.remove(style) if self.external_styles: for stylefile in self.external_styles: if stylefile.startswith('http://'): css_body = urllib.urlopen(stylefile).read() elif os.path.exists(stylefile): try: f = codecs.open(stylefile) css_body = f.read() finally: f.close() else: raise ValueError(u"Could not find external style: %s" % stylefile) these_rules, these_leftover = self._parse_style_rules( css_body, -1) rules.extend(these_rules) # rules is a tuple of (specificity, selector, styles), where specificity is a tuple # ordered such that more specific rules sort larger. rules.sort(key=operator.itemgetter(0)) first_time = [] first_time_styles = [] for __, selector, style in rules: new_selector = selector class_ = '' if ':' in selector: new_selector, class_ = re.split(':', selector, 1) class_ = ':%s' % class_ # Keep filter-type selectors untouched. if class_ in FILTER_PSEUDOSELECTORS: class_ = '' else: selector = new_selector sel = CSSSelector(selector) for item in sel(page): old_style = item.attrib.get('style', '') if not item in first_time: new_style = merge_styles(old_style, style, class_) first_time.append(item) first_time_styles.append((item, old_style)) else: new_style = merge_styles(old_style, style, class_) item.attrib['style'] = new_style self._style_to_basic_html_attributes(item, new_style, force=True) # Re-apply initial inline styles. for item, inline_style in first_time_styles: old_style = item.attrib.get('style', '') if not inline_style: continue new_style = merge_styles(old_style, inline_style, class_) item.attrib['style'] = new_style self._style_to_basic_html_attributes(item, new_style, force=True) if self.remove_classes: # now we can delete all 'class' attributes for item in page.xpath('//@class'): parent = item.getparent() del parent.attrib['class'] ## ## URLs ## if self.base_url: for attr in ('href', 'src'): for item in page.xpath("//@%s" % attr): parent = item.getparent() if attr == 'href' and self.preserve_internal_links \ and parent.attrib[attr].startswith('#'): continue if not self.base_url.endswith('/'): self.base_url += '/' parent.attrib[attr] = urlparse.urljoin( self.base_url, parent.attrib[attr].strip('/')) out = etree.tostring(root, method="html", pretty_print=pretty_print) if self.strip_important: out = _importants.sub('', out) return out
def displayHtmlEntry(self, entry, author, nick, url): prepend = '''\ <div class="status__prepend"> <span> <a href="%s" class="status__display-name"> <strong>%s</strong> </a> shared </span> </div> ''' status = '''\ <div class="status"> <div class="status__header"> <a class="status__relative-time" href="%s"> <time class="time-ago" datetime="%s">%s</time> </a> <a class="status__display-name" href="%s"> <span class="display-name"> <strong>%s</strong> <span>@%s</span> </span> </a> </div> <div class="status__content">%s</div> </div> ''' id = entry.xpath('atom:id/text()', namespaces={"atom": "http://www.w3.org/2005/Atom"})[0] updated = entry.xpath( 'atom:updated/text()', namespaces={"atom": "http://www.w3.org/2005/Atom"})[0] verb = entry.xpath( 'activity:verb/text()', namespaces={"activity": "http://activitystrea.ms/spec/1.0/"})[0] if verb == 'http://activitystrea.ms/schema/1.0/share': print(prepend % (id, author)) author = entry.xpath( 'activity:object/atom:author/poco:displayName/text()', namespaces={ "activity": "http://activitystrea.ms/spec/1.0/", "atom": "http://www.w3.org/2005/Atom", "poco": "http://portablecontacts.net/spec/1.0" })[0] nick = entry.xpath('activity:object/atom:author/atom:name/text()', namespaces={ "activity": "http://activitystrea.ms/spec/1.0/", "atom": "http://www.w3.org/2005/Atom" })[0] url = entry.xpath('activity:object/atom:author/atom:id/text()', namespaces={ "activity": "http://activitystrea.ms/spec/1.0/", "atom": "http://www.w3.org/2005/Atom" })[0] content = entry.xpath( 'atom:content/text()', namespaces={"atom": "http://www.w3.org/2005/Atom"})[0] parser = etree.HTMLParser() tree = etree.fromstring(content, parser) content = etree.tostring(tree, encoding='unicode', method='html') print(status % (id, updated, updated, url, author, nick, content))
import os import os.path as op import re from lxml import etree parser = etree.HTMLParser(encoding='utf-8') from time import sleep from urllib.parse import urlsplit, parse_qs import requests_cache from validators import validate_raw_files, check_products_detection from create_csvs import create_csvs from ers import all_keywords_usa as keywords, fpath_namer, mh_brands, clean_url, shop_inventory_lw_csv from matcher import BrandMatcher from ers import COLLECTION_DATE, file_hash, img_path_namer, TEST_PAGES_FOLDER_PATH from custom_browser import CustomDriver from parse import parse from ers import clean_xpathd_text # Init variables and assets shop_id = 'sip_whisky' root_url = 'https://sipwhiskey.com' requests_cache.install_cache(fpath_namer(shop_id, 'requests_cache')) country = 'USA' searches, categories, products = {}, {}, {} driver = CustomDriver(headless=True) brm = BrandMatcher()
</div> ''' html=etree.HTML(text) # HTML 类 可以对 text 文本初始化 构造一个 XPath 解析对象 result=etree.tostring(html) # tostring() 可以输出修正后的 HTML 代码 print(result.decode('utf-8')) 也可以直接读取文本文件进行解析 from lxml import etree html = etree.parse('./test.html', etree.HTMLParser()) result = etree.tostring(html) print(result.decode('utf-8')) text=''' <div> <url> <li class="item-0"><a href="link1.html">first item</a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link1.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </url> </div>
def transform(self, pretty_print=True, **kwargs): """change the self.html and return it with CSS turned into style attributes. """ if etree is None: return self.html if self.method == 'xml': parser = etree.XMLParser(ns_clean=False, resolve_entities=False) else: parser = etree.HTMLParser() stripped = self.html.strip() tree = etree.fromstring(stripped, parser).getroottree() page = tree.getroot() # lxml inserts a doctype if none exists, so only include it in # the root if it was in the original html. root = tree if stripped.startswith(tree.docinfo.doctype) else page if page is None: print repr(self.html) raise ValueError("Could not parse the html") assert page is not None ## style tags for element in CSSSelector('style,link[rel~=stylesheet]')(page): # If we have a media attribute whose value is anything other than # 'screen', ignore the ruleset. media = element.attrib.get('media') if media and media != 'screen': continue is_style = element.tag == 'style' if is_style: css_body = element.text else: href = element.attrib.get('href') if not href: continue css_body = self._load_external(href) self._parse_style_rules(css_body) parent_of_element = element.getparent() if not self.keep_style_tags or not is_style: parent_of_element.remove(element) ## explicitly defined external style file if self.external_styles: for stylefile in self.external_styles: css_body = self._load_external(stylefile) self._parse_style_rules(css_body) ## styles from element selectors, runs before class selectors for elem in page.xpath('//*'): if elem.tag in self.rules: old_style = elem.attrib.get('style', '') new_style = self.rules[elem.tag] if old_style: #replace any old property values with new property value old_cleaned_style = '' for property in old_style.split(';'): if len(property.split(':')) != 2: continue else: property_name, property_val = property.split(':') if new_style.find(property_name) < 0: old_cleaned_style += property + ';' new_style = '; '.join([old_cleaned_style, new_style]) elem.attrib['style'] = new_style ## styles from class selectors for tag_classes in page.xpath('//@class'): tag = tag_classes.getparent() tag_classes = [ '.' + c.strip() for c in tag_classes.split(' ') if c.strip() ] for tag_class in tag_classes: if tag_class in self.rules: old_style = tag.attrib.get('style', '') new_style = self.rules[tag_class] if old_style: #replace any old property values with new property value old_cleaned_style = '' for property in old_style.split(';'): if len(property.split(':')) != 2: continue else: property_name, property_val = property.split( ':') if new_style.find(property_name) < 0: old_cleaned_style += property + ';' new_style = '; '.join([old_cleaned_style, new_style]) tag.attrib['style'] = new_style if self.remove_classes: # now we can delete all 'class' attributes for item in page.xpath('//@class'): parent = item.getparent() del parent.attrib['class'] kwargs.setdefault('method', self.method) kwargs.setdefault('pretty_print', pretty_print) out = etree.tostring(root, **kwargs) if self.method == 'xml': out = _cdata_regex.sub( lambda m: '/*<![CDATA[*/%s/*]]>*/' % m.group(1), out) if self.strip_important: out = _importants.sub('', out) return out