def _set_data_(self, data, *args, **kwargs): if isinstance(data, QtGui.QTextDocument): self._docViewer_.setDocument(data) elif isinstance(data, str): from html.parser import HTMLParser parser = HTMLParser(convert_charrefs=True) parser.feed(data) parser.close() if parser.get_starttag_text() is None: self._docViewer_.document().setPlainText(data) else: self._docViewer_.document().setHtml(data) if data.find("<?xml version=") >= 0: self._highlighter_ = xmlutils.XmlSyntaxHighlighter( self._docViewer_.document()) else: self._highlighter_ = None else: raise TypeError( "Expecting a QTextDdocument or a str; got %s instead" % type(data).__name__) if kwargs.get("show", True): self.activateWindow()
def _strip_tags(self, html): result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def feed(self, data): data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data) data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) data = data.replace(''', "'") data = data.replace('"', '"') HTMLParser.feed(self, data) HTMLParser.close(self)
def parse_links(self): f = open(self.file,'r') data = f.read() f.close() parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def parse_links(self): f = open(self.file, 'r') data = f.read() f.close() parse = HTMLParser() parse.feed(data) parse.close() return parse.anchorlist
def strip_tags(self, htmlStr): htmlStr = htmlStr.strip() htmlStr = htmlStr.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(htmlStr) parser.close() return ''.join(result)
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r') data = f.read() f.close() parser = HTMLParser(AbstractFormatter(DumbWriter(io.StringIO()))) parser.feed(data) parser.close() return parser.anchorlist
def strip_tags(html): html = html.strip() html = html.strip("\n") result = [] parse = HTMLParser() parse.handle_data = result.append parse.feed(html) parse.close() return "".join(result)
def test_generate_body_with_dummy_data_html(self): """Check to make sure that the last tag is an html tag""" test_email_data = [{'Author': 'Test Author', 'Journal': 'Test Journal', 'PubDate': datetime.datetime.now().date(), 'Title': 'Test Title', 'Link': 'https://www.altmetric.com/details/101571224'}] test_email_address = '*****@*****.**' test_body = api_parser.generate_body(test_email_data, 30, test_email_address) parser = HTMLParser() parser.feed(test_body) test_output = parser.get_starttag_text() parser.close() self.assertEqual(test_output, '<a href="mailto:[email protected]">')
def strip_tags(html): from html.parser import HTMLParser html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def urlparser(): import urllib.request, formatter, sys, html from html.parser import HTMLParser with urllib.request.urlopen(url) as response: data = str(response.read()) response.close() format = formatter.AbstractFormatter(formatter.DumbWriter( sys.stdout)) ptext = HTMLParser(format) ptext.feed(data) ptext.close()
def strip_tags(html): ''' Removes html tags from a string ''' from html.parser import HTMLParser result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def strip_tags(html): """ Python中过滤HTML标签的函数 """ html = html.strip() parser = HTMLParser() result = [] parser.handle_data = result.append parser.feed(html) parser.close() return result
def parse_links(self): 'Parse out the links found in downloaded HTML file' f = open(self.file, 'r') data = f.read() # print(data) f.close() # pa = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) pa = HTMLParser() pa.feed(data) pa.close() return pa.rawdata
def parse_links(self): #解析刚下载下来的页面链接 f = codecs.open(self.file, 'rb', 'utf-8') #读取下载的页面 data = f.read() #.decode("utf-8") f.close() parser = HTMLParser( formatter.AbstractFormatter( #AbstractFormatter用来解析数据 formatter.DumbWriter(io.StringIO())) ) #DumbWriter用来输出内容,cStringIO保障不输出到标准输出(最好输出到文件) parser.feed(data) parser.close() return parser.anchorlist #返回解析后的列表
def strip_tags(html): if html: html = html.strip() html = html.strip("\n") result = [] parse = HTMLParser() parse.handle_data = result.append parse.feed(html) parse.close() return "".join(result) return ''
def close(self): HTMLParser.close(self) self.pbr() self.o('', 0, 'end') self.outtext = self.outtext.join(self.outtextlist) if options.google_doc: self.outtext = self.outtext.replace(' _place_holder;', ' ') return self.outtext
def close(self): """ Tell the parser the feed has ended and clean up the rightmost whitespaces. """ HTMLParser.close(self) string = ''.join(self._string) if string and HtmlExtractor.RSTRIP_REGEX.search(string): # clean up those rightmost whitespaces: self._string = [string.rstrip()] self._shortenTags(self.position)
def __get_content_detail_other(self, detail_url): chrome_options = Options() #chrome以无界面模式打开 chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument("disable-extensions") chrome_options.add_argument("--start-maximized") chrome_options.add_argument( 'user-agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"' ) chrome_options.add_argument( 'Accept-Language: "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7"') chrome_options.add_argument('Host: "www.91porn.com"') chrome_options.add_argument( 'Referer: "http://www.91porn.com/v.php?category=top&viewtype=basic"' ) chrome_options.add_argument('Cache-Control: "max-age=0"') chrome_options.add_argument( 'Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"' ) chrome_options.add_argument('Upgrade-Insecure-Requests: "1"') #禁止加载图片 prefs = {'profile.default_content_setting_values': {'images': 2}} chrome_options.add_experimental_option('prefs', prefs) # chrome_options.add_extension(r'D:\PyWorkSpace\learn\4.10.0_0adblock.crx') # 驱动路径 path = r'D:\PyWorkSpace\learn\chromedriver.exe' # selenium + 无头google解决js懒加载video url问题 browser = webdriver.Chrome(executable_path=path, options=chrome_options) # 解决browser.get全部加载完成需要的时间太长问题 browser.set_page_load_timeout(10) browser.set_script_timeout(10) try: browser.get(detail_url) except: print("加载页面太慢,停止加载,继续下一步操作") browser.execute_script("window.stop()") page_source = browser.page_source browser.quit() p = HTMLParser() p.feed(page_source) # 处理html中&被转义 page_source = p.unescape(page_source) print(page_source) p.close() return page_source
def html_strip(html): html = html.replace("#", '') html = html.replace(">", '') html = html.replace("-", '') html = html.replace("*", '') html = html.strip() html = html.strip("\n") result = [] parse = HTMLParser() parse.handle_data = result.append parse.feed(html) parse.close() return "".join(result)
def strip_tags(html): # Python中过滤HTML标签的函数 # >>> str_text=strip_tags("<font color=red>hello</font>") # >>> print str_text # hello from html.parser import HTMLParser html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return ''.join(result)
def strip_tags(self, htmlStr): ''' 使用HTMLParser进行html标签过滤 :param htmlStr: ''' self.htmlStr = htmlStr htmlStr = htmlStr.strip() htmlStr = htmlStr.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(htmlStr) parser.close() return ''.join(result)
def close(self, *args, **kwargs): self.handle_data(None) HTMLParser.close(self, *args, **kwargs) if self.stack: raise Exception("%s unclosed stack: %s" % (self.name, repr(self.stack))) if self.tree: raise Exception("%s unclosed tags: %s" % (self.name, repr(self.tree_names))) self.buffers["_r"] += "return _c.$t;" self.buffers["_r"] += "};" buffer_r = self.buffers["_r"] del self.buffers["_r"] final_buffer = "".join(self.buffers.values()) final_buffer += buffer_r return final_buffer
class properties(): def __init__(self, mls_link): self.mls_link = mls_link # self.listing_div = None self._collect_listings() def _collect_listings(self): response = requests.get(self.mls_link) html = response.text html_soup = BeautifulSoup(html, features='html.parser') self.listing_div = html_soup.find_all( 'div', attrs={'class': 'j-resultsPageAsyncDisplays'}) def convert_to_HTML(self): self.p = HTMLParser() self.p.feed(str(self.listing_div)) self.p.close()
def close(self): HTMLParser.close(self) if self.__core['status'] == 0: errored = False for data in self.__core['detected_list']: if not data['complete']: if data['tag'] in self.__SINGLE_TAGS: data['complete'] = True continue self.__core['status'] = -1 self.__core['detail'] = 'Construction Error' errored = True break if not errored: self.__core['status'] = 1 self.__core['detail'] = 'ok' return self.__core
def close(self): """This should be called at the end of the file. If in parser mode, it saves the corpus text and the annotations to files. """ while self.tagstack: t, a, _ = self.tagstack[0] if t not in self.autoclose: util.log.error(self.pos() + "(at EOF) Autoclosing tag </%s>, starting at %s", t, a) self.errors = True else: util.log.info(self.pos() + "(at EOF) Autoclosing tag </%s>, starting at %s", t, a) self.handle_endtag(t) self.anchor() if self.skipped: new_elements = sorted(list(self.skipped.items()), key=lambda x: (-x[1], x[0])) new_elements_ann = " ".join(".".join([x[0][0].replace(":", "_"), x[0][1]]) if not x[0][1] is None else x[0][0].replace(":", "_") for x in new_elements) new_elements_ele = " ".join(":".join([x[0][0].replace(":", "\\:"), x[0][1]]) if not x[0][1] is None else x[0][0].replace(":", "\\:") for x in new_elements) if not self.elem_annotations: util.log.info("Found elements:") print() print("vrt_structs_annotations = " + new_elements_ann) print("vrt_structs = " + new_elements_ele) print("xml_elements = " + new_elements_ele) print("xml_annotations = " + new_elements_ann) print() else: print() print("xml_skip = " + new_elements_ele) print() # Only save results if no errors occured if not self.errors: text = u"".join(self.textbuffer) util.write_corpus_text(self.textfile, text, self.pos2anchor) if self.elem_order: for elem in self.elem_order: annot, db = elem[1], self.dbs[elem[1]] util.write_annotation(annot, db) else: for annot, db in list(self.dbs.items()): util.write_annotation(annot, db) for header, db in list(self.header_dbs.items()): util.write_annotation(header, db) HTMLParser.close(self)
def strip_tags_parser(self, html): """ 去除文本中的HTML标签.用到了HTMLParser 使用示例: str_text=strip_tags("<font color=red>hello</font>") :return: String """ from html.parser import HTMLParser html = html.strip('\n') html = html.strip('\t') html = html.strip(' ') html = html.strip() result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() return '$'.join(result)
def close(self): HTMLParser.close(self) if self.state != "finished": raise IOError("incorrectly-nested tables; state={}".format( self.state)) st = self.store def make(i): return [(s[0], s[i]) for s in st if s[i] is not None] self.releases = { "CALDB": make(1), "SDP": make(2), "CIAO": make(3), "L3": make(4) } if len(self.releases["CALDB"]) == 0: raise IOError("No CALDB release information found!")
def _qtree(node: Node, parser: HTMLParser, level: int = 0) -> str: parser.reset() parser.feed(node.label) parser.close() lbl = parser.tex if parser.valid else node.label if node.value: parser.reset() parser.feed(node.value) parser.close() val = parser.tex if parser.valid else node.value else: val = None leaf = not node.children and level > 0 res = " " * level + f"{'[.' if not leaf else ''}{lbl}" if val: res += f"\\\\{val}" res += "\n" for c in node.children: res += _qtree(c, parser, level + 1) if not leaf: res += " " * level + "]\n" return res
class Retriever(object): """ 网络爬虫 """ def __init__(self, url): self.url = url self.file = self.filename(url) def filename(self, url, deffile='index.html'): # 存储地址 paserdurl = urlparse(url, 'http:', 0) print('parsedurl', paserdurl) path = 'D:/' + paserdurl[1] + paserdurl[2] ext = splitext(path) if ext[1] == '': if path[-1] == '/': path += deffile else: path += '/' + deffile ldir = dirname(path) # 本地地址 if sep != '/': ldir = ldir.replace("/", sep) if not isdir(ldir): if exists(ldir): unlink(ldir) makedirs(ldir) return path def download(self): # 下载网页 try: retval = urlretrieve(self.url, self.file) except IOError as e: retval = '*** ERROR: invalid URL "%s"' % self.url return retval def parseAndGetLinks(self): # 解析HTML文档,保存链接 parse html,save links self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO()))) self.parser.feed(open(self.file).read()) self.parser.close() return self.parser.anchorlist
def close(self): HTMLParser.close(self) return self.__builder.close()
def close(self): HTMLParser.close(self)
def _check_valid_html(text): p = HTMLParser() p.feed(text) p.close()
def close(self): HTMLParser.close(self) return self.liCheckTime
def close(self): HTMLParser.close(self) return self.attr_values
def submit(request): ret = {} has_html = False if request.POST: d = request.POST.dict() name = d.get("name", None) secret_key = d.get("secret_key", None) if name and is_user_data_valid(name): parser = HTMLParser() parser.feed(name) parser.close() if parser.get_starttag_text(): ret["flag"] = FLAGS["scoreboard_hacking"][0] has_html = True if name and secret_key and not has_html: if not is_user_data_valid(name, data_type=DataType.SHORT_NAME): ret["error"] = "Too much data" elif not is_user_data_valid(secret_key, data_type=DataType.PASSWORD): ret["error"] = "Too much data" elif get_leader(name.strip().lower()): # If they also proved the correct secret key, update that entry in the database secret_key = hashlib.sha512(secret_key.encode('utf-8')).hexdigest() leader = get_leader(name.strip().lower(), secret_key) if leader: session = get_unauth_session(request) if not session.lifetime_hacker_bucks: ret["error"] = "What makes you think you belong on the leaderboard?" else: # Update the leader with the new info # Create a set of claimed flags and combine the loaded leader with the current session leader_claimed_flags = json.loads(leader.claimed_flags) claimed_flags = list(set(leader_claimed_flags + session.claimed_flags)) leader.lifetime_hacker_bucks = calc_lifetime_hacker_bucks_from_claimed_flags(claimed_flags) leader.num_flags_found = len(claimed_flags) leader.claimed_flags = json.dumps(claimed_flags) # This will overwrite their hacker bucks. Only an issue if they didn't load first leader.hacker_bucks = session.hacker_bucks leader.remote_ip = session.remote_ip leader.percent_complete = int((leader.num_flags_found) / len(FLAGS) * 100) leader.playtime = str(timezone.now() - leader.session_creation_time) leader_purchased_challenges = json.loads(leader.purchased_challenges) for challenge_id, challenge in session.challenges.items(): if challenge.purchased: leader_purchased_challenges.append(challenge_id) leader_purchased_challenges = list(set(leader_purchased_challenges)) leader.purchased_challenges = json.dumps(leader_purchased_challenges) # Update the changes leader.save() else: ret["error"] = "Already a leader with that name. To update, provide the correct password." else: name = name.strip() session = get_unauth_session(request) if not session.lifetime_hacker_bucks: ret["error"] = "What makes you think you belong on the leaderboard?" else: leader = LeaderboardEntry() leader.lifetime_hacker_bucks = session.lifetime_hacker_bucks leader.num_flags_found = len(session.claimed_flags) leader.claimed_flags = json.dumps(session.claimed_flags) leader.hacker_bucks = session.hacker_bucks leader.percent_complete = int((leader.num_flags_found / len(FLAGS)) * 100) leader.name = name.lower() leader.display_name = name leader.remote_ip = session.remote_ip leader.session_creation_time = session.creation_time leader.secret_key = hashlib.sha512(secret_key.encode('utf-8')).hexdigest() leader.playtime = str(timezone.now() - session.creation_time) # Get the list of purchased challenge IDs purchased_challenges = [] for challenge_id, challenge in session.challenges.items(): if challenge.purchased: purchased_challenges.append(challenge_id) leader.purchased_challenges = json.dumps(purchased_challenges) leader.save() elif not has_html: ret["error"] = "No name/secret key provided for leaderboard entry" return HttpResponse(json.dumps(ret))
def close(self): HTMLParser.close(self) # python 2 return self.data
def run_check(self, test): #try: p = HTMLParser() p.feed(str(test.resultBody)) p.close() return True
def close(self): self._check_result(force=True) if py3: super().close() else: HTMLParser.close(self)
def close(self): HTMLParser.close(self) for t in reversed(self._tags): self._target.end(t) return self._target.close()
def close(self): HTMLParser.close(self) return self.tb.close()