Example #1
0
    def _set_data_(self, data, *args, **kwargs):
        if isinstance(data, QtGui.QTextDocument):
            self._docViewer_.setDocument(data)

        elif isinstance(data, str):
            from html.parser import HTMLParser

            parser = HTMLParser(convert_charrefs=True)

            parser.feed(data)

            parser.close()

            if parser.get_starttag_text() is None:
                self._docViewer_.document().setPlainText(data)

            else:
                self._docViewer_.document().setHtml(data)

            if data.find("<?xml version=") >= 0:
                self._highlighter_ = xmlutils.XmlSyntaxHighlighter(
                    self._docViewer_.document())
            else:
                self._highlighter_ = None

        else:
            raise TypeError(
                "Expecting a QTextDdocument or a str; got %s instead" %
                type(data).__name__)

        if kwargs.get("show", True):
            self.activateWindow()
Example #2
0
 def _strip_tags(self, html):
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
Example #3
0
 def feed(self, data):
     data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'&lt;!\1', data)
     data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data)
     data = data.replace('&#39;', "'")
     data = data.replace('&#34;', '"')
     HTMLParser.feed(self, data)
     HTMLParser.close(self)
 def _strip_tags(self, html):
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)
Example #5
0
File: 2-1.py Project: yutty21/study
 def parse_links(self):
     f = open(self.file,'r')
     data = f.read()
     f.close()
     parser = HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
Example #6
0
 def parse_links(self):
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parse = HTMLParser()
     parse.feed(data)
     parse.close()
     return parse.anchorlist
Example #7
0
 def strip_tags(self, htmlStr):
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(htmlStr)
     parser.close()
     return ''.join(result)
Example #8
0
 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     f.close()
     parser = HTMLParser(AbstractFormatter(DumbWriter(io.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.anchorlist
Example #9
0
def strip_tags(html):
    html = html.strip()
    html = html.strip("\n")
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
 def test_generate_body_with_dummy_data_html(self):
     """Check to make sure that the last tag is an html tag"""
     test_email_data = [{'Author': 'Test Author', 'Journal': 'Test Journal', 'PubDate': datetime.datetime.now().date(), 'Title': 'Test Title', 'Link': 'https://www.altmetric.com/details/101571224'}]
     test_email_address = '*****@*****.**'
     test_body = api_parser.generate_body(test_email_data, 30, test_email_address)
     parser = HTMLParser()
     parser.feed(test_body)
     test_output = parser.get_starttag_text()
     parser.close()
     self.assertEqual(test_output, '<a href="mailto:[email protected]">')
Example #11
0
def strip_tags(html):
    from html.parser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
def urlparser():
    import urllib.request, formatter, sys, html
    from html.parser import HTMLParser
    with urllib.request.urlopen(url) as response:
        data = str(response.read())
        response.close()
        format = formatter.AbstractFormatter(formatter.DumbWriter(
            sys.stdout))
        ptext = HTMLParser(format)
        ptext.feed(data)
        ptext.close()
Example #13
0
def strip_tags(html):
    '''
    Removes html tags from a string
    '''
    from html.parser import HTMLParser
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Example #14
0
def strip_tags(html):
    """
    Python中过滤HTML标签的函数
    """
    html = html.strip()
    parser = HTMLParser()
    result = []
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return result
Example #15
0
 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r')
     data = f.read()
     # print(data)
     f.close()
     # pa = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
     pa = HTMLParser()
     pa.feed(data)
     pa.close()
     return pa.rawdata
Example #16
0
 def parse_links(self):  #解析刚下载下来的页面链接
     f = codecs.open(self.file, 'rb', 'utf-8')  #读取下载的页面
     data = f.read()  #.decode("utf-8")
     f.close()
     parser = HTMLParser(
         formatter.AbstractFormatter(  #AbstractFormatter用来解析数据
             formatter.DumbWriter(io.StringIO()))
     )  #DumbWriter用来输出内容,cStringIO保障不输出到标准输出(最好输出到文件)
     parser.feed(data)
     parser.close()
     return parser.anchorlist  #返回解析后的列表
Example #17
0
def strip_tags(html):
    if html:
        html = html.strip()
        html = html.strip("\n")
        result = []
        parse = HTMLParser()
        parse.handle_data = result.append
        parse.feed(html)
        parse.close()
        return "".join(result)
    return ''
Example #18
0
    def close(self):
        HTMLParser.close(self)

        self.pbr()
        self.o('', 0, 'end')

        self.outtext = self.outtext.join(self.outtextlist)

        if options.google_doc:
            self.outtext = self.outtext.replace('&nbsp_place_holder;', ' ')

        return self.outtext
Example #19
0
    def close(self):
        """
        Tell the parser the feed has ended and clean up the rightmost
        whitespaces.
        """
        HTMLParser.close(self)
        string = ''.join(self._string)

        if string and HtmlExtractor.RSTRIP_REGEX.search(string):
            # clean up those rightmost whitespaces:
            self._string = [string.rstrip()]
            self._shortenTags(self.position)
Example #20
0
    def close(self):
        """
        Tell the parser the feed has ended and clean up the rightmost
        whitespaces.
        """
        HTMLParser.close(self)
        string = ''.join(self._string)

        if string and HtmlExtractor.RSTRIP_REGEX.search(string):
            # clean up those rightmost whitespaces:
            self._string = [string.rstrip()]
            self._shortenTags(self.position)
Example #21
0
    def __get_content_detail_other(self, detail_url):

        chrome_options = Options()
        #chrome以无界面模式打开
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument("disable-extensions")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument(
            'user-agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"'
        )
        chrome_options.add_argument(
            'Accept-Language: "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7"')
        chrome_options.add_argument('Host: "www.91porn.com"')
        chrome_options.add_argument(
            'Referer: "http://www.91porn.com/v.php?category=top&viewtype=basic"'
        )
        chrome_options.add_argument('Cache-Control: "max-age=0"')
        chrome_options.add_argument(
            'Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"'
        )
        chrome_options.add_argument('Upgrade-Insecure-Requests: "1"')
        #禁止加载图片
        prefs = {'profile.default_content_setting_values': {'images': 2}}
        chrome_options.add_experimental_option('prefs', prefs)
        # chrome_options.add_extension(r'D:\PyWorkSpace\learn\4.10.0_0adblock.crx')
        # 驱动路径
        path = r'D:\PyWorkSpace\learn\chromedriver.exe'

        # selenium + 无头google解决js懒加载video url问题
        browser = webdriver.Chrome(executable_path=path,
                                   options=chrome_options)
        # 解决browser.get全部加载完成需要的时间太长问题
        browser.set_page_load_timeout(10)
        browser.set_script_timeout(10)

        try:
            browser.get(detail_url)
        except:
            print("加载页面太慢,停止加载,继续下一步操作")
            browser.execute_script("window.stop()")

        page_source = browser.page_source

        browser.quit()
        p = HTMLParser()
        p.feed(page_source)
        # 处理html中&被转义
        page_source = p.unescape(page_source)
        print(page_source)
        p.close()
        return page_source
Example #22
0
def html_strip(html):
    html = html.replace("#", '')
    html = html.replace(">", '')
    html = html.replace("-", '')
    html = html.replace("*", '')
    html = html.strip()
    html = html.strip("\n")
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(html)
    parse.close()
    return "".join(result)
Example #23
0
def strip_tags(html):
    # Python中过滤HTML标签的函数
    # >>> str_text=strip_tags("<font color=red>hello</font>")
    # >>> print str_text
    # hello

    from html.parser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)
Example #24
0
    def strip_tags(self, htmlStr):
        '''
        使用HTMLParser进行html标签过滤
        :param htmlStr:
        '''

        self.htmlStr = htmlStr
        htmlStr = htmlStr.strip()
        htmlStr = htmlStr.strip("\n")
        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(htmlStr)
        parser.close()
        return ''.join(result)
Example #25
0
 def close(self, *args, **kwargs):
     self.handle_data(None)
     HTMLParser.close(self, *args, **kwargs)
     if self.stack:
         raise Exception("%s unclosed stack: %s" %
                         (self.name, repr(self.stack)))
     if self.tree:
         raise Exception("%s unclosed tags: %s" %
                         (self.name, repr(self.tree_names)))
     self.buffers["_r"] += "return _c.$t;"
     self.buffers["_r"] += "};"
     buffer_r = self.buffers["_r"]
     del self.buffers["_r"]
     final_buffer = "".join(self.buffers.values())
     final_buffer += buffer_r
     return final_buffer
Example #26
0
class properties():
    def __init__(self, mls_link):
        self.mls_link = mls_link
        # self.listing_div = None
        self._collect_listings()

    def _collect_listings(self):
        response = requests.get(self.mls_link)
        html = response.text
        html_soup = BeautifulSoup(html, features='html.parser')
        self.listing_div = html_soup.find_all(
            'div', attrs={'class': 'j-resultsPageAsyncDisplays'})

    def convert_to_HTML(self):
        self.p = HTMLParser()
        self.p.feed(str(self.listing_div))
        self.p.close()
Example #27
0
 def close(self):
     HTMLParser.close(self)
     if self.__core['status'] == 0:
         errored = False
         for data in self.__core['detected_list']:
             if not data['complete']:
                 if data['tag'] in self.__SINGLE_TAGS:
                     data['complete'] = True
                     continue
                 self.__core['status'] = -1
                 self.__core['detail'] = 'Construction Error'
                 errored = True
                 break
         if not errored:
             self.__core['status'] = 1
             self.__core['detail'] = 'ok'
     return self.__core
Example #28
0
    def close(self):
        """This should be called at the end of the file. If in parser mode,
        it saves the corpus text and the annotations to files.
        """
        while self.tagstack:
            t, a, _ = self.tagstack[0]
            if t not in self.autoclose:
                util.log.error(self.pos() + "(at EOF) Autoclosing tag </%s>, starting at %s", t, a)
                self.errors = True
            else:
                util.log.info(self.pos() + "(at EOF) Autoclosing tag </%s>, starting at %s", t, a)
            self.handle_endtag(t)
        self.anchor()

        if self.skipped:
            new_elements = sorted(list(self.skipped.items()), key=lambda x: (-x[1], x[0]))
            new_elements_ann = " ".join(".".join([x[0][0].replace(":", "_"), x[0][1]]) if not x[0][1] is None else x[0][0].replace(":", "_") for x in new_elements)
            new_elements_ele = " ".join(":".join([x[0][0].replace(":", "\\:"), x[0][1]]) if not x[0][1] is None else x[0][0].replace(":", "\\:") for x in new_elements)
            if not self.elem_annotations:
                util.log.info("Found elements:")
                print()
                print("vrt_structs_annotations = " + new_elements_ann)
                print("vrt_structs             = " + new_elements_ele)
                print("xml_elements    = " + new_elements_ele)
                print("xml_annotations = " + new_elements_ann)
                print()
            else:
                print()
                print("xml_skip = " + new_elements_ele)
                print()

        # Only save results if no errors occured
        if not self.errors:
            text = u"".join(self.textbuffer)
            util.write_corpus_text(self.textfile, text, self.pos2anchor)
            if self.elem_order:
                for elem in self.elem_order:
                    annot, db = elem[1], self.dbs[elem[1]]
                    util.write_annotation(annot, db)
            else:
                for annot, db in list(self.dbs.items()):
                    util.write_annotation(annot, db)
            for header, db in list(self.header_dbs.items()):
                util.write_annotation(header, db)

        HTMLParser.close(self)
Example #29
0
    def strip_tags_parser(self, html):
        """
        去除文本中的HTML标签.用到了HTMLParser
        使用示例:
        str_text=strip_tags("<font color=red>hello</font>")

        :return: String
        """
        from html.parser import HTMLParser
        html = html.strip('\n')
        html = html.strip('\t')
        html = html.strip(' ')
        html = html.strip()

        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(html)
        parser.close()
        return '$'.join(result)
Example #30
0
    def close(self):
        HTMLParser.close(self)
        if self.state != "finished":
            raise IOError("incorrectly-nested tables; state={}".format(
                self.state))

        st = self.store

        def make(i):
            return [(s[0], s[i]) for s in st if s[i] is not None]

        self.releases = {
            "CALDB": make(1),
            "SDP": make(2),
            "CIAO": make(3),
            "L3": make(4)
        }

        if len(self.releases["CALDB"]) == 0:
            raise IOError("No CALDB release information found!")
Example #31
0
 def _qtree(node: Node, parser: HTMLParser, level: int = 0) -> str:
     parser.reset()
     parser.feed(node.label)
     parser.close()
     lbl = parser.tex if parser.valid else node.label
     if node.value:
         parser.reset()
         parser.feed(node.value)
         parser.close()
         val = parser.tex if parser.valid else node.value
     else:
         val = None
     leaf = not node.children and level > 0
     res = "  " * level + f"{'[.' if not leaf else ''}{lbl}"
     if val:
         res += f"\\\\{val}"
     res += "\n"
     for c in node.children:
         res += _qtree(c, parser, level + 1)
     if not leaf:
         res += "  " * level + "]\n"
     return res
Example #32
0
class Retriever(object):
    """ 网络爬虫 """
    def __init__(self, url):
        self.url = url
        self.file = self.filename(url)

    def filename(self, url, deffile='index.html'):  # 存储地址
        paserdurl = urlparse(url, 'http:', 0)
        print('parsedurl', paserdurl)
        path = 'D:/' + paserdurl[1] + paserdurl[2]
        ext = splitext(path)
        if ext[1] == '':
            if path[-1] == '/':
                path += deffile
            else:
                path += '/' + deffile
        ldir = dirname(path)  # 本地地址
        if sep != '/':
            ldir = ldir.replace("/", sep)
        if not isdir(ldir):
            if exists(ldir):
                unlink(ldir)
            makedirs(ldir)
        return path

    def download(self):  # 下载网页
        try:
            retval = urlretrieve(self.url, self.file)
        except IOError as e:
            retval = '*** ERROR: invalid URL "%s"' % self.url
        return retval

    def parseAndGetLinks(self):  # 解析HTML文档,保存链接 parse html,save links
        self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
 def close(self):
     HTMLParser.close(self)
     return self.__builder.close()
Example #34
0
 def close(self):
     HTMLParser.close(self)
def _check_valid_html(text):
    p = HTMLParser()
    p.feed(text)
    p.close()
Example #36
0
 def close(self):
     HTMLParser.close(self)
     return self.liCheckTime
 def close(self):
     HTMLParser.close(self)
     return self.attr_values
Example #38
0
def submit(request):
    ret = {}
    has_html = False

    if request.POST:
        d = request.POST.dict()
        name = d.get("name", None)
        secret_key = d.get("secret_key", None)

        if name and is_user_data_valid(name):
            parser = HTMLParser()
            parser.feed(name)
            parser.close()
            if parser.get_starttag_text():
                ret["flag"] = FLAGS["scoreboard_hacking"][0]
                has_html = True

        if name and secret_key and not has_html:
            if not is_user_data_valid(name, data_type=DataType.SHORT_NAME):
                ret["error"] = "Too much data"
            elif not is_user_data_valid(secret_key, data_type=DataType.PASSWORD):
                ret["error"] = "Too much data"
            elif get_leader(name.strip().lower()):
                # If they also proved the correct secret key, update that entry in the database
                secret_key = hashlib.sha512(secret_key.encode('utf-8')).hexdigest()
                leader = get_leader(name.strip().lower(), secret_key)
                if leader:
                    session = get_unauth_session(request)
                    if not session.lifetime_hacker_bucks:
                        ret["error"] = "What makes you think you belong on the leaderboard?"
                    else:
                        # Update the leader with the new info
                        # Create a set of claimed flags and combine the loaded leader with the current session
                        leader_claimed_flags = json.loads(leader.claimed_flags)
                        claimed_flags = list(set(leader_claimed_flags + session.claimed_flags))

                        leader.lifetime_hacker_bucks = calc_lifetime_hacker_bucks_from_claimed_flags(claimed_flags)
                        leader.num_flags_found = len(claimed_flags)
                        leader.claimed_flags = json.dumps(claimed_flags)

                        # This will overwrite their hacker bucks. Only an issue if they didn't load first
                        leader.hacker_bucks = session.hacker_bucks
                        leader.remote_ip = session.remote_ip
                        leader.percent_complete = int((leader.num_flags_found) / len(FLAGS) * 100)
                        leader.playtime = str(timezone.now() - leader.session_creation_time)

                        leader_purchased_challenges = json.loads(leader.purchased_challenges)
                        for challenge_id, challenge in session.challenges.items():
                            if challenge.purchased:
                                leader_purchased_challenges.append(challenge_id)
                        leader_purchased_challenges = list(set(leader_purchased_challenges))
                        leader.purchased_challenges = json.dumps(leader_purchased_challenges)

                        # Update the changes
                        leader.save()
                else:
                    ret["error"] = "Already a leader with that name. To update, provide the correct password."
            else:
                name = name.strip()
                session = get_unauth_session(request)
                if not session.lifetime_hacker_bucks:
                    ret["error"] = "What makes you think you belong on the leaderboard?"
                else:
                    leader = LeaderboardEntry()
                    leader.lifetime_hacker_bucks = session.lifetime_hacker_bucks
                    leader.num_flags_found = len(session.claimed_flags)
                    leader.claimed_flags = json.dumps(session.claimed_flags)
                    leader.hacker_bucks = session.hacker_bucks
                    leader.percent_complete = int((leader.num_flags_found / len(FLAGS)) * 100)
                    leader.name = name.lower()
                    leader.display_name = name
                    leader.remote_ip = session.remote_ip
                    leader.session_creation_time = session.creation_time
                    leader.secret_key = hashlib.sha512(secret_key.encode('utf-8')).hexdigest()
                    leader.playtime = str(timezone.now() - session.creation_time)

                    # Get the list of purchased challenge IDs
                    purchased_challenges = []
                    for challenge_id, challenge in session.challenges.items():
                        if challenge.purchased:
                            purchased_challenges.append(challenge_id)
                    leader.purchased_challenges = json.dumps(purchased_challenges)

                    leader.save()
        elif not has_html:
            ret["error"] = "No name/secret key provided for leaderboard entry"

    return HttpResponse(json.dumps(ret))
Example #39
0
 def close(self):
     HTMLParser.close(self)  # python 2
     return self.data
Example #40
0
 def run_check(self, test):
     #try:
         p = HTMLParser()
         p.feed(str(test.resultBody))
         p.close()
         return True
Example #41
0
 def close(self):
   self._check_result(force=True)
   if py3:
     super().close()
   else:
     HTMLParser.close(self)
Example #42
0
	def close(self):
		HTMLParser.close(self)
		for t in reversed(self._tags):
			self._target.end(t)
		return self._target.close()
Example #43
0
 def close(self):
     HTMLParser.close(self)
     return self.tb.close()