Python HTMLParserの例、lxml.etree.HTMLParser Pythonの例

コード例 #1

0

ファイルを表示

ファイル: xml_edit.py プロジェクト: ATikhonov2/leo-editor

def xml2leo(event, from_string=None):
    """handle import of an .xml file, places new subtree after c.p
    """
    c = event['c']
    p = c.p

    if from_string:
        parser_func = etree.fromstring
        file_name = from_string
    else:
        parser_func = etree.parse
        cd_here(c, p)
        file_name = g.app.gui.runOpenFileDialog(c,
                                                title="Open",
                                                filetypes=table,
                                                defaultextension=".xml")

        if not file_name:
            raise Exception("No file selected")

    try:
        xml_ = parser_func(file_name)
    except etree.XMLSyntaxError:
        xml_ = parser_func(file_name, parser=etree.HTMLParser())
    except Exception:
        g.es("Failed to read '%s'" % file_name)
        raise

    if from_string:
        # etree.fromstring and etree.parse return Element and
        # ElementTree respectively
        xml_ = etree.ElementTree(xml_)

    nd = p.insertAfter()
    nd.h = os.path.basename(file_name)

    # the root Element isn't necessarily the first thing in the XML file
    # move to the beginning of the list to capture preceding comments
    # and processing instructions
    toplevel = xml_.getroot()
    while toplevel.getprevious() is not None:
        toplevel = toplevel.getprevious()

    # move through list, covering root Element and any  comments
    # or processing instructions which follow it
    while toplevel is not None:
        append_element(toplevel, nd)
        toplevel = toplevel.getnext()

    nd.b = '<?xml version="%s"?>\n' % (xml_.docinfo.xml_version or '1.0')
    if xml_.docinfo.encoding:
        nd.b = '<?xml version="%s" encoding="%s"?>\n' % (
            xml_.docinfo.xml_version or '1.0', xml_.docinfo.encoding)
    if NSMAP:
        for k in sorted(NSMAP):
            if k:
                nd.b += "%s: %s\n" % (k, NSMAP[k])
            else:
                nd.b += "%s\n" % NSMAP[k]
    nd.b += xml_.docinfo.doctype + '\n'

    c.redraw()

    return nd

コード例 #2

0

ファイルを表示

ファイル: mail.py プロジェクト: yemanadep/Flectra

def html2plaintext(html, body_id=None, encoding='utf-8'):
    """ From an HTML text, convert the HTML to plain text.
    If @param body_id is provided then this is the tag where the
    body (not necessarily <body>) starts.
    """
    ## (c) Fry-IT, www.fry-it.com, 2007
    ## <*****@*****.**>
    ## download here: http://www.peterbe.com/plog/html2plaintext

    html = ustr(html)

    if not html:
        return ''

    tree = etree.fromstring(html, parser=etree.HTMLParser())

    if body_id is not None:
        source = tree.xpath('//*[@id=%s]' % (body_id,))
    else:
        source = tree.xpath('//body')
    if len(source):
        tree = source[0]

    url_index = []
    i = 0
    for link in tree.findall('.//a'):
        url = link.get('href')
        if url:
            i += 1
            link.tag = 'span'
            link.text = '%s [%s]' % (link.text, i)
            url_index.append(url)

    html = ustr(etree.tostring(tree, encoding=encoding))
    # \r char is converted into &#13;, must remove it
    html = html.replace('&#13;', '')

    html = html.replace('<strong>', '*').replace('</strong>', '*')
    html = html.replace('<b>', '*').replace('</b>', '*')
    html = html.replace('<h3>', '*').replace('</h3>', '*')
    html = html.replace('<h2>', '**').replace('</h2>', '**')
    html = html.replace('<h1>', '**').replace('</h1>', '**')
    html = html.replace('<em>', '/').replace('</em>', '/')
    html = html.replace('<tr>', '\n')
    html = html.replace('</p>', '\n')
    html = re.sub('<br\s*/?>', '\n', html)
    html = re.sub('<.*?>', ' ', html)
    html = html.replace(' ' * 2, ' ')
    html = html.replace('&gt;', '>')
    html = html.replace('&lt;', '<')
    html = html.replace('&amp;', '&')

    # strip all lines
    html = '\n'.join([x.strip() for x in html.splitlines()])
    html = html.replace('\n' * 2, '\n')

    for i, url in enumerate(url_index):
        if i == 0:
            html += '\n\n'
        html += ustr('[%s] %s\n') % (i + 1, url)

    return html

コード例 #3

0

ファイルを表示

from lxml import etree

parser = etree.HTMLParser()
tree = etree.parse("app.html", parser)

name_xpath_1 = '/html/body/div[1]/div[7]/div[4]/div[1]/div[2]/div[2]/div[2]/div/div[3]/text()'
name_xpath_2 = '/html/body/div[1]/div[7]/div[4]/div[1]/div[2]/div[1]/div[2]/div/div[3]/text()'

name_1 = tree.xpath(name_xpath_1)
name_2 = tree.xpath(name_xpath_2)

print(name_1)
print(type(name_1))
print(name_2)
print(type(name_2))

コード例 #4

0

ファイルを表示

ファイル: crawler_frame.py プロジェクト: bdwalker93/Spacetime-Crawler

def extract_next_links(rawDatas):
    global most_outlinks, visited_subdomains
    outputLinks = list()

    for urlResponse in rawDatas:
        outlinks = []

        # The URL base path
        basePath = urlResponse.url

        hostName = urlparse(basePath).hostname
        if hostName not in visited_subdomains:
            visited_subdomains[hostName] = set()

        # The content of the page
        content = urlResponse.content

        # Stops us from trying parse pages with no content or an error
        if not urlResponse.error_message or content:

            # Debug
            if DEBUG_VERY_VERBOSE:
                print "Error Message: ", urlResponse.error_message
                print "Headers: ", urlResponse.headers
                print "Is Redirected: ", urlResponse.is_redirected
                print "Final URL: ", urlResponse.final_url
                print "Content: ", urlResponse.content, "-\n"
                print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"

            try:
                # Loading the DOM with etree
                parser = etree.HTMLParser(recover=True)
                pageDom = etree.parse(StringIO.StringIO(content), parser)

                # Checks for the presence of a base tag
                if pageDom.xpath('//base/@href'):
                    basePath = pageDom.xpath('//base/@href')[0]

                # Extracting all of the links
                for linkPath in pageDom.xpath('//a/@href'):

                    # absolutePath = urljoin(basePath, relativePath)
                    absoluteUrl = urljoin(basePath, linkPath)

                    # Adding link to list
                    outlinks.append(absoluteUrl)
                    visited_subdomains[hostName].add(absoluteUrl)

                #If outlinks is currently empty then assign it new tuple
                if most_outlinks[0] == "None":
                    most_outlinks = (basePath, len(outlinks))
                #If the current tuples outlinks count is lower to current then replace
                elif most_outlinks[1] < len(outlinks):
                    most_outlinks = (basePath, len(outlinks))

                outputLinks += outlinks

            except AssertionError as err:
                # Setting this as a bad link
                urlResponse.bad_url = True

                # might want to set that built in bad within the url object here???
                if DEBUG:
                    print err.message
        else:
            # Setting this as a bad link
            urlResponse.bad_url = True
            if DEBUG:
                print "No content or an error code exists"
    # Debug
    if DEBUG_VERBOSE:
        print "List of found link: ", outputLinks

    return outputLinks

コード例 #5

0

ファイルを表示

ファイル: baiduscrapybody_formattext2.py プロジェクト: Suncicie/DataExtrator

def parsehtml(file,urlbiglist):
    #开始处理url
    for i in range(len(urlbiglist)):
        for j in range(len(urlbiglist[i])):
            # print urlbiglist[i][j]
            for k in range(3):
                try:
                   request = urllib2.Request(url = urlbiglist[i][j],headers=headers)
                   html = urllib2.urlopen(request).read()
                   print "连接成功，跳出循环"
                   break

                except urllib2.HTTPError,e:
                    print "有问题，再连一遍"
                    continue
            #编码问题解决
            char_type = chardet.detect(html)
            print (char_type)
            #再对编码进行一次判断  只要中文字符的网页 若超出范围直接跳出
            language = ['Chinese','']
            print char_type['language']
            print char_type['language'] in language

            if not ( char_type['language'] in language):
                print char_type['language']
                print char_type['language'] in language
                continue
            if(char_type["encoding"]=='GB2312'):
                try:
                     html = html.decode('gbk').encode('utf-8')
                except UnicodeDecodeError,e:
                    print "编码有些问题，已跳过"
                    continue
            else:
                html = unicode(html, char_type["encoding"]).encode("utf-8")
            pagecontent = etree.HTML(html,parser=etree.HTMLParser(encoding='utf-8'))

            #因为每个界面的网页结构不同  所以要查找多种形式

            # 第一种找法  搞一个大的字符串（其中包括空格和空行）
            filecontent = ''
            p1 = pagecontent.xpath('//div[@class="main-content"]')
            print  type(p1)
            print p1
            print ("第一次找")
            for i in range(len(p1)):
                filecontent = filecontent + p1[i].xpath('string()')
            # 去空格  去空行
            filestringcontent = ''
            file.write('\n这是一篇:\n')
            for line in filecontent.splitlines():
                if not line.split():
                    continue
                line = line.strip()#去空格    也是去掉了换行符
                filestringcontent += line
            file.write(filestringcontent)

            # 每一个网页抓完后的标志

            if(len(p1)):
                continue
            print "第二次找"
            p2 = pagecontent.xpath('//body//div//p//text()')

            print p2
            for l in range(len(p2)):
                print '打印每一页的内容====='
                print p2[l]
                print type(p2[l])
                file.write(p2[l])

コード例 #6

0

ファイルを表示

ファイル: test6_match_attr.py プロジェクト: FaneWang/spider_book

from lxml import etree

html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]')
print(result)

コード例 #7

0

ファイルを表示

def parse_lagoufile():
    parser = etree.HTMLParser(encoding='utf-8')
    htmlElement = etree.parse('lagou.html', parser=parser)
    print(etree.tostring(htmlElement, encoding='utf-8').decode('utf-8'))

コード例 #8

0

ファイルを表示