def get_xpath_text(self, dom_item, path):
     try:
         result = dom_item.xpath(path)
         if len(result) == 1:
             result = result[0]
         etree.tostring(result)
         return etree.tostring(result) 
     except Exception, e:    
         print e
         self.logger.error("Wrong XPath - %s" % path)
         return None
Exemple #2
0
def scrappURL(link):
    all = requests.get(link)
    treeObj = lxml.html.fromstring(all.text)
    title = treeObj.xpath("//div[@id='mh_lesson_page']")[0].xpath('.//h1')[0]
    content = treeObj.xpath("//div[@id='mh_lesson_page']")[0].xpath('.//p')[0]
    title = etree.tostring(title, with_tail=False)[4:-5]
    content = etree.tostring(content, with_tail=False)[3:-4]
    print(title)
    if not Dashbord.objects.filter(title=title):
        print('%s saved' % title)
        print(link)
        row = Dashbord.objects.create(title=title, content=content, url=link)
Exemple #3
0
	def get_xpath_text(self, dom_item, path):
		'''
		  - Try to get text from the dom model according to the path.
		'''
		try:
			result = dom_item.xpath(path)
			if len(result) == 1:
				result = result[0]
			etree.tostring(result)
			return etree.tostring(result) 
		except Exception, e:	
			self.logger.error("Wrong XPath - %s" % path)
			return None
    def get_node_note_by_id(self, node_id=None):
        node = self.get_node_by_id(node_id)

        if node is None:
            raise self.FreeplaneNodeNotExisting
        else:
            richcontent_node = node.find(self.T_RICHCONTENT)
            if richcontent_node is None:
                self.logger.debug('get_node_note_by_id: No richcontent tag under {0}'.format(node_id))
                a = None
            else:
                if self.A_TYPE in richcontent_node.attrib:
                    if richcontent_node.attrib[self.A_TYPE] == self.V_TYPE_NOTE:
                        note_elements = richcontent_node.find('html')
                        a = ETH.tostring(note_elements)
                    else:
                        self.logger.debug(
                            'get_node_note_by_id: richcontent tag under {0} is not of type note'.format(node_id))
                        a = None
                else:
                    self.logger.debug('get_node_note_by_id: richencontent tag exists but no type defined')
                    raise self.FreeplaneRichContentTagNotProperlyDefined

        if a is not None:
            a = a.decode('ascii')
        return a
Exemple #5
0
def get_apocopes(list_urls):
    apo_urls = []
    for list_url in list_urls:
        for node in parse(list_url).findall('.//div[@class="mw-category"].//li/a[@href]'):
            apo_urls.append((node.text, 'http://fr.wiktionary.org' + node.attrib['href']))
    
    with codecs.open('wiki.log', 'w', 'utf-8') as log:
        apos = {}
        for short, url in sorted(apo_urls):
            short = short.lower()
            if short not in apos:
                apos[short] = []
            fulls = apos[short]
            for node in parse(url).findall('.//dl/dd'): #/i/a[@href]
                text = etree.tostring(node, encoding = 'unicode', method = "text").lower().replace('\n', '')
                fulls_sub = []
                for match in extractor.findall(text):
                    for full in match:
                        full = cleaner.sub('\\1', full)
                        if not full:
                            continue
                        fulls_sub.append(full)
                log.write(delim.join([short, str(fulls_sub), text]) + newline)
                if not fulls_sub:
                    print short, '=>', text
                    continue
                for full in fulls_sub:
                    if full not in fulls:
                        fulls.append(full)
    return apos
Exemple #6
0
 def extract(self, selector, host='', with_body_html=False):
     body = selector.xpath('//body')[0]
     for node in iter_node(body):
         node_hash = hash(node)
         density_info = self.calc_text_density(node)
         text_density = density_info['density']
         ti_text = density_info['ti_text']
         text_tag_count = self.count_text_tag(node, tag='p')
         sbdi = self.calc_sbdi(ti_text, density_info['ti'], density_info['lti'])
         images_list = node.xpath('.//img/@src')
         if host:
             images_list = [pad_host_for_images(host, url) for url in images_list]
         node_info = {'ti': density_info['ti'],
                                      'lti': density_info['lti'],
                                      'tgi': density_info['tgi'],
                                      'ltgi': density_info['ltgi'],
                                      'node': node,
                                      'density': text_density,
                                      'text': ti_text,
                                      'images': images_list,
                                      'text_tag_count': text_tag_count,
                                      'sbdi': sbdi}
         if with_body_html:
             body_source_code = unescape(etree.tostring(node).decode())
             node_info['body_html'] = body_source_code
         self.node_info[node_hash] = node_info
     std = self.calc_standard_deviation()
     self.calc_new_score(std)
     result = sorted(self.node_info.items(), key=lambda x: x[1]['score'], reverse=True)
     return result
def all_purpose_template(parse, title):
    lines = parse.xpath('/html//text()')
    content = "".join(lines)
    title = title.encode("gbk", errors="replace").decode("gbk", errors="replace").replace("?", "")
    open("test.html", "w", encoding="utf-8").write(etree.tostring(parse).decode("utf-8"))
    content = content.split(title, 2)[2].split("评论")[0].encode("utf-8",errors="replace").decode("utf-8",errors="replace")
    return content
Exemple #8
0
def get_apocopes(list_urls):
    apo_urls = []
    for list_url in list_urls:
        for node in parse(list_url).findall(
                './/div[@class="mw-category"].//li/a[@href]'):
            apo_urls.append(
                (node.text, 'http://fr.wiktionary.org' + node.attrib['href']))

    with codecs.open('wiki.log', 'w', 'utf-8') as log:
        apos = {}
        for short, url in sorted(apo_urls):
            short = short.lower()
            if short not in apos:
                apos[short] = []
            fulls = apos[short]
            for node in parse(url).findall('.//dl/dd'):  #/i/a[@href]
                text = etree.tostring(node, encoding='unicode',
                                      method="text").lower().replace('\n', '')
                fulls_sub = []
                for match in extractor.findall(text):
                    for full in match:
                        full = cleaner.sub('\\1', full)
                        if not full:
                            continue
                        fulls_sub.append(full)
                log.write(delim.join([short, str(fulls_sub), text]) + newline)
                if not fulls_sub:
                    print short, '=>', text
                    continue
                for full in fulls_sub:
                    if full not in fulls:
                        fulls.append(full)
    return apos
 def extract(self,
             selector,
             host='',
             body_xpath='',
             with_body_html=False,
             use_visiable_info=False):
     body_xpath = body_xpath or config.get('body', {}).get('xpath', '')
     use_visiable_info = use_visiable_info or config.get(
         'use_visiable_info', False)
     if body_xpath:
         body = selector.xpath(body_xpath)[0]
     else:
         body = selector.xpath('//body')[0]
     for node in iter_node(body):
         if use_visiable_info:
             if not node.attrib.get('is_visiable', True):
                 continue
             coordinate_json = node.attrib.get('coordinate', '{}')
             coordinate = json.loads(coordinate_json)
             if coordinate.get('height', 0) < 150:  # 正文块的高度应该要大于150px
                 continue
         node_hash = hash(node)
         density_info = self.calc_text_density(node)
         text_density = density_info['density']
         ti_text = density_info['ti_text']
         text_tag_count = self.count_text_tag(node, tag='p')
         sbdi = self.calc_sbdi(ti_text, density_info['ti'],
                               density_info['lti'])
         images_list = node.xpath('.//img/@src')
         host = host or config.get('host', '')
         if host:
             images_list = [
                 pad_host_for_images(host, url) for url in images_list
             ]
         node_info = {
             'ti': density_info['ti'],
             'lti': density_info['lti'],
             'tgi': density_info['tgi'],
             'ltgi': density_info['ltgi'],
             'node': node,
             'density': text_density,
             'text': ti_text,
             'images': images_list,
             'text_tag_count': text_tag_count,
             'sbdi': sbdi
         }
         if use_visiable_info:
             node_info['is_visiable'] = node.attrib['is_visiable']
             node_info['coordinate'] = node.attrib.get('coordinate', '')
         if with_body_html or config.get('with_body_html', False):
             body_source_code = unescape(
                 etree.tostring(node, encoding='utf-8').decode())
             node_info['body_html'] = body_source_code
         self.node_info[node_hash] = node_info
     self.calc_new_score()
     result = sorted(self.node_info.items(),
                     key=lambda x: x[1]['score'],
                     reverse=True)
     return result
Exemple #10
0
 def FROM_HTML(cls, r, c, x):
     '''Gets the HTML source of an HTML element as string.
             
             Args:
                 x   (lxml.html.HtmlElement  ): The element to get the source of.
             Returns:
                 str: HTML source of the element.
         '''
     return etree.tostring(x, encoding='unicode').strip()
Exemple #11
0
 def string(self):
     """
     return string of element
     :return:
     """
     return etree.tostring(self,
                           pretty_print=True,
                           encoding="utf-8",
                           method='html').decode('utf-8')
 def guess_htmlcontent(self, content_xpath):
     htmlcontent = ''
     try:
         htmlcontent = self.element.xpath(content_xpath)
     except Exception as e:
         pprint(f"error:{e.args},xpath:{content_xpath}", indent=4)
     if htmlcontent:
         htmlcontent = "".join(etree.tostring(
             htmlcontent[0], encoding='utf-8').decode("utf-8"))
     return htmlcontent
Exemple #13
0
def visit_page(url,path="",save=False):
    content = {"title" : "", "url" : "", "keywords" : "", "links" : [], "body" : ""}
    if urlparse(url).netloc == "":
        #print "partial:", url
        return content
    
    if urlparse(url).netloc in BANNED:
        #print "banned:", url
        return content
    try:
        resp = urllib2.urlopen(url)
    except URLError as e:
        print e
        return content
    if resp.getcode() != 200:
        "Bad response: ",resp.getcode() 
        return content
    #resolves URL
    content["url"] = resp.url  
    html = resp.read()
    try:
        tree = etree.parse(StringIO.StringIO(html), parser)
    except:
        print "LXML error"
        return content
    content["title"] = tree.xpath("//title//text()")
    if len(content["title"]) > 0:
        content["title"] = content["title"][0].strip()
    content["links"] = tree.xpath("//body//@href")
    content["keywords"] = tree.xpath("//meta[@name='keywords']/@content")
    if content["keywords"] == "":
        content["keywords"] = tree.xpath("//meta[@name='Keywords']/@content")
        print "caught a case ",url
    #content["body"] = cleaner.clean_html(etree.tostring(tree.xpath("//body")[0]))
    body = cleaner.clean_html(etree.tostring(tree.xpath("//body")[0]))
    content["word_count"] = len(body.split(" "))

    #will save full html
    if save:
        filename = urllib.quote_plus(content["url"][0:60])+".txt"
        #filename.replace("http%3A%2F%2F", "")
                
        #if file doesn't already exist
        if not findInSub(filename,path):
            #make that day's path
            path = path + strftime("/%Y/%m/%d/", gmtime())
            if not os.path.exists(path):
                os.makedirs(path)
            f = open(path+filename, "w+")
            f.write(html)
            f.close()
            print "wrote " + path+filename
        else:
            print "already had " + filename
    return content
Exemple #14
0
def process_page(sterile_page, target_url):
    """
    Process the page so all the links has it's text wrapped in <em></em> and all the words that are longer than 4
    symbols are wrapped in <strong></strong>
    :param sterile_page: A string, target page's source stripped from all the tags, but <a></a>
    :param target_url: A string, an URL which user gave us
    :return: A string, processed page ready to render in template
    """
    # Parse the inbound page into element tree with lxml
    root = etree.fromstring(sterile_page)

    # First, let's deal with <a></a>
    for a_tag in root.xpath(".//a"):

        # If <a></a> has some text in it
        if a_tag.text and a_tag.text.strip():

            # Create new element <em></em>, assign the text from <a></a> to it, delete the text from <a></a>,
            # and insert <em></em> element instead
            em = etree.Element('em')
            em.text = a_tag.text
            a_tag.text = None
            a_tag.insert(0, em)

            # While we are at it, let's fix all the broken relative links we got from page source
            # #crutch_alert
            try:
                # If it works, we don't need to do anything with the a_tag's href
                valid = URLValidator()
                valid(a_tag.attrib['href'])

            except ValidationError:
                # Good chances are, that this malformed url is _relative_ to target url's domain
                a_tag.attrib['href'] = absolutize_url(
                    schemeful_domain(target_url), a_tag.attrib['href'])

        else:
            # If <a></a> is empty (e.g., after removing an image from anchor's text), remove it altogether with hrefs.
            a_tag.getparent().remove(a_tag)

    # Take every element in the tree and traverse the tree, checking if it has text in it
    # If it does, inflict reinforce_text() which will wrap the words in <strong></strong> if they are longer than 4
    for element in root.iter():

        if element.text and element.text.strip():
            element.text = reinforce_text(element.text)

        if element.tail and element.tail.strip():
            element.tail = reinforce_text(element.tail)

    # The final bit: flatten the modified tree back to string, decode it and then unescape everything what was escaped
    # (< and > in <strong></strong>)
    return unescape(etree.tostring(root, method='html').decode())
def sep_tag(elems, split_pattern):
    ret = []
    for e in elems:
        e_str = re.split(split_pattern, etree.tostring(e).decode('utf-8'))
        for s in e_str:
            try:
                s = "".join(etree.HTML(s).xpath("//text()")).strip()
                s = clean_text(s)
                if not re.fullmatch(r"\s*", s):
                    ret.append(s)
            except:
                pass
    return ret
Exemple #16
0
 def parse_count(tree):
     match = tree.xpath('//div/br/following-sibling::text()')
     rgx = re.compile('of\\s+(\\d+)\\.')
     if match:
         count = rgx.search(' '.join(match))
     else:
         count = rgx.search(etree.tostring(tree))
     if not count:
         logger.error('Failed to count links in search page, returning 0')
         return 0
     links = int(count.group(1))
     logger.info('Number of links found in search page: %d', links)
     return int(count.group(1))
Exemple #17
0
 def extract(self, html: Union[str, etree._Element]):
     if isinstance(html, etree._Element):
         html = etree.tostring(html, encoding='utf8').decode()
         try:
             res = re.findall(r'<html><body><p>(.*)</p></body></html>', html, flags=re.DOTALL | re.S)
             html = res[0]
         except:
             pass
     if self.many:
         matches = self._re_object.finditer(html)
         return [self._parse_match(match) for match in matches]
     else:
         match = self._re_object.search(html)
         return self._parse_match(match)
Exemple #18
0
 def to_string(self, element: Element, limit: int = None):
     """
     convert element to string
     :param element:
     :param limit:
     :return:
     """
     result = etree.tostring(element,
                             pretty_print=True,
                             encoding="utf-8",
                             method='html').decode('utf-8')
     if limit:
         return result[:limit]
     return result
Exemple #19
0
    def post_filter(self, args):
       
        title = args[0].split('[[')[-1].split(']]')[0].split('|')[-1]
        if title.strip():
            title = title.strip()

        text = args[1]
        counts = {}
        doc = etree.fromstring(text, etree.HTMLParser())
        hids = []            
        toc_html = '<div id="toc" class="table_of_contents"><h3>%s</h3>\n'%(title)
        for node in doc.xpath('//h1|//h2|//h3|//h4|//h5'):
            if node.tag.lower() == 'h1':
                this_depth = 0
            elif node.tag.lower() == 'h2':
                this_depth = 1
            elif node.tag.lower() == 'h3':
                this_depth = 2
            elif node.tag.lower() == 'h4':
                this_depth = 3
            elif node.tag.lower() == 'h5':
                this_depth = 4
            else:
                continue
            
            p = re.compile('[^a-zA-Z0-9\s\_]')
            this_id = re.sub(p, '-', node.text).replace(' ','-')
            if this_id in hids:
                counts[this_id] = counts.get(this_id, 0) + 1
                this_id = '%s-%s'%(this_id, counts[this_id])
            hids.append(this_id)           
            
            pat = '%s'%(etree.tostring(node))
            rep = '<%s id="%s" class="toc_heading">%s'\
                  '<span class="toc_top"><a href="#toc">&#8617;</a></span></%s>'\
                  '<p style="clear: both;"></p>'\
                  %(node.tag, this_id, node.text, node.tag)
            text = text.replace(pat, rep, 1)
            indent_px = this_depth * 20
            toc_html += '<p style="margin-left: %spx">+ '\
                        '<a href="#%s">%s</a></p>\n'\
                        %(indent_px, this_id, node.text)                

        toc_html += '</div>\n'
        text = text.replace(text,toc_html+text)
        return text
Exemple #20
0
 def extract(self, selector, host='', body_xpath='', with_body_html=False):
     body_xpath = body_xpath or config.get('body', {}).get('xpath', '')
     if body_xpath:
         body = selector.xpath(body_xpath)[0]
     else:
         body = selector.xpath('//body')[0]
     body = self.remove_list_relevant(body)
     for node in iter_node(body):
         density_info = self.calc_text_density(node)
         node_hash = hash(node)
         text_density = density_info['density']
         ti_text = density_info['ti_text']
         text_tag_count = self.count_text_tag(node, tag='p')
         sbdi = self.calc_sbdi(ti_text, density_info['ti'],
                               density_info['lti'])
         images_list = node.xpath('.//img/@src')
         images_list = self.remove_img(images_list)
         host = host or config.get('host', '')
         if host:
             images_list = [
                 pad_host_for_images(host, url) for url in images_list
             ]
         node_info = {
             'ti': density_info['ti'],
             'lti': density_info['lti'],
             'tgi': density_info['tgi'],
             'ltgi': density_info['ltgi'],
             'node': node,
             'body': body,
             'density': text_density,
             'text': ti_text,
             'images': images_list,
             'text_tag_count': text_tag_count,
             'sbdi': sbdi
         }
         if with_body_html or config.get('with_body_html', False):
             body_source_code = unescape(
                 etree.tostring(node, encoding='utf-8').decode())
             node_info['body_html'] = body_source_code
         self.node_info[node_hash] = node_info
     self.calc_new_score()
     result = sorted(self.node_info.items(),
                     key=lambda x: x[1]['score'],
                     reverse=True)
     return result
Exemple #21
0
    def filter(
        self,
        html: str,
        inline: bool = False,
        outgoing: bool = False,
        display_name_mentions: Optional[Dict[str, str]] = None,
    ) -> str:
        """Filter and return HTML."""

        mentions = display_name_mentions

        sanit = Sanitizer(self.sanitize_settings(inline, outgoing, mentions))
        html = sanit.sanitize(html).rstrip("\n")

        if not html.strip():
            return html

        tree = etree.fromstring(
            html,
            parser=etree.HTMLParser(encoding="utf-8"),
        )

        for a_tag in tree.iterdescendants("a"):
            self._mentions_to_matrix_to_links(a_tag, mentions, outgoing)

            if not outgoing:
                self._matrix_to_links_add_classes(a_tag)

        html = etree.tostring(tree, encoding="utf-8", method="html").decode()
        html = sanit.sanitize(html).rstrip("\n")

        if outgoing:
            return html

        # Client-side modifications

        html = self.quote_regex.sub(r'\1<span class="quote">\2</span>\3', html)

        if not inline:
            return html

        return self.inline_quote_regex.sub(
            r'\1<span class="quote">\2</span>',
            html,
        )
Exemple #22
0
def get_html_at_url(
    url,
    charset='UTF-8',
):
    for banned in banned_list:
        if (banned in url):
            return flask.render_template('error.html')

    try:
        website = urllib.request.Request(url)
    except ValueError:
        return flask.render_template('error.html')

    try:
        html = urllib.request.urlopen(website).read().decode(charset)
    except Exception as e:
        return flask.render_template('error.html')

    root = make_etree(html, url)
    head = root.find('.//head')
    if head is not None:
        base = etree.Element('base', href=url)
        head.insert(0, base)
    profile_photo = copy_profile_photo_to_static(root)

    if profile_photo is not None:
        img_info = get_image_info(profile_photo)
        add_glasses(profile_photo, img_info['faces'][0])
        new_html = etree.tostring(root)

        # Credit:  Alexander J. Quinn.  Used with permission.  https://piazza.com/class/jkspuifikh3s9?cid=789
        mo = re.search(r"\s*<.+?>", html, flags=re.DOTALL)
        if mo is not None:
            doctype = mo.group(0)
            new_html = doctype.encode('utf8') + b"\n" + new_html
        return new_html

    else:
        return flask.render_template('noprofile.html')
def sep_tag(elems, split_pattern):
    """
    在html元素的字符串中进行切割
    :param elems: html元素
    :param split_pattern: 切割所使用的正则表达式
    :return: list of str, 每个字符串都经过清洗
    """
    ret = [""]
    url_head = "https://asoiaf.huijiwiki.com"
    # e_str = re.split(split_pattern, etree.tostring(elems).decode('utf-8'))
    elems = etree.HTML(etree.tostring(elems).decode('utf-8'))
    for s in elems.xpath("//body/*/text()|//body/*/*"):
        try:
            if isinstance(s, str):
                # 访问到字符串元素
                if not re.fullmatch(r"\s*", s):
                    ret[-1] += clean_text(s)
            else:
                if s.tag == "br":
                    ret.append("")
                elif s.tag == "a":
                    if "new" in s.xpath("@class"):
                        ret[-1] += clean_text("".join(s.xpath(".//text()")))
                    else:
                        ret[-1] += get_header(url_head + s.xpath("@href")[0])
                else:
                    ret[-1] += clean_text("".join(s.xpath(".//text()")))
        except BaseException as e:
            print(e)
    # 清理空字符串,分割逗号
    ret_copy = []
    for item in ret:
        if item != "":
            # ret_copy += [*item.split(",")]
            ret_copy.append(item)
    return ret_copy
Exemple #24
0
def parse(html_str):
    html = etree.HTML(html_str)
    pics = html.xpath("//div[@class='single-content']/p")
    for pic in pics:
        global images
        images.append(etree.tostring(pic).decode('utf-8'))
    def extract(self, selector, host='', with_body_html=False):
        """
         W3C标准中HTML结构:

        <!DOCTYPE html>
        <html>
          <head>
            <meta charset="UTF-8">
            <title>网页标题</title>
          </head>
          <body>
            <h1>网页正文</h1>
          </body>
        </html>

        :param selector:
        :param host:
        :param with_body_html:
        :return:
        """
        body = selector.xpath('//body')[0]  # 选中body标签
        for node in iter_node(body):
            node_hash = hash(node)
            # 计算节点文本密度
            density_info = self.calc_text_density(
                node
            )  # 返回{'density': density, 'ti_text': ti_text, 'ti': ti, 'lti': lti, 'tgi': tgi, 'ltgi': ltgi}

            # 计算文字符号密度
            text_density = density_info['density']
            ti_text = density_info['ti_text']
            text_tag_count = self.count_text_tag(node, tag='p')  # 计算文本标签(p)数量
            sbdi = self.calc_sbdi(ti_text, density_info['ti'],
                                  density_info['lti'])  # 返回sbdi or 1

            # 解析图片url(获取所有img的src,若用户定义了host主域名,则加上)
            images_list = node.xpath('.//img/@src')
            host = host or config.get('host', '')
            if host:
                images_list = [
                    pad_host_for_images(host, url) for url in images_list
                ]

            node_info = {
                'ti': density_info['ti'],
                'lti': density_info['lti'],
                'tgi': density_info['tgi'],
                'ltgi': density_info['ltgi'],
                'node': node,
                'density': text_density,
                'text': ti_text,
                'images': images_list,
                'text_tag_count': text_tag_count,
                'sbdi': sbdi
            }
            # 生成新闻正文所在标签的 HTML 源代码
            if with_body_html or config.get('with_body_html', False):
                body_source_code = unescape(
                    etree.tostring(node, encoding='utf-8').decode())
                node_info['body_html'] = body_source_code
            self.node_info[node_hash] = node_info
        std = self.calc_standard_deviation()  # 计算标准差
        self.calc_new_score(std)  # 评分核心函数
        # sorted(key)参数含义: 按照第几维的元素进行排序。
        # 此处按照第二维中的score对应的值排序
        result = sorted(self.node_info.items(),
                        key=lambda x: x[1]['score'],
                        reverse=True)
        return result
Exemple #26
0
data = list()
for detail_url in detail_list:
    url = root + detail_url
    rsp = requests.get(url=url, headers=headers)
    html = etree.HTML(rsp.text)
    row = dict()

    # 获取标题
    row['title'] = html.xpath('//h1[@class="page-title"]/text()')[0].replace(
        ' ', '').replace('\n', '')

    # 获取配方
    row['ins'] = dict()
    for r in html.xpath('//div[@class="ings"]//tr[@itemprop]'):
        k = r.find('td[@class="name"]')
        k = etree.tostring(k, encoding='utf-8').decode('utf-8')
        k = re.sub('<.*?>', '', k).replace(' ', '').replace('\n', '')
        v = r.find('td[@class="unit"]').text.replace(' ', '').replace('\n', '')
        row['ins'][k] = v

# 获取烹饪步骤
    row['steps'] = list()
    for r in html.xpath('//div[@class="steps"]/ol/li'):
        li = dict()
        p = r.find('p[@class="text"]')
        img = r.find('img').get('src') if r.find('img') is not None else ''
        text = etree.tostring(p, encoding='utf-8').decode('utf-8')
        text = re.sub('<.*?>', '', text).replace(' ', '').replace('\n', '')
        li['text'] = text
        li['img'] = img
        row['steps'].append(li)
Exemple #27
0
            step_.replace('\n', '</p>\n<p>\n') + '</p>\n]]>'
        expect_ = '<![CDATA[<p>\n' + \
            expect_.replace('\n', '</p>\n<p>\n') + '</p>\n]]>'

        test_case = etree.SubElement(testsuite_, 'testcase', name=case_)
        preconditions = etree.SubElement(test_case, 'preconditions')
        preconditions.text = u'{0}'.format(pre_)
        steps = etree.SubElement(test_case, 'steps')
        step = etree.SubElement(steps, 'step')
        step_number = etree.SubElement(step, 'step_number')
        step_number.text = u'1'
        actions = etree.SubElement(step, 'actions')
        actions.text = u'{0}'.format(step_)
        expectedresults = etree.SubElement(step, 'expectedresults')
        expectedresults.text = u'{0}'.format(expect_)
        keywords = etree.SubElement(test_case, 'keywords')
        keyword_list = keywords_.split('\n')
        for kw in keyword_list:
            keyword = etree.SubElement(keywords, 'keyword', name=kw)
    except Exception as e:
        print("line:", seq)
        print(str(e))
        for item in sys.exc_info():
            print(item)

s = etree.tostring(xmlroot, pretty_print=True, encoding='utf-8').decode()
s = s.replace('&lt;', '<')  # 临时强制修改,将来碰到内容中包含大于小于的可能会导致XML格式错误,导入失败。
s = s.replace('&gt;', '>')
with open('output.xml', mode='w+', encoding='utf-8') as target:
    target.write(s)
Exemple #28
0
from lxml.html import etree
"""
用lxml来解析html代码
"""

text = '''
<div>
   <ul>
     <li class="item_1"><a href="0.html">item</a><li>
     <li class="item_2"><a href="0.html">item</a><li>
     <li class="item_3"><a href="0.html">item</a><li>
   </ul>
</div>

'''

#利用etree,html把字符串解析成HTML文档

html = etree.HTML(text)
s = etree.tostring(html)
print(s)
Exemple #29
0
try:
  tree = parse(arg['url'])
  #parser = etree.HTMLParser()
  #tree = etree.parse(arg['url'], parser)
except:
  # should not be raised if only some minor HTML errors occur
  err_exit('Unable to parse the given HTML document.')

if arg['b']:
  """handle --beautiful-output argument"""

  # nasty solution, but we need the "cleaned" version (i.e. the
  #   lxml-specific one) of HTML tree
  printw(BeautifulSoup(
    etree.tostring(tree.getroot(), method='html') ).prettify())
  sys.exit(0)

# precompile regexps
eres = {
  'url': re.compile('^(http://)?www[.]', re.IGNORECASE),
  'trail_space': re.compile('\\s*$'),
  'lead_space': re.compile('^\\s*'),
  'whole_blank': re.compile('^\\s*$'),
  'blank': re.compile('[\\s\u00a0]+'),  # \u00a0 == nbsp
  'url_hex': re.compile('(%[A-F0-9]{2}|\\+)+'),
  'year4': re.compile('[(]\\s*([12][0-9][0-9][0-9])\\s*[)]'),
  'year2': re.compile('[(]\\s*([01][0-9])\\s*[)]'),
  'year4r': re.compile('[(]\\s*([12][0-9][0-9][0-9])(?!.*?[12][0-9][0-9][0-9])\\s*[)]'),
  'year2r': re.compile('[(]\\s*([01][0-9])(?!.*?[01][0-9])\\s*[)]'),
  'year4r_no_paren': re.compile('\\s*([12][0-9][0-9][0-9])(?!.*?[12][0-9][0-9][0-9])\\s*'),