Example #1
0
    def run(self, text):
        soup = BeautifulSoup(text, 'html.parser')
        new_soup = BeautifulSoup()

        content = new_soup.new_tag('div', **{'class': self.content_class})

        for tag in soup.children:
            if isinstance(tag, NavigableString):
                continue

            if tag.name not in self.incut_tags and len(tag.contents) == 1 and tag.contents[0].name in self.incut_tags:
                tag = tag.contents[0]

            if tag.name in self.incut_tags:
                if len(content):
                    new_soup.append(content)
                    content = new_soup.new_tag('div', **{'class': self.content_class})

                klass = self.incut_class
                if tag.name == 'iframe':
                    klass += ' ' + self.incut_video_class

                incut = soup.new_tag('div', **{'class': klass})
                incut.append(tag)
                new_soup.append(incut)
            else:
                content.append(tag)

        if len(content):
            new_soup.append(content)

        return new_soup.decode()
Example #2
0
 def render_to_response(self, context, **kwargs):
     response = super(ArticleDetail, self).render_to_response(context, **kwargs)
     if self.request.user.is_staff or self.request.GET.get("preview"):
         return response
     cache = caches['default']
     if cache.get(context['object'].get_absolute_url()):
         return cache.get(context['object'].get_absolute_url())
     content = response.rendered_content
     bs = BeautifulSoup(content, "html5lib")
     imgs = bs.find("div", class_="article-content").find_all("img")
     for img in imgs:
         if not img.attrs:
             continue
         ns_attrs = img.attrs
         ns_img = bs.new_tag("img", **ns_attrs)
         img.insert_before(ns_img)
         ns_img.wrap(bs.new_tag("noscript"))
         if img.attrs.get("class") and "lazyload" in img.attrs["class"]:
             continue
         img.attrs["class"] = img.attrs.get("class", []) + ["lazyload"]
         if img.attrs.get("src"):
             img.attrs["data-src"] = img.attrs.get("src")
         if img.attrs.get("srcset"):
             img.attrs["data-srcset"] = img.attrs.get("srcset")
             img.attrs.pop("srcset", "")
         gray_gif = ""
         img.attrs["src"] = gray_gif
     content = unicode(bs)
     response.content = content
     cache.set(context['object'].get_absolute_url(), response)
     return response
Example #3
0
 def handle_html_content(self, content):
     soup = BeautifulSoup(content, 'html.parser')
     for p_elem in soup.find_all('p'):
         css = None
         if 'style' in p_elem.attrs:
             css = cssutils.parseStyle(p_elem.attrs['style'])
         text_list = p_elem.text.split()
         p_new = soup.new_tag('p', style=css.cssText if css else None)
         for idx, word in enumerate(text_list):
             if len(self.dorks) <= 0:
                 self.dorks = yield from self.get_dorks()
             word += ' '
             if idx % 5 == 0:
                 a_tag = soup.new_tag(
                     'a',
                     href=self.dorks.pop(),
                     style='color:{color};text-decoration:none;cursor:text;'.format(
                         color=css.color if css and 'color' in css.keys() else '#000000'
                     )
                 )
                 a_tag.string = word
                 p_new.append(a_tag)
             else:
                 p_new.append(soup.new_string(word))
         p_elem.replace_with(p_new)
     content = soup.encode('utf-8')
     return content
Example #4
0
def test_feed():
    data = urllib2.urlopen(MYFEED).read()
    tree = BeautifulSoup(data, features='xml')

    items = tree.find_all('entry')
    for i in items:
        # find the ID
        video = i.find('link', rel='related')['href'].split('/')[-1]

        # remove extraneous gunk
        [q.decompose() for q in i.group.find_all('content')]

        # add or update the enclosure
        enc = tree.new_tag('link')
        enc['rel'] = 'enclosure'
        enc['type'] = 'audio/mpeg'
        enc['title'] = 'mp3'
        enc['href'] = 'http://%s:%s/media/%s' % (HOSTNAME, PORT, video)
        i.append(enc)

        # add the description
        desc = tree.new_tag('description')
        desc.string = i.description.string
        i.append(desc)

    return tree.prettify()
Example #5
0
def create_new_feed():
    config = yaml.load(open('config.yaml'))
    if not os.path.isfile(config['feed_location']):
        # Create initial feed info
        soup = BeautifulSoup(
            """<feed xmlns="http://www.w3.org/2005/Atom"></feed>""")
        feed_tag = soup.feed

        # Create required tags and add them in the feed tag
        title_tag = soup.new_tag('title')
        id_tag = soup.new_tag('id')
        updated_tag = soup.new_tag('updated')
        feed_tag.append(title_tag)
        feed_tag.append(id_tag)
        feed_tag.append(updated_tag)

        # Put data into the new feeds
        title_tag.string = config['feed_title']
        id_tag.string = config['feed_id']
        updated_tag.string = datetime.datetime.now().isoformat('T')

        feed_file = open(config['feed_location'], 'w')
        print(config['feed_xml'], file=feed_file)
        print(feed_tag.prettify(), file=feed_file)
        feed_file.close()
def serializeLabContent(labContent):
	#print labContent
	f = open('template.html', "r")
	labHtml = f.read()
	f.close()
	labTemplate = BeautifulSoup(labHtml)
	articleSection = labTemplate.find_all('div', id="experiment-article-sections")[0]
	sectionNumber = 1
	for sectionName,sectionContent in labContent:
		sectionTag = labTemplate.new_tag('section', id="experiment-article-section-"+str(sectionNumber))
		articleSection.append(sectionTag)
		iconTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-icon")
		iconTag['class']='icon'
		sectionTag.append(iconTag)
		headingTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-heading")
		headingTag['class']='heading'
		headingTag.append(sectionName)
		sectionTag.append(headingTag)
		contentTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-content")
		contentTag['class']='content'
		contentTag.append(sectionContent)
		sectionTag.append(contentTag)
		sectionNumber +=1	
	f = open('content.html', "w+")
	labTemplate = labTemplate.prettify()
	f.write(labTemplate.encode('utf-8'))
	f.close()
Example #7
0
def split_and_save(sentence, listOfWords, new):
    while len(sentence) > 0:
        res = find_word(sentence, listOfWords)
        word = BeautifulSoup.new_tag(new, "w")
        if res[0] == True:
            
            for i in res[1]:
                an = BeautifulSoup.new_tag(new, "ana")
                an['lex'] = i.lex
                an['transcr'] = i.transcr
                an['sem'] = i.sem
                word.append(an)
            if res[2] < len(sentence) and res[2] > 0:
                sentence = sentence[res[2]:]
            elif res[2] == 0:
                an = BeautifulSoup.new_tag(new, "ana")
                an['lex'] = sentence[0]
                word.append(an)
                sentence = sentence[1:] 
            else:
                sentence = ''
            new.append(word)
        else:
            if 1 < len(sentence):
                an = BeautifulSoup.new_tag(new, "ana")
                an['lex'] = sentence[0]
                word.append(an)
                sentence = sentence[1:]
            else:
                sentence = ''
Example #8
0
def write_counters():
    try:
        if os.path.exists(common.getConfig("rootDir") + "/report/report.html"):
            pre_rendered = open(common.getConfig("rootDir") + "/report/report.html",'r').read()
            pre_rendered_html = BeautifulSoup(pre_rendered,'html5lib')
            warnings =  len(re.findall(r'badger-warning', str(pre_rendered_html)))
            information =  len(re.findall(r'badger-success', str(pre_rendered_html)))
            vulnerabilities =  len(re.findall(r'badger-danger', str(pre_rendered_html)))
            debug =  len(re.findall(r'debug-level', str(pre_rendered_html)))

            new_div_tag = pre_rendered_html.new_tag("div")
            new_div_tag.string = str(vulnerabilities)
            pre_rendered_html.find("h1", id="vulnerability_count").append(new_div_tag)

            new_div_tag1 = pre_rendered_html.new_tag("div")
            new_div_tag1.string = str(warnings)
            pre_rendered_html.find("h1", id="warning_count").append(new_div_tag1)

            new_div_tag2 = pre_rendered_html.new_tag("div")
            new_div_tag2.string = str(information)
            pre_rendered_html.find("h1", id="information_count").append(new_div_tag2)

            new_div_tag3 = pre_rendered_html.new_tag("div")
            new_div_tag3.string = str(debug)
            pre_rendered_html.find("h1", id="debug_count").append(new_div_tag3)

            with open(common.getConfig("rootDir") + "/report/report.html", "w") as fh:
                fh.write(str(pre_rendered_html.prettify()))
            fh.close()
    except Exception as e:
        common.logger.debug("Error in write_counters: " + str(e))
Example #9
0
File: AI2KD.py Project: hlpan/AI2KD
def add_inflections():
    """
    添加变形列表
    """
    Fobj = open(file_name_value, "r", encoding="utf-8")
    data = Fobj.read()
    Fobj.close()
    soup = BeautifulSoup(data)
    entrylist = soup.findAll("idx:entry")
    for entry in entrylist:
        # 当前的单词
        word = entry.find("idx:orth")["value"]
        # 在infl_list中查找
        # infl_index是原型,infl_list是变形
        # 所以先在infl_index中查找到位置,再到infl_list中查找
        if word in infl_index:
            pos = infl_index.index(word)
        else:
            continue
        if len(infl_list) <= 1:
            continue
        idx_infl_tag = soup.new_tag("idx:infl")
        for x in range(1, len(infl_list[pos])):
            idx_iform_tag = soup.new_tag("idx:iform", value=infl_list[pos][x])
            idx_infl_tag.append(idx_iform_tag)
        # 在idx:orth标签后面添加一组标签
        entry.find("idx:orth").insert_after(idx_infl_tag)

    name_split = file_name.split(".")
    name_split.insert(-1, "_add_infl.")
    file_name_infl = "".join(name_split)
    Fobj = open(file_name_infl, "w", encoding="utf-8")
    Fobj.write(str(soup))
    Fobj.close()
Example #10
0
def buildtable(matrix, section):
    """Return an XML <informaltable> built from a matrix of rows."""
    if section is True:
        soup = BeautifulSoup('''<section>
                                    <title>Insert Title Here</title>
                                    <informaltable>
                                        <tgroup>
                                        </tgroup>
                                    </informaltable>
                                </section>''', "xml")
    else:
        soup = BeautifulSoup('''<informaltable>
                                    <tgroup>
                                    </tgroup>
                                </informaltable>''', "xml")

    # tgroup
    # cols = 'cols="' + str(len(matrix[1])) + '"'
    soup.tgroup['cols'] = str(len(matrix[1]))

    # thead
    thead = soup.new_tag("thead")
    soup.tgroup.append(thead)
    header = matrix.pop(0)
    thead.append(createrow(header))

    # tbody
    tbody = soup.new_tag("tbody")
    soup.tgroup.append(tbody)
    for row in matrix:
        tbody.append(createrow(row))

    return soup.prettify()
def generate_job_item(job,current_job=None):
    """
    Calls and generates HTML snippet for a simple job as list item
    in the side navigation bark

    :job_listing: Row Job
    :current_job: Current job, can be none
    :rtype: Generated HTML or None
    """
    job_soup = BeautifulSoup()
    li = job_soup.new_tag("li")
    job_link = job_soup.new_tag("a",
                                href="/apps/marc_batch/jobs/{0}/".format(job.pk))
    job_link.string = job.name
    li.append(job_link)
    if current_job is not None:
        if job.pk == current_job.pk:
            li['class'] = 'active'
            job_tasks = job_soup.new_tag("ul")
            job_tasks["class"] = "nav nav-list"            
            history_li = job_soup.new_tag("li")
            history_link = job_soup.new_tag("a",
                                            href="/apps/marc_batch/jobs/{0}/history/".format(current_job.pk))
            history_link.string = "History"
            history_li.append(history_link)
            job_tasks.append(history_li)
            li.append(job_tasks)
            print(li)
    return mark_safe(str(li))
    def modify_html(self, html, source_article_id):
        # we need this in order to plot the heatmap
        soup = Soup(html, 'html.parser')
        head = soup.find('base')
        print soup.find("title")
        if head is not None:
            head.decompose()


        css = soup.find("link", {"rel": "stylesheet"})
        if css is not None:
            css['href'] = 'https:' + css['href']
            headers = {'user-agent': EMAIL}
            r = requests.get(css['href'], headers=headers, stream=True)
            css['href'] = ""
            if r.status_code == 200:
                style = soup.new_tag('style')
                style.string = r.text
                css.insert_after(style)
            else:
                print('FAIL: Cannot load css  for id: "%s" ' % source_article_id)

            css.decompose()

        last_element_on_page_meta = soup.new_tag('meta')
        last_element_on_page_meta['http-equiv'] = "content-type"
        last_element_on_page_meta['content'] = "text/html; charset=utf-8"

        body = soup.find('body')
        #if body is not None:
        last_element_on_page = soup.new_tag('div')
        last_element_on_page['class'] = "pyqt_is_shit"
        body.append(last_element_on_page)
        return soup.prettify(encoding='utf-8')
Example #13
0
	def dir_links(self, path):
		soup = BeautifulSoup('<ul></ul>', 'lxml')
		sublist = soup.ul
		for root, subFolders, files in os.walk(path):
			link = 'file://' + root
			name = root.split('/')[-1]

			item_tag = soup.new_tag('li')
			a_tag = soup.new_tag('a', href=link)
			a_tag.string = name.split('.')[0]
			item_tag.append(a_tag)
			sublist.append(item_tag)

			sublist_tag = soup.new_tag('ul')
			item_tag.append(sublist_tag)


			for f in files:
				link = 'file://' + root + f

				item_tag = soup.new_tag('li')
				a_tag = soup.new_tag('a', href=link)
				a_tag.string = f
				item_tag.append(a_tag)
				sublist_tag.append(item_tag)

			# If next os.walk iteration is going down a level, go down a level in list
			if subFolders:
				sublist = sublist.ul

		return soup.ul
Example #14
0
def add_sidebar(content, item):

    page = BeautifulSoup(content)

    # Table of contents.

    table = []

    for i in page.section.find_all('h2'):
        i['id'] = i.string.lower().replace(' ', '-')
        table.append(i)

    title = page.new_tag('h1')
    title.string = 'Table of contents'

    page.header.append(title)
    page.header.append(page.new_tag('ul'))

    for i in table:
        tag = page.new_tag('li')
        a = page.new_tag('a', href='#' + i['id'])
        a.string = i.string
        tag.append(a)
        page.header.find_all('ul')[-1].append(tag)

    return str(page)
Example #15
0
def create_new_entry(title_contents, article_contents, link, img_link=''):
    soup = BeautifulSoup("<entry></entry>")
    entry_tag = soup.entry
    id_tag = soup.new_tag('id')
    title_tag = soup.new_tag('title')
    updated_tag = soup.new_tag('updated')
    content_tag = soup.new_tag('content')
    link_tag = soup.new_tag('link')

    entry_tag.append(id_tag)
    entry_tag.append(title_tag)
    entry_tag.append(updated_tag)
    entry_tag.append(content_tag)
    entry_tag.append(link_tag)

    id_tag.string = link
    link_tag['href'] = link
    title_tag.contents = title_contents
    title_tag['type'] = 'xhtml'
    content_tag['type'] = 'xhtml'

    img_tag = soup.new_tag('img')
    img_tag['src'] = img_link
    article_contents.insert(0, img_tag)
    content_tag.contents = article_contents
    updated_tag.string = datetime.datetime.now().isoformat('T')
    return entry_tag
    def _get_networks_tag(self):
        bs = BeautifulSoup()
        networks_tag = bs.new_tag('networks')
        for key in self.networks:
            network_tag = bs.new_tag('network')
            network_tag['sourceType'] = self.networks[key][1]['sourceType']
            network_tag['source'] = self.networks[key][1]['source']
            network_tag['targetType'] = self.networks[key][1]['targetType']
            network_tag['target'] = self.networks[key][1]['target']
            network_tag['id'] = key
            network_tag['isDirected'] = dmlpu.unformat_prop(self.networks[key]['isDirected'])
            network_tag['allowSelfLoops'] = dmlpu.unformat_prop(self.networks[key]['allowSelfLoops'])
            network_tag['isBinary'] = dmlpu.unformat_prop(self.networks[key]['isBinary'])

            e_l = self.networks[key].edge_list()
            if self.networks[key]['isBinary']:
                for i in range(len(e_l)):
                    network_tag.append(bs.new_tag('link', source=e_l[i][0], target=e_l[i][1]))
            else:
                for i in range(len(e_l)):
                    network_tag.append(bs.new_tag('link', source=e_l[i][0], target=e_l[i][1],
                                                  value=self.networks[key].es[i]['weight']))

            networks_tag.append(network_tag)

        return networks_tag
Example #17
0
def toc_from_headers(html_string):
    """make a table of contents from headers"""
    soup = BeautifulSoup(html_string, "html.parser")
    headers = soup.find_all(name=re.compile("h[1-3]"), id=True)
    toc_s = ""
    for h in headers:
        if h.name == "h1":
            toc_level = "level-1"
        elif h.name == "h2":
            toc_level = "level-2"
        else:
            toc_level = "level-3"

        new_a = soup.new_tag("a", href="#"+h["id"])
        if h.string:
            new_a.string = h.string
        else:
            new_a.string = " ".join(h.strings)
        new_li = soup.new_tag("li")
        new_li["class"] = toc_level
        new_li.append(new_a)

        toc_s += str(new_li)+"\n"

    return str(toc_s)
Example #18
0
def add_meta_tag(page_dir, index_page):
    google_content = config['WEB-TOOLS']['google']
    bing_content = config['WEB-TOOLS']['bing']

    if not google_content and not bing_content:
        return

    with open('/opt/snare/pages/' + page_dir + "/" + index_page) as main:
        main_page = main.read()
    soup = BeautifulSoup(main_page, 'html.parser')

    if (google_content and
                soup.find("meta", attrs={"name": "google-site-verification"}) is None):
        google_meta = soup.new_tag('meta')
        google_meta.attrs['name'] = 'google-site-verification'
        google_meta.attrs['content'] = google_content
        soup.head.append(google_meta)
    if (bing_content and
                soup.find("meta", attrs={"name": "msvalidate.01"}) is None):
        bing_meta = soup.new_tag('meta')
        bing_meta.attrs['name'] = 'msvalidate.01'
        bing_meta.attrs['content'] = bing_content
        soup.head.append(bing_meta)

    html = soup.prettify("utf-8")
    with open('/opt/snare/pages/' + page_dir + "/" + index_page, "wb") as file:
        file.write(html)
Example #19
0
            def album2html(raw):
                from bs4 import BeautifulSoup

                soup = BeautifulSoup(raw)
                desc = soup.find(id='album-desc')

                images_orig = soup.find(id='album-images')
                coverid = images_orig.get('data-cover')
                images = soup.new_tag('div', id='album-images')
                for imgtag in images_orig.find_all(['img']):
                    try:
                        idx = imgtag.get('data-id')
                        img = Image.objects.get(idx=idx)
                        self.image_set.add(img)
                        imgtag['src'] = img.thumb_url
                        imgtag['alt'] = img.desc if img.desc else ''
                        imgtag['data-src'] = img.img_url
                        images.append(imgtag)
                    except:  # ignore illegal img
                        pass

                divcover = soup.new_tag('div', id='album-cover')
                try:
                    coverimg = images.find(lambda tag: tag.get('data-id') == coverid)
                    import copy
                    divcover.append(copy.deepcopy(coverimg))
                except:
                    pass

                return '\n'.join(map(lambda div: div.prettify(),
                                     [divcover, desc, images]))
Example #20
0
def content_process(content, mode):
    content = clone_bs4_elem(content)
    del content['class']
    soup = BeautifulSoup(
        '<html><head></head><body></body></html>')
    soup.body.append(content)
    no_script_list = soup.find_all("noscript")
    for no_script in no_script_list:
        no_script.extract()
    if mode == 'answer':
        img_list = soup.find_all("img", class_=["origin_image", "content_image"])
    elif mode == 'post':
        img_list = soup.find_all("img")
    for img in img_list:
        if mode == 'answer':
            if "content_image" in img['class']:
                img['data-original'] = img['data-actualsrc']
            new_img = soup.new_tag('img', src=PROTOCOL + img['data-original'])
        elif mode == 'post':
            #原图的话就不需要replace
            new_img = soup.new_tag('img', src=PIC_PROTOCOL + img['src'].replace('.jpg','_b.jpg'))
        img.replace_with(new_img)
        if img.next_sibling is None:
            new_img.insert_after(soup.new_tag('br'))
    useless_list = soup.find_all("i", class_="icon-external")
    for useless in useless_list:
        useless.extract()
    return soup.prettify()
Example #21
0
def parse(raw_html, chapter_no):
    soup = BeautifulSoup(raw_html)

    charset_tag = soup.head.meta.extract()
    title_tag = soup.head.title.extract()
    style_tag = soup.new_tag("link", rel="stylesheet",
                        type="text/css", href="../styles.css")
    soup.head.clear()
    soup.head.append(charset_tag)
    soup.head.append(title_tag)
    soup.head.append(style_tag)

    story_text = soup.find("div", id="storytext").extract()
    story_text.attrs = None
    chapters = soup.find("select", id="chap_select")
    chapter = chapters.find("option", value=str(chapter_no)).string

    soup.body.clear()
    soup.body.attrs = None
    chapter_tag = soup.new_tag("h2")
    chapter_tag.string = chapter
    soup.body.append(chapter_tag)
    soup.body.append(story_text)

    html = str(soup)
    html = html.replace("<!DOCTYPE html>", XHTML_TRANSITIONAL, 1)
    return html, chapter_no
Example #22
0
    def parse_body(self, response):
        """
        解析正文
        :param response: 爬虫返回的response对象
        :return: 返回处理后的html文本
        """
        try:
            soup = BeautifulSoup(response.content, 'html.parser')
            body = soup.find_all(class_="x-wiki-content")[0]

            # 加入标题, 居中显示
            title = soup.find('h4').get_text()
            center_tag = soup.new_tag("center")
            title_tag = soup.new_tag('h1')
            title_tag.string = title
            center_tag.insert(1, title_tag)
            body.insert(1, center_tag)

            html = str(body)
            # body中的img标签的src相对路径的改成绝对路径
            pattern = "(<img .*?src=\")(.*?)(\")"

            def func(m):
                if not m.group(3).startswith("http"):
                    rtn = "".join([m.group(1), self.domain, m.group(2), m.group(3)])
                    return rtn
                else:
                    return "".join([m.group(1), m.group(2), m.group(3)])

            html = re.compile(pattern).sub(func, html)
            html = html_template.format(content=html)
            html = html.encode("utf-8")
            return html
        except Exception as e:
            logging.error("解析错误", exc_info=True)
def buildHtml(year):
	dataFilePath='e:\\patent\\patent-cn-{0}.txt'.format(str(year))
	if os.path.exists(dataFilePath):		
		patents=[]
		with open(dataFilePath,'r') as f:
			for each_line in f:
				patent=eval(each_line)
				if patent['type']=='patent':
					rate=pattern.findall(patent['rate'])
					if int(rate[1])>=9:
						patent['count']=int(rate[0])
						patents.append(patent)
		patents=sorted(patents,key=itemgetter('count'),reverse=True)
		soup=BeautifulSoup('<html><head></head><body><ul></ul></body></html>')
		ul=soup.ul
		for patent in patents[0:20]:
			li=soup.new_tag('li')
			a=soup.new_tag('a',href=patent['url'],target='_blank')
			a.string=patent['title']
			li.append(a)
			li.append(patent['rate'])
			ul.append(li)
		htmlFilePath='e:\\patent\\patent-cn-{0}.html'.format(str(year))
		with open(htmlFilePath,'w') as f:			
			f.write(soup.prettify())
Example #24
0
    def highlight_syntax(self, soup):
        """
        Highlight code syntax.

        :param soup: bs4 instance
        :return: bs4 instance
        """
        code_tags = soup.find_all('code')

        for code in code_tags:
            if code.has_attr('class'):
                lang = code['class']
                code.parent['class'] = "highlight " + lang[0]
                del code['class']
                code.name = "span"

                in_pre_code = syntax_highlight(lang[0], code.string)
                if self.config.CLIPBOARD:
                    s = BeautifulSoup("<blockquote></blockquote>")
                    blockquote = s.blockquote
                    blockquote['class'] = 'highlight ' + lang[0]
                    p = s.new_tag('p')
                    a = s.new_tag('a', href="#")
                    a['class'] = 'clipboard'
                    a["data-clipboard-text"] = code.string
                    a["data-clipboard-action"] = "copy"
                    a.append("copy")
                    p.append(a)
                    blockquote.append(p)

                    in_pre_code += str(blockquote)

                code.parent.replaceWith(in_pre_code)

        return soup
Example #25
0
def extract_table(content, rows=None):

    dammit = UnicodeDammit(content, ["utf-8", "latin-1", "iso-8859-1"])
    soup = BeautifulSoup(dammit.unicode_markup)

    table = soup.find("table")
    # removing coordinates
    table.tr.find_next_sibling("tr").extract()
    table.tr.find_next_sibling("tr").extract()
    # moving the link to a new column
    trs = table.tr.find_next_siblings("tr")

    more = trs[2].td.find_next_sibling("td")
    link = trs[3].td.find_next_sibling("td")

    trs[2].td.extract()
    trs[3].td.extract()

    trs[2].decompose()
    trs[3].decompose()

    trs[0].append(more)
    trs[1].append(link)
    # copyright info, if any
    if rows:
        new_td = soup.new_tag("td",colspan=2)
        new_td.string = " — ".join(rows)
        new_tr = soup.new_tag("tr")
        new_tr.append(new_td)
        trs[1].insert_after(new_tr)

    s = unicode(table)

    return u''.join('&%s;' % entities[ord(c)] if ord(c) in entities else c for c in s)
Example #26
0
def build_bonita_role_xml(uuid,name,description='',label='',dbid='',with_class=False):
    """ Build XML for a Bonita Role information """
    # Build XML body
    soup = BeautifulSoup('','xml')

    tag_role = soup.new_tag('Role')
    if with_class:
        tag_role.attrs['class']='Role'

    tag_uuid = soup.new_tag('uuid')
    tag_name = soup.new_tag('name')
    tag_description = soup.new_tag('description')
    tag_label = soup.new_tag('label')
    tag_dbid = soup.new_tag('dbid')

    tag_uuid.string = uuid
    tag_name.string = name
    tag_description.string = description
    tag_label.string = label
    tag_dbid.string = dbid

    role_tags = [tag_uuid,tag_name,tag_description,tag_label,tag_dbid]

    for tag in role_tags:
        tag_role.append(tag)

    return unicode(tag_role)
Example #27
0
def index(request):
    # return HttpResponse('Hello from Python!')
    # return render(request, 'index.html' )
    resultsParser = ResultsParser()
    resultsModel = resultsParser.parse('http://cfrsolo2.com/2016/04-17-16-brooksville_fin.htm')
    # return render(request, 'adrian0.html')
    # r = requests.get('http://httpbin.org/status/418')
    # print r.text
    # return HttpResponse('<pre>' + r.text + '</pre>')
    soup = BeautifulSoup()

    new_img_tag = soup.new_tag("img", style='position: absolute; top: 0; right: 0; border: 0;', src="https://camo.githubusercontent.com/e7bbb0521b397edbd5fe43e7f760759336b5e05f/68747470733a2f2f73332e616d617a6f6e6177732e636f6d2f6769746875622f726962626f6e732f666f726b6d655f72696768745f677265656e5f3030373230302e706e67")
    new_a_tag = soup.new_tag("a", href='https://github.com/orozcoadrian/race-graphs')
    new_a_tag.append(new_img_tag)
    soup.append(new_a_tag)

    years = get_years_from_homepage()

    for year in years:
        new_a_tag = soup.new_tag("a", href=year)
        new_a_tag.string = year
        soup.append(new_a_tag)
        new_a_tag.append(soup.new_tag('br'))
    # self.wfile.write(soup.prettify())
    return HttpResponse(soup.prettify())
Example #28
0
    def __query_params_file(self, rtag, q):

        # queries are in a dict q
        # build the query-param XML and write it out to disk

        soup = BeautifulSoup("<parameters></parameters>", "xml")

        # float n query tags in the soup

        for num in q:
            T_query = soup.new_tag("query")
            T_type = soup.new_tag("type")
            T_type.string = "indri"
            T_number = soup.new_tag("number")
            T_number.string = num
            T_text = soup.new_tag("text")
            T_text.string = "#combine(" + q[num] + ")"
            T_query.append(T_type)
            T_query.append(T_number)
            T_query.append(T_text)
            soup.parameters.append(T_query)

        o_file = os.path.join(self.path["RUNS"], rtag + ".indri")

        # purge the XML declaration introduced by BeautifulSoup and
        # shape it up for Indri to consume

        with open(o_file, "w") as f:
            f.write(self.__shapeup_xml(soup.prettify().split("\n")[1:]))

        return o_file
Example #29
0
def compareandsave(sampleparagraphs, textparagraphs, q, filepath):
    res = compareParagraps(sampleparagraphs, textparagraphs, q)

    filesp = os.path.splitext(filepath)

    tex = BeautifulSoup(features='xml')
    tex.append(BeautifulSoup.new_tag(tex, 'name'))
    tex.append(BeautifulSoup.new_tag(tex, 'body'))
    for par in res[0]:
        tex.body.append(par)

    html = tex.prettify('utf-8')
    with open(filesp[0] + '_res' + filesp[1], 'wb') as file:
        file.write(html)

    log = BeautifulSoup(features='xml')
    log.append(BeautifulSoup.new_tag(log, 'missing'))
    log.append(BeautifulSoup.new_tag(log, 'errors'))
    for par in res[1]:
        log.errors.append(par)
    for par in res[2]:
        log.missing.append(par)

    l = log.prettify('utf-8')
    # file = open(filesp[0] + '_log' + filesp[1], 'w', encoding='utf-8')
    with open(filesp[0] + '_log' + filesp[1], 'wb') as file:
        file.write(l)
def xmlify(filename):
    """
    create an xml representation of the text files
    :param filename: str name of file
    """
    with codecs.open(filename, 'r', 'utf-8') as infile:
        raw_rambam = infile.read()

    chap_index = [getGematria(i.group(1)) for i in re.finditer(ur'@00\u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})', raw_rambam)]
    chapters = re.split(ur'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}', raw_rambam)[1:]
    assert len(chap_index) == len(chapters)

    soup = BeautifulSoup(u'<root></root>', 'xml')
    for index, chapter in zip(chap_index, chapters):
        x_chapter = soup.new_tag('chapter', num=unicode(index))
        soup.root.append(x_chapter)

        v_indices = [getGematria(i.group(1)) for i in re.finditer(ur'@22([\u05d0-\u05ea]{1,2})', chapter)]
        verses = re.split(ur'@22[\u05d0-\u05ea]{1,2}', chapter)[1:]
        assert len(v_indices) == len(verses)

        for v_index, verse in zip(v_indices, verses):
            x_verse = soup.new_tag('verse', num=unicode(v_index))
            comments = verse.splitlines()
            for i, comment in enumerate(comments[1:]):
                x_comment = soup.new_tag('comment', num=unicode(i+1))
                x_comment.append(comment)
                x_verse.append(x_comment)

            x_chapter.append(x_verse)
    with codecs.open('./xml/{}'.format(filename.replace('.txt', '.xml')), 'w', 'utf-8') as outfile:
        outfile.write(unicode(soup.prettify()))
Example #31
0
def update_file_meta(abs_path_to_file):
    """
    This function accepts the absolute path to a file that has already been
    verified as containing learning activity metadata.
    It fills several roles:
         - Parse the XML file to determine the activity's <title> value.
         - Fetch new metadata for this activity from the course_metadata dict.
         - Update metadata in the XML file for 3 key tags: <available_at>,
           <due_at>, and <lock_at>.
    """

    global course_metadata, undefined_activities

    # Open the XML file and instantiate Beautiful Soup parsing.
    xml_file = open(abs_path_to_file, mode="rt+", encoding="utf-8")

    # Snag the first line, which contains the good-practice XML declaration.
    # Beautiful Soup erases it.
    xml_declaration = xml_file.readline()
    raw_xml = xml_file.read()
    soup = BeautifulSoup(raw_xml, "xml")

    # Get the learning activity's title from the previous semester. Use it to
    # look up the new metadata, which is stored as a subdict in the dictionary.
    # If no such entry is found, add it to a list (undefined_acts) to be
    # returned and ultimately printed to the user.
    prev_title = soup.title.string
    try:
        new_metadata = course_metadata[prev_title]
    except KeyError:
        undefined_activities.append(prev_title)

        return

    # If an modified title is specified, update it. Otherwise keep the previous
    # title.
    if new_metadata["new_title"]:
        soup.title.string = new_metadata["new_title"]
    else:
        pass

    # If new available, due, or lock times are specified, update them.
    # If not, delete the times that were copied over from the previous
    # semester.
    if new_metadata["new_avail_datetime"]:
        unlock_at_str = format_datetime(new_metadata["new_avail_datetime"])
    else:
        unlock_at_str = ""

    if new_metadata["new_due_datetime"]:
        due_at_str = format_datetime(new_metadata["new_due_datetime"])
    else:
        due_at_str = ""

    if new_metadata["new_lock_datetime"]:
        lock_at_str = format_datetime(new_metadata["new_lock_datetime"])
    else:
        lock_at_str = ""

    tags_to_update = {
        "unlock_at": unlock_at_str,
        "due_at": due_at_str,
        "lock_at": lock_at_str,
        "all_day_date": ""
    }

    for tag in tags_to_update.keys():
        try:
            exec("soup.{}.string = '{}'".format(tag, tags_to_update[tag]))
        except AttributeError:
            # If tag is not present in the soup, add it, then provide the
            # up-to-date string metadata.
            tag_to_add = soup.new_tag(tag)
            soup.contents[0].insert(3, tag_to_add)
            exec("soup.{}.string = '{}'".format(tag, tags_to_update[tag]))

    for tag in tags_to_update.keys():
        try:
            exec("soup.assignment.{}.string = '{}'".format(
                tag, tags_to_update[tag]))
        except AttributeError:
            pass

    # Write the updated XML back to the file. Note that, weirdly, the soup
    # object is a list that always contains exactly 1 entry -- that is,
    # a "tag" object containing updated XML code. Need to convert it to string.
    xml_file.truncate(0)
    xml_file.seek(0)
    # Give the XML declaration back.
    xml_file.write(xml_declaration)
    xml_file.write(str(soup.contents[0]))
    xml_file.close()

    return
Example #32
0
def addToParentIndex(des, tipe, Xrc):
    """
    adds Xpage or Xbook on parent's index
    """
    title = des.split("/")[-1].replace(".html", "")
    index = des.replace(os.path.basename(des), "index.html")
    with open(index, 'r') as f:
        soup = BeautifulSoup(f, "html.parser")
        f.close()
    with open(index, 'w') as f:
        notebook = "/".join(des.split("/")[2:])
        soup.head.title.string = 'TOC of ' + des.split("/")[-1].split(".")[0]
        if tipe == "Xpage":
            tr = soup.new_tag('tr')
            tr["id"] = title
            tr["onclick"] = "window.location.replace('$LINK$'); updateExplorer_IFrame('$LINK$')".replace(
                "$LINK$", '\\\\' + Xrc["gh_repo_name"] + '/' + notebook)
            tr["style"] = "background-color: rgb(55, 57, 58); width: 100vw; box-shadow: gray 2px 2px 2px;"
            th = soup.new_tag('th')
            th["scope"] = "row"
            th["style"] = "border: none; width: 60vw;"
            th.string = title
            td = soup.new_tag('td')
            td["style"] = "border: none; width: 40vw;"
            td.string = datetime.datetime.fromtimestamp(time.time()).strftime(
                "%H:%M.%S|$MONTH$ %d %Y by Xbooks[bot]").replace(
                    "$MONTH$",
                    chooseMonth(
                        datetime.datetime.fromtimestamp(
                            time.time()).strftime("%m")))
            tr.insert(0, td)
            tr.insert(0, th)
            soup.body.select('table')[1].tbody.insert(0, tr)
        if tipe == "Xbook":
            notebook = notebook + "/index.html"
            shutil.copy2(
                des.replace("docs/", "") + "/card.png", des + "/card.png")
            ccc.note("copied " + des.replace("docs/", "") + "/card.png to" +
                     des + "/card.png")
            td = soup.new_tag('td')
            td["id"] = title
            div_wrapper = soup.new_tag('div')
            div_wrapper[
                "onclick"] = "window.location.replace('$LINK$'); updateExplorer_IFrame(\'$LINK$\');".replace(
                    "$LINK$", '\\\\' + Xrc["gh_repo_name"] + '/' + notebook)
            div_wrapper["class"] = "card bg-light mb-3"
            div_wrapper[
                "style"] = "max-width: 20rem; background-color: rgba(39, 39, 39, 0.819) !important; color: rgb(200, 192, 188) !important; border: none; box-shadow: gray 5px 5px 5px;"
            div_head = soup.new_tag('div')
            div_head["class"] = "card-header"
            div_head[
                "style"] = "background-color: rgba(37, 37, 37, 0.877) !important; border: none; color: white;"
            div_head.string = title
            div_wrapper.insert(0, div_head)
            if os.path.exists(des.replace("docs/", "") + "/card.png"):
                div_body = soup.new_tag('div')
                div_body["class"] = "card-body"
                img = soup.new_tag('img')
                img["style"] = "height: 256px; width: 256px; display: block; filter: saturate(0.7) brightness(0.5);"
                img["src"] = title + "/card.png"
                div_body.insert(0, img)
                div_wrapper.insert(-1, div_body)
            td.insert(0, div_wrapper)
            if len(soup.table.tr.select("td")) == 3:
                tr = soup.new_tag('tr')
                soup.table.insert(0, tr)
            soup.table.tr.insert(0, td)
        f.write(soup.prettify(formatter="html"))
        f.close()
    ccc.success("adding " + des + " to parent index")
Example #33
0
        if re.search('^\/', form.get('action')):
            form['action'] = stype + '://' + hostget + form.get('action')
        else:
            form['action'] = stype + '://' + hostget + '/' + form.get('action')
    else:
        form['action'] = stype + '://' + hostget + '/' + form.get(
            'action').split('/', 1)[1].split('/', 1)[1].split('/', 1)[1]

    # If the autofill feature was enabled, then open the autofill file
    # and inject each hidden autofill input type into the form
    if args['autofill'] is not None:
        autofile = open(args['autofill'], 'r')

        for line in autofile:
            if re.match('^#', line) is None:
                autoform = soup.new_tag(line)
                form.insert(0, autoform)

        autofile.close()

phishHtml = soup.prettify(formatter="html")

# BeautifulSoup fixes broken HTML, I do not want this to happen for &amp;
phishHtml = re.sub('&amp;', '&', phishHtml)

# Rewrite CSS url(), first look for all matches
urlCSS = re.findall(r'url\((.*)\)', phishHtml)

# Loop through matches and replace
for urls in urlCSS:
    if re.search('\)', urls):
Example #34
0
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
    def __init__(self,
                 namespaceHTMLElements,
                 soup=None,
                 store_line_numbers=True,
                 **kwargs):
        if soup:
            self.soup = soup
        else:
            from bs4 import BeautifulSoup
            # TODO: Why is the parser 'html.parser' here? To avoid an
            # infinite loop?
            self.soup = BeautifulSoup("",
                                      "html.parser",
                                      store_line_numbers=store_line_numbers,
                                      **kwargs)
        # TODO: What are **kwargs exactly? Should they be passed in
        # here in addition to/instead of being passed to the BeautifulSoup
        # constructor?
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

        # This will be set later to an html5lib.html5parser.HTMLParser
        # object, which we can use to track the current line number.
        self.parser = None
        self.store_line_numbers = store_line_numbers

    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)

    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]

        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
        self.soup.object_was_parsed(doctype)

    def elementClass(self, name, namespace):
        kwargs = {}
        if self.parser and self.store_line_numbers:
            # This represents the point immediately after the end of the
            # tag. We don't know when the tag started, but we do know
            # where it ended -- the character just before this one.
            sourceline, sourcepos = self.parser.tokenizer.stream.position()
            kwargs['sourceline'] = sourceline
            kwargs['sourcepos'] = sourcepos - 1
        tag = self.soup.new_tag(name, namespace, **kwargs)

        return Element(tag, self.soup, namespace)

    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)

    def fragmentClass(self):
        from bs4 import BeautifulSoup
        # TODO: Why is the parser 'html.parser' here? To avoid an
        # infinite loop?
        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)

    def appendChild(self, node):
        # XXX This code is not covered by the BS4 tests.
        self.soup.append(node.element)

    def getDocument(self):
        return self.soup

    def getFragment(self):
        return treebuilder_base.TreeBuilder.getFragment(self).element

    def testSerializer(self, element):
        from bs4 import BeautifulSoup
        rv = []
        doctype_re = re.compile(
            r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')

        def serializeElement(element, indent=0):
            if isinstance(element, BeautifulSoup):
                pass
            if isinstance(element, Doctype):
                m = doctype_re.match(element)
                if m:
                    name = m.group(1)
                    if m.lastindex > 1:
                        publicId = m.group(2) or ""
                        systemId = m.group(3) or m.group(4) or ""
                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
                                  (' ' * indent, name, publicId, systemId))
                    else:
                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
                else:
                    rv.append("|%s<!DOCTYPE >" % (' ' * indent, ))
            elif isinstance(element, Comment):
                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
            elif isinstance(element, NavigableString):
                rv.append("|%s\"%s\"" % (' ' * indent, element))
            else:
                if element.namespace:
                    name = "%s %s" % (prefixes[element.namespace],
                                      element.name)
                else:
                    name = element.name
                rv.append("|%s<%s>" % (' ' * indent, name))
                if element.attrs:
                    attributes = []
                    for name, value in element.attrs.items():
                        if isinstance(name, NamespacedAttribute):
                            name = "%s %s" % (prefixes[name.namespace],
                                              name.name)
                        if isinstance(value, list):
                            value = " ".join(value)
                        attributes.append((name, value))

                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' *
                                                  (indent + 2), name, value))
                indent += 2
                for child in element.children:
                    serializeElement(child, indent)

        serializeElement(element, 0)

        return "\n".join(rv)
Example #35
0
def main() -> int:
    """
	Entry point for the executable.
	"""

    parser = argparse.ArgumentParser(
        description=
        "Build the Standard Ebooks Manual of Style from a set of .rst files.")
    parser.add_argument(
        "source_directory",
        metavar="SOURCE_DIRECTORY",
        help=
        "a directory containing .rst files comprising the Standard Ebooks Manual of Style"
    )
    parser.add_argument("dest_directory",
                        metavar="DEST_DIRECTORY",
                        help="a directory to place the output .php files")
    args = parser.parse_args()

    return_code = 0

    if not os.path.isdir(args.source_directory):
        print(f"Not a directory: `{args.source_directory}`")
        return 1

    if not os.path.isdir(args.dest_directory):
        print(f"Not a directory: `{args.dest_directory}`")
        return 1

    toc = []

    header_path = Path(args.source_directory) / "templates" / "header.html"
    footer_path = Path(args.source_directory) / "templates" / "footer.html"

    try:
        with open(header_path, "r", encoding="utf-8") as file:
            header_html = file.read()
    except:
        print(f"Couldn’t open `{header_path}`")
        return 1

    try:
        with open(footer_path, "r", encoding="utf-8") as file:
            footer_html = file.read()
    except:
        print(f"Couldn’t open `{footer_path}`")
        return 1

    with tempfile.TemporaryDirectory() as work_directory:
        for filename in os.listdir(args.source_directory):
            if not filename.endswith(".rst"):
                continue

            with open(Path(args.source_directory) / filename,
                      "r",
                      encoding="utf-8") as file:
                rst = file.read()

            # Add our special RST roles to the top of the file before processing
            rst = RST_ROLES + rst

            result = subprocess.run(["rst2html5"],
                                    input=rst.encode(),
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    check=False)

            errors = result.stderr.decode().strip()
            if errors:
                print(filename)
                # Because we add the RST roles to the top of the file, we have to subtract those lines to get the
                # *real* line number in the RST file that the error occurs in.
                errors = regex.sub(
                    "<stdin>:([0-9]+)", lambda exp: "\tLine {}".format(
                        int(exp.groups()[0]) - RST_ROLES_LINE_COUNT),
                    errors).rstrip()
                print(errors)
                return_code = 1

            html = result.stdout.decode().strip()

            matches = regex.findall(r"<h1>(.+?)</h1>", html)
            if matches:
                title = matches[0]

            # Remove empty spans
            html = regex.sub(r"<span>[^>]*?</span>",
                             "",
                             html,
                             flags=regex.DOTALL)

            # SE extension: :italics:`abc <def>` will generate a link like so: <i><a href="def">abc</a></i>
            html = regex.sub(r"<em class=\"i\">([^>]+?) &lt;([^<]+?)&gt;</em>",
                             r"""<i><a href="\2">\1</a></i>""", html)

            # SE extension: change <em class="i"> to <i>
            html = regex.sub(r"<em class=\"i\">([^<]+?)</em>", r"<i>\1</i>",
                             html)

            # Change :ws: and :utf: markers to <span>s
            html = regex.sub(r":(ws|utf):`([^`]+?)`",
                             r"""<span class="\1">\2</span>""", html)

            # Remove comments
            html = regex.sub(r"<!--.+?-->", "", html)

            # Pygments doesn't add colors to html that is just a namespaced attribute, like :html:`xml:lang`. Add that here.
            html = regex.sub(
                r"""<code class="html">([a-zA-Z\-:]+?)</code>""",
                r"""<code class="html"><span class="na">\1</span></code>""",
                html)

            root_number = None
            matches = regex.findall(r"^([0-9]+)\-", filename)
            if matches:
                root_number = matches[0]

            # Now we have some cleaned up HTML.
            # Start parsing the various <section> and <ol> elements to number them.
            soup = BeautifulSoup(html, "html.parser")

            if root_number:
                # Set the ID on the top-level manual section
                top_level_section = soup.select("body > section")[0]

                top_level_section["id"] = root_number

                # Do the actual numbering
                process_ids(top_level_section, root_number, 1)

                # Record the number and its h2 children in the ToC
                toc_item = TocItem(root_number, title,
                                   filename.replace(".rst", ""))
                for header in soup.select("h2"):
                    toc_item.items.append(
                        TocItem(header.parent["id"], header.text, None))

                toc.append(toc_item)

            # rst2html5 doesn't wrap the first child of <li> elements in <p>.
            # Try to do that here.
            for li_item in soup.select("li"):
                need_wrapping = []
                for elem in li_item.contents:
                    if isinstance(elem, NavigableString
                                  ) or elem.name not in BLOCK_LEVEL_ELEMENTS:
                        need_wrapping.append(elem)

                    if elem.name in BLOCK_LEVEL_ELEMENTS:
                        break

                if need_wrapping:
                    new_tag = soup.new_tag("p")

                    for elem in need_wrapping:
                        new_tag.append(elem)

                    li_item.insert(0, new_tag)

            # Now that we've got our structure done, insert <aside>s that have the section numbers in them.
            for elem in soup.find_all(
                    "", attrs={"id": regex.compile(r"^[0-9\.]+$")}):
                aside = soup.new_tag("aside")
                aside["class"] = "number"

                # Add a link to the section within the section <aside>, but only if it is not the main section number (like "2" or "8")
                if regex.match(r"^[0-9]$", elem["id"]):
                    aside.string = elem["id"]
                else:
                    link = soup.new_tag("a")
                    link["href"] = f"#{elem['id']}"
                    link.string = elem["id"]
                    aside.insert(0, link)

                elem.insert(0, aside)

            html = str(soup)

            # Now that we've added IDs and <aside>s, remove the now-unnecessary "no-numbering" class
            html = html.replace(" class=\"no-numbering\"", "")

            # Add a <b> around the first word in a bash command, to highlight it.
            html = regex.sub(r"<code class=\"bash\">([a-z]+) ",
                             r"""<code class="bash"><b>\1</b> """, html)

            # Add syntax highlighting around value strings
            html = regex.sub(
                r"<code class=\"value\">([^<]+?)</code>",
                r"""<code class="bash"><span class="s">\1</span></code>""",
                html)

            # Remove everything up to and including the body element so that we can add our own headers and footers
            html = regex.sub(r".+?<body>", "", html, flags=regex.DOTALL)
            html = regex.sub(r"</body>.*", "", html, flags=regex.DOTALL)

            # If we use CSS properties like -epub-hyphens, the colorizer considers them errors and adds error coloring. Remove that here.
            html = regex.sub(
                r"""<span class="err">-</span><span class="n">(.+?)</span>""",
                r"""<span class="k">-\1</span>""", html)

            # Convert spaces to tabs
            html = regex.sub(r"    ", "\t", html)

            # Add PHP headers and footers
            html = header_html + html + footer_html

            # Replace <pre> with <figure>.
            # Do this last, because editing with BS4 and pretty printing can muck up
            # spacing in <pre> elements if the elements are removed early
            html = regex.sub(r"<pre data-language=\"([^\"]+?)\">",
                             r"""<figure><code class="\1 full">""", html)
            html = regex.sub(
                r"<pre class=\"([^\"]+?)\" data-language=\"([^\"]+?)\">",
                r"""<figure class="\1"><code class="\2 full">""", html)
            html = regex.sub(
                r"<pre data-language=\"([^\"]+?)\" class=\"([^\"]+?)\">",
                r"""<figure class="\2"><code class="\1 full">""", html)
            html = regex.sub(r"</pre>", r"</code></figure>", html)

            # Fill in <title> elements
            if filename == "index.rst":
                version = regex.findall(r"\.\. version: (.+)", rst)[0]
                html = regex.sub(r"MANUAL_TITLE", "The Standard Ebooks Manual",
                                 html)
                html = regex.sub(r"<section id=\".+?\"", r"<section", html)
            else:
                html = regex.sub(
                    r"MANUAL_TITLE",
                    f"{root_number}. {title} - The Standard Ebooks Manual",
                    html)

            with open(Path(work_directory) / filename.replace(".rst", ".php"),
                      "w",
                      encoding="utf-8") as file:
                file.write(html)
                file.truncate()

        # Now, generate the ToC
        toc = natsorted(toc, key=lambda x: x.number)
        toc_html = f"<nav><p><a href=\"/manual/{version}\">The Standard Ebooks Manual of Style</a></p><ol>"
        for toc_item in toc:
            toc_html += f"<li><p><a href=\"/manual/{version}/{toc_item.filename}\">{toc_item.number}. {escape(toc_item.title)}</a></p><ol>"
            for sub_item in toc_item.items:
                toc_html += f"<li><p><a href=\"/manual/{version}/{toc_item.filename}#{sub_item.number}\">{sub_item.number} {escape(sub_item.title)}</a></p></li>"
            toc_html += "</ol></li>"
        toc_html += "</ol></nav>"

        # Place the ToC and version number into the final files
        for filename in os.listdir(work_directory):
            if not filename.endswith(".php"):
                continue

            with open(Path(work_directory) / filename, "r",
                      encoding="utf-8") as file:
                html = file.read()
                html = html.replace("VERSION", version)

                if filename != "index.php":
                    html = regex.sub(r"<main(.+?)>", fr"<main\1>{toc_html}",
                                     html)

            # Check if pygments generated any errors (for example, missing quotes in an HTML attribute)
            if "class=\"err\"" in html:
                print(
                    f"Error colorized code in `{filename}`. Search the file for `class=\"err\"`."
                )

            with open(Path(args.dest_directory) / filename,
                      "w",
                      encoding="utf-8") as file:
                file.write(html)
                file.truncate()

    return return_code
def genSelectBox(df, session_state):
    """ 
    This function generates select boxes for choosing the school network

    Parameters: 
        df (type): 2019 school census dataframe
        session_state (type): section dataset
        user_analytics (type): user data by amplitude
    """

    st.write(
        f"""
        <div class="main-padding" id="top">
            <div class="subtitle-section"> Selecione sua rede </div>
        </div>
        """,
        unsafe_allow_html=True,
    )
    col1, col2, col3, col4 = st.beta_columns([0.3, 0.5, 0.5, 1])

    with col1:
        session_state.state_id = st.selectbox("Estado",
                                              utils.filter_place(df, "state"))
        session_state.state_name = utils.set_state_name(
            df, session_state.state_id)
    with col2:
        options_city_name = utils.filter_place(df,
                                               "city",
                                               state_id=session_state.state_id)
        options_city_name = pd.DataFrame(data=options_city_name,
                                         columns=["city_name"])
        x = int(options_city_name[options_city_name["city_name"] ==
                                  "Todos"].index.tolist()[0])
        session_state.city_name = st.selectbox("Município",
                                               options_city_name,
                                               index=x)
        import pathlib
        from bs4 import BeautifulSoup
        GA_JS = ("""
        window.dataLayer = window.dataLayer || [];
        function municipio(){dataLayer.push('municipio_value': '%s');}
        """ % session_state.city_name)
        index_path = pathlib.Path(st.__file__).parent / "static" / "index.html"
        soup = BeautifulSoup(index_path.read_text())
        script_tag_loader = soup.new_tag("script")
        script_tag_loader.string = GA_JS
    with col3:
        options_adiminlevel = utils.filter_place(
            df,
            "administrative_level",
            state_id=session_state.state_id,
            city_name=session_state.city_name,
        )
        options_adiminlevel = pd.DataFrame(data=options_adiminlevel,
                                           columns=["adiminlevel"])
        y = int(options_adiminlevel[options_adiminlevel["adiminlevel"] ==
                                    "Todos"].index.tolist()[0])
        session_state.administrative_level = st.selectbox(
            "Nível de Administração", options_adiminlevel, index=y)
    with col4:
        st.write(
            f"""
        <div class="container main-padding">
            <br><br>
        </div>
        """,
            unsafe_allow_html=True,
        )
Example #37
0
def cleanup(html_file):
    node = int(os.path.basename(html_file).rstrip(".html.zst"))
    if node in skipNodes:
        print(f"Skipping ignored node {node}")
        return

    if (html_file.endswith('.zst')):
        with open(html_file, 'rb') as doc:
            soup = BeautifulSoup(zstd.decompress(doc.read()), 'lxml')
    else:
        with open(html_file, 'r') as doc:
            soup = BeautifulSoup(doc.read(), 'lxml')

    # Set title to entry name
    soup.title.string = soup.title.string.split('|')[1].strip()

    # Remove soft-hypens
    soup.title.string = soup.title.string.replace(u'\xad', '')

    # revision is correct for all nodes, but canonical
    # only for most parts.
    docpath = soup.find('link', rel='revision')

    if not docpath:
        docpath = soup.find('link', rel='canonical')

    if not docpath:
        print(f'Failed to parse document name: {html_file}')
        return

    url = urllib.parse.unquote(docpath['href'])
    with open('seen-urls.txt', 'a') as f:
        f.write(f'{url} {os.path.basename(html_file)}\n')

    subpath = urllib.parse.urlparse(url).path
    ignore = True
    for path in allowed_paths.keys():
        if path in subpath:
            ignore = False
            break

    if not ignore:
        out = f'{subpath[1:]}.html'
    else:
        with open('ignored-links.txt', 'a') as f:
            f.write(f'{url} - {os.path.basename(html_file)}\n')
        return

    if os.path.exists(out):
        print(f'{out} already exists, renaming to {subpath[1:]}_2.html')
        out = f'{subpath[1:]}_2.html'

    # ads
    [
        div.decompose()
        for div in soup.find_all('aside', class_=lambda x: x != 'rule__note')
    ]

    # Cookie notice crap
    [div.decompose() for div in soup.find_all('div', class_='cookie-notice')]

    # IE stuff
    [
        comment.extract() for comment in soup.findAll(
            text=lambda text: isinstance(text, Comment))
    ]

    # header
    [div.decompose() for div in soup.find_all('div', 'tabloid__masthead')]

    # search-box
    [div.decompose() for div in soup.find_all('div', 'form-asap')]

    # Gizmo?
    [div.decompose() for div in soup.find_all('nav', 'gizmo')]

    # Footer
    [
        div.decompose()
        for div in soup.find_all('div', 'tabloid__footer-top-line')
    ]
    [div.decompose() for div in soup.find_all('div', 'tabloid__footer-top')]
    [div.decompose() for div in soup.find_all('div', 'tabloid__footer-bottom')]

    # all scripts
    [div.decompose() for div in soup.find_all('script')]

    [div.decompose() for div in soup.find_all('form')]
    [div.decompose() for div in soup.find_all('meta')]
    [div.decompose() for div in soup.find_all('style')]

    # Any css and co.
    [div.decompose() for div in soup.find_all('noscript')]
    [div.decompose() for div in soup.find_all('link')]

    # Duden mentor
    [div.decompose() for div in soup.find_all('div', {"id": "block-premium"})]

    # Add UTF-8 charset
    new_tag = soup.new_tag('meta', charset='utf-8')
    soup.head.append(new_tag)

    # Fix view on mobile
    new_tag = soup.new_tag('meta',
                           attrs={
                               'name': 'viewport',
                               'content':
                               'width=device-width, initial-scale=1.0'
                           })
    soup.head.append(new_tag)

    # Disable any referrers
    new_tag = soup.new_tag('meta',
                           attrs={
                               'name': 'referrer',
                               'content': 'no-referrer'
                           })
    soup.head.append(new_tag)

    # Fixup breadcrumb
    for link in soup.find_all('a', class_='breadcrumb__crumb'):
        if link['href'] == '/':
            link.nextSibling.decompose()
            link.decompose()
            break

    # Add bundle.min.css
    new_tag = soup.new_tag('link',
                           href=f'/css/bundle.min.css',
                           media='all',
                           rel='stylesheet')
    soup.head.append(new_tag)

    for img in soup.find_all('img'):
        dest = urllib.parse.urlparse(img['src'])
        with open('media.txt', 'a') as f:
            f.write(f"{dest.scheme}://{dest.netloc}{dest.path}\n")
            img['src'] = f'{dest.path}'

    contains_audio = False
    contains_notation = False
    for link in soup.find_all('a'):

        if not 'href' in link.attrs:
            # <a id="real, Realität" name="real, Realität">real, Realität</a>
            continue

        dest = urllib.parse.unquote(link['href'])
        dest = urllib.parse.urlparse(dest)
        is_media = False

        if 'data-duden-ref-type' in link.attrs:
            if link['data-duden-ref-type'] in ['audio', 'image']:
                with open('media.txt', 'a') as f:
                    f.write(f"{dest.scheme}://{dest.netloc}{dest.path}\n")
                link['href'] = f'{dest.path}'
                is_media = True

                if link['data-duden-ref-type'] == 'audio':
                    contains_audio = True

        if (not is_media) and (not link['href'].startswith('#')):
            if dest.netloc and ('duden.de' not in dest.netloc):
                with open('external-links.txt', 'a') as f:
                    f.write(f"{link['href']}\n")
                continue

            # This can still be audio eg. grammatik-randummern-*
            if ('class' in link.attrs) and ('notation__audio'
                                            in link['class']):
                with open('media.txt', 'a') as f:
                    f.write(f"{dest.scheme}://{dest.netloc}{dest.path}\n")
                link['href'] = f'{dest.path}'
                contains_notation = True
            elif dest.fragment:
                link['href'] = f'{dest.path}.html#{dest.fragment}'
            else:
                link['href'] = f'{dest.path}.html'

    if contains_audio:
        new_tag = soup.new_tag('script', src='/js/pronunciation-guide.js')
        soup.body.append(new_tag)
    elif contains_notation:
        for js in ["notation", "picturefill"]:
            new_tag = soup.new_tag('script', src=f'/js/{js}.js')
            soup.body.append(new_tag)

    with open(out, 'w') as f:
        f.write(str(soup))
Example #38
0
from bs4 import BeautifulSoup


def get_context(path):
    with open(path, 'r') as file:
        return file.read()


if __name__ == '__main__':
    content = get_context('..\\requests\\econpy.html')
    bs = BeautifulSoup(content, 'html.parser')
    a = bs.new_tag('a', href='https://github.com/gabriel-acuna')
    a.string = 'Github profile'
    new_tag = bs.new_tag('div', title='site-data', id='i001', class_='info')
    new_tag.append('\n')
    new_tag.append(a)
    new_tag.append('\n')
    # append(): add the element at parent element end
    bs.body.append(new_tag)
    #insert()
    bs.body.insert(1, new_tag)
    print(bs.body)
Example #39
0
async def wiki(message,
               fname,
               url="https://{lang}.wikipedia.org/w/api.php",
               query=None,
               lang=None,
               lurk=False,
               prefix="w",
               **kwargs):
    w = Wikipya(url=url, lang=lang, lurk=lurk, **kwargs)

    try:
        if query is None:
            command, query = message.text.split(maxsplit=1)

        page, image, url = await w.get_all(
            query,
            lurk,
            blocklist=WIKIPYA_BLOCKLIST,
            img_blocklist=kwargs.get("img_blocklist") or (),
            prefix=prefix)
        text = fixWords(page.parsed)

    except NotFound:
        await message.reply(_("errors.not_found"))
        return

    except ValueError:
        await message.reply(_("errors.enter_wiki_query").format(message.text),
                            parse_mode="Markdown")
        return

    soup = BeautifulSoup(text, "lxml")

    i = soup.find_all("i")
    b = soup.find_all("b")

    if len(i) != 0:
        i[0].unwrap()

    if len(b) != 0:
        if url is not None:
            b = b[0]
            b.name = "a"
            b["href"] = url
            b = b.wrap(soup.new_tag("b"))

    text = unbody(soup)

    try:
        if image != -1:
            cropped = cuteCrop(text, limit=1024)

            if cropped == "":
                cropped = text[:1024]

            await bot.send_chat_action(message.chat.id, "upload_photo")
            await message.reply_photo(image,
                                      caption=cropped,
                                      parse_mode="HTML")
        else:
            await message.reply(cuteCrop(text, limit=4096),
                                parse_mode="HTML",
                                disable_web_page_preview=True)

    except Exception as e:
        await message.reply(bold(_("errors.error")) + "\n" + code(e),
                            parse_mode="HTML")
        await message.answer(cuteCrop(text, limit=4096),
                             disable_web_page_preview=True)
Example #40
0
 def Items(self, opts=None):
     """
     生成器,返回一个元组
     对于HTML:section,url,title,content
     对于图片,mime,url,filename,content
     """
     cnt4debug = 0
     decoder = AutoDecoder(False)
     timeout = self.timeout
     for section, url in self.feeds:
         cnt4debug += 1
         if IsRunInLocal and cnt4debug > 1:
             break
         
         opener = URLOpener(self.host, timeout=timeout)
         result = opener.open(url)
         status_code, content = result.status_code, result.content
         if status_code != 200 or not content:
             self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
             continue
         
         if self.page_encoding:
             content = content.decode(self.page_encoding)
         else:
             content = decoder.decode(content,url)
         
         content =  self.preprocess(content)
         soup = BeautifulSoup(content, "lxml")
         
         try:
             title = soup.html.head.title.string
         except AttributeError:
             self.log.warn('object soup invalid!(%s)'%url)
             continue
         
         title = self.processtitle(title)
         
         if self.keep_only_tags:
             body = soup.new_tag('body')
             try:
                 if isinstance(self.keep_only_tags, dict):
                     self.keep_only_tags = [self.keep_only_tags]
                 for spec in self.keep_only_tags:
                     for tag in soup.find('body').find_all(**spec):
                         body.insert(len(body.contents), tag)
                 soup.find('body').replace_with(body)
             except AttributeError: # soup has no body element
                 pass
         
         def remove_beyond(tag, next):
             while tag is not None and getattr(tag, 'name', None) != 'body':
                 after = getattr(tag, next)
                 while after is not None:
                     ns = getattr(tag, next)
                     after.decompose()
                     after = ns
                 tag = tag.parent
         
         if self.remove_tags_after:
             rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after
             for spec in rt:
                 tag = soup.find(**spec)
                 remove_beyond(tag, 'next_sibling')
         
         if self.remove_tags_before:
             tag = soup.find(**self.remove_tags_before)
             remove_beyond(tag, 'previous_sibling')
         
         remove_tags = self.insta_remove_tags + self.remove_tags
         remove_ids = self.insta_remove_ids + self.remove_ids
         remove_classes = self.insta_remove_classes + self.remove_classes
         remove_attrs = self.insta_remove_attrs + self.remove_attrs
         for tag in soup.find_all(remove_tags):
             tag.decompose()
         for id in remove_ids:
             for tag in soup.find_all(attrs={"id":id}):
                 tag.decompose()
         for cls in remove_classes:
             for tag in soup.find_all(attrs={"class":cls}):
                 tag.decompose()
         for attr in remove_attrs:
             for tag in soup.find_all(attrs={attr:True}):
                 del tag[attr]
         for tag in soup.find_all(attrs={"type":"text/css"}):
             tag.decompose()
         for cmt in soup.find_all(text=lambda text:isinstance(text, Comment)):
             cmt.extract()
         
         if self.keep_image:
             self.soupbeforeimage(soup)
             for img in soup.find_all('img',attrs={'src':True}):
                 imgurl = img['src']
                 if img.get('height') in ('1','2','3','4','5') \
                     or img.get('width') in ('1','2','3','4','5'):
                     self.log.warn('img size too small,take away it:%s' % imgurl)
                     img.decompose()
                     continue
                 if not imgurl.startswith('http'):
                     imgurl = urlparse.urljoin(url, imgurl)
                 if self.fetch_img_via_ssl and url.startswith('https://'):
                     imgurl = imgurl.replace('http://', 'https://')
                 if self.isfiltered(imgurl):
                     self.log.warn('img filtered:%s' % imgurl)
                     img.decompose()
                     continue
                 imgresult = opener.open(imgurl)
                 imgcontent = process_image(imgresult.content,opts) if imgresult.status_code==200 else None
                 if imgcontent:
                     imgtype = imghdr.what(None, imgcontent)
                     if imgtype:
                         imgmime = r"image/" + imgtype
                         fnimg = "%d.%s" % (random.randint(10000,99999999), 'jpg' if imgtype=='jpeg' else imgtype)
                         img['src'] = fnimg
                         yield (imgmime, imgurl, fnimg, imgcontent, None)
                     else:
                         img.decompose()
                 else:
                     self.log.warn('fetch img failed(err:%d):%s' % (imgresult.status_code,imgurl))
                     img.decompose()                
         else:
             for img in soup.find_all('img'):
                 img.decompose()
         
         self.soupprocessex(soup)
         content = unicode(soup)
         
         #提取文章内容的前面一部分做为摘要
         brief = u''
         if GENERATE_TOC_DESC:
             body = soup.find('body')
             for h1 in body.find_all('h1'): # 去掉H1,避免和标题重复
                 h1.decompose()
             for s in body.stripped_strings:
                 brief += unicode(s) + u' '
                 if len(brief) >= TOC_DESC_WORD_LIMIT:
                     brief = brief[:TOC_DESC_WORD_LIMIT]
                     break
         soup = None
         
         content =  self.postprocess(content)
         yield (section, url, title, content, brief)
Example #41
0
import os
import markdown
from bs4 import BeautifulSoup

SOURCE_FILE = os.path.join(os.path.dirname(__file__), '..', 'README.md')
DEST_PATH = os.path.join(os.path.dirname(__file__), '..', 'build')

if not os.path.exists(DEST_PATH):
    os.makedirs(DEST_PATH)

with open(SOURCE_FILE, 'r') as source:
    html = markdown.markdown(source.read())
    soup = BeautifulSoup(html, 'html.parser')
    # Reconstruct title
    new_title = soup.new_tag('p')
    new_title.string = "😱 A dark theme for JetBrains IDEs"
    soup.find('h1').replace_with(new_title)
    # Remove badges
    blockquote_h2 = soup.find('blockquote')
    blockquote_h2.find_next_sibling('p').decompose()
    blockquote_h2.decompose()
    # Set image widths
    for img in soup.find_all('img'):
        img['width'] = '700'
    # Add margin above images
    for img in soup.find_all('img'):
        img.insert_before(soup.new_tag('br'))
    # Remove installation
    installation_h2 = soup.find('h2', text='Installation')
    installation_h2.find_next('ol').decompose()
    installation_h2.decompose()
Example #42
0
def docx_to_html(fonts,jumpiness,word_rotation,width_shift,height_shift,rotace,table_header=True):
    #change all the html files
    for filee in os.listdir("data\\converted\\docx"):
        filee_converted = "data\\converted\\docx\\" + filee
        filee_dest = "data\\done\\" + filee
        #find text and change it
        with open(filee_converted,"r",encoding="utf-8") as f:
            result = f.read()
            result = result.replace("<em>","")
            result = result.replace("</em>","")
            result = result.replace("<strong>","")
            result = result.replace("</strong>","")
            result = result.replace("<li>","<p>")
            result = result.replace("</li>","</p>")
            result = result.replace("</li>","</p>")
            result = result.replace("`","&nbsp;")
            if (not table_header):
                result = result.replace("th","td")
                result = result.replace("<thead>","")
                result = result.replace("</thead>","")
                result = result.replace("<tbody>","")
                result = result.replace("</tbody>","")
                result = result.replace("header","even")

            soup = BeautifulSoup(result,"html.parser")

            #align on line paper
            soup.append(soup.new_tag('style', type='text/css'))
            soup.style.append('body{margin-left:' + str(margin_left) +'cm; line-height:7mm; color:#000F55; word-spacing: 0.25cm;} p{margin:0px;} td:nth-child(even) {padding-right:80px;} td:nth-child(odd) {padding-right:30px;} th {font-weight: normal;} td {padding-top: 0; padding-bottom: 0;} th:nth-child(even) {padding-right:55px;} th:nth-child(odd) {padding-right:30px;}') #1inch nahore offset v chrome/// line-height:7.83mm; (ctvereckovy) /// line-height:6.83mm; linkovany

            #style pismenka v paragraf
            for p in soup.find_all("p"):
                p["style"] = "margin:0px 0px {1}cm {0}px;transform:rotate({2}deg);".format(randrange(width_shift[0],width_shift[1]),0,randrange(rotace[0],rotace[1]))
                #randomize pismenka
                line = p.decode_contents()
                res = ""
                i = 0
                while i < len(line):    
                    if (line[i:i + 1] == " "):
                        res += line[i:i + 1]
                    elif (unidecode(line[i:i + 1]) == unidecode("")):
                        res += " "
                    elif (line[i:i + 5] == "<span" or line[i:i + 6] == "</span") or (line[i:i + 2] == "<p" or line[i:i + 3] == "</p"):
                        while line[i:i + 1] != ">":
                            res += line[i:i + 1]
                            i += 1
                        res += ">"
                    elif (line[i:i+1] == "^"):
                        res += "<span style='font-family:{0};top:{1}px;font-size:100%;transform:skewY({2}deg)'>{3}</span>".format("mv boli",randrange(jumpiness[0],jumpiness[1]),randrange(word_rotation[0],word_rotation[1]),line[i:i+1])
                    else:
                        word = ["<span style='font-family:{0};top:{1}px;font-size:170%;transform:skewY({2}deg)'>".format(choice(fonts),randrange(jumpiness[0],jumpiness[1]),randrange(word_rotation[0],word_rotation[1])),"</span>"]
                        res += word[0] + line[i:i + 1] + word[1]
                    i += 1
                p.string = res
            
            #style pismenka v table
            for t in soup.find_all("table"):
                #th
                for th in t.find_all("th"):
                    #randomize pismenka
                    line = th.decode_contents()
                    res = ""
                    i = 0
                    while i < len(line):
                        if (line[i:i + 1] == " "):
                            res += line[i:i + 1]
                        elif (unidecode(line[i:i + 1]) == unidecode("")):
                            res += " "
                        elif (line[i:i + 5] == "<span" or line[i:i + 6] == "</span"):
                            while line[i:i + 1] != ">":
                                res += line[i:i + 1]
                                i += 1
                            res += ">"
                        else:
                            word = ["<span style='font-family:{0};top:{1}px;font-size:170%;transform:skewY({2}deg)'>".format(choice(fonts),randrange(jumpiness[0],jumpiness[1]),randrange(word_rotation[0],word_rotation[1])),"</span>"]
                            res += word[0] + line[i:i + 1] + word[1]
                        i += 1
                    th.string = res
                
                #tr
                for td in t.find_all("td"):
                    #random left offset
                    td["style"] = "padding-left:%spx;" % randrange(0,8)
                    #randomize pismenka
                    line = td.decode_contents()
                    res = ""
                    i = 0
                    while i < len(line):
                        if (line[i:i + 1] == " "):
                            res += line[i:i + 1]
                        elif (unidecode(line[i:i + 1]) == unidecode("")):
                            res += " "
                        elif (line[i:i + 5] == "<span" or line[i:i + 6] == "</span"):
                            while line[i:i + 1] != ">":
                                res += line[i:i + 1]
                                i += 1
                            res += ">"
                        else:
                            word = ["<span style='font-family:{0};top:{1}px;font-size:170%;transform:skewY({2}deg)'>".format(choice(fonts),randrange(jumpiness[0],jumpiness[1]),randrange(word_rotation[0],word_rotation[1])),"</span>"]
                            res += word[0] + line[i:i + 1] + word[1]
                        i += 1
                    td.string = res

            
            

        #write new file
        with open(filee_dest,"w",encoding="utf-8") as f:
            soup = str(soup).replace("&lt;","<")
            soup = soup.replace("&gt;",">")
            f.write(str(soup))
            print("done")
Example #43
0
	def perform(self,document,sourceHTML,sourceURL,srcPrefix):
		aggregateCSS="";
		if len(srcPrefix) and not srcPrefix.endswith('/'):
			srcPrefix = srcPrefix + '/'

		# retrieve CSS rel links from html pasted and aggregate into one string
		CSSRelSelector = CSSSelector("link[rel=stylesheet],link[rel=StyleSheet],link[rel=STYLESHEET]")
		matching = CSSRelSelector.evaluate(document)
		for element in matching:
			try:
				csspath=element.get("href")
				if len(sourceURL):
					if element.get("href").lower().find("http://",0) < 0:
						parsedUrl=urlparse.urlparse(sourceURL);
						csspath=urlparse.urljoin(parsedUrl.scheme+"://"+parsedUrl.hostname, csspath)
				f=urllib.urlopen(csspath)
				aggregateCSS+=''.join(f.read())
				element.getparent().remove(element)
			except:
				raise IOError('The stylesheet '+element.get("href")+' could not be found')

		#include inline style elements
		print aggregateCSS
		CSSStyleSelector = CSSSelector("style,Style")
		matching = CSSStyleSelector.evaluate(document)
		for element in matching:
			aggregateCSS+=element.text
			element.getparent().remove(element)

		#convert  document to a style dictionary compatible with etree
		styledict = self.getView(document, aggregateCSS)

		#set inline style attribute if not one of the elements not worth styling
		ignoreList=['html','head','title','meta','link','script','repeater','singleline','multiline','br','layout']
		for element, style in styledict.items():
			if element.tag not in ignoreList:
				v = style.getCssText(separator=u'')
				element.set('style', v)

		#convert tree back to plain text html
		self.convertedHTML = etree.tostring(document, method="xml", pretty_print=True,encoding='UTF-8')
		self.convertedHTML= self.convertedHTML.replace('&#13;', '') #tedious raw conversion of line breaks.

		# We've inline styled the CSS, now do the HTML src tags
		soup = BeautifulSoup(self.convertedHTML)
		for img in soup.find_all("img"):
			img['src'] = srcPrefix + img.get('src')

		# Now we would like to set width and min-width on all our tables
		for table in soup.find_all("table"):
			if table.get('width') is not None:
				width = table.get('width')
				if not width.endswith('%'):
					if table.get('style') is None:
						style = []
					else:
						style = table.get('style').split(';')
					style = [x for x in style if x]
					style.append("min-width:" + width + "px")
					style.append("width:" + width + "px")
					table['style'] = ';'.join(style)

		# Might as well go ahead and throw a style tag in the head for iOS fixes
		if soup.html.head is None:
			soup.html.insert(0, soup.new_tag('head'))
		if soup.html.head.style is None:
			soup.html.head.append(soup.new_tag('style', type="text/css"))
		soup.html.head.style.append("""
			a[href^="x-apple-data-detectors:"] {
    color: #000000;
    text-decoration: none;
}
a[href^="tel"], a[href^="sms"], a[href^="mailto"] {
    color: #000000;
    text-decoration: none;
}
""")

		for img in soup.find_all('img'):
			if 'spacer.gif' in img.get('src'):
				classes = img.get('class')
				if classes is not None:
					if 'w' in classes:
						img.parent['width'] = img.get('width')
					if 'h' in classes:
						img.parent['height'] = img.get('height')

		self.convertedHTML = str(soup)

		return self
Example #44
0
    def save_to_html(self, title="", file_name=""):
        """
        title: The <H1> title in the mobi file
        file_name: Save file as <file_name>.mobi
        """
        if self.is_empty():
            raise BriticleException("File is empty")
        if not title:
            title = self.title
        # Generate file name via title if doesn't exist
        if not file_name:
            if title:
                file_name = re.sub(r'[^-\w ]+', '', title).replace(' ', '_')
            else:
                file_name = re.sub(r'[^-\w ]+', '',
                                   self.title).replace(' ', '_')
            if not file_name:
                file_name = "Untitled_Documentation"

        # Save images to local and change the <img> src to new location
        i = 1
        soup = BeautifulSoup(self.html, 'html.parser')
        images = soup.find_all('img')
        print('images: {}'.format(images))
        for img in images:
            if 'src' not in img.attrs:
                continue
            src = img['src']
            image_ext = src.split(".")[-1]
            # Set it as PNG when suffix does not exist
            if len(image_ext) >= 5:
                image_ext = "png"
            image_name = "%03d.%s" % (i, image_ext)
            dir_image = os.path.join(self.save_dir, file_name)
            if not os.path.exists(dir_image):
                os.mkdir(dir_image)
            local_file_name = os.path.join(dir_image, image_name)
            try:
                download_to_local(src, local_file_name)
            except URLError:
                continue
            except Exception as e:
                if 'timed out' in str(e):
                    continue
                raise

            new_tag = soup.new_tag("img", src=file_name + "/" + image_name)
            img.replace_with(new_tag)
            i += 1

        html_file = os.path.join(self.save_dir, file_name + '.html')
        tags_h1 = soup.find_all('h1')
        h1_exists = True if (tags_h1 and len(tags_h1) == 1) else False
        with open(html_file, 'w') as f:
            html = u""
            if h1_exists:
                hr = soup.new_tag('hr')
                tags_h1[0].insert_after(hr)
            else:
                html = u'<h1>{}</h1>\r\n<hr/>\r\n'.format(title)
            html += '{}'.format(soup)

            # FIXME: netloc not correct for URLs ends with "xxx.com.cn"
            try:
                netloc = urlparse(self.url).netloc
                netloc = u".".join(netloc.split(".")[-2:])
            except:
                netloc = u"Original URL"
            html += u'<br/>From <a href="%s">%s</a>. ' % (self.url, netloc)
            f.write(html)
        self.html_file = html_file
        return html_file
Example #45
0
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
    def __init__(self, namespaceHTMLElements, soup=None):
        if soup:
            self.soup = soup
        else:
            from bs4 import BeautifulSoup
            self.soup = BeautifulSoup("", "html.parser")
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)

    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]

        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
        self.soup.object_was_parsed(doctype)

    def elementClass(self, name, namespace):
        tag = self.soup.new_tag(name, namespace)
        return Element(tag, self.soup, namespace)

    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)

    def fragmentClass(self):
        from bs4 import BeautifulSoup
        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)

    def appendChild(self, node):
        # XXX This code is not covered by the BS4 tests.
        self.soup.append(node.element)

    def getDocument(self):
        return self.soup

    def getFragment(self):
        return treebuilder_base.TreeBuilder.getFragment(self).element

    def testSerializer(self, element):
        from bs4 import BeautifulSoup
        rv = []
        doctype_re = re.compile(
            r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')

        def serializeElement(element, indent=0):
            if isinstance(element, BeautifulSoup):
                pass
            if isinstance(element, Doctype):
                m = doctype_re.match(element)
                if m:
                    name = m.group(1)
                    if m.lastindex > 1:
                        publicId = m.group(2) or ""
                        systemId = m.group(3) or m.group(4) or ""
                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
                                  (' ' * indent, name, publicId, systemId))
                    else:
                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
                else:
                    rv.append("|%s<!DOCTYPE >" % (' ' * indent, ))
            elif isinstance(element, Comment):
                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
            elif isinstance(element, NavigableString):
                rv.append("|%s\"%s\"" % (' ' * indent, element))
            else:
                if element.namespace:
                    name = "%s %s" % (prefixes[element.namespace],
                                      element.name)
                else:
                    name = element.name
                rv.append("|%s<%s>" % (' ' * indent, name))
                if element.attrs:
                    attributes = []
                    for name, value in list(element.attrs.items()):
                        if isinstance(name, NamespacedAttribute):
                            name = "%s %s" % (prefixes[name.namespace],
                                              name.name)
                        if isinstance(value, list):
                            value = " ".join(value)
                        attributes.append((name, value))

                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' *
                                                  (indent + 2), name, value))
                indent += 2
                for child in element.children:
                    serializeElement(child, indent)

        serializeElement(element, 0)

        return "\n".join(rv)
Example #46
0
    def readability_by_soup(self, article, url, opts=None):
        content = self.preprocess(article)
        soup = BeautifulSoup(content, "lxml")

        try:
            title = soup.html.head.title.string
        except AttributeError:
            self.log.warn('object soup invalid!(%s)' % url)
            return

        title = self.processtitle(title)
        soup.html.head.title.string = title

        if self.keep_only_tags:
            body = soup.new_tag('body')
            try:
                if isinstance(self.keep_only_tags, dict):
                    keep_only_tags = [self.keep_only_tags]
                else:
                    keep_only_tags = self.keep_only_tags
                for spec in keep_only_tags:
                    for tag in soup.find('body').find_all(**spec):
                        body.insert(len(body.contents), tag)
                soup.find('body').replace_with(body)
            except AttributeError:
                pass

        for spec in self.remove_tags_after:
            tag = soup.find(**spec)
            remove_beyond(tag, 'next_sibling')

        for spec in self.remove_tags_before:
            tag = soup.find(**spec)
            remove_beyond(tag, 'previous_sibling')

        remove_tags = self.insta_remove_tags + self.remove_tags
        remove_ids = self.insta_remove_ids + self.remove_ids
        remove_classes = self.insta_remove_classes + self.remove_classes
        remove_attrs = self.insta_remove_attrs + self.remove_attrs

        for tag in soup.find_all(remove_tags):
            tag.decompose()
        for id in remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            self.soupbeforeimage(soup)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                 or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take away it:%s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                    print url
                    print imgurl
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered:%s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content, opts) if imgresult.code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%s_%d.%s" % (
                            datetime.datetime.now().strftime("%Y%m%d_%H%M%S"),
                            self.imgindex,
                            'jpg' if imgtype == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.code, imgurl))
                    img.decompose()

            for img in soup.find_all('img'):  #去掉图像上面的链接
                if img.parent and img.parent.parent and \
                 img.parent.name == 'a':
                    img.parent.replace_with(img)

        else:
            for img in soup.find_all('img'):
                img.decompose()

        #如果没有内容标题则添加
        t = soup.html.body.find(['h1', 'h2'])
        if not t:
            t = soup.new_tag('h1')
            t.string = title
            soup.html.body.insert(0, t)
        else:
            totallen = 0
            for ps in t.previous_siblings:
                totallen += len(string_of_tag(ps))
                if totallen > 40:  #此H1/H2在文章中间出现,不是文章标题
                    t = soup.new_tag('h1')
                    t.string = title
                    soup.html.body.insert(0, t)
                    break

        self.soupprocessex(soup)
        content = unicode(soup)

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2,避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break
        soup = None

        yield (title, None, None, content, brief)
Example #47
0
if bibtex['ENTRYTYPE'] == "article":
    formattedTags.append("[[Journal Article]]")
elif bibtex['ENTRYTYPE'] == "book":
    formattedTags.append("Book")
elif bibtex['ENTRYTYPE'] == "inproceedings":
    formattedTags.append("[[Conference Paper]]")
elif bibtex['ENTRYTYPE'] == "phdthesis":
    formattedTags.append("Dissertation")
elif bibtex['ENTRYTYPE'] == "mastersthesis":
    formattedTags.append("Thesis")
elif bibtex['ENTRYTYPE'] == "techreport":
    formattedTags.append("[[Technical Report]]")
elif bibtex['ENTRYTYPE'] == "manual":
    formattedTags.append("[[Technical Manual]]")

tag = soup.new_tag('div')

authors = bibtex['author'].split('and')
for i in range(0, len(authors)):
    authorSplit = authors[i].split(',')
    author = ""
    if len(authorSplit) > 1:
        author = authorSplit[1].strip() + " " + authorSplit[0].strip()
    else:
        author = authorSplit[0].lstrip()
    authorTag = 'author' + str(i + 1)
    tag.attrs[authorTag] = author
    if len(authorSplit) == 1:
        formattedTags.append(author)
    else:
        formattedTags.append("[[" + author + "]]")
Example #48
0
    def readability(self, article, url, opts=None):
        """ 使用readability-lxml处理全文信息 """
        content = self.preprocess(article)
        #		print '--------------'
        #		print content
        #		print '---------------'
        # 提取正文
        try:
            doc = readability.Document(content)
            summary = doc.summary(html_partial=True)
        except:
            self.log.warn('article is invalid.[%s]' % url)
            return

        title = doc.short_title()
        title = self.processtitle(title)
        #		print '=================='
        #		print summary
        #		print '==================='

        soup = BeautifulSoup(summary, 'lxml')
        #	soup = BeautifulSoup(content,'lxml')
        '''
		#没有head
		h = soup.find('head')
		if not h:
			h = soup.new_tag('head')
			t = soup.new_tag('title')
			t.string = title
			h.append(t)
			soup.html.insert(0,h)

		#没有h
		t = soup.html.body.find(['h1','h2'])
		if not t:
			t = soup.new_tag('h1')
			t.string = title
			soup.html.body.insert(0,t)
		else:
			totallen = 0
			for ps in t.previous_siblings:
				totallen += len(string_of_tag(ps))
				if totallen > 40:
					t = soup.new_tag('h1')
					t.string = title
					soup.html.body.insert(0,t)
					break
		'''
        self.soupbeforeimage(soup)

        if self.remove_tags:
            for tag in soup.find_all(self.remove_tags):
                tag.decompose()
        for id in self.remove_ids:
            for tag in soup.find_all(attrs={"id": id}):
                tag.decompose()
        for cls in self.remove_classes:
            for tag in soup.find_all(attrs={"class": cls}):
                tag.decompose()
        for attr in self.remove_attrs:
            for tag in soup.find_all(attrs={attr: True}):
                del tag[attr]
        for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)):
            cmt.extract()

        if self.extra_css:
            sty = soup.new_tag('style', type="text/css")
            sty.string = self.extra_css
            soup.html.head.append(sty)

        if self.keep_image:
            opener = URLOpener(self.host, timeout=self.timeout)
            for img in soup.find_all('img', attrs={'src': True}):
                imgurl = img['src']
                if img.get('height') in ('1','2','3','4','5') \
                 or img.get('width') in ('1','2','3','4','5'):
                    self.log.warn('img size too small,take it away : %s' %
                                  imgurl)
                    img.decompose()
                    continue
                if not imgurl.startswith('http'):
                    imgurl = self.urljoin(url, imgurl)
                if self.fetch_img_via_ssl and url.startswith('https://'):
                    imgurl = imgurl.replace('http://', 'https://')
                if self.isfiltered(imgurl):
                    self.log.warn('img filtered : %s' % imgurl)
                    img.decompose()
                    continue
                imgresult = opener.open(imgurl)
                imgcontent = self.process_image(
                    imgresult.content, opts) if imgresult.code == 200 else None
                if imgcontent:
                    imgtype = imghdr.what(None, imgcontent)
                    if imgtype:
                        imgmime = r"image/" + imgtype
                        fnimg = "img%s_%d.%s" % (
                            datetime.datetime.now().strftime("%Y%m%d_%H%M%S"),
                            self.imgindex,
                            'jpg' if imgtype == 'jpeg' else imgtype)
                        img['src'] = fnimg
                        yield (imgmime, imgurl, fnimg, imgcontent, None)
                    else:
                        img.decompose()
                else:
                    self.log.warn('fetch img failed(err:%d):%s' %
                                  (imgresult.code, imgurl))
                    img.decompose()
            #去掉图像上面的链接
            for img in soup.find_all('img'):
                if img.parent and img.parent.parent and \
                 img.parent.name == 'a':
                    img.parent.replace_with(img)
        else:
            for img in soup.find_all('img'):
                img.decompose()

        self.soupprocessex(soup)
        #		print '====-=-=-=-=-=-=-='
        #		print soup
        #		print '-=-=-=-=-=-=-=-=-=-=-'
        cc = soup.body.contents[0]
        #		cc.name = "articleblock"
        #		print cc
        #		print soup.body.renderContents()
        #content = unicode(soup)
        content = unicode(cc)

        #print soup.find('body').contents
        #print soup.body.contents

        #提取文章内容的前面一部分做为摘要
        brief = u''
        if GENERATE_TOC_DESC:
            body = soup.find('body')
            for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2,避免和标题重复
                h.decompose()
            for s in body.stripped_strings:
                brief += unicode(s) + u' '
                if len(brief) >= TOC_DESC_WORD_LIMIT:
                    brief = brief[:TOC_DESC_WORD_LIMIT]
                    break

        soup = None
        yield (title, None, None, content, brief)
Example #49
0
    def Items(self, opts=None):
        decoder = AutoDecoder(False)
        timeout = self.timeout
        for section, url in self.feeds:
            opener = URLOpener(self.host, timeout=timeout)
            result = opener.open(url)
            code, content = result.code, result.content
            if code != 200 or not content:
                self.log.warn('fetch article failed(%d):%s.' % (code, url))
                continue

            if self.page_encoding:
                try:
                    content = content.decode(self.page_encoding)
                except UnicodeDecodeError:
                    content = decoder.decode(content, opener.realurl)
            else:
                content = decoder.decode(content, opener.realurl)

            content = self.preprocess(content)
            soup = BeautifulSoup(content, "lxml")

            h = soup.find('head')
            if not h:
                h = soup.new_tag('head')
                t = soup.new_tag('title')
                t.string = section
                h.append(t)
                soup.html.insert(0, h)
            try:
                title = soup.html.head.title.string
            except AttributeError:
                title = section

            title = self.processtitle(title)

            if self.keep_only_tags:
                body = soup.new_tag('body')
                try:
                    if isinstance(self.keep_only_tags, dict):
                        keep_only_tags = [self.keep_only_tags]
                    else:
                        keep_only_tags = self.keep_only_tags
                    for spec in keep_only_tags:
                        for tag in soup.find('body').find_all(**spec):
                            body.insert(len(body.contents), tag)
                    soup.find('body').replace_with(body)
                except AttributeError:  # soup has no body element
                    pass

            for spec in self.remove_tags_after:
                tag = soup.find(**spec)
                remove_beyond(tag, 'next_sibling')

            for spec in self.remove_tags_before:
                tag = soup.find(**spec)
                remove_beyond(tag, 'previous_sibling')

            remove_tags = self.insta_remove_tags + self.remove_tags
            remove_ids = self.insta_remove_ids + self.remove_ids
            remove_classes = self.insta_remove_classes + self.remove_classes
            remove_attrs = self.insta_remove_attrs + self.remove_attrs
            for tag in soup.find_all(remove_tags):
                tag.decompose()
            for id in remove_ids:
                for tag in soup.find_all(attrs={"id": id}):
                    tag.decompose()
            for cls in remove_classes:
                for tag in soup.find_all(attrs={"class": cls}):
                    tag.decompose()
            for attr in remove_attrs:
                for tag in soup.find_all(attrs={attr: True}):
                    del tag[attr]
            for cmt in soup.find_all(
                    text=lambda text: isinstance(text, Comment)):
                cmt.extract()

            if self.extra_css:
                sty = soup.new_tag('style', type="text/css")
                sty.string = self.extra_css
                soup.html.head.append(sty)

            if self.keep_image:
                self.soupbeforeimage(soup)
                for img in soup.find_all('img', attrs={'src': True}):
                    imgurl = img['src']
                    if img.get('height') in ('1','2','3','4','5') \
                     or img.get('width') in ('1','2','3','4','5'):
                        self.log.warn('img size too small,take away it:%s' %
                                      imgurl)
                        img.decompose()
                        continue
                    if not imgurl.startswith('http'):
                        imgurl = self.urljoin(url, imgurl)
                    if self.fetch_img_via_ssl and url.startswith('https://'):
                        imgurl = imgurl.replace('http://', 'https://')
                    if self.isfiltered(imgurl):
                        self.log.warn('img filtered:%s' % imgurl)
                        img.decompose()
                        continue

                    imgresult = opener.open(imgurl)
                    imgcontent = self.process_image(
                        imgresult.content,
                        opts) if imgresult.code == 200 else None
                    if imgcontent:
                        imgtype = imghdr.what(None, imgcontent)
                        if imgtype:
                            imgmime = r"image/" + imgtype
                            fnimg = "img%s_%d.%s" % (
                                datetime.datetime.now().strftime(
                                    "%Y%m%d_%H%M%S"), self.imgindex,
                                'jpg' if imgtype == 'jpeg' else imgtype)
                            img['src'] = fnimg
                            yield (imgmime, imgurl, fnimg, imgcontent, None)
                        else:
                            img.decompose()
                    else:
                        self.log.warn('fetch img failed(err:%d):%s' %
                                      (imgresult.code, imgurl))
                        img.decompose()

                for img in soup.find_all('img'):
                    if img.parent and img.parent.parent and img.parent.name == 'a':
                        img.parent.replace_with(img)
            else:
                for img in soup.find_all('img'):
                    img.decompose()

            self.soupprocessex(soup)
            content = unicode(soup)

            brief = u''
            if GENERATE_TOC_DESC:
                body = soup.find('body')
                for h in body.find_all(['h1', 'h2']):  # 去掉h1/h2,避免和标题重
                    h.decompose()
                for s in body.stripped_strings:
                    brief += unicode(s) + u' '
                    if len(brief) >= TOC_DESC_WORD_LIMIT:
                        brief = brief[:TOC_DESC_WORD_LIMIT]
                        break

            soup = None
            content = self.postprocess(content)
            yield (section, url, title, content, brief)