Ejemplo n.º 1
0
def genReport(url, new_scan, date):
	print "[+] Generating report..."

	report = BeautifulSoup(open("./templates/template.html", 'r').read())
	
	### In future do this for each page in list
	filepath = getFilePath(url, False).split("/")
	report.oldscan.string = date
	report.newscan.string = time.strftime("%y-%m-%d")
	report.target.string = filepath[2]
	report.targetpage.string = filepath[-1]
	
	#Add results
	diff = report.diff
	diff.string = ""
	date, changes = analyzePage(url, new_scan, date)
	diff.append(report.new_string(changes[0]))
	for line in changes[1:]:
		diff.append(report.new_tag('br'))
		diff.append(report.new_string(line))

	#Write changes to compiled report
	filename = "./reports/" + "/".join(filepath[2:4]) + "/report.html"
	open(filename, "w+").write(str(report))
	print "[+] Saved final report in: " + filename
Ejemplo n.º 2
0
    def _select_info(self, html, rules):
        """
        :param html: 待提取页面
        :param rules: 提取规则
        :return: 返回值形式为 [item1, item2 ... itemN]
        """
        if html and rules:
            soup = BeautifulSoup(html, self.parser)
            item = soup.select(rules)
            # 增加额外的URL链接到列表
            # 当传入规则为获取分类链接的规则时,提取url到列表.将自定义url加入列表
            if ADDITIONAL_URL and rules == 'ul.sub-menu > li > ul > li > a':
                item = [url.get('href') for url in item]
                item.extend(ADDITIONAL_URL)

            # temp为 获取的书籍下载链接
            temp = get_down_load_link(soup, down_link_rules)
            if temp:
                dttag = soup.new_tag("dt")
                #添加dt同级的dd标签
                ddtag = soup.new_tag("dd")

                #设置值
                new_string = soup.new_string("downloadlink: ")
                dttag.append(new_string)

                #将书籍的下载链接添加进去
                new_string = soup.new_string(temp[0].get('href'))
                ddtag.append(new_string)

                item[0].append(dttag)
                item[0].append(ddtag)
            for match in item:
                # yield 返回一个生成器,用for循环迭代
                yield match
Ejemplo n.º 3
0
def get_ip():
    ip = request.headers['X-Forwarded-For'].replace(' ','').split(',')[0]
    #response = requests.get("http://ip-api.com/json/" + ip + "?lang=zh-CN").json()
    response = requests.get("https://www.ipip.net/ip/"+ip+".html",headers=ua).content
    _soup = BeautifulSoup(response, 'html.parser')
    soup = BeautifulSoup(html, 'html.parser')
    '''
    country_name = response['country']
    region_name = response['regionName']
    city = response['city']
    isp = response['isp']
    region_info = country_name + ' ' + region_name + ' ' + city
    '''
    #tables = _soup.find_all(style='clear: both')
    region_info = ''
    isp = ''
    tds = _soup.find_all('td')
    region_info = ''
    isp = ''
    for td in tds:
        if td.string == "地理位置":
            region_info = td.find_next_sibling('td').span.string
        if td.string == "运营商":
            isp = td.find_next_sibling('td').span.string

    soup.body.append(soup.new_tag('br'))
    soup.body.append(soup.new_tag('br'))
    soup.find_all("br")[0].insert_before(soup.new_string(ip))
    soup.find_all("br")[0].insert_after(soup.new_string(region_info))
    soup.find_all("br")[1].insert_after(soup.new_string(isp))
    #print(soup)
    return str(soup)
Ejemplo n.º 4
0
    def generate_html(self, url, status_url, last_checked_time):
        with open("AppView.html") as inf:
            txt = inf.read()
            soup = BeautifulSoup(txt, "html.parser")
            #print(soup.prettify())
        new_tr = soup.new_tag('tr')
        new_td_url = soup.new_tag('td')
        new_td_url.append(soup.new_string(url))
        new_td_status_url = soup.new_tag('td')
        new_td_status_url.append(soup.new_string(status_url))
        new_td_last_checked_time = soup.new_tag('td')
        new_td_last_checked_time.append(soup.new_string(last_checked_time))
        # insert it into the document
        new_tr.append(new_td_url)
        new_tr.append(new_td_status_url)
        new_tr.append(new_td_last_checked_time)

        old_tr = soup.findChildren('tr')
        for tr in old_tr:
            old_td = tr.findChildren('td')
            url_string = old_td[0].getText()
            if url_string != '':
                if url_string == url:
                    soup.table.tr.replaceWith(new_tr)
                else:
                    soup.table.append(new_tr)

            else:
                soup.table.tr.replaceWith(new_tr)


        # save the file again
        with open("AppView.html", "w") as outf:
            outf.write(str(soup))
Ejemplo n.º 5
0
def get_xml_from_dict(params_dict):
    """
    由字典转为xml字符串
    :param params_dict: 字典
    :return: xml_str
    :rtype: str
    """
    soup = BeautifulSoup(features="xml")
    xml = soup.new_tag("xml")
    for k, v in params_dict.items():
        tag = soup.new_tag(k)
        if isinstance(v, int):
            tag.append(soup.new_string(str(v)))
        elif isinstance(v, (str, unicode)):
            tag.append(CData(v))
        else:
            for k1, v1 in v.items():
                tag1 = soup.new_tag(k1)
                if isinstance(v1, int):
                    tag1.append(soup.new_string(str(v1)))
                elif isinstance(v1, (str, unicode)):
                    tag1.append(CData(v1))
            tag.append(tag1)
        xml.append(tag)
    return str(xml)
Ejemplo n.º 6
0
def process_references_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, refs: Dict) -> Dict:
    """
    Process all references in paragraph and generate a dict that contains (type, ref_id, surface_form)
    :param para_el:
    :param sp:
    :param refs:
    :return:
    """
    tokgen = UniqTokenGenerator('REFTOKEN')
    ref_dict = dict()
    for rtag in para_el.find_all('ref'):
        try:
            ref_type = rtag.get('type')
            # skip if citation
            if ref_type == 'bibr':
                continue
            if ref_type == 'table' or ref_type == 'figure':
                ref_id = rtag.get('target')
                if ref_id and normalize_grobid_id(ref_id) in refs:
                    # normalize reference string
                    rtag_string = normalize_grobid_id(ref_id)
                else:
                    rtag_string = None
                # add to ref set
                ref_key = tokgen.next()
                ref_dict[ref_key] = (rtag_string, rtag.text.strip(), ref_type)
                rtag.replace_with(sp.new_string(f" {ref_key} "))
            else:
                # replace with surface form
                rtag.replace_with(sp.new_string(rtag.text.strip()))
        except AttributeError:
            continue
    return ref_dict
Ejemplo n.º 7
0
    def write_index_page(self):
        """
        Using an HTML Template, create an index page that lists
        all of the created calendars.
        """

        LOG.info('Writing index page...')

        # Copy needed files to the destination
        for filename in self.ICONS:
            source = os.path.join("resources", filename)
            if os.path.exists(source):
                shutil.copy(source, self.meta.output)

        with open(self.TEMPLATE, "r") as f:
            template = f.read()

        parser = BeautifulSoup(template, 'lxml')

        # Locate insertion points
        title = parser.find('title')
        header = parser.find('div', {'id': 'header'})
        footer = parser.find('div', {'id': 'footer'})

        # Page title
        title.insert(0, parser.new_string("WBC %s Event Schedule" % self.meta.year))
        header.h1.insert(0, parser.new_string("WBC %s Event Schedule" % self.meta.year))
        footer.p.insert(0, parser.new_string("Updated on %s" % self.meta.now.strftime("%A, %d %B %Y %H:%M %Z")))

        # Tournament event calendars
        tourneys = dict([(k, v) for k, v in self.calendars.items() if k not in self.meta.special])
        ordering = lambda x, y: cmp(tourneys[x]['summary'], tourneys[y]['summary'])
        self.render_calendar_table(parser, 'tournaments', 'Tournament Events', tourneys, ordering)

        # Non-tourney event calendars
        nontourneys = dict([(k, v) for k, v in self.calendars.items() if k in self.meta.special])
        self.render_calendar_list(parser, 'other', 'Other Events', nontourneys)

        # Location calendars
        self.render_calendar_list(parser, 'location', 'Location Calendars', self.locations)

        # Daily calendars
        self.render_calendar_list(parser, 'daily', 'Daily Calendars', self.dailies)

        # Special event calendars
        specials = {
            'all-in-one': self.everything,
            'tournaments': self.tournaments,
        }
        self.render_calendar_list(parser, 'special', 'Special Calendars', specials)

        with codecs.open(os.path.join(self.meta.output, 'index.html'), 'w', 'utf-8') as f:
            f.write(parser.prettify())
Ejemplo n.º 8
0
def MarkOne(fn, note, link):
    soup = None
    with open(fn, "r") as fd:
        soup = BeautifulSoup(fd)

        # Check if the document is already marked.
        if len(soup.select("div#" + DIV_ID)) > 0:
            logging.warning("Document '%s' is already marked.", fn)
            return

        # Build the box containing the note.
        note_div = soup.new_tag("div")
        note_div["id"] = DIV_ID
        note_div["style"] = DIV_STYLE

        note_header = soup.new_tag("p")
        note_header["style"] = HEADER_STYLE
        note_header.string = "Note:"
        note_div.append(note_header)

        note_main = soup.new_tag("p")
        note_main["style"] = NOTE_STYLE
        note_div.append(note_main)

        note_text = soup.new_string(note + " ")
        note_main.append(note_text)

        note_link = soup.new_tag('a')
        note_link["href"] = link
        note_link.string = "Goto the latest version"
        note_main.append(note_link)

        note_final_dot = soup.new_string(".")
        note_main.append(note_final_dot)

        # Selector should lead to a single node.
        done = False
        for selector in SELECTORS:
            node_lst = soup.select(selector)
            if not done and len(node_lst) == 1:
                node_lst[0].insert_after(note_div)
                done = True

        if not done:
            if soup.body:
                soup.body.insert(0, note_div)
            else:
                raise Exception(
                    "Unable to find a place to insert note in '%s'." % fn)

    with codecs.open(fn, "w", "utf-8") as fd:
        fd.write(unicode(soup))
Ejemplo n.º 9
0
def MarkOne(fn, note, link):
    soup = None
    with open(fn, "r") as fd:
        soup = BeautifulSoup(fd)

        # Check if the document is already marked.
        if len(soup.select("div#" + DIV_ID)) > 0:
            logging.warning("Document '%s' is already marked.", fn)
            return

        # Build the box containing the note.
        note_div = soup.new_tag("div")
        note_div["id"] = DIV_ID
        note_div["style"] = DIV_STYLE

        note_header = soup.new_tag("p")
        note_header["style"] = HEADER_STYLE
        note_header.string = "Note:"
        note_div.append(note_header)

        note_main = soup.new_tag("p")
        note_main["style"] = NOTE_STYLE
        note_div.append(note_main)

        note_text = soup.new_string(note + " ")
        note_main.append(note_text)

        note_link = soup.new_tag("a")
        note_link["href"] = link
        note_link.string = "Goto the latest version"
        note_main.append(note_link)

        note_final_dot = soup.new_string(".")
        note_main.append(note_final_dot)

        # Selector should lead to a single node.
        done = False
        for selector in SELECTORS:
            node_lst = soup.select(selector)
            if not done and len(node_lst) == 1:
                node_lst[0].insert_after(note_div)
                done = True

        if not done:
            if soup.body:
                soup.body.insert(0, note_div)
            else:
                raise Exception("Unable to find a place to insert note in '%s'." % fn)

    with codecs.open(fn, "w", "utf-8") as fd:
        fd.write(unicode(soup))
Ejemplo n.º 10
0
 def markup_gloss_abbrs(soup, string):
     for i, abbr in enumerate(string.split('.')):
         if i > 0:
             yield soup.new_string('.')
         atom = abbr.strip().upper()
         m = re.match('(1|2|3)(?P<atom>SG|PL)', atom)
         if atom in ABBRS or m:
             if m:
                 atom = m.group('atom')
             span = soup.new_tag('span', **{'class': 'hint--bottom', 'data-hint': ABBRS[atom]})
             span.string = abbr
             yield span
         else:
             yield soup.new_string(abbr)
Ejemplo n.º 11
0
def replace_ref_tokens(sp: BeautifulSoup, el: bs4.element.Tag, ref_map: Dict):
    """
    Replace all references in element with special tokens
    :param sp:
    :param el:
    :param ref_map:
    :return:
    """
    # replace all citations with cite keyword
    for cite in el.find_all('cit'):
        try:
            target = cite.ref.get('target').replace('bid', 'BIBREF')
            cite.replace_with(sp.new_string(f" {target} "))
        except AttributeError:
            print('Attribute error: ', cite)
            continue

    # replace all non citation references
    for rtag in el.find_all('ref'):
        try:
            if rtag.get('target') and not rtag.get('target').startswith('bid'):
                if rtag.get('target').startswith('cid'):
                    target = rtag.get('target').replace('cid', 'SECREF')
                elif rtag.get('target').startswith('uid'):
                    if rtag.get('target').replace('uid', 'FIGREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'FIGREF')
                    elif rtag.get('target').replace('uid',
                                                    'TABREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'TABREF')
                    elif rtag.get('target').replace('uid', 'EQREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'EQREF')
                    elif rtag.get('target').replace('uid',
                                                    'FOOTREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'FOOTREF')
                    elif rtag.get('target').replace('uid',
                                                    'SECREFU') in ref_map:
                        target = rtag.get('target').replace('uid', 'SECREFU')
                    else:
                        target = rtag.get('target').upper()
                else:
                    print('Weird ID!')
                    target = rtag.get('target').upper()
                rtag.replace_with(sp.new_string(f" {target} "))
        except AttributeError:
            print('Attribute error: ', rtag)
            continue

    return el
Ejemplo n.º 12
0
    def postprocess(self, content):
        pn = re.compile(ur'<a href="(\S*?)">本话题在雪球有.*?条讨论,点击查看。</a>', re.I)
        comment = ''
        mt = pn.search(content)
        url = mt.group(1) if mt else None
        if url:
            opener = URLOpener(url, timeout=self.timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
                if self.feed_encoding:
                    try:
                        comment = result.content.decode(self.feed_encoding)
                    except UnicodeDecodeError:
                        return content

        pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I)
        mt = pn.search(comment)
        if mt:
            comment_json = mt.group(1)
            j = json.loads(comment_json)
            soup = BeautifulSoup(content, "lxml")
            for c in j['comments']:
                u = c['user']['screen_name']
                t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text']))
                for img in t.find_all('img', alt=True):
                    img.replace_with(t.new_string(img['alt']))
                soup.html.body.append(t.p)

            content = unicode(soup)
        return content
Ejemplo n.º 13
0
def compile_text_to_html(text_str, imgs):
    """
	Compile text file to html. All text will be included in one <p> tag, and any
	images will be appended to the end of the <body> tag. All newlines will be
	replaced with <br /> tags.
	"""
    soup = BeautifulSoup()
    soup.append(soup.new_tag('html'))
    body_tag = soup.new_tag('body')
    soup.html.append(body_tag)
    p_tag = soup.new_tag('p')
    body_tag.append(p_tag)

    br_arr = [[soup.new_tag('br'), soup.new_string(line)]
              for line in text_str.split('\n')]
    flattened_arr = [val for sublist in br_arr for val in sublist][1:]

    for el in flattened_arr:
        p_tag.append(el)

    for img in imgs:
        img_tag = soup.new_tag('img',
                               src='cid:%s' % img["tag"],
                               style="max-width: 100%")
        body_tag.append(img_tag)

    return str(soup)
Ejemplo n.º 14
0
 async def handle_content(self, content):
     soup = BeautifulSoup(content, 'html.parser')
     if self.no_dorks is not True:
         for p_elem in soup.find_all('p'):
             if p_elem.findChildren():
                 continue
             css = None
             if 'style' in p_elem.attrs:
                 css = cssutils.parseStyle(p_elem.attrs['style'])
             text_list = p_elem.text.split()
             p_new = soup.new_tag('p', style=css.cssText if css else None)
             for idx, word in enumerate(text_list):
                 # Fetch dorks if required
                 if len(self.dorks) <= 0:
                     self.dorks = await self.get_dorks()
                 word += ' '
                 if idx % 5 == 0:
                     a_tag = soup.new_tag(
                         'a',
                         href=self.dorks.pop(),
                         style=
                         'color:{color};text-decoration:none;cursor:text;'.
                         format(color=css.color if css
                                and 'color' in css.keys() else '#000000'))
                     a_tag.string = word
                     p_new.append(a_tag)
                 else:
                     p_new.append(soup.new_string(word))
             p_elem.replace_with(p_new)
     content = soup.encode('utf-8')
     return content
Ejemplo n.º 15
0
    def postprocess(self, content):
        pn = re.compile(ur'<a href="(\S*?)">本话题在雪球有.*?条讨论,点击查看。</a>',
                        re.I)
        mt = pn.search(content)
        url = mt.group(1) if mt else None
        if url:
            opener = URLOpener(url, timeout=self.timeout)
            result = opener.open(url)
            if result.status_code == 200 and result.content:
              if self.feed_encoding:
                try:
                  comment = result.content.decode(self.feed_encoding)
                except UnicodeDecodeError:
                  return content

        pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I)
        mt = pn.search(comment)
        comment_json = mt.group(1) if mt else None
        j = json.loads(comment_json)
        soup = BeautifulSoup(content, "lxml")
        for c in j['comments']:
            u = c['user']['screen_name']
            t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text']))
            for img in t.find_all('img', alt=True):
                img.replace_with(t.new_string(img['alt']))
            soup.html.body.append(t.p)

        content = unicode(soup)
        return content
Ejemplo n.º 16
0
    async def pathfinder(self, ctx, *, spell):
        """ Retrieves information about a spell in pathfinder. """

        spell_data = conn.execute("SELECT * FROM spells WHERE name LIKE ?", (spell,)).fetchone()

        if spell_data:

            desc_soup = BeautifulSoup(spell_data["description_formated"], "html.parser")
            for p in desc_soup.findAll("p"):
                p.insert_after(desc_soup.new_string("\n\n"))
            desc = desc_soup.get_text()


            output = (
                "**{name}**\n\n"

                "**School** {school}; **Level** {spell_level}\n\n"

                "**Casting Time** {casting_time}\n**Components** {components}\n\n"

                "**Range** {range}\n**Target** {targets}\n**Duration** {duration}\n"
                "**Saving Throw** {saving_throw}; **Spell Resistance** {spell_resistence}\n\n".format(**spell_data)
                )

            await ctx.send(output + desc)


        else:
            await ctx.send("I didn't find anything!")
Ejemplo n.º 17
0
 def test_insert_before_something_empty(self):
     soup = BeautifulSoup("")
     tag = soup.new_tag("a")
     string = soup.new_string("")
     self.assertRaises(ValueError, string.insert_before, tag)
     self.assertRaises(NotImplementedError, soup.insert_before, tag)
     self.assertRaises(ValueError, tag.insert_before, tag)
Ejemplo n.º 18
0
 def test_insert_after_something_that_has_no_meaning(self):
     soup = BeautifulSoup("")
     tag = soup.new_tag("a")
     string = soup.new_string("")
     self.assertRaises(ValueError, string.insert_after, tag)
     self.assertRaises(NotImplementedError, soup.insert_after, tag)
     self.assertRaises(ValueError, tag.insert_after, tag)
Ejemplo n.º 19
0
 def handle_html_content(self, content):
     soup = BeautifulSoup(content, 'html.parser')
     for p_elem in soup.find_all('p'):
         css = None
         if 'style' in p_elem.attrs:
             css = cssutils.parseStyle(p_elem.attrs['style'])
         text_list = p_elem.text.split()
         p_new = soup.new_tag('p', style=css.cssText if css else None)
         for idx, word in enumerate(text_list):
             if len(self.dorks) <= 0:
                 self.dorks = yield from self.get_dorks()
             word += ' '
             if idx % 5 == 0:
                 a_tag = soup.new_tag(
                     'a',
                     href=self.dorks.pop(),
                     style='color:{color};text-decoration:none;cursor:text;'.format(
                         color=css.color if css and 'color' in css.keys() else '#000000'
                     )
                 )
                 a_tag.string = word
                 p_new.append(a_tag)
             else:
                 p_new.append(soup.new_string(word))
         p_elem.replace_with(p_new)
     content = soup.encode('utf-8')
     return content
Ejemplo n.º 20
0
def extract_formulas_from_tei_xml(sp: BeautifulSoup) -> None:
    """
    Replace all formulas with the text
    :param sp:
    :return:
    """
    for eq in sp.find_all('formula'):
        eq.replace_with(sp.new_string(eq.text.strip()))
Ejemplo n.º 21
0
def parse_date_html(html_string):
	"""Takes a string that contains html, and returns (date, date_string,
	content) as a tuple. For now, date is an int that represents the year.
	Negative numbers are B.C. and positive are A.D. years. If there is no date
	that can be parsed, returns None.
	"""

	# preprocess to add newlines after <br />, or else get_text smushes things
	# together
	soup = BeautifulSoup(html_string)
	for el in soup.descendants:
		if el.name == 'br':
			el.insert_after(soup.new_string('\n'))
			el.insert_before(soup.new_string('\n'))

	html_splitter = HtmlSplitter(unicode(soup))
	s = html_splitter.text_string

	content_offset = 0

	# strip out all non-letter/digit characters from the beginning
	m = re.search('^[^\d\w]+', s)
	if m:
		content_offset += m.end()
	if not s:
		return None

	# get the date
	extract = parse_date_text(s[content_offset:])
	if not extract:
		return None
	(date, date_index) = extract
	date_string = html_splitter.get_span(content_offset, date_index + content_offset)

	content_offset += date_index

	# strip out any transition characters between the date and the content
	m = re.search(u'^[\s\-–—:\.]+', s[content_offset:])
	if m:
		content_offset += m.end()

	content = '' if content_offset >= len(s) \
		else html_splitter.get_span(content_offset, len(s))

	return (date, date_string, content)
Ejemplo n.º 22
0
 def markup_gloss_abbrs(soup, string):
     for i, abbr in enumerate(string.split('.')):
         if i > 0:
             yield soup.new_string('.')
         atom = abbr.strip().upper()
         m = re.match('(1|2|3)(?P<atom>SG|PL)', atom)
         if atom in ABBRS or m:
             if m:
                 atom = m.group('atom')
             span = soup.new_tag(
                 'span', **{
                     'class': 'hint--bottom',
                     'data-hint': ABBRS[atom]
                 })
             span.string = abbr
             yield span
         else:
             yield soup.new_string(abbr)
Ejemplo n.º 23
0
def urlize_html(html, trim_url_limit=40):
    """will urlize html, while ignoring link
    patterns inside anchors, <pre> and <code> tags
    """
    soup = BeautifulSoup(html, 'html5lib')
    extract_nodes = list()
    for node in soup.findAll(text=True):
        parent_tags = [p.name for p in node.parents]
        skip_tags = ['a', 'img', 'pre', 'code']
        if set(parent_tags) & set(skip_tags):
            continue

        #bs4 is weird, so we work around to replace nodes
        #maybe there is a better way though
        urlized_text = urlize(node, trim_url_limit=trim_url_limit)
        if unicode(node) == urlized_text:
            continue

        sub_soup = BeautifulSoup(urlized_text, 'html5lib')
        contents = sub_soup.find('body').contents
        num_items = len(contents)
        for i in range(num_items):
            #there is strange thing in bs4, can't iterate
            #as the tag seemingly can't belong to >1 soup object
            child = contents[0]  #always take first element
            #insure that text nodes are sandwiched by space
            have_string = (not hasattr(child, 'name'))
            if have_string:
                node.insert_before(soup.new_string(' '))
            node.insert_before(child)
            if have_string:
                node.insert_before(soup.new_string(' '))

        extract_nodes.append(node)

    #extract the nodes that we replaced
    for node in extract_nodes:
        node.extract()

    result = unicode(soup.find('body').renderContents(), 'utf8')
    if html.endswith('\n') and not result.endswith('\n'):
        result += '\n'

    return result
Ejemplo n.º 24
0
def urlize_html(html, trim_url_limit=40):
    """will urlize html, while ignoring link
    patterns inside anchors, <pre> and <code> tags
    """
    soup = BeautifulSoup(html, 'html5lib')
    extract_nodes = list()
    for node in soup.findAll(text=True):
        parent_tags = [p.name for p in node.parents]
        skip_tags = ['a', 'img', 'pre', 'code']
        if set(parent_tags) & set(skip_tags):
            continue

        # bs4 is weird, so we work around to replace nodes
        # maybe there is a better way though
        urlized_text = urlize(node, trim_url_limit=trim_url_limit)
        if unicode(node) == urlized_text:
            continue

        sub_soup = BeautifulSoup(urlized_text, 'html5lib')
        contents = sub_soup.find('body').contents
        num_items = len(contents)
        for i in range(num_items):
            # there is strange thing in bs4, can't iterate
            # as the tag seemingly can't belong to >1 soup object
            child = contents[0]  # always take first element
            # insure that text nodes are sandwiched by space
            have_string = (not hasattr(child, 'name'))
            if have_string:
                node.insert_before(soup.new_string(' '))
            node.insert_before(child)
            if have_string:
                node.insert_before(soup.new_string(' '))

        extract_nodes.append(node)

    # extract the nodes that we replaced
    for node in extract_nodes:
        node.extract()

    result = unicode(soup.find('body').renderContents(), 'utf8')
    if html.endswith('\n') and not result.endswith('\n'):
        result += '\n'

    return result
Ejemplo n.º 25
0
def format_spaces(text):
    soup = BeautifulSoup(text)
    for tagstring in list(soup.strings):
        value = tagstring.replace(' ', '').replace(SPACE_TAG, ' ')
        new_tag = soup.new_string(value)
        tagstring.replace_with(new_tag)
        if new_tag == '':
            new_tag.extract()

    return soup, tag_to_text(soup.body).replace(' '+SPACE_TAG+' ', ' ')
def generate_html():
    """
    Generate the dictionary as html.
    """
    soup = BeautifulSoup()
    table = soup.new_tag('table')
    table['class'] = 'table table-striped'
    with open(dict_file) as f:
        dict = csv.reader(f)
        # headers
        tr = soup.new_tag('tr')

        th = soup.new_tag('th')
        th.append(soup.new_string('英語 / English'))
        th['class'] = 'col-xs-2'
        tr.append(th)

        th = soup.new_tag('th')
        th.append(soup.new_string('日本語 / Japanese'))
        th['class'] = 'col-xs-2'
        tr.append(th)

        th = soup.new_tag('th')
        th.append(soup.new_string('ローマ字 / Rōmaji'))
        th['class'] = 'col-xs-1'
        tr.append(th)

        table.append(tr)
        for words in dict:
            tr = soup.new_tag('tr')
            for word in words:
                td = soup.new_tag('td')
                td.append(soup.new_string(word))
                tr.append(td)
            table.append(tr)
        soup.append(table)
    
    with open('template.html') as f:
        template = f.read()
    
    with open('html/index.html', 'w') as f:
        html = template.format(table=soup.prettify(), size=path.getsize(dict_file) // 1000)
        f.write(html)
Ejemplo n.º 27
0
def urlize_html(html):
    """will urlize html, while ignoring link
    patterns inside anchors, <pre> and <code> tags
    """
    soup = BeautifulSoup(html, "html5lib")
    extract_nodes = list()
    for node in soup.findAll(text=True):
        parent_tags = [p.name for p in node.parents]
        skip_tags = ["a", "img", "pre", "code"]
        if set(parent_tags) & set(skip_tags):
            continue

        # bs4 is weird, so we work around to replace nodes
        # maybe there is a better way though
        urlized_text = urlize(node)
        if unicode(node) == urlized_text:
            continue

        sub_soup = BeautifulSoup(urlized_text, "html5lib")
        contents = sub_soup.find("body").contents
        num_items = len(contents)
        for i in range(num_items):
            # there is strange thing in bs4, can't iterate
            # as the tag seemingly can't belong to >1 soup object
            child = contents[0]  # always take first element
            # insure that text nodes are sandwiched by space
            have_string = not hasattr(child, "name")
            if have_string:
                node.insert_before(soup.new_string(" "))
            node.insert_before(child)
            if have_string:
                node.insert_before(soup.new_string(" "))

        extract_nodes.append(node)

    # extract the nodes that we replaced
    for node in extract_nodes:
        node.extract()

    result = unicode(soup.find("body").renderContents(), "utf8")
    if html.endswith("\n") and not result.endswith("\n"):
        result += "\n"
    return result
Ejemplo n.º 28
0
def typograf(html):
    """
        Удаление висячих предлогов
    """
    soup = Soup(html, 'html5lib')
    for tag in soup.findAll(text=True):
        if re_nbsp.search(tag):
            new_tag = soup.new_string(unescape(_typograf_replace(tag)))
            tag.replace_with(new_tag)

    return soup.body.decode_contents().replace('\xa0', '&nbsp;')
Ejemplo n.º 29
0
def get_xml(base_xxx, db_package):
    #psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
    #psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY)
    initiate_threaded_connection_pool(db_package)
    with getconnection() as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT id, nom FROM optin_list WHERE abreviation = %s",
                       (str(base_xxx), ))
        records = cursor.fetchone()
        if records:
            optin_id = records[0]
            nom = records[1]
        else:
            optin_id = '0'
            nom = ""
        cursor.execute(
            "SELECT xml FROM criteo_xml WHERE optin_id = %s AND usage = %s",
            (str(optin_id), 'header'))
        records = cursor.fetchone()[0]
        if records:
            header = records
        else:
            header = ""
        cursor.execute(
            "SELECT xml FROM criteo_xml WHERE optin_id = %s AND usage = %s",
            (str(optin_id), 'footer'))
        records = cursor.fetchone()[0]
        if records:
            footer = records
        else:
            footer = ""
    conn_pool.closeall()
    post_dict = {}
    post_dict['id'] = '1'
    post_dict['nom'] = nom
    post_dict['header'] = header
    post_dict['footer'] = footer
    xml_doc = BeautifulSoup(features='xml')
    xml_doc.append(xml_doc.new_tag("bases"))
    xml_doc.bases.append(xml_doc.new_tag("base"))
    cpt_content = 0
    for key, value in post_dict.iteritems():
        xml_doc.bases.base.append(xml_doc.new_tag(str(key)))
        xml_container = xml_doc.bases.base.contents[cpt_content]
        if key == 'footer':
            xml_formatted_value = "<![CDATA[" + value + "]]>"
        else:
            xml_formatted_value = value
        xml_container.append(xml_doc.new_string(xml_formatted_value))
        cpt_content += 1
    xml_feed = xml_doc.prettify()
    xml_feed = xml_feed.replace("&lt;", "<").replace(
        "&gt;", ">")  #.replace("&lt;p&gt;", "").replace("&lt;/p&gt;", "")
    return xml_feed
Ejemplo n.º 30
0
 def _add_title_tag(soup: BeautifulSoup) -> Tag:
     if title.level == 0:
         new_tag = soup.new_tag(
             ODFXMLTagNames.TEXT_P.value,
             attrs={ODFXMLAttributes.STYLE_NAME.value: 'Title'})
     else:
         new_tag = soup.new_tag(
             ODFXMLTagNames.TEXT_H.value,
             attrs={ODFXMLAttributes.TITLE_LEVEL.value: str(title.level)})
     new_tag.append(soup.new_string(title.text))
     return new_tag
Ejemplo n.º 31
0
 def _add_tag(soup: BeautifulSoup) -> Tag:
     new_tag = soup.new_tag(
         ODFXMLTagNames.TABLE_CELL.value,
         attrs={
             ODFXMLAttributes.TABLE_ROW_SPAN.value: str(cell.rowspan),
             ODFXMLAttributes.TABLE_COL_SPAN.value: str(cell.colspan),
         },
     )
     p_tag = soup.new_tag(ODFXMLTagNames.TEXT_P.value)
     p_tag.append(soup.new_string(cell.content.text))
     new_tag.append(p_tag)
     return new_tag
Ejemplo n.º 32
0
def use_bs4():
    soup = BeautifulSoup(xml, 'lxml')
    from bs4 import Comment
    x = soup.new_string('xxx', Comment)
    soup.a.append(x)
    print soup
    #print soup.a.clear(True)
    soup.root.unwrap()
    print '-' * 10
    print soup
    #help(soup)
    help(soup.a)
Ejemplo n.º 33
0
 def handle_request(self, request, payload):
     header = {key: value for (key, value) in request.headers.items()}
     data = dict(
         method=request.method,
         path=request.path,
         headers=header
     )
     r = yield from aiohttp.post('http://localhost:8090/event', data=json.dumps(data))
     ret = yield from r.text()
     print(ret)
     response = aiohttp.Response(
         self.writer, 200, http_version=request.version
     )
     base_path = '/'.join(['/opt/snare/pages', self.run_args.page_dir])
     parsed_url = urlparse(unquote(request.path))
     path = '/'.join(
         [base_path, parsed_url.path[1:]]
     )
     path = os.path.normpath(path)
     if os.path.isfile(path) and path.startswith(base_path):
         with open(path, 'rb') as fh:
             content = fh.read()
         content_type = mimetypes.guess_type(path)[0]
         if content_type:
             if 'text/html' in content_type:
                 print(content_type)
                 soup = BeautifulSoup(content, 'html.parser')
                 for p_elem in soup.find_all('p'):
                     text_list = p_elem.text.split()
                     p_new = soup.new_tag('p', style='color:#000000')
                     for idx, word in enumerate(text_list):
                         word += ' '
                         if idx % 5 == 0:
                             a_tag = soup.new_tag(
                                 'a',
                                 href='http://foo.com',
                                 style='color:#000000;text-decoration:none;cursor:text;'
                             )
                             a_tag.string = word
                             p_new.append(a_tag)
                         else:
                             p_new.append(soup.new_string(word))
                     p_elem.replace_with(p_new)
                 content = str(soup).encode('utf-8')
                 # print(repr(content))
             response.add_header('Content-Type', content_type)
         response.add_header('Content-Length', str(len(content)))
         response.send_headers()
         response.write(content)
     else:
         response.status = 404
         response.send_headers()
     yield from response.write_eof()
    def output_journal(self, journal: Journal):
        soup = BeautifulSoup('', 'html5lib')

        body = soup.find('body')

        # Add title and distance statement
        self.output_title(soup, body, journal.journal_title)

        self.output_subtitle(soup, body, journal.journal_subtitle)
        self.output_subtitle(soup, body,
                             'By {0}'.format(journal.journal_author))

        locations = [location for location in journal.locales]
        self.output_para(soup, body,
                         'Locations: {0}'.format(', '.join(locations)))

        if journal.cover_image:
            self.output_picture(soup, body, journal.cover_image)

        toc_div = soup.new_tag('div', attrs={'class': 'toc_container'})

        # Iterate over the ToC and process every page
        page_idx = 1
        for toc_item in journal.toc:
            p_tag = soup.new_tag('p')
            toc_tag = p_tag

            if toc_item.page:
                html_filename = self.output_page(toc_item.page, page_idx)
                toc_tag = soup.new_tag('a', attrs={'href': html_filename})
                p_tag.append(toc_tag)
                toc_tag.append(soup.new_string(toc_item.page.title))

                page_idx += 1
            elif toc_item.subtitle:
                toc_tag.append(soup.new_string(toc_item.subtitle))
            toc_div.append(p_tag)

        body.append(toc_div)
        self.output_html(soup, 'index')
Ejemplo n.º 35
0
def put_to_html(articles):
    if not config['template_input_html'] or not config['output_html_path']:
        return False
    path_input = config['template_input_html']
    path_output = config['output_html_path']
    # 加载输入HTML模板
    fin = open(path_input, 'r', encoding='UTF-8')
    # 加载输出模板
    fout = open(path_output, 'w+', encoding='UTF-8')
    soup = BeautifulSoup(fin.read(), 'lxml')
    body = soup.find('body')
    # 构造文章 div 节点
    for article in articles:
        # 文章标题节点
        title_node = soup.new_tag('h2')
        title = soup.new_string(article['title'])
        title_node.append(title)

        # 图片节点
        div_image_node = soup.new_tag('div', {'class': 'image'})
        # 神秘代码节点
        div_magnet_node = soup.new_tag('div', {'class': 'magnet'})
        div_node = soup.new_tag('div', {'class': 'article'})
        for image in article['images']:
            image_node = soup.new_tag('img', src=image)
            div_image_node.append(image_node)
        for magnet in article['magnet']:
            magnet_string = soup.new_string(magnet)
            magnet_node = soup.new_tag('p')
            magnet_node.append(magnet_string)
            div_magnet_node.append(magnet_node)
        # 依次将文章节点添加到模板页面的 body 中
        div_node.append(title_node)
        div_node.append(div_magnet_node)
        div_node.append(div_image_node)
        body.append(div_node)
    # 将HTML页面格式化输出到本地
    fout.write(soup.prettify())
Ejemplo n.º 36
0
def cut_bloc2bs_elt(cut_bloc_res) :
  bs = BeautifulSoup('')
  if cut_bloc_res[0] == 'str' :
    return bs.new_string(cut_bloc_res[1])
  name = cut_bloc_res[0]['name']
  new_bs_elt = bs.new_tag(name)
  new_bs_elt.attrs = {}
  for k,v in cut_bloc_res[0].iteritems() :
    if k == 'name' :
      continue
    new_bs_elt.attrs[k] = v
  for bloc in cut_bloc_res[1] :
    new_bs_elt.append(cut_bloc2bs_elt(bloc))
  return new_bs_elt
Ejemplo n.º 37
0
def format_spaces(content):
    if not isinstance(content, BeautifulSoup):
        soup = BeautifulSoup(content, "lxml")
    else:
        soup = content

    for tagstring in list(soup.strings):
        value = tagstring.replace(' ', '').replace(SPACE_TAG, ' ')
        new_tag = soup.new_string(value)
        tagstring.replace_with(new_tag)
        if new_tag == '':
            new_tag.extract()

    return soup, tag_to_text(soup.body)
Ejemplo n.º 38
0
def process_footnotes_from_text(sp: BeautifulSoup) -> Dict:
    """
    Process footnote marks
    :param sp:
    :return:
    """
    footnote_map = dict()

    for note in sp.find_all('note'):
        try:
            if note.name and note.get('id'):
                # normalize footnote id
                ref_id = note.get('id').replace('uid', 'FOOTREF')
                # remove equation tex
                for eq in note.find_all('texmath'):
                    eq.decompose()
                # replace all xrefs with link
                for xref in note.find_all('xref'):
                    xref.replace_with(sp.new_string(f" {xref.get('url')} "))
                # clean footnote text
                footnote_text = None
                if note.text:
                    footnote_text = note.text.strip()
                    footnote_text = re.sub(r'\s+', ' ', footnote_text)
                    footnote_text = re.sub(r'\s', ' ', footnote_text)
                # form footnote entry
                footnote_map[ref_id] = {
                    "num": note.get('id-text', None),
                    "text": footnote_text,
                    "ref_id": ref_id
                }
                note.replace_with(sp.new_string(f" {ref_id} "))
        except AttributeError:
            continue

    return footnote_map
Ejemplo n.º 39
0
def _wrap_content(title, content_node, wrap):
    if wrap:
        doc = BeautifulSoup(
            '''
            <html>
                <head>
                    <title><title/>
                </head>
                <body></body>
            </html>''')
        doc.title.append(doc.new_string(title))
        doc.body.append(content_node)
        return doc
    else:
        return content_node
Ejemplo n.º 40
0
def process_formulas_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup) -> None:
    """
    Process all formulas in paragraph and replace with text and label
    :param para_el:
    :param sp:
    :return:
    """
    for ftag in para_el.find_all('formula'):
        # get label if exists and insert a space between formula and label
        if ftag.label:
            label = ' ' + ftag.label.text
            ftag.label.decompose()
        else:
            label = ''
        ftag.replace_with(sp.new_string(f'{ftag.text.strip()}{label}'))
Ejemplo n.º 41
0
def _extract_tags(
    soup: BeautifulSoup, tag_finder: Callable[[BeautifulSoup], List[Tag]]
) -> Tuple[BeautifulSoup, Dict[str, Tag]]:
    soup = _copy_soup(soup)
    tags = tag_finder(soup)
    reference_to_tag: Dict[str, Tag] = {}
    for tag in tags:
        ref = _generate_reference()
        str_ = soup.new_string(ref)
        extracted = tag.replace_with(str_)
        if not extracted:
            raise ValueError('Expecting Tag, not None.')
        reference_to_tag[ref] = extracted
        str_.wrap(soup.new_tag('w:r'))
    return soup, reference_to_tag
Ejemplo n.º 42
0
def writexml(xml):
    file = open('student.xml', 'w')
    xml = BeautifulSoup(open('student.xml'), 'xml', from_encoding='utf-8')
    root = xml.new_tag('root')
    xml.append(root)
    roots = xml.root
    student = xml.new_tag('students')
    roots.append(student)
    students = roots.students
    comment = xml.new_string('\n学生信息表\n"id":[名字,数学,语文,英文]\n', Comment)
    student.append(comment)
    student.append(xml)
    i = xml.prettify()
    file.write(i)
    file.close()
Ejemplo n.º 43
0
    def __call__(self, outdir):
        """
        runs a parser workflow consisting of
        - preprocess
        - refactor
        - postprocess
        writes the results, an html, a css and a json file to disk.
        """
        cssutils_logger = logging.getLogger('CSSUTILS')
        cssutils_logger.setLevel(logging.ERROR)
        print(self.fname.namebase.encode('utf8'))

        with open(self.fname, encoding='utf8') as fp:
            c = fp.read()
        soup = BeautifulSoup(self.preprocess(self._preprocess(c)))

        # extract css from the head section of the HTML doc:
        css = cssutils.parseString('\n')
        for style in soup.find('head').find_all('style'):
            for rule in self.cssrules(style):
                css.add(rule)

        md = dict(outline=[], refs=[], authors=[])
        soup = self.refactor(soup, md)

        # enhance section headings:
        for section, t in tag_and_text(soup.find_all('h3')):
            t = t.split('[Note')[0]
            id_ = 'section-%s' % slug(t)
            md['outline'].append((t, id_))
            section.attrs['id'] = id_
            for s, attrs in [
                (u'\u21eb', {'href': '#top', 'title': 'go to top of the page', 'style': 'vertical-align: bottom'}),
                ('¶', {'class': 'headerlink', 'href': '#' + id_, 'title': 'Permalink to this section'}),
            ]:
                append(section, soup.new_string('\n'), new_tag(soup, 'a', s, **attrs))

        body = self.insert_links(unicode(soup.find('body')), md)

        # write output files:
        with open(outdir.joinpath('%s.html' % self.id), 'w', encoding='utf8') as fp:
            fp.write(self.wrap(self.postprocess(body)))

        with open(outdir.joinpath('%s.css' % self.id), 'wb') as fp:
            fp.write(self.csstext(css))

        md['authors'] = list(self.yield_valid_authors(md['authors']))
        jsondump(md, outdir.joinpath('%s.json' % self.id), indent=4)
Ejemplo n.º 44
0
def ExportAlgoInfo(fileName, algorithm):
    if not os.path.exists('./svgs/infos'):
        os.makedirs('./svgs/infos')

    file = open(
        "./svgs/infos/%s" % os.path.basename(fileName).replace("svg", "html"),
        "w")
    info = algorithm.About()
    soup = BeautifulSoup(info)

    # Link the CSS file
    head = soup.find('head')
    if not head:
        html = soup.find('html')
        html.insert(0, soup.new_tag('head'))
        head = soup.find('head')
    head.insert(
        0,
        soup.new_tag('link',
                     rel='stylesheet',
                     href='../css/info_page_style.css'))

    # Insert a div with background-color found by colordef into the <dt> elements
    for dt in soup.find_all('dt'):
        colordef = dt.find('colordef')
        if not colordef:
            continue
        color = colordef['color']
        colordef.extract()
        color_div = soup.new_tag('div')
        color_div.string = '&nbsp;&nbsp;&nbsp;&nbsp;'
        color_div['style'] = 'background-color: %s' % color
        color_div['class'] = 'color_div'
        dt.append(color_div)

    # Add a div with "clear: both" to make the next row
    for dd in soup.find_all('dd'):
        clear_div = soup.new_tag('div')
        clear_div['class'] = 'clear_div'
        dd.insert_after(clear_div)

    body = soup.find('body')
    if not body.contents:
        # If there isn't any content add "No algorithm info"
        body.append(soup.new_string('No algorithm info'))

    file.write(soup.prettify(formatter=None))
Ejemplo n.º 45
0
def ExportAlgoInfo(fileName, algorithm):
    if not os.path.exists('./svgs/infos'):
        os.makedirs('./svgs/infos')

    file = open("./svgs/infos/%s" % os.path.basename(fileName).replace("svg", "html"), "w")
    info = algorithm.About()
    soup = BeautifulSoup(info)

    # Link the CSS file
    head = soup.find('head')
    if not head:
        html = soup.find('html')
        html.insert(0, soup.new_tag('head'))
        head = soup.find('head')
    head.insert(0, soup.new_tag('link', rel='stylesheet', href='../css/info_page_style.css'))

    # Insert a div with background-color found by colordef into the <dt> elements
    for dt in soup.find_all('dt'):
        colordef = dt.find('colordef')
        if not colordef:
            continue
        color = colordef['color']
        colordef.extract()
        color_div = soup.new_tag('div')
        color_div.string = '&nbsp;&nbsp;&nbsp;&nbsp;'
        color_div['style'] = 'background-color: %s' % color
        color_div['class'] = 'color_div'
        dt.append(color_div)


    # Add a div with "clear: both" to make the next row
    for dd in soup.find_all('dd'):
        clear_div = soup.new_tag('div')
        clear_div['class'] = 'clear_div'
        dd.insert_after(clear_div)
    
    body = soup.find('body')
    if not body.contents:
        # If there isn't any content add "No algorithm info"
        body.append(soup.new_string('No algorithm info'))

    file.write(soup.prettify(formatter=None))
Ejemplo n.º 46
0
def get_pinyin_bs(file_name):
    """
    Return file + Ruby characters.

    *fn* file name to run on
    """
    # open file and parse with bs4, with xml rules
    debug(file_name)
    f = open(file_name, 'r')
    s1 = f.read()
    f.close()
    bs = BeautifulSoup(s1, 'xml')

    # Go through all the 'text' tags and extract only the tags' strings
    # assign to all_of_the_tags
    all_of_the_tags = []

    for tn in GET_TAG_NAMES:
        for t in bs.findAll(tn):
            if "href" in t:
                t['href'] = htmlLib.escape(t['href'])
            for ts in get_all_tags_text(t):
                all_of_the_tags.append(ts)

    for p in all_of_the_tags:
        debug(p)
        p_el = p['tag']
        p_text = p['str']
        debug(p_text)
        new_p_str = ''

        debug('WORKING ON: ')
        debug("#######{0}#######".format(p_text))
        # pos_tagging will give you the type of word
        # unneccesary in this case
        words = [word for word in jieba.cut(p_text)]
        debug("WORDS: {0}".format(words))
        new_p_str = generate_new_html_for_words(words)
        debug("Parent Element: " + str(p_el.parent))
        debug("{0} will be replaced by {1}".format(p_el, new_p_str))
        p_el.replace_with(bs.new_string(new_p_str))
    return str(add_js_link(bs).decode(formatter=None))
Ejemplo n.º 47
0
		base = os.path.basename(input_file)
		inputdir = path + input_file
		with open (inputdir, "r") as FA:
			FA_string = FA.read().replace('\n', '')
	
		#script = "&lt;script type='text/javascript' src='http://library.albany.edu/angelfish.js'&gt;&lt;/script&gt;&lt;script type='text/javascript'&gt;agf.pageview();&lt;/script&gt;"
		#FA_output = FA_string[:554] + script + FA_string[554:]
		
		input_string = FA_string.replace(u'\xa0', u' ')
		soup = Soup(input_string)

		title = soup.find('title')
		script1 = soup.new_tag('script')
		script1['type'] = "text/javascript"
		script1['src'] = "http://library.albany.edu/angelfish.js"
		if title is None:
			print base
		title.insert_after(script1)
		script2 = soup.new_tag('script')
		script2['type'] = "text/javascript"
		new_string = soup.new_string("agf.pageview();")
		script2.append(new_string)
		title.insert_after(script2)

		
		#prettyHTML=soup.prettify()
		output = str(soup)
		
		output_path = outputdir + base
		file = open(output_path, "w")
		file.write(output)
Ejemplo n.º 48
0
print(new_li_tag.prettify())


#adding/modifying string

new_div_name_tag.string = "phytoplankton"
print(producer_entries.prettify())

#using append

new_div_name_tag.append("producer")
print(soup.prettify()) 

#using new_string

new_string_toappend = soup.new_string("producer")
new_div_name_tag.append(new_string_toappend)

#using insert
new_string_toinsert  = soup.new_string("10000")
new_div_number_tag.insert(0, new_string_toinsert)
print(soup.prettify())


#deleting using decompose

third_producer = soup.find_all("li")[2]
div_name = third_producer.div
div_name.decompose()
print(third_producer.prettify())
Ejemplo n.º 49
0
base_last = os.path.split(base_dir)[-1]
index_name = os.path.join(base_dir+'/../', 'index.html')
if os.path.exists(index_name):
    os.remove(index_name)
files = glob.glob(os.path.join(base_dir, '*.html'))
#ru_files = glob.glob(os.path.join(base_dir, '*.html_ru'))

html_doc = """
<html><head><title>Index</title></head>
<body>
</body>
</html>
"""
soup = BeautifulSoup(html_doc)
for f in files:
    fname = os.path.basename(f)
    tag = soup.new_tag('p')
    soup.body.append(tag)
    a = soup.new_tag('a', href='./'+base_last+'/'+fname, target='_blank')
    a.append(soup.new_string(get_title(f)))
    tag.append(a)
    tag.append(soup.new_tag('br'))
    a = soup.new_tag('a', href='./'+base_last+'/'+fname+'_ru', target='_blank')
    a.append(soup.new_string(get_title(f+'_ru')))
    tag.append(a)
    tag.append(soup.new_tag('br'))
    tag.append(soup.new_tag('br'))

index = open(index_name, 'w+')
index.write(soup.prettify(formatter="html").encode('utf-8'))
Ejemplo n.º 50
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from bs4 import  BeautifulSoup
from bs4 import  Comment
from minelibs import  *
import pprint

from html_content import  *
import re


soup = BeautifulSoup("<b>stop</b>", 'html.parser')
tag = soup.new_tag("i")
tag.string = "Don't"
soup.b.string.insert_before(tag)
print_eval('soup.b')
print_eval('soup.b.contents')

print(xgreen('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -'))
soup.b.i.insert_after(soup.new_string(" ever "))
print_eval('soup.b')
print_eval('soup.b.contents')






Ejemplo n.º 51
0
for line in soup.find_all(class_='MsoNormalTable'):
     #print(line.next_sibling.next_sibling)
     p = line.next_sibling.next_sibling
     new_tag_tr = soup.new_tag('tr')
     new_tag_tr['class'] = 'norTr'
     new_tag = soup.new_tag('table')
     new_tag['class'] = 'footTb'
     new_tag.append(new_tag_tr)
     for sub in p.stripped_strings:
          for s in sub.splitlines():
               if len(s.strip()) > 1:
                    #print(s+'#')
                    new_tag_td = soup.new_tag('td', style="width:1cm;")
                    if len(s.strip(':')) > 2:
                         new_tag_td['style'] = "width:15mm;"
                    new_tag_td_ip = soup.new_tag('td', style=new_tag_td['style'])
                    pinyin = Pinyin()
                    s = s.strip(':').strip(':').strip()
                    pys = pinyin.get_init(s) 
                    new_tag_ip = soup.new_tag('input', id= 'ft_'+pys)
                    new_tag_td_ip.append(new_tag_ip)
                    new_tag_td.append(soup.new_string(s+':'))
                    new_tag_tr.append(new_tag_td)
                    new_tag_tr.append(new_tag_td_ip)
     p.replace_with(new_tag)
for line in soup.find_all('p'):
     line.unwrap()
print(soup.prettify())
f = open('prase2.htm','w',encoding='utf-8')
f.write(soup.prettify())
Ejemplo n.º 52
0
class ConfluencePageInflater(object):
    def __init__(self, page_source, page_handle, attach_handle,
                 encoding='utf-8'):
        super(ConfluencePageInflater, self).__init__()
        self.soup = BeautifulSoup(page_source, 'html5lib',
                                  from_encoding=encoding)
        self.page_handle = page_handle
        self.attach_handle = attach_handle
        self.cleaned_up = False

    def filter_image(self):
        for img in self.soup.find_all('img'):
            ac_image = self.soup.new_tag('ac:image')
            src = img.get('src')
            if src and '//' not in src:
                attach = self.attach_handle(src, img.get('title'))
                if attach:
                    ri_resource = self.soup.new_tag('ri:attachment')
                    ri_resource['ri:filename'] = attach['resource_name']
                else:
                    img.decompose()
                    continue
            else:
                ri_resource = self.soup.new_tag('ri:url')
                ri_resource['ri:value'] = src
            ac_image.append(ri_resource)
            if img.has_attr('alt'):
                ac_image['ac:alt'] = img['alt']
            img.replace_with(ac_image)

    def filter_link(self):
        for link in self.soup.find_all('a'):
            href = link.get('href')
            if href and '//' not in href:
                if '?' in href:
                    href = href[:href.index('?')]
                ac_link = self.soup.new_tag('ac:link')
                if '#' in href:
                    ac_link['ac:anchor'] = href[href.index('#') + 1:]
                    href = href[:href.index('#')]
                if href.endswith('.html'):
                    page = self.page_handle(href)
                    if page:
                        ri_resource = self.soup.new_tag('ri:page')
                        ri_resource['ri:content-title'] = page['title']
                    else:
                        link.decompose()
                        continue
                else:
                    attach = self.attach_handle(href, link.get('title'))
                    if attach:
                        ri_resource = self.soup.new_tag('ri:attachment')
                        ri_resource['ri:filename'] = attach['resource_name']
                    else:
                        link.decompose()
                        continue
                ac_link.append(ri_resource)
                children = link.find_all()
                if children:
                    body = self.soup.new_tag('ac:link-body')
                    for child in children:
                        body.append(child)
                elif link.text:
                    body = self.soup.new_tag('ac:plain-text-link-body')
                    body.append(self.soup.new_string(link.text, CData))
                else:
                    link.decompose()
                    continue
                if link.has_attr('title'):
                    ac_link['ac:title'] = link['title']
                ac_link.append(body)
                link.replaceWith(ac_link)

    @property
    def title(self):
        title = self.soup.find('title')
        return title and title.encode_contents().strip() or ''

    def filter_dl(self):
        for dl in self.soup.find_all('dl'):
            ul = self.soup.new_tag('ul')
            dts = dl.find_all('dt')
            dds = dl.find_all('dd')
            for dt, dd in zip(dts, dds):
                li = self.soup.new_tag('li')
                dt.name = 'p'
                li.append(dt)
                dd.name = 'p'
                li.append(dd)
                ul.append(li)
            dl.replace_with(ul)

    @property
    def is_home_page(self):
        meta = self.soup.find('meta', attrs={'name': 'homepage'})
        return meta is not None and meta.get('value') == 'true'

    def filter_code(self):
        for pre in self.soup.find_all('pre'):
            code_block = self.soup.new_tag('ac:structured-macro')
            code_block['ac:name'] = 'code'

            if pre.has_attr('data-lang'):
                lang_param = self.soup.new_tag('ac:parameter')
                lang_param['ac:name'] = 'language'
                lang_param.append(pre['data-lang'])
                code_block.append(lang_param)

            plain_text = self.soup.new_tag('ac:plain-text-body')
            plain_text.append(self.soup.new_string(pre.get_text(), CData))
            code_block.append(plain_text)
            pre.replace_with(code_block)

    @property
    def cleaned_src(self):
        if not self.cleaned_up:
            self.cleaned_up = True
            self.filter_image()
            self.filter_link()
            self.filter_dl()
            self.filter_code()
        body = self.soup.find('body')
        return (body and body.encode_contents(formatter='html') or
                self.soup.encode_contents(formatter='html'))
Ejemplo n.º 53
0
def tugua_download(url, directory="", date=None):
	'''\
	Download tugua of [date:datetime|str] from [url:str], and store into [directory:str].
	It will create a new folder named "YYYYmmdd" and store converted file into it, and store the original html file into "src" folder.
	Return: None
	'''
	# prepare source directory
	if (not date):
		date = datetime.date.today()
	if (isinstance(date, datetime.date)):
		date_str = date.strftime("%Y%m%d")
	else:
		date_str = date
	directory = os.path.realpath(os.path.abspath(directory))
	src_dir = os.path.join(directory, config["TUGUA"]["SrcDir"])
	if (not os.path.isdir(src_dir)):
		os.makedirs(src_dir)
	src_path = os.path.join(src_dir, date_str + ".html")
	# download contents
	global urlsrc
	url = url.strip()
	urlsrc = url
	down_url(url, src_path)
	data = None
	with open(src_path, "rb") as src_file:
		data = src_file.read()
	src = parse_html(data)
	dest = BeautifulSoup("", config["TUGUA"]["HtmlParser"])
	# analyze source title and frame
	title_tag_src = src.find("title")
	assert (title_tag_src), "No title found!"
	title = title_tag_src.get_text()
	title_match = re.search(r"【喷嚏图卦(\d{8})】\S.*$", title)
	assert (title_match), "No title found!\n  Title tag is '{}'.".format(title)
	assert (date_str == title_match.group(1)), "Date mismatch!\n  Input is '{}', actual is '{}'.".format(date_str, title_match.group(1))
	title = title_match.group(0).strip()
	start_tag_src = src.find(text=re.compile(r"以下内容,有可能引起内心冲突或愤怒等不适症状。|本文转摘的各类事件,均来自于公开发表的国内媒体报道。引用的个人或媒体评论旨在传播各种声音,并不代表我们认同或反对其观点。"))
	end_tag_src = src.find(text=re.compile(r"广告联系:dapenti#dapenti.com"))
	if (end_tag_src):
		tmp = end_tag_src.find_next(text=re.compile(r"喷嚏网"))
		if (tmp):
			end_tag_src = tmp
		while (not end_tag_src.name or end_tag_src.name == "a"):
			end_tag_src = end_tag_src.parent
	assert (start_tag_src) and (end_tag_src), "No content found!\n  Start is '{}', end is '{}'.".format(start_tag_src, end_tag_src)
	if (not end_tag_src.next_element):
		src.append(dest.new_tag("end"))
	# construct dest frame
	dest.append(dest.new_tag("html"))
	head_tag_dest = dest.new_tag("head")
	charset_tag = dest.new_tag("meta")
	charset_tag["http-equiv"] = "Content-Type"
	charset_tag["content"] = "text/html; charset={}".format(config["TUGUA"]["DestEncoding"])
	head_tag_dest.append(charset_tag)
	if (config["STYLE"]["JqueryFile"]):
		head_tag_dest.append(dest.new_tag("script", type="text/javascript", src=config["STYLE"]["JqueryFile"]))
	if (config["STYLE"]["CssFile"]):
		head_tag_dest.append(dest.new_tag("link", rel="stylesheet", type="text/css", href=config["STYLE"]["CssFile"]))
	if (config["STYLE"]["JsFile"]):
		head_tag_dest.append(dest.new_tag("script", type="text/javascript", src=config["STYLE"]["JsFile"]))
	title_tag_dest = dest.new_tag("title")
	title_tag_dest.string = title
	head_tag_dest.append(title_tag_dest)
	dest.html.append(head_tag_dest)
	body_tag_dest = dest.new_tag("body")
	dest.html.append(body_tag_dest)
	# analyze and convert
	subtitle_regex = re.compile(r"^【(\d{0,2})】(.*)")
	def stop_func(tag):
		if (tag == end_tag_src):
			return True
		elif (not tag) or (not tag.string):
			return False
		elif (subtitle_regex.match(tag.string.strip())):
			return True
		else:
			return False
	(prologue, curr_src) = tugua_analyze(start_tag_src, dest, stop_func=stop_func)
	sections = []
	while True:
		assert (curr_src), "Unsupported Error!\n  Analysis tag suspended."
		(section, curr_src) = tugua_analyze(curr_src, dest, stop_func=stop_func)
		sections.append(section)
		if (curr_src == end_tag_src):
			(last_tag, _) = tugua_analyze(curr_src, dest, search_sibling=False)
			if (last_tag.name == "div"):
				last_tag.name = "p"
			section.append(last_tag)  # a bit tricky, append it into previous section
			break
	# debug
	'''debug_output("0: {}".format(prologue))
	count = 0
	for section in sections:
		count += 1
		debug_output("{}: {}".format(count, section))'''
	# check section number
	number_error = 0
	number_count = 0
	number_delta = 0
	for section in sections:
		number_count = number_count + 1
		subtitle = section
		while (not isinstance(subtitle, NavigableString)):
			subtitle = subtitle.next_element
			assert (subtitle), "Content Error!\n  Expect section '{}' but no text found in '{}'.".format(number_count, section)
		subtitle_match = subtitle_regex.match(subtitle)
		assert (subtitle_match), "Content Error!\n  Expect subtitle '【{}】' but actual is '{}'.".format(number_count, subtitle)
		curr_id = subtitle_match.group(1)
		if (len(curr_id) > 0):
			curr_id = int(curr_id)
		else:
			curr_id = 0
		if (curr_id != number_count and curr_id + number_delta != number_count):
			logger.warn("Subtitle number mismatch, expect '{}' but actual is '{}'.".format(number_count, subtitle_match.group(1)))
			number_error = number_error + 1
			number_delta = number_count - curr_id
		subtitle.replace_with(dest.new_string("【{:02}】{}".format(number_count, subtitle_match.group(2).strip())))
	assert (number_error <= config["CORRECTION"].getint("TitleNumErrorMax")), "Content Error!\n  Too many subtitle number mismatch, totally {} errors.".format(number_error)
	# prepare destination directory
	dest_dir = os.path.join(directory, date_str)
	if (not os.path.isdir(dest_dir)):
		os.makedirs(dest_dir)
	os.chdir(dest_dir)
	# load img_info from tmp file
	tmp_path = os.path.join(src_dir, config["TUGUA"]["TmpFile"])
	if (os.path.isfile(tmp_path)) and (os.path.getsize(tmp_path) > 0):
		with open(tmp_path, "rb") as tmp_file:
			tmp_data = pickle.loads(tmp_file.read())
	else:
		tmp_data = {}
	if (date_str not in tmp_data):
		tmp_data[date_str] = {}
	img_info = tmp_data[date_str]
	img_info["count"] = 0
	# format sections & download images
	try:
		prologue = tugua_format(prologue, dest, img_info=img_info)
		for index in range(len(sections)):
			img_info["count"] = 0
			sections[index] = tugua_format(sections[index], dest, img_info=img_info, section_id="{:02}".format(index+1), has_subtitle=True)
	finally:
		# store img_info into tmp file
		with open(tmp_path, "wb") as tmp_file:
			tmp_data[date_str] = img_info
			tmp_file.write(pickle.dumps(tmp_data))
	# separate extra, ad and epilogue
	tag = sections[-1]
	temp = []
	epi_regex = re.compile(r"^(友情提示:请各位河蟹评论。道理你懂的)|(\s*喷嚏新浪围脖:\s*@\s*喷嚏官微\s*、\s*@\s*喷嚏意图\s*(新浪)\s*)$")
	epi = None
	for child in tag.children:
		ch = child.contents[0]
		if (isinstance(ch, Tag)) and ((ch.name == "img") or (ch.name == "embed")):
			temp.clear()
		elif (epi_regex.match(child.get_text())):
			epi = child
			break
		else:
			temp.append(child)
	assert (epi), "Content Error!\n  No epilogue found in '{}'.".format(tag)
	extra_tag = dest.new_tag("div")
	ad_tag = dest.new_tag("div")
	if (len(temp) > 0):
		ad_tmp = temp[-1].extract()
		if (len(ad_tmp.contents) == 1) and (ad_tmp.contents[0].name == "a") and (ad_tmp.contents[0].string.startswith("http")) and (len(temp) > 1):
			ad_tag.append(temp[-2].extract())
			ad_tag.append(ad_tmp)
			temp = temp[:-2]
		else:
			ad_tag.append(ad_tmp)
			temp = temp[:-1]
		for t in temp:
			extra_tag.append(t.extract())
	epilogue_tag = dest.new_tag("div")
	while(epi):
		next_epi = epi.next_sibling
		epilogue_tag.append(epi.extract())
		epi = next_epi
	prologue["id"] = config["IDENT"]["Prologue"]
	prologue["class"] = config["IDENT"]["Prologue"]
	extra_tag["id"] = config["IDENT"]["Extra"]
	extra_tag["class"] = config["IDENT"]["Extra"]
	ad_tag["id"] = config["IDENT"]["Ad"]
	ad_tag["class"] = config["IDENT"]["Ad"]
	epilogue_tag["id"] = config["IDENT"]["Epilogue"]
	epilogue_tag["class"] = config["IDENT"]["Epilogue"]
	# generate title
	title_tag = dest.new_tag("div")
	title_tag["id"] = config["IDENT"]["Title"]
	title_tag["class"] = config["IDENT"]["Title"]
	title_tag.append(dest.new_tag("p"))
	title_tag.p.append(dest.new_tag("a"))
	title_tag.p.a["href"] = url
	title_tag.p.a.string = title
	# regroup
	body_tag_dest.append(title_tag)
	body_tag_dest.append(prologue)
	for section in sections:
		body_tag_dest.append(section)
	body_tag_dest.append(extra_tag)
	body_tag_dest.append(ad_tag)
	body_tag_dest.append(epilogue_tag)
	#dest_path = os.path.join(dest_dir, "{}.html".format(title))
	dest_path = os.path.join(dest_dir, config["TUGUA"]["DestFile"])
	with open(dest_path, "wb") as dest_file:
		logger.info("Saving file '{}' ...".format(dest_path))
		dest_file.write(dest.prettify().encode(config["TUGUA"]["DestEncoding"]))
	# delete tmp record when complete
	del tmp_data[date_str]
	with open(tmp_path, "wb") as tmp_file:
		tmp_file.write(pickle.dumps(tmp_data))
	urlsrc = None
	return
Ejemplo n.º 54
0
#! /usr/bin/python
#encoding=utf-8

from bs4 import BeautifulSoup

'''演示如何添加一段字符串'''

html_doc = '<b></b>'

soup = BeautifulSoup(html_doc)

tag = soup.b
tag.append("hello")
new_string = soup.new_string(" python")
tag.append(new_string)

print tag
print tag.contents
Ejemplo n.º 55
0
Archivo: zdic.py Proyecto: feilong/zdic
# print cjcs.prettify()
for p in cjcs.find_all('p'):
    if p.find('span', class_='yxs'):
        num = int(p.find('span', class_='yxs').string.split('.')[0])
        p.find('span', class_='yxs').decompose()
        print num
        for match in p.find_all('a'):
            match.replaceWithChildren()
        for child in p.children:
            if child.string:
                s = child.string
                li = []
                wrapper = soup.new_tag('div', **{'class':'temp'})
                for i, cont in enumerate(s.split(' ')):
                    if i % 2 == 0:
                        li.append(soup.new_string(cont))
                    else:
                        t = soup.new_tag('div', **{'class':'name'})
                        t.string = cont
                        li.append(t)
                for t in li:
                    wrapper.append(t)
                child.replace_with(wrapper)
            else:
                for i, cont in enumerate(child.children):
                    if i % 2 != 0:
                        t = soup.new_tag('div', **{'class':'name'})
                        t.string = cont
                        cont.replace_with(t)
        for match in p.find_all('div', class_='temp'):
            match.replaceWithChildren()
Ejemplo n.º 56
0
def publish(fname, full=True):
	cxn = sqlite3.connect( os.path.join(config_dir, 'fbk_cache.db') )
	cur = cxn.cursor()
	local_tz = get_localzone() 


	sql_fetch_query = """SELECT `fbk_id`,`message`,`created_timestamp`,`privacy_description` FROM `posts` WHERE `privacy_description`='Public' AND `type`='status' 
	ORDER BY `created_timestamp` DESC"""
	cur.execute(sql_fetch_query)


	soup = BeautifulSoup( """<html><head><title>%s — Wall</title>
			<meta charset="utf-8">
			<link rel="stylesheet" href="style.css" type="text/css">
			</head>
			<body><table id="main"><thead /><tfoot /><tbody /></table></body></html>""" % obj_config['name'], "html.parser")

	body = soup.find('body')

	h1 = soup.new_tag('h1')
	h1.string = obj_config['name']

	if obj_config['tagline']:
		

		span = soup.new_tag('span')
		span['id'] = "tagline"
		span.string = obj_config['tagline']

		h1.append(span)


	body.append(h1)

	main_body = BeautifulSoup( '<div id="content" />', "html.parser" )
	main = main_body.find(id='content')


	for post in cur.fetchall():

		p = transform(post)

		soup_post = BeautifulSoup("""<div class="feedentry hentry" id="fb_%s">
			<span class="author vcard"><span class="fn profile">%s</span></span>
			<span class="entry-title entry-content">%s</span>
	        <div class="timerow">
	        <time class="time published" title="%s" data-date="%s">
	        %s
	        </time>
	        </div>
			</div>""" % (p['fbk_id'], obj_config['name'], p['message'], p['created_timestamp'], p['date'], p['sanitized_timestamp']), "html.parser" )


		main.append(soup_post)

	now = datetime.now(local_tz)
	fbk_util_comment = soup.new_string("Generated by fbk_utils %s" % now.isoformat(), Comment)
	main.append(fbk_util_comment)

	if full:
		body.append(main_body)
		write_outfile( soup.prettify(), '.', 'wall-full.html' )
	else:
		write_outfile( main_body.prettify(), '.', 'wall-posts.html' )

	cxn.close()

	return
Ejemplo n.º 57
0
def xkcdify(content):
    """
    Replace text within a string as specified by the xkcd Substitutions comics.

    This takes an HTML fragment and replaces the text accordingly, wrapping the
    resulting substitutions in span tags.

    :param content: Original content with text to be replaced.
    :returns: Resulting content after xkcd substitutions.
    """

    def sub(matchobj):
        match = matchobj.group()
        key = match.lower().replace("-", " ")
        key1 = re.escape(key)
        key2 = re.escape(key.rstrip("'s"))

        # First, check if the match has a substitution.
        # If it doesn't, check as if the match were plural or possessive.
        if key1 in subs:
            result = subs[key1]
        elif key2 in subs:
            result = subs[key2]
            # If the pattern encountered a match that's the plural or
            # possessive form of a key, modify the return value accordingly.
            if match.endswith("s"):
                result = result + "s"
            elif match.endswith("'"):
                result = result + "'"
        else:
            return ""

        return result

    # Get all the plain text strings in the document without their tags.
    soup = BeautifulSoup(content, 'html.parser')
    content_strings = [element for element in soup.recursiveChildGenerator() \
                       if type(element) == NavigableString]

    for string in content_strings:
        # Use index to track where the current substring of plain text starts.
        index = 0

        # Use wrapper to string together plain text and span elements.
        wrapper_tag = soup.new_tag('span')

        # Upon each match, write to the wrapper the substitution result and the
        # plain text preceding it. Then update index to the position after the
        # matched substring to mark the start of the next plain text substring.
        for match in pattern.finditer(string):
            wrapper_tag.append(soup.new_string(string[index:match.start()]))
            replacement = soup.new_tag('span',
                                       **{
                                           'class': 'substitution',
                                           'data-tooltip': match.group()
                                       })
            replacement.string = sub(match)
            if replacement.string:
                wrapper_tag.append(replacement)
            else:
                wrapper_tag.append(soup.new_string(match.group()))
            index = match.end()

        # Keep the original plain text unless substitutions were made.
        if wrapper_tag.contents:
            # Only append the rest of the string if substitutions were made,
            # because we would otherwise be left with the full original string.
            wrapper_tag.append(string[index:])
            string.replace_with(wrapper_tag)
            wrapper_tag.unwrap()

    return unicode(soup)
Ejemplo n.º 58
0
    def sanitise_html(text, is_html):
        if not is_html:
            # Plain text - generate HTML
            soup = BeautifulSoup()
            paras = text.split('\n')
            for para in paras:
                # Skip empty paragraphs
                if re.search(r'\S', para) is None:
                    continue
                
                tag = soup.new_tag("p")
                
                # Attempt to make links and add text to the tag
                while True:
                    mo = re.search(r'http://\S+', para)
                    if mo is None: 
                        # no links found - add remaining text to tag and finish
                        tag.append(soup.new_string(para))
                        break

                    # Add text before link (if any) as string
                    if mo.start() > 0:
                        tag.append(soup.new_string(para[:mo.start()]))

                    # Strip final punctuation off link target, if applicable
                    if re.match(r'.*[.,;/()]$', mo.group(0)) is not None:
                        link_href = para[ mo.start() : mo.end() - 1]
                        para = para[ mo.end() - 1:]
                    else:
                        link_href = mo.group(0)
                        para = para[ mo.end() :]

                    link_tag = soup.new_tag("a", href=link_href)
                    if len(link_href) <= 25:
                        link_tag.append(link_href)
                    else:
                        link_tag.append(link_href[:22] + '...')
                    tag.append(link_tag)

                soup.append(tag)

        else:
            # HTML - store sanitized HTML in the database
            blacklist = ['script', 'style']
            whitelist = { 'a' : ['href'],
                          'p' : None,
                          'div' : None,
                          'span' : None,
                          'br' : None,
                          'table' : None,
                          'tr' : None,
                          'td' : None,
                          'th' : None,
                          'thead' : None,
                          'tbody' : None,
                          'ul' : None,
                          'ol' : None,
                          'li' : None,
                          'b' : None,
                          'strong' : None,
                          'i' : None,
                          'em' : None,
                          'u' : None,
                          'strike' : None,
                        }

            soup = BeautifulSoup(text)

            for tag in soup.findAll():
                if tag.name.lower() in blacklist:
                    # remove including all children
                    tag.extract()
                elif tag.name.lower() not in whitelist:
                    # remove, retaining children
                    tag.unwrap()
                else:
                    # remove disallowed attributes
                    permitted_attrs = whitelist[tag.name.lower()]
                    for attr in tag.attrs:
                        if permitted_attrs is None or attr not in permitted_attrs:
                            del tag.attrs[attr]

        return soup.decode(formatter='html')