Example #1
0
def create_html_tag(tag, is_self_closing=False, attrs=None):
    assert tag is not None, 'Tham số tag không được là None'
    new_tag = BeautifulSoup('<%s>' % tag, features='xml').find(
        tag) if is_self_closing else _BEAUTIFUL_SOUP.new_tag(tag)
    if attrs is not None:
        new_tag.attrs = attrs
    return new_tag
Example #2
0
    def pre_process(self, content, css):
        soup = BeautifulSoup(content, 'lxml')

        # Select content with CSS selector
        soup = soup.select(css)[0]

        # Remove tags
        [s.extract() for s in soup.find_all(self.ignored_tags)]

        # Remove comments
        [
            s.extract()
            for s in soup(text=lambda text: isinstance(text, Comment))
        ]

        # Remove all attributes
        soup.attrs = {}
        for tag in soup.find_all(True):
            tag.attrs = {}

        # Remove all tags without content
        for tag in soup.find_all(True):
            if tag.get_text().replace("\n", "").replace("\t", "") == "":
                tag.extract()

        return soup
Example #3
0
    def translate(self, src_text: str, to_lang: str, from_lang=None) -> str:
        """Translates src_text to `to_lang`.

        It `from_lang` is not valid, auto-detection is performed to find it.
        """
        if self._dummy: return src_text  # noqa: E701
        if from_lang in {'ja', 'zh'} and all(
                ord(c) < 127 for c in src_text if not c.isspace()):
            return src_text
        src_soup = BeautifulSoup(src_text, 'html.parser')
        if not src_soup.get_text().strip():
            return src_text  # self-closed tag that without content

        if from_lang:
            with _lock:
                cache, old_cache = self._get_cache(from_lang, to_lang)
                target_text = cache.get(src_text)
                if not target_text and (src_text in old_cache):
                    cache[src_text] = target_text = old_cache[src_text]
                if target_text: return target_text  # noqa: E701

        if len(src_soup.contents) == 1 and isinstance(src_soup.contents[0],
                                                      Tag):
            src_children = src_soup.contents[0].children
            full_target_str = src_text[:src_text.find('>', 1) +
                                       1]  # attributes are included
            target_tail_str = f'</{src_soup.contents[0].name}>'
        else:
            src_children = src_soup.children
            full_target_str = target_tail_str = ''

        for src_tag in src_children:
            attrs = {}
            if isinstance(src_tag, Tag):
                attrs = src_tag.attrs
                src_tag.attrs = {}
            target_str = self._do_translation(
                str(src_tag), from_lang, to_lang,
                isinstance(src_tag, NavigableString))
            if not target_str: continue  # noqa: E701
            if attrs:
                target_tag = BeautifulSoup(target_str,
                                           'html.parser').contents[0]
                target_tag.attrs = attrs
                target_str = str(target_tag)
            full_target_str += target_str

        full_target_str += target_tail_str
        return full_target_str
Example #4
0
def cleanarticle(soup: BeautifulSoup) -> BeautifulSoup:
	soup.attrs = None
	soup = removeattrs(soup)
	soup = striptags(soup)
	# dellist = soup.find_all('img', src=re.compile(r'l-stat.livejournal.net/img/userinfo'))
	# for delobj in dellist:
	# 	delobj.decompose()
	# dellist = soup.find_all('a', href=re.compile(r'livejournal.com/profile'), text=False)
	# for delobj in dellist:
	# 	delobj.decompose()
	badlinks = [None, 'https://t.me/evo_lutio', 'https://facebook.com/psychoalchemy.ru/', 'https://www.youtube.com/channel/UCjl7ABlrO8mrtdNabYGb9bQ', 'https://www.instagram.com/evo_lutio/', 'https://vk.com/psychoalchemy', 'https://twitter.com/evo_lutio']
	dellist = soup.find_all('a', href=badlinks)
	for delobj in dellist:
		delobj.decompose()
	return soup
Example #5
0
    if tag.name == 'sup' or ('href' in tag.attrs and 'cmnt_ref' in tag.attrs['href']):
        return False
    return True
def clean_style(key, val):
    if key != 'style': return val
    for st in junk_styles:
        val = val.replace(st + ';', '')
        val = val.replace(st, '')
    return val
    
os.system('mkdir -p cleaned')
for fnm in glob('*/*/*.html'):
    with open(fnm, 'r') as f: 
        body = BeautifulSoup(f).body
    
    body.attrs = {}
    for tag in body.findAll(): 
        tag.attrs = {key: clean_style(key, val) for key, val in tag.attrs.iteritems()}
        if not tagOK(tag):
            tag.extract()
            
    # remove the comments
    for div in body.findAll('div', {'style': 'margin:5px;border:1px solid black'}):
        div.extract()
    
    with open('cleaned/{}'.format(fnm.split('/')[-1]), 'w+', encoding='utf-8') as f:
        out = unicode(
            ''.join(
                [str(tag) for tag in body.contents]
            ).replace(' style=""','').replace('\t', ''),
            encoding='utf-8')
Example #6
0
def transcription(file_tei, config_file):
    with open(config_file) as json_file:
        config = json.load(json_file)
    with open(file_tei, encoding='utf8') as fp:
        soup = BeautifulSoup(fp, "lxml-xml")
    body = soup.find('body')
    for x, y in config['tags'].items():
        for tag in y:
            for node in body.find_all(tag):
                convertTag(node, x, tag, config)

    # for node in body.find_all("p"):
    # 	node.insert(0, " ")
    # 	node.insert(len(node.contents), " ")
    output = {"will": [], "envelope": [], "codicil": []}
    output_div = []
    for item in body.find_all("div"):
        if item.has_attr('type') and item['type'] in output.keys():
            output_div.append(item)

    for item in output_div:
        page = item.find("div", {"class": "pb"})
        page_div = BeautifulSoup(features="html.parser").new_tag('div')
        page_div.attrs = {"id": page['id'], "class": "transcription"}
        page_div.append("")
        output_ = []
        tags = []
        prev_tags = []
        for tag in page.next_siblings:
            if tag is not None and tag.name is not None:
                if tag.get('class') == "pb":
                    page_div.extend(tags)
                    #  Traitement partie commentaire <!-- -->
                    for element in page_div(
                            text=lambda it: isinstance(it, Comment)):
                        element.extract()
                    wrap_ul(page_div)
                    output_.append(str(page_div))
                    tags = []
                    page_div = BeautifulSoup(
                        features="html.parser").new_tag('div')
                    page_div.attrs = {
                        "id": tag['id'],
                        "class": "transcription"
                    }
                elif tag.find("div", {"class": "pb"}) is not None:
                    [tags, page_div, output_,
                     prev_tags] = parse_paragraph(tag.next_element, tags,
                                                  page_div, output_,
                                                  "transcription", prev_tags)
                elif tag.name == "p":
                    # tags = list(filter(lambda a: a != '\n', tags))
                    for element in tags:
                        if element not in ['\n', ' ']:
                            if isinstance(
                                    element,
                                    NavigableString) or element.name not in [
                                        "p", "ul", "li"
                                    ]:
                                new_tag = BeautifulSoup(
                                    features="html.parser").new_tag('p')
                                new_tag.extend(tags)
                                tags.clear()
                                tags.append(new_tag)
                                break
                    tags.append(tag)
                else:
                    tags.append(tag)

        page_div.extend(tags)

        # Traitement attr @rend pour <p>
        for node_p in page_div.find_all("p"):
            if node_p.has_attr('rend'):
                node_p.attrs = {"class": "p-" + node_p["rend"]}
        #  Traitement partie commentaire <!-- -->
        for element in page_div(text=lambda it: isinstance(it, Comment)):
            element.extract()
            # comment_soup = BeautifulSoup(comment, "html.parser")
            # for x, y in config['tags'].items():
            # y_lower = [x.lower() for x in y]
            # for tag in y_lower:
            # for node in comment_soup.find_all(tag):
            # convertTag(node, x, y[y_lower.index(tag)], config)
            # comment.replace_with(Comment(str(comment_soup)))

        wrap_ul(page_div)
        output_.append(str(page_div))
        output[item['type']] = output_

    return output
Example #7
0
def parse_paragraph(tag, tags, page_div, output, type_class, prev_tags):
    if tag.parent.name and tag.parent.name != "div":
        new_element = dict()
        new_element[tag.parent.name] = []
        if isinstance(tag, NavigableString) and tag.string != "\n":
            new_element[tag.parent.name].append(tag.string)
        prev_tags.insert(0, new_element)
    if tag is not None and tag.name is not None:
        if tag.get('class') == "pb":
            if len(prev_tags) > 0:
                for i in range(len(prev_tags) - 1):
                    for key in prev_tags[i].keys():
                        new_tag = BeautifulSoup(
                            features="html.parser").new_tag(key)
                        new_tag.extend(prev_tags[i][key])
                        for key_bis in prev_tags[i + 1].keys():
                            prev_tags[i + 1][key_bis].append(new_tag)
                for key in prev_tags[len(prev_tags) - 1].keys():
                    new_tag = BeautifulSoup(
                        features="html.parser").new_tag(key)
                    new_tag.extend(prev_tags[len(prev_tags) - 1][key])
                    tags.append(new_tag)
                    prev_tags = []
            for element in tags:
                if element not in ['\n', ' ']:
                    if isinstance(element,
                                  NavigableString) or element.name not in [
                                      "p", "ul", "li"
                                  ]:
                        new_tag = BeautifulSoup(
                            features="html.parser").new_tag('p')
                        new_tag.extend(tags)
                        tags.clear()
                        tags.append(new_tag)
                        break
            page_div.extend(tags)
            for element in page_div(text=lambda it: isinstance(it, Comment)):
                element.extract()
            wrap_ul(page_div)
            output.append(str(page_div))
            tags = []
            page_div = BeautifulSoup(features="html.parser").new_tag('div')
            page_div.attrs = {"id": tag.get('id'), "class": type_class}

    for item in tag.next_siblings:
        if item is not None and item.name is not None:
            if item.get('class') == "pb":
                if len(prev_tags) > 0:
                    for i in range(len(prev_tags) - 1):
                        for key in prev_tags[i].keys():
                            new_tag = BeautifulSoup(
                                features="html.parser").new_tag(key)
                            new_tag.extend(prev_tags[i][key])
                            for key_bis in prev_tags[i + 1].keys():
                                prev_tags[i + 1][key_bis].append(new_tag)
                    for key in prev_tags[len(prev_tags) - 1].keys():
                        new_tag = BeautifulSoup(
                            features="html.parser").new_tag(key)
                        new_tag.extend(prev_tags[len(prev_tags) - 1][key])
                        tags.append(new_tag)
                        prev_tags = []
                # p_tag = BeautifulSoup(
                #     features="html.parser").new_tag("p")
                # p_tag.extend(tags)

                for element in tags:
                    if element not in ['\n', ' ']:
                        if isinstance(element,
                                      NavigableString) or element.name not in [
                                          "p", "ul", "li"
                                      ]:
                            new_tag = BeautifulSoup(
                                features="html.parser").new_tag('p')
                            new_tag.extend(tags)
                            tags.clear()
                            tags.append(new_tag)
                            break
                page_div.extend(tags)
                for element in page_div(
                        text=lambda it: isinstance(it, Comment)):
                    element.extract()
                wrap_ul(page_div)
                output.append(str(page_div))
                tags = []
                page_div = BeautifulSoup(features="html.parser").new_tag('div')
                page_div.attrs = {"id": item.get('id'), "class": type_class}
            elif item.find("div", {"class": "pb"}) is not None:
                [tags, page_div, output,
                 prev_tags] = parse_paragraph(item.next_element, tags,
                                              page_div, output, type_class,
                                              prev_tags)
            else:
                if len(prev_tags) == 0:
                    tags.append(item)
                else:
                    if tag.parent.name in prev_tags[0]:
                        prev_tags[0][tag.parent.name].append(item)

        else:
            if len(prev_tags) == 0:
                tags.append(item)
            else:
                if tag.parent.name in prev_tags[0]:
                    prev_tags[0][tag.parent.name].append(item)

    return [tags, page_div, output, prev_tags]
Example #8
0
def edition(file_tei, config_file):
    add_translate = {
        "above": "au dessus",
        "below": "au dessous",
        "marginLeft": "en marge à gauche",
        "marginRight": "en marge à droite",
        "marginBottom": "en marge inférieur",
        "marginTop": "en marge supérieur",
        "inline": "dans la ligne"
    }

    with open(config_file) as json_file:
        config = json.load(json_file)
    with open(file_tei, encoding='utf8') as fp:
        soup = BeautifulSoup(fp, "lxml-xml")
    body = soup.find('body')
    for x, y in config['tags'].items():
        for tag in y:
            for node in body.find_all(tag):
                if node.name in ["sic", "abbr", "note", "space", "del"]:
                    # if node.name in ["lb", "space"]:
                    # 	if node.name == "lb" and isinstance(node.previous_sibling, NavigableString):
                    # 		str_ = node.previous_sibling.strip()
                    # 		if len(str_) == 0 or str_[len(str_)-1] != '-':
                    # 			space = BeautifulSoup(features="html.parser").new_tag('span')
                    # 			space.attrs = {"class": node.name + "-edition"}
                    # 			space.string = "  "
                    # 			node.insert_after(space)
                    # 		else:
                    # 			node.previous_sibling.replace_with(str_.replace('-', ''))
                    node.decompose()
                else:
                    # if node.name in ["persName", "item", "placeName", "date", "addrLine", "surname"]:
                    #     node.insert(0, "")
                    #     # if node.name == "persName" :
                    #     # 	node.next_element.next_element.insert_after(" ")
                    #     next_str = node.parent.next_sibling
                    #     # if next_str is not None and len(next_str) > 1 and next_str[0] not in [".", ","]:
                    #     #     node.insert(len(node.contents), "")

                    # elif node.name in ["expan"] and node.string is not None:
                    #     node.string = "" + node.string + ""
                    new_attrs = dict()
                    new_attrs['class'] = tag
                    for old_attr in node.attrs:
                        if old_attr in config['attrs']:
                            new_attrs['class'] += "-" + node.attrs[old_attr]
                        if old_attr == "xml:lang":
                            new_attrs["xml:lang"] = node.get("xml:lang")
                        elif old_attr == "facs":
                            new_attrs["id"] = node.attrs[old_attr]
                    if "rend" in node.attrs:
                        new_attrs['class'] += "-" + node.attrs["rend"]
                    if node.name == "unclear":
                        new_attrs['title'] = "transcription incertaine"
                    if node.name == "add":
                        if 'place' in node.attrs:
                            new_attrs['title'] = "ajout " + \
                                add_translate[node['place']]
                            # new_attrs['class'] = "add-" + node['place']
                            # node.insert(0, "\\")
                            # node.insert(len(node.contents), "/ ")
                    elif node.name == "supplied":
                        node.insert(0, "{")
                        node.insert(len(node.contents), "}")
                    node.name = x
                    node.attrs = new_attrs
                    if len(node.contents) == 0 and node.name == "span":
                        node.string = ""

    # for node in body.find_all("p"):
    # 	node.insert(0, " ")
    # 	node.insert(len(node.contents), " ")
    output = {"will": [], "envelope": [], "codicil": []}
    output_div = []
    for item in body.find_all("div"):
        if item.has_attr('type') and item['type'] in output.keys():
            output_div.append(item)

    for item in output_div:
        page = item.find("div", {"class": "pb"})
        page_div = BeautifulSoup(features="html.parser").new_tag("div")
        page_div.attrs = {"id": page['id'], "class": "edition"}
        page_div.append("")
        tags = []
        output_ = []
        prev_tags = []
        for tag in page.next_siblings:
            if tag is not None and tag.name is not None:
                if tag.get('class') == "pb":
                    page_div.extend(tags)
                    #  Traitement partie commentaire <!-- -->
                    for element in page_div(
                            text=lambda it: isinstance(it, Comment)):
                        element.extract()
                    wrap_ul(page_div)
                    output_.append(str(page_div))
                    tags = []
                    page_div = BeautifulSoup(
                        features="html.parser").new_tag('div')
                    page_div.attrs = {"id": tag.get('id'), "class": "edition"}
                elif tag.find("div", {"class": "pb"}) is not None:
                    [tags, page_div, output_,
                     prev_tags] = parse_paragraph(tag.next_element, tags,
                                                  page_div, output_, "edition",
                                                  prev_tags)
                elif tag.name == "p":
                    for element in tags:
                        if element not in ['\n', ' ']:
                            if isinstance(
                                    element,
                                    NavigableString) or element.name not in [
                                        "p", "ul", "li"
                                    ]:
                                new_tag = BeautifulSoup(
                                    features="html.parser").new_tag('p')
                                new_tag.extend(tags)
                                tags.clear()
                                tags.append(new_tag)
                                break
                    tags.append(tag)
                else:
                    tags.append(tag)

        page_div.extend(tags)
        # for comment in page_div.find_all(string=lambda text: isinstance(text, Comment)):
        #	comment_soup = BeautifulSoup(comment, "html.parser")
        #	for x, y in config['tags'].items():
        #		y_lower = [x.lower() for x in y]
        #		for tag in y_lower:
        #			for node in comment_soup.find_all(tag):
        #				convertTag(node, x, y[y_lower.index(tag)], config)
        #	comment.replace_with(Comment(str(comment_soup)))

        # Traitement attr @rend pour <p>
        for node_p in page_div.find_all("p"):
            if node_p.has_attr('rend'):
                node_p.attrs = {"class": "p-" + node_p["rend"]}

        #  Traitement partie commentaire <!-- -->
        for element in page_div(text=lambda it: isinstance(it, Comment)):
            element.extract()
        wrap_ul(page_div)
        output_.append(str(page_div))
        output[item['type']] = output_
    return output
Example #9
0
def render_table_odt(elem, doc):
    table = elem.content[0]
    table_number = tuple(
        str(i) for i in utils.get_elem_count(doc, pf.Table, register="table"))
    table_name = "Table{}".format("_".join(str(i) for i in table_number))
    #
    table_root = BeautifulSoup("", "xml")

    if hasattr(table, "caption") and table.caption:
        colon = ": "
        caption = "".join(pf.stringify(c) for c in table.caption)
    else:
        colon = ""
        caption = ""

    caption_odt = utils.create_nested_tags(
        **{
            "name":
            "text:p",
            "attrs": {
                "text:style-name": "Table"
            },
            "contents": [
                {
                    "name":
                    "text:span",
                    "attrs": {
                        "text:style-name": "Strong_20_Emphasis"
                    },
                    "contents": [
                        "Table ",
                        {
                            "name": "text:sequence",
                            "attrs": {
                                "text:ref-name": f"ref{table_name}",
                                "text:name": "Table",
                                "text:formula": "ooow:Table+1",
                                "style:num-format": "1",
                            },
                            "contents": [".".join(table_number)],
                        },
                        colon,
                    ],
                },
                caption,
            ],
        })

    table_root.contents.append(caption_odt)

    table_odt = utils.create_nested_tags(
        **{
            "name": "table:table",
            "attrs": {
                "table:name": table_name,
                "table:style-name": table_name,
                "table:template-name": "Default Style",
            },
        })

    table_root.contents.append(table_odt)

    unoccupied_width = 1 - sum(table.width)
    unspecified_widths = len([w for w in table.width if not w])
    remaining_for_each = unoccupied_width / unspecified_widths

    widths = [w if w else remaining_for_each for w in table.width]

    # We want the table to occupy a maximum width
    widths = map(lambda x: x * table.total_width, widths)

    column_style_names, column_styles, column_definitions = zip(
        *create_column_definitions(widths, table_name))

    pf.debug(column_style_names, column_styles, column_definitions)

    styles = BeautifulSoup("", "xml")
    styles.contents = list(column_styles)

    table_odt.contents.extend(column_definitions)

    for r, row in enumerate(table.content):
        row_odt = Tag(name="table:table-row")
        row_odt.attrs = {
            "table:style-name":
            "{table_name}.{r}".format(table_name=table_name, r=r + 1)
        }

        row_cell_styles = []

        for c, cell in enumerate(row.content):

            if cell.covered:
                cell_odt = Tag(name="table:covered-table-cell")
                row_odt.contents.append(cell_odt)

                row_cell_styles.append(None)
            else:
                cell_odt = Tag(name="table:table-cell")

                cell_style_name = "{column_style}{r}".format(
                    column_style=column_style_names[c], r=r + 1)

                cell_style = Tag(name="style:style")
                cell_style.attrs = {
                    "style:name": cell_style_name,
                    "style:family": "table-cell",
                    "style:writing-mode": "page",
                }
                style_cell_properies = Tag(name="style:table-cell-properties")
                style_cell_properies.attrs = {
                    "fo:padding-left": "0.10cm",
                    "fo:padding-right": "0.10cm",
                    "fo:padding-top": "0.10cm",
                    "fo:padding-bottom": "0.10cm",
                    "style:vertical-align": "bottom",
                }
                style_background_image = Tag(name="style:background-image")
                style_cell_properies.contents.append(style_background_image)
                cell_style.contents.append(style_cell_properies)

                row_cell_styles.append(cell_style)

                cell_odt.attrs = {
                    "table:style-name": cell_style_name,
                    "office:value-type": "string",
                }

                if cell.col_span > 1:
                    cell_odt.attrs[
                        "table:number-columns-spanned"] = cell.col_span

                if cell.content:
                    cell_content = utils.panflute2output(
                        cell.content, format="opendocument").strip()

                    cell_content = BeautifulSoup(cell_content,
                                                 "lxml").html.body

                    text_p = re.compile("text:p")

                    for t in cell_content.find_all(text_p):
                        if cell.heading == 1:
                            t["text:style-name"] = "Table_20_Heading"
                        elif cell.heading == 2:
                            t["text:style-name"] = "Table_20_Subheading"
                        else:
                            t["text:style-name"] = "Table_20_Contents"

                        if cell.vertical:
                            t_contents = t.contents
                            t.contents = [
                                utils.create_nested_tags(
                                    **{
                                        "name": "text:span",
                                        "attrs": {
                                            "text:style-name": "Vertical"
                                        },
                                        "contents": t_contents,
                                    })
                            ]
                    cell_odt.contents = cell_content.contents
                else:
                    cell_content = Tag(name="text:p")
                    cell_content.attrs = {
                        "text:style-name": "Table_20_contents"
                    }
                    cell_odt.contents.append(cell_content)

                row_odt.contents.append(cell_odt)

        if row.underlines:
            for underline in row.underlines:
                start = underline[0]
                stop = underline[1]

                for i in range(start - 1, stop):
                    cell_style = row_cell_styles[i]

                    if cell_style is None:
                        pass
                    else:
                        cell_style.contents[0].attrs[
                            "fo:border-bottom"] = "0.5pt solid #000000"

        add_top_space = table.content[r - 1].btm_space if r else False

        if row.top_space or add_top_space:
            for cell_style in row_cell_styles:
                if cell_style is not None:
                    padding_top = cell_style.contents[0].attrs[
                        "fo:padding-top"]

                    padding_top = (float(padding_top.strip("cm")) +
                                   0.05 * add_top_space + 0.05 * row.top_space)

                    cell_style.contents[0].attrs[
                        "fo:padding-top"] = f"{padding_top}cm"

        row_cell_styles = [cs for cs in row_cell_styles if cs is not None]
        styles.contents.extend(row_cell_styles)

        table_odt.contents.append(row_odt)

    try:
        footer = elem.content[1].content[0]
    except IndexError:
        footer = None

    if footer is not None:
        for definition_item in footer.content:
            term = "".join(pf.stringify(e) for e in definition_item.term)

            definitions = [
                utils.panflute2output(d.content, format="opendocument")
                for d in definition_item.definitions
            ]
            definitions_parsed = BeautifulSoup("".join(definitions),
                                               "lxml").html.body.contents

            for t in definitions_parsed:
                if t.name == "text:p":
                    t.name = "text:span"
                    t.contents.insert(0, NavigableString(" "))

            definition = utils.create_nested_tags(
                **{
                    "name":
                    "text:p",
                    "attrs": {
                        "text:style-name": "Table_20_Legend"
                    },
                    "contents": [{
                        "name": "text:span",
                        "attrs": {
                            "text:style-name": "Superscript"
                        },
                        "contents": [term],
                    }] + definitions_parsed,
                })
            table_root.contents.append(definition)

    styles = "\n".join(c.prettify() for c in styles.contents)
    doc.auto_styles.append(styles)

    table = "\n".join(str(c) for c in table_root.contents)
    # pf.debug(table)

    return table
Example #10
0
def process_tree(html: str) -> str:
    to_remove_by_html_parser = ["nav"]
    to_remove_by_lxml = [
        "header",
        "footer",
        "script",
        "link",
        "style",
        "img",
        "svg",
        "i",
        "iframe",
        "input",
        "textarea",
        "a",
        "noscript",
        "label",
        "button",
        "br",
        "hr",
        "video",
        "audio",
        "option",
    ]
    soup = BeautifulSoup(html, "html.parser").body
    for tag_name in to_remove_by_html_parser:
        remove_all_tags(soup, tag_name)

    html = str(soup)

    soup = BeautifulSoup(html, "lxml").body
    for tag_name in to_remove_by_lxml:
        remove_all_tags(soup, tag_name)

    for elem in soup.find_all(string=lambda x: isinstance(x, Comment)):
        elem.extract()

    for elem in soup.find_all(string=lambda x: isinstance(x, NavigableString)):
        text = elem.string.strip()
        if text in [",", ".", "/", "|", "!", ":"]:
            elem.extract()
        else:
            elem.replace_with(NavigableString(text))

    # let's remove empty tags or set attrs to {}
    def remove_empty_tags(html):
        for elem in html.contents:
            if isinstance(elem, Tag):
                remove_empty_tags(elem)
                if is_empty(elem):
                    elem.decompose()

    def clean_tags(html):
        for elem in html.contents:
            if isinstance(elem, Tag):
                clean_tags(elem)
                elem.attrs = {}

    remove_empty_tags(soup)
    clean_tags(soup)
    soup.attrs = {}
    return str(soup)
Example #11
0
    else:
        try:
            k = int(x)
        except:
            os.sys.exit("faild convert data-xcols \"%s\" to integer: " % x)

    if k < 1: k = d1.shape[1]
    n, m = divmod(d1.shape[1], k)
    if m > 0: n += 1

    for i in range(n):
        d2 = d1.iloc[:, (i * k):(i * k + k)]
        if d2.shape[1] == 0: continue

        t1 = BeautifulSoup(d2.to_html(na_rep=""), 'html5lib').find("table")
        t1.attrs = {}
        t1.find("th").append(idx)
        el.insert(i + 2, t1)

## process figure.mySlide
for el in main.find_all("figure", {"class": "mySlide"}):
    if el is None: continue
    p, ps = el.attrs.get("data-file"), el.attrs.get("data-file-match")
    if p is None and ps is None: continue
    for e in el.find_all("img"):
        e.decompose()

    fs = []
    if not (p is None) and p != "": fs.extend(p.relpace(",", " ").split())

    if not (ps is None):