def create_html_tag(tag, is_self_closing=False, attrs=None): assert tag is not None, 'Tham số tag không được là None' new_tag = BeautifulSoup('<%s>' % tag, features='xml').find( tag) if is_self_closing else _BEAUTIFUL_SOUP.new_tag(tag) if attrs is not None: new_tag.attrs = attrs return new_tag
def pre_process(self, content, css): soup = BeautifulSoup(content, 'lxml') # Select content with CSS selector soup = soup.select(css)[0] # Remove tags [s.extract() for s in soup.find_all(self.ignored_tags)] # Remove comments [ s.extract() for s in soup(text=lambda text: isinstance(text, Comment)) ] # Remove all attributes soup.attrs = {} for tag in soup.find_all(True): tag.attrs = {} # Remove all tags without content for tag in soup.find_all(True): if tag.get_text().replace("\n", "").replace("\t", "") == "": tag.extract() return soup
def translate(self, src_text: str, to_lang: str, from_lang=None) -> str: """Translates src_text to `to_lang`. It `from_lang` is not valid, auto-detection is performed to find it. """ if self._dummy: return src_text # noqa: E701 if from_lang in {'ja', 'zh'} and all( ord(c) < 127 for c in src_text if not c.isspace()): return src_text src_soup = BeautifulSoup(src_text, 'html.parser') if not src_soup.get_text().strip(): return src_text # self-closed tag that without content if from_lang: with _lock: cache, old_cache = self._get_cache(from_lang, to_lang) target_text = cache.get(src_text) if not target_text and (src_text in old_cache): cache[src_text] = target_text = old_cache[src_text] if target_text: return target_text # noqa: E701 if len(src_soup.contents) == 1 and isinstance(src_soup.contents[0], Tag): src_children = src_soup.contents[0].children full_target_str = src_text[:src_text.find('>', 1) + 1] # attributes are included target_tail_str = f'</{src_soup.contents[0].name}>' else: src_children = src_soup.children full_target_str = target_tail_str = '' for src_tag in src_children: attrs = {} if isinstance(src_tag, Tag): attrs = src_tag.attrs src_tag.attrs = {} target_str = self._do_translation( str(src_tag), from_lang, to_lang, isinstance(src_tag, NavigableString)) if not target_str: continue # noqa: E701 if attrs: target_tag = BeautifulSoup(target_str, 'html.parser').contents[0] target_tag.attrs = attrs target_str = str(target_tag) full_target_str += target_str full_target_str += target_tail_str return full_target_str
def cleanarticle(soup: BeautifulSoup) -> BeautifulSoup: soup.attrs = None soup = removeattrs(soup) soup = striptags(soup) # dellist = soup.find_all('img', src=re.compile(r'l-stat.livejournal.net/img/userinfo')) # for delobj in dellist: # delobj.decompose() # dellist = soup.find_all('a', href=re.compile(r'livejournal.com/profile'), text=False) # for delobj in dellist: # delobj.decompose() badlinks = [None, 'https://t.me/evo_lutio', 'https://facebook.com/psychoalchemy.ru/', 'https://www.youtube.com/channel/UCjl7ABlrO8mrtdNabYGb9bQ', 'https://www.instagram.com/evo_lutio/', 'https://vk.com/psychoalchemy', 'https://twitter.com/evo_lutio'] dellist = soup.find_all('a', href=badlinks) for delobj in dellist: delobj.decompose() return soup
if tag.name == 'sup' or ('href' in tag.attrs and 'cmnt_ref' in tag.attrs['href']): return False return True def clean_style(key, val): if key != 'style': return val for st in junk_styles: val = val.replace(st + ';', '') val = val.replace(st, '') return val os.system('mkdir -p cleaned') for fnm in glob('*/*/*.html'): with open(fnm, 'r') as f: body = BeautifulSoup(f).body body.attrs = {} for tag in body.findAll(): tag.attrs = {key: clean_style(key, val) for key, val in tag.attrs.iteritems()} if not tagOK(tag): tag.extract() # remove the comments for div in body.findAll('div', {'style': 'margin:5px;border:1px solid black'}): div.extract() with open('cleaned/{}'.format(fnm.split('/')[-1]), 'w+', encoding='utf-8') as f: out = unicode( ''.join( [str(tag) for tag in body.contents] ).replace(' style=""','').replace('\t', ''), encoding='utf-8')
def transcription(file_tei, config_file): with open(config_file) as json_file: config = json.load(json_file) with open(file_tei, encoding='utf8') as fp: soup = BeautifulSoup(fp, "lxml-xml") body = soup.find('body') for x, y in config['tags'].items(): for tag in y: for node in body.find_all(tag): convertTag(node, x, tag, config) # for node in body.find_all("p"): # node.insert(0, " ") # node.insert(len(node.contents), " ") output = {"will": [], "envelope": [], "codicil": []} output_div = [] for item in body.find_all("div"): if item.has_attr('type') and item['type'] in output.keys(): output_div.append(item) for item in output_div: page = item.find("div", {"class": "pb"}) page_div = BeautifulSoup(features="html.parser").new_tag('div') page_div.attrs = {"id": page['id'], "class": "transcription"} page_div.append("") output_ = [] tags = [] prev_tags = [] for tag in page.next_siblings: if tag is not None and tag.name is not None: if tag.get('class') == "pb": page_div.extend(tags) # Traitement partie commentaire <!-- --> for element in page_div( text=lambda it: isinstance(it, Comment)): element.extract() wrap_ul(page_div) output_.append(str(page_div)) tags = [] page_div = BeautifulSoup( features="html.parser").new_tag('div') page_div.attrs = { "id": tag['id'], "class": "transcription" } elif tag.find("div", {"class": "pb"}) is not None: [tags, page_div, output_, prev_tags] = parse_paragraph(tag.next_element, tags, page_div, output_, "transcription", prev_tags) elif tag.name == "p": # tags = list(filter(lambda a: a != '\n', tags)) for element in tags: if element not in ['\n', ' ']: if isinstance( element, NavigableString) or element.name not in [ "p", "ul", "li" ]: new_tag = BeautifulSoup( features="html.parser").new_tag('p') new_tag.extend(tags) tags.clear() tags.append(new_tag) break tags.append(tag) else: tags.append(tag) page_div.extend(tags) # Traitement attr @rend pour <p> for node_p in page_div.find_all("p"): if node_p.has_attr('rend'): node_p.attrs = {"class": "p-" + node_p["rend"]} # Traitement partie commentaire <!-- --> for element in page_div(text=lambda it: isinstance(it, Comment)): element.extract() # comment_soup = BeautifulSoup(comment, "html.parser") # for x, y in config['tags'].items(): # y_lower = [x.lower() for x in y] # for tag in y_lower: # for node in comment_soup.find_all(tag): # convertTag(node, x, y[y_lower.index(tag)], config) # comment.replace_with(Comment(str(comment_soup))) wrap_ul(page_div) output_.append(str(page_div)) output[item['type']] = output_ return output
def parse_paragraph(tag, tags, page_div, output, type_class, prev_tags): if tag.parent.name and tag.parent.name != "div": new_element = dict() new_element[tag.parent.name] = [] if isinstance(tag, NavigableString) and tag.string != "\n": new_element[tag.parent.name].append(tag.string) prev_tags.insert(0, new_element) if tag is not None and tag.name is not None: if tag.get('class') == "pb": if len(prev_tags) > 0: for i in range(len(prev_tags) - 1): for key in prev_tags[i].keys(): new_tag = BeautifulSoup( features="html.parser").new_tag(key) new_tag.extend(prev_tags[i][key]) for key_bis in prev_tags[i + 1].keys(): prev_tags[i + 1][key_bis].append(new_tag) for key in prev_tags[len(prev_tags) - 1].keys(): new_tag = BeautifulSoup( features="html.parser").new_tag(key) new_tag.extend(prev_tags[len(prev_tags) - 1][key]) tags.append(new_tag) prev_tags = [] for element in tags: if element not in ['\n', ' ']: if isinstance(element, NavigableString) or element.name not in [ "p", "ul", "li" ]: new_tag = BeautifulSoup( features="html.parser").new_tag('p') new_tag.extend(tags) tags.clear() tags.append(new_tag) break page_div.extend(tags) for element in page_div(text=lambda it: isinstance(it, Comment)): element.extract() wrap_ul(page_div) output.append(str(page_div)) tags = [] page_div = BeautifulSoup(features="html.parser").new_tag('div') page_div.attrs = {"id": tag.get('id'), "class": type_class} for item in tag.next_siblings: if item is not None and item.name is not None: if item.get('class') == "pb": if len(prev_tags) > 0: for i in range(len(prev_tags) - 1): for key in prev_tags[i].keys(): new_tag = BeautifulSoup( features="html.parser").new_tag(key) new_tag.extend(prev_tags[i][key]) for key_bis in prev_tags[i + 1].keys(): prev_tags[i + 1][key_bis].append(new_tag) for key in prev_tags[len(prev_tags) - 1].keys(): new_tag = BeautifulSoup( features="html.parser").new_tag(key) new_tag.extend(prev_tags[len(prev_tags) - 1][key]) tags.append(new_tag) prev_tags = [] # p_tag = BeautifulSoup( # features="html.parser").new_tag("p") # p_tag.extend(tags) for element in tags: if element not in ['\n', ' ']: if isinstance(element, NavigableString) or element.name not in [ "p", "ul", "li" ]: new_tag = BeautifulSoup( features="html.parser").new_tag('p') new_tag.extend(tags) tags.clear() tags.append(new_tag) break page_div.extend(tags) for element in page_div( text=lambda it: isinstance(it, Comment)): element.extract() wrap_ul(page_div) output.append(str(page_div)) tags = [] page_div = BeautifulSoup(features="html.parser").new_tag('div') page_div.attrs = {"id": item.get('id'), "class": type_class} elif item.find("div", {"class": "pb"}) is not None: [tags, page_div, output, prev_tags] = parse_paragraph(item.next_element, tags, page_div, output, type_class, prev_tags) else: if len(prev_tags) == 0: tags.append(item) else: if tag.parent.name in prev_tags[0]: prev_tags[0][tag.parent.name].append(item) else: if len(prev_tags) == 0: tags.append(item) else: if tag.parent.name in prev_tags[0]: prev_tags[0][tag.parent.name].append(item) return [tags, page_div, output, prev_tags]
def edition(file_tei, config_file): add_translate = { "above": "au dessus", "below": "au dessous", "marginLeft": "en marge à gauche", "marginRight": "en marge à droite", "marginBottom": "en marge inférieur", "marginTop": "en marge supérieur", "inline": "dans la ligne" } with open(config_file) as json_file: config = json.load(json_file) with open(file_tei, encoding='utf8') as fp: soup = BeautifulSoup(fp, "lxml-xml") body = soup.find('body') for x, y in config['tags'].items(): for tag in y: for node in body.find_all(tag): if node.name in ["sic", "abbr", "note", "space", "del"]: # if node.name in ["lb", "space"]: # if node.name == "lb" and isinstance(node.previous_sibling, NavigableString): # str_ = node.previous_sibling.strip() # if len(str_) == 0 or str_[len(str_)-1] != '-': # space = BeautifulSoup(features="html.parser").new_tag('span') # space.attrs = {"class": node.name + "-edition"} # space.string = " " # node.insert_after(space) # else: # node.previous_sibling.replace_with(str_.replace('-', '')) node.decompose() else: # if node.name in ["persName", "item", "placeName", "date", "addrLine", "surname"]: # node.insert(0, "") # # if node.name == "persName" : # # node.next_element.next_element.insert_after(" ") # next_str = node.parent.next_sibling # # if next_str is not None and len(next_str) > 1 and next_str[0] not in [".", ","]: # # node.insert(len(node.contents), "") # elif node.name in ["expan"] and node.string is not None: # node.string = "" + node.string + "" new_attrs = dict() new_attrs['class'] = tag for old_attr in node.attrs: if old_attr in config['attrs']: new_attrs['class'] += "-" + node.attrs[old_attr] if old_attr == "xml:lang": new_attrs["xml:lang"] = node.get("xml:lang") elif old_attr == "facs": new_attrs["id"] = node.attrs[old_attr] if "rend" in node.attrs: new_attrs['class'] += "-" + node.attrs["rend"] if node.name == "unclear": new_attrs['title'] = "transcription incertaine" if node.name == "add": if 'place' in node.attrs: new_attrs['title'] = "ajout " + \ add_translate[node['place']] # new_attrs['class'] = "add-" + node['place'] # node.insert(0, "\\") # node.insert(len(node.contents), "/ ") elif node.name == "supplied": node.insert(0, "{") node.insert(len(node.contents), "}") node.name = x node.attrs = new_attrs if len(node.contents) == 0 and node.name == "span": node.string = "" # for node in body.find_all("p"): # node.insert(0, " ") # node.insert(len(node.contents), " ") output = {"will": [], "envelope": [], "codicil": []} output_div = [] for item in body.find_all("div"): if item.has_attr('type') and item['type'] in output.keys(): output_div.append(item) for item in output_div: page = item.find("div", {"class": "pb"}) page_div = BeautifulSoup(features="html.parser").new_tag("div") page_div.attrs = {"id": page['id'], "class": "edition"} page_div.append("") tags = [] output_ = [] prev_tags = [] for tag in page.next_siblings: if tag is not None and tag.name is not None: if tag.get('class') == "pb": page_div.extend(tags) # Traitement partie commentaire <!-- --> for element in page_div( text=lambda it: isinstance(it, Comment)): element.extract() wrap_ul(page_div) output_.append(str(page_div)) tags = [] page_div = BeautifulSoup( features="html.parser").new_tag('div') page_div.attrs = {"id": tag.get('id'), "class": "edition"} elif tag.find("div", {"class": "pb"}) is not None: [tags, page_div, output_, prev_tags] = parse_paragraph(tag.next_element, tags, page_div, output_, "edition", prev_tags) elif tag.name == "p": for element in tags: if element not in ['\n', ' ']: if isinstance( element, NavigableString) or element.name not in [ "p", "ul", "li" ]: new_tag = BeautifulSoup( features="html.parser").new_tag('p') new_tag.extend(tags) tags.clear() tags.append(new_tag) break tags.append(tag) else: tags.append(tag) page_div.extend(tags) # for comment in page_div.find_all(string=lambda text: isinstance(text, Comment)): # comment_soup = BeautifulSoup(comment, "html.parser") # for x, y in config['tags'].items(): # y_lower = [x.lower() for x in y] # for tag in y_lower: # for node in comment_soup.find_all(tag): # convertTag(node, x, y[y_lower.index(tag)], config) # comment.replace_with(Comment(str(comment_soup))) # Traitement attr @rend pour <p> for node_p in page_div.find_all("p"): if node_p.has_attr('rend'): node_p.attrs = {"class": "p-" + node_p["rend"]} # Traitement partie commentaire <!-- --> for element in page_div(text=lambda it: isinstance(it, Comment)): element.extract() wrap_ul(page_div) output_.append(str(page_div)) output[item['type']] = output_ return output
def render_table_odt(elem, doc): table = elem.content[0] table_number = tuple( str(i) for i in utils.get_elem_count(doc, pf.Table, register="table")) table_name = "Table{}".format("_".join(str(i) for i in table_number)) # table_root = BeautifulSoup("", "xml") if hasattr(table, "caption") and table.caption: colon = ": " caption = "".join(pf.stringify(c) for c in table.caption) else: colon = "" caption = "" caption_odt = utils.create_nested_tags( **{ "name": "text:p", "attrs": { "text:style-name": "Table" }, "contents": [ { "name": "text:span", "attrs": { "text:style-name": "Strong_20_Emphasis" }, "contents": [ "Table ", { "name": "text:sequence", "attrs": { "text:ref-name": f"ref{table_name}", "text:name": "Table", "text:formula": "ooow:Table+1", "style:num-format": "1", }, "contents": [".".join(table_number)], }, colon, ], }, caption, ], }) table_root.contents.append(caption_odt) table_odt = utils.create_nested_tags( **{ "name": "table:table", "attrs": { "table:name": table_name, "table:style-name": table_name, "table:template-name": "Default Style", }, }) table_root.contents.append(table_odt) unoccupied_width = 1 - sum(table.width) unspecified_widths = len([w for w in table.width if not w]) remaining_for_each = unoccupied_width / unspecified_widths widths = [w if w else remaining_for_each for w in table.width] # We want the table to occupy a maximum width widths = map(lambda x: x * table.total_width, widths) column_style_names, column_styles, column_definitions = zip( *create_column_definitions(widths, table_name)) pf.debug(column_style_names, column_styles, column_definitions) styles = BeautifulSoup("", "xml") styles.contents = list(column_styles) table_odt.contents.extend(column_definitions) for r, row in enumerate(table.content): row_odt = Tag(name="table:table-row") row_odt.attrs = { "table:style-name": "{table_name}.{r}".format(table_name=table_name, r=r + 1) } row_cell_styles = [] for c, cell in enumerate(row.content): if cell.covered: cell_odt = Tag(name="table:covered-table-cell") row_odt.contents.append(cell_odt) row_cell_styles.append(None) else: cell_odt = Tag(name="table:table-cell") cell_style_name = "{column_style}{r}".format( column_style=column_style_names[c], r=r + 1) cell_style = Tag(name="style:style") cell_style.attrs = { "style:name": cell_style_name, "style:family": "table-cell", "style:writing-mode": "page", } style_cell_properies = Tag(name="style:table-cell-properties") style_cell_properies.attrs = { "fo:padding-left": "0.10cm", "fo:padding-right": "0.10cm", "fo:padding-top": "0.10cm", "fo:padding-bottom": "0.10cm", "style:vertical-align": "bottom", } style_background_image = Tag(name="style:background-image") style_cell_properies.contents.append(style_background_image) cell_style.contents.append(style_cell_properies) row_cell_styles.append(cell_style) cell_odt.attrs = { "table:style-name": cell_style_name, "office:value-type": "string", } if cell.col_span > 1: cell_odt.attrs[ "table:number-columns-spanned"] = cell.col_span if cell.content: cell_content = utils.panflute2output( cell.content, format="opendocument").strip() cell_content = BeautifulSoup(cell_content, "lxml").html.body text_p = re.compile("text:p") for t in cell_content.find_all(text_p): if cell.heading == 1: t["text:style-name"] = "Table_20_Heading" elif cell.heading == 2: t["text:style-name"] = "Table_20_Subheading" else: t["text:style-name"] = "Table_20_Contents" if cell.vertical: t_contents = t.contents t.contents = [ utils.create_nested_tags( **{ "name": "text:span", "attrs": { "text:style-name": "Vertical" }, "contents": t_contents, }) ] cell_odt.contents = cell_content.contents else: cell_content = Tag(name="text:p") cell_content.attrs = { "text:style-name": "Table_20_contents" } cell_odt.contents.append(cell_content) row_odt.contents.append(cell_odt) if row.underlines: for underline in row.underlines: start = underline[0] stop = underline[1] for i in range(start - 1, stop): cell_style = row_cell_styles[i] if cell_style is None: pass else: cell_style.contents[0].attrs[ "fo:border-bottom"] = "0.5pt solid #000000" add_top_space = table.content[r - 1].btm_space if r else False if row.top_space or add_top_space: for cell_style in row_cell_styles: if cell_style is not None: padding_top = cell_style.contents[0].attrs[ "fo:padding-top"] padding_top = (float(padding_top.strip("cm")) + 0.05 * add_top_space + 0.05 * row.top_space) cell_style.contents[0].attrs[ "fo:padding-top"] = f"{padding_top}cm" row_cell_styles = [cs for cs in row_cell_styles if cs is not None] styles.contents.extend(row_cell_styles) table_odt.contents.append(row_odt) try: footer = elem.content[1].content[0] except IndexError: footer = None if footer is not None: for definition_item in footer.content: term = "".join(pf.stringify(e) for e in definition_item.term) definitions = [ utils.panflute2output(d.content, format="opendocument") for d in definition_item.definitions ] definitions_parsed = BeautifulSoup("".join(definitions), "lxml").html.body.contents for t in definitions_parsed: if t.name == "text:p": t.name = "text:span" t.contents.insert(0, NavigableString(" ")) definition = utils.create_nested_tags( **{ "name": "text:p", "attrs": { "text:style-name": "Table_20_Legend" }, "contents": [{ "name": "text:span", "attrs": { "text:style-name": "Superscript" }, "contents": [term], }] + definitions_parsed, }) table_root.contents.append(definition) styles = "\n".join(c.prettify() for c in styles.contents) doc.auto_styles.append(styles) table = "\n".join(str(c) for c in table_root.contents) # pf.debug(table) return table
def process_tree(html: str) -> str: to_remove_by_html_parser = ["nav"] to_remove_by_lxml = [ "header", "footer", "script", "link", "style", "img", "svg", "i", "iframe", "input", "textarea", "a", "noscript", "label", "button", "br", "hr", "video", "audio", "option", ] soup = BeautifulSoup(html, "html.parser").body for tag_name in to_remove_by_html_parser: remove_all_tags(soup, tag_name) html = str(soup) soup = BeautifulSoup(html, "lxml").body for tag_name in to_remove_by_lxml: remove_all_tags(soup, tag_name) for elem in soup.find_all(string=lambda x: isinstance(x, Comment)): elem.extract() for elem in soup.find_all(string=lambda x: isinstance(x, NavigableString)): text = elem.string.strip() if text in [",", ".", "/", "|", "!", ":"]: elem.extract() else: elem.replace_with(NavigableString(text)) # let's remove empty tags or set attrs to {} def remove_empty_tags(html): for elem in html.contents: if isinstance(elem, Tag): remove_empty_tags(elem) if is_empty(elem): elem.decompose() def clean_tags(html): for elem in html.contents: if isinstance(elem, Tag): clean_tags(elem) elem.attrs = {} remove_empty_tags(soup) clean_tags(soup) soup.attrs = {} return str(soup)
else: try: k = int(x) except: os.sys.exit("faild convert data-xcols \"%s\" to integer: " % x) if k < 1: k = d1.shape[1] n, m = divmod(d1.shape[1], k) if m > 0: n += 1 for i in range(n): d2 = d1.iloc[:, (i * k):(i * k + k)] if d2.shape[1] == 0: continue t1 = BeautifulSoup(d2.to_html(na_rep=""), 'html5lib').find("table") t1.attrs = {} t1.find("th").append(idx) el.insert(i + 2, t1) ## process figure.mySlide for el in main.find_all("figure", {"class": "mySlide"}): if el is None: continue p, ps = el.attrs.get("data-file"), el.attrs.get("data-file-match") if p is None and ps is None: continue for e in el.find_all("img"): e.decompose() fs = [] if not (p is None) and p != "": fs.extend(p.relpace(",", " ").split()) if not (ps is None):