def get_first_three(soup, table): loop = 0 first = 1 enclose = Tag(soup, "div") for tr in table.findAll("tr"): li = Tag(soup, "li") for td in tr.findAll("td"): if loop != 3: try: text = ''.join(td.findAll(text=True)) text = text.strip() if text != '' and text != ' ': td.name = "span" if first == 1: first = 0 enclose.append(td) else: if loop != 2: td.append(' - ') li.append(td) except: pass else: break loop += 1 loop = 0 if ''.join(li.findAll(text=True)) != '': enclose.append(li) title = enclose.find("span") enclose.find("span").replaceWith("") enclose.name = "ul" div = Tag(soup, "div") div.append(title) div.append(enclose) return div
def linearize_rows_1_cols(soup, table): if table.get('id') == "linearize-rows-1-cols": div = Tag(soup, "div") div["class"] = "center" for tr in table.findAll("tr"): lista = tr.findAll("td") li = Tag(soup, "li") for td in lista: for p in td.findAll("p"): p.name = "span" td.name = "span" if td == lista[0]: td = BeautifulSoup('<b>' + td.prettify() + '</b>') else: td = BeautifulSoup('<span>[</span>' + td.prettify() + '<span>]</span>') li.append(td) div.append(li) div.name = "ul" table.replaceWith(div)
def parse_html(self): title = None body = None bodysoup = None nav_title = None meta_title = None content_title = None self.attachments = [] if (not self.debug_path or self.debug_path == self.path): # Remove nbsps self.html = self.html.replace(" ", "") # Remove attributes from closing tags (!) self.html = re.sub(r"</([a-zA-Z]+) [^>]+>", r"</\1>", self.html) m = RE_BODY.search(self.html) if m and m.lastindex == 1: bodysoup = BeautifulSoup(m.group(1), convertEntities=BeautifulStoneSoup.HTML_ENTITIES) else: try: bodysoup = BeautifulSoup(self.html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES).html.body if not bodysoup: fixed_html = self.html.replace("</head>", "</head><body>") bodysoup = BeautifulSoup(fixed_html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES).html.body except AttributeError: pass if not bodysoup: raise ImportError("No body") if self.debug_path == self.path: print "\n\n========= DEBUG =========\n" # Remove comments for comment in (bodysoup.findAll(text=lambda text:isinstance(text, Comment))): self.debug("Removed comment: <!-- %s -->" % comment) comment.extract() bodysoup = BeautifulSoup(bodysoup.prettify()) # Convert header divs into h1, h2 h1_found = False for tag in bodysoup.findAll("div"): if tag.get("class") == "BL-otsikko1" and not h1_found: h1_found = True tag.name = "h1" self.debug("Converted into H1: %s" % tag) elif tag.get("class") == "BL-otsikko2" or \ (tag.get("class") == "BL-otsikko1" and h1_found): tag.name = "h2" self.debug("Converted into H2: %s" % tag) elif tag.get("class") == "BL-leivanmurut": tag.extract() self.debug("Removed breadcrumbs") else: tag.hidden = True bodysoup = BeautifulSoup(bodysoup.prettify()) # Remove unwanted elements for tag in bodysoup.findAll(["style", "link"]): tag.extract() self.debug("Removed %s" % tag) bodysoup = BeautifulSoup(bodysoup.prettify()) # Hide unnecessary elements for tag in bodysoup.findAll(["span", "div", "body", "font"]): self.debug("Set hidden: %s" % tag.name) tag.hidden = True bodysoup = BeautifulSoup(bodysoup.prettify()) # Reformat forms for form in bodysoup.findAll("form"): pass # Hide non-semantic tables for table in bodysoup.findAll("table"): if table.get("border") != "1" and len(table.findAll("tr", recursive=False)) < 100 and not has_ancestor(table, "form"): table.hidden = True for tr in table.findAll("tr", recursive=False): tr.hidden = True for td in tr.findAll(["td", "th"], recursive=False): text = td.find(text=re.compile("[^\s]+", re.U)) if text and text.strip() != "": td.name = "p" self.debug("Converted th/td into p: %s" % td) else: td.hidden = True self.debug("Hid non-semantic table") bodysoup = BeautifulSoup(bodysoup.prettify()) # Remove orphan th/td/tr for el in bodysoup.findAll(["tr", "td", "th"]): if el.parent.name != "table" or (el.parent.parent and el.parent.parent.name != "table"): if el.name in ("td", "td"): text = el.find(text=re.compile("[^\s]+", re.U)) if text and text.strip() != "": el.name = "p" self.debug("Converted td/th into p: %s" % el) else: el.hidden = True self.debug("Hid orphan %s: %s" % (el.name, el)) else: el.hidden = True self.debug("Hid orphan %s: %s" % (el.name, el)) bodysoup = BeautifulSoup(bodysoup.prettify()) # Wrap NavigableStrings in td into p for tag in bodysoup.findAll(text=lambda text:isinstance(text, NavigableString)): if tag.parent.name == "td" and tag.strip() != "": p = Tag(bodysoup, "p") p.insert(0, "%s" % tag) tag.replaceWith(p) self.debug("Moved from td into p: %s" % tag) bodysoup = BeautifulSoup(bodysoup.prettify()) # Convert "loose" NavigableStrings into paragraphs for tag in bodysoup.findAll(text=lambda text:isinstance(text, NavigableString)): if len(tag.strip()) > 10 and tag.parent.name == "[document]": p = Tag(bodysoup, "p") p.insert(0, "%s" % tag) tag.replaceWith(p) self.debug("Moved loose string into p: %s" % tag) bodysoup = BeautifulSoup(bodysoup.prettify()) # Move NavigableStrings after list into p before moving lists for ul in bodysoup.findAll("ul"): if ul.parent.name == "p": next = ul.nextSibling if isinstance(next, NavigableString): p = Tag(bodysoup, "p") p.insert(0, "%s" % next) next.replaceWith(p) self.debug("Moved NavigableString after list into p: %s" % p) bodysoup = BeautifulSoup(bodysoup.prettify()) # Move blocks outside paragraphs for block in bodysoup.findAll(["p", "ul", "h1", "h2"]): parent = block.parent if parent.name == "p": if block.name in ("h1", "h2"): parent.parent.insert(parent.parent.index(parent), block) self.debug("Moved %s before p" % block.name) else: parent.parent.insert(parent.parent.index(parent) + 1, block) self.debug("Moved %s after p" % block.name) # Delete depracated attributes for tag in bodysoup.findAll(): for attr in ("align", "valign", "class", "style", "border", "vspace", "hspace", "cellpadding", "cellspacing"): del(tag[attr]) for attr in ("width", "height"): if tag.name != "img": del(tag[attr]) for attr in ("colspan", "rowspan"): if not tag.name in ("td", "tr"): del(tag[attr]) bodysoup = BeautifulSoup(bodysoup.prettify()) # Import images for tag in bodysoup.findAll("img"): src = tag.get("src") if src and src.endswith(".gif") and src.find("/tyhja-") != -1: tag.extract() elif src and not src.startswith("http://"): img_path = os.path.dirname(os.path.join(self.source_dir, self.source_path)) + "/" + src if os.path.exists(img_path) and os.path.isfile(img_path): img_title = tag.get("title") or tag.get("alt") or "" img = Image.create_from_file(img_path, img_title[0:100]) img.tmp_orig_path = src img.save() tag["src"] = img.file.url self.debug("Imported image: %s" % tag["src"]) bodysoup = BeautifulSoup(bodysoup.prettify()) # Import external files into Attachment models for a in bodysoup.findAll("a", href=re.compile(".+")): href = a.get("href") path, ext = os.path.splitext(href) if not href.startswith("http://") and ext != "" and not ext in (".html", ".shtml", ".php", ".jpg", ".gif", ".png"): if href.startswith("/"): abspath = self.source_dir + href else: abspath = self.source_dir + os.path.dirname(self.source_path) + "/" + href if os.path.exists(abspath): self.debug("Found attachment: %s" % abspath) self.attachments.append(abspath) # store for later import as we don't have page id yet # Remove bad linebreaks for br in bodysoup.findAll("br"): if br.parent.name == "p": for sib in (br.previousSibling, br.nextSibling): if not sib or (isinstance(sib, NavigableString) and sib.strip() == ""): self.debug("Removed linebreak at (%s, %s)" % (br.parent.index(br), br.parent)) br.extract() break elif br.parent.name == "[document]": br.extract() bodysoup = BeautifulSoup(bodysoup.prettify()) # Clean up paragraphs for p in bodysoup.findAll("p"): non_sentence = lambda str:str != None and not str.strip().endswith(".") and 3 < len(str) < 30 # Remove empty if p.string and p.string.strip() == "": self.debug("Removed empty p at (%s, %s)" % (p.parent.name, p.parent.index(p))) p.extract() # Hide if contains only tag(s) elif not p.findAll(text=re.compile(r"[^\s]+", re.U)): self.debug("Hid p with no text: %s" % p) p.hidden = True # Convert short one-liners into h3 elif non_sentence(p.string) or (len(p.findAll(text=re.compile("[^\s]+", re.U))) == 1 and non_sentence(p.contents[0].string)): p.name = "h3" self.debug("Converted p into h3: %s" % p) # Remove bad styling else: tags = p.findAll(recursive=False) if len(tags) == 1 and tags[0].name in ("b", "u", "i"): #if not tags[].previousSibling and not el.nextSibling: # el.hidden = True #self.debug("Hid %s from p, only child" % el.name) if not p.findAll(text=re.compile("[^\s]+", re.U), recursive=False): #print "!!! %s" % p tags[0].hidden = True self.debug("Hid %s from p, bad styling: %s" % (tags[0].name, p)) bodysoup = BeautifulSoup(bodysoup.prettify()) # Remove redundant information for text in bodysoup.findAll(text=re.compile(r"^\s*pdf-tiedosto [0-9]+ KB\s+$")): self.debug("Removed text: %s" % text) text.extract() bodysoup = BeautifulSoup(bodysoup.prettify()) # Clean up headings for h in bodysoup.findAll(["h1", "h2", "h3", "h4", "h5", "h6"]): for el in h.findAll(): # Remove styling elements (u, b, i, etc) if isinstance(el, Tag) and el.name != "a": el.hidden = True self.debug("Heading clean-up, hid %s in %s" % (el.name, h)) try: # Move h1 at first if h.name == "h1" and h.parent.index(h) != 1: h.parent.insert(1, h) self.debug("Moved %s at first" % h.name) # Convert any heading at the beginning of document into h1 elif h.name != "h1" and h.parent.name == "[document]" and not h.previousSibling: self.debug("Converted into h1: %s" % h) h.name = "h1" except IndexError: pass bodysoup = BeautifulSoup(bodysoup.prettify()) # Convert internal links for a in bodysoup.findAll("a"): href = a.get("href") if href and not href.startswith("http://") and href.endswith(".shtml"): a["href"] = href.replace("/index.shtml", "").replace(".shtml", "").replace("_", "-") self.debug("Fixed link: %s -> %s" % (href, a["href"])) bodysoup = BeautifulSoup(bodysoup.prettify()) # Parse content_title text h1 = bodysoup.find("h1") if h1: content_title = " ".join(h1.findAll(text=True)) content_title = re.sub("[\s]+", " ", content_title).strip() # Reformat body = u"" + bodysoup.prettify().decode("UTF8") #print "type: %s" % type(body) #raise SystemExit() body = re.sub(r"\s+>\s+", " > ", body) body = re.sub(r"\s+<\s+", " < ", body) body = re.sub(r"[\n\r]+", " ", body) body = re.sub(r"[ \t]+", " ", body) body = re.sub(r">\s+", ">", body) body = re.sub(r"\s+<", "<", body) body = re.sub(r"</(p|h1|h2|h3|h4|h5|h6|ul|ol|table|tr)>", r"</\1>\n\n", body) body = re.sub(r"</(li|td)>", r"</\1>\n", body) body = re.sub(r"<(u|b|i|em|strong)>\s*", r" <\1>", body) body = re.sub(r"\s*</(u|b|i|em|strong)>", r"</\1> ", body) body = re.sub(r"</a>([^\-])", r"</a> \1", body) body = re.sub(r"<a ", " <a ", body) body = re.sub(r"\s+(\.|,|:|;|!|\?)", r"\1", body) # Is body valid UTF8? try: body.encode("UTF8") except UnicodeError: print "DAA" raise SystemExit() else: body = "(debug mode, no content parsed)" docsoup = BeautifulSoup(self.html, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) # nav title for text in docsoup.findAll(text=re.compile("^#include")): m = re.compile('"(.+)"').search(text) if m and m.lastindex == 1: include_path = m.group(1) if include_path.find("valikko") != -1: if not include_path.startswith("/"): if self.source_path.endswith("/index.shtml"): include_path = os.path.join(self.path, include_path) else: include_path = os.path.join("/".join(self.path.split("/")[0:-1]), include_path) f = open(self.source_dir + include_path) navsoup = BeautifulSoup(f.read(), convertEntities=BeautifulStoneSoup.HTML_ENTITIES) f.close() for a in navsoup.findAll("a", href="/"+self.source_path): if a.get("class") == "valikon_tekstit" or a.parent.get("class") == "avattu_alavalikko": nav_title = a.find(text=re.compile("[^\s]+")).strip() break # meta title try: meta_title = docsoup.head.title.string except AttributeError: pass if meta_title: valid_meta_title_parts = [] for part in [part.strip() for part in meta_title.split(" - ")]: if part not in ("BirdLife Suomi", u"Yhdessä lintujen puolesta"): valid_meta_title_parts.append(part) meta_title = u" – ".join(valid_meta_title_parts) # choose best title self.debug("Titles: nav: '%s', meta: '%s', content: '%s'" % (nav_title, meta_title, content_title)) if nav_title: title = nav_title self.debug("Title choice: nav_title: %s" % title) elif content_title and meta_title and len(content_title) < len(meta_title): title = content_title self.debug("Title choice: content_title (shorter): %s" % title) elif meta_title: title = meta_title self.debug("Title choice: meta_title: %s" % title) elif content_title: title = content_title self.debug("Title choice: content_title: %s" % title) else: title = "%s (autogen)" % self.slug.capitalize() self.debug("Title choice: autogenerated from slug: %s" % title) if not title: raise ImportError("No title") if self.level == 0: if self.slug in ("liity", "suojelu", "lintuharrastus", "julkaisut", "yhdistys"): self.template = "osio.html" #else: # self.level = 0 if self.level == 0 and self.slug == "": self.template = "etusivu.html" self.title = title self.body = body