def trim(html, prefix_url=None): """ 코멘트 제거, 자바스크립트 제거 (100.daum.net 제외) \r\n -> \n html에 포함된 <br>, <p>를 \n 으로 변환 다수의 공백, \t, \n 을 하나로 합침 :param html: :param prefix_url: :return: """ html = html.replace('\r\n', '\n') convert_dic = {'<br>': '\n', '<br/>': '\n', '<br />': '\n', '<p>': '\n', '<p/>': '\n', '<p />': '\n', '<BR>': '\n', '<BR/>': '\n', '<BR />': '\n', '<P>': '\n', '<P/>': '\n', '<P />': '\n'} for _from, _to in convert_dic.items(): html = html.replace(_from, _to) html = HtmlUtil.remove_comments_in_html(html) # remove html comments. doc = lxml.html.document_fromstring(html) # convert to html element.r if prefix_url: doc.make_links_absolute(prefix_url) # convert links to absolute links. if prefix_url: if '100.daum.net' not in prefix_url: # javascript를 지우면 일부가 안 보이는 HTML도 있다. (100.daum.net) doc = HtmlUtil.remove_javascripts_in_doc(doc) # remove javascript elements. else: doc = HtmlUtil.remove_javascripts_in_doc(doc) # remove javascript elements. html = lxml.html.tostring(doc, encoding='utf8', include_meta_content_type=True) # convert to html string. html = html.decode('utf8') # bytes -> string html = StringUtil.merge(html) # replace multiple blanks to one blank. return html.strip()
def abrhtml(html, length=85): soup = bs4.BeautifulSoup(html.strip(), 'html.parser') size = len(soup.getText()) toRemove = size - length if toRemove > 0: lastString = soup.find_all(string=True)[-1] lastString.replace_with(lastString[:-toRemove] + "...") return True, soup.prettify() else: return False, soup.prettify()
def _extractRawText(self): self.rawVars = {} self.raw_text = html.strip(self.content) # see if we can get the title # We get anything until the first tag, cause there aren't supposed to be other tags inside the title title_regex = re.compile("<title>([^<]*)<", re.IGNORECASE|re.DOTALL|re.MULTILINE) match = title_regex.search(self.content) self.rawVars[dc.title] = match.group(1)
def break_into_sentences(elem): simple_html = simplified_inner_html(elem) split_html = SENT_RE.findall(simple_html) sentences = [] for html in split_html: shtml = html.strip() if not shtml: continue text = BeautifulSoup(shtml, 'html.parser').get_text() # hacky but works! sentences.append({ 'html': shtml, 'text': text, 'chars': count_meaty_chars(text), }) return sentences
reg = re.compile('<p> <a name="(\d+)"><b>([^<]+)</b></a>.*?<blockquote>(.*?)</blockquote>', re.S) reg2 = re.compile('<p> <a name="(\d+)"><b>([^<]+)</b></a>(.*?)<blockquote>', re.S) try: m = reg.findall(doc) except: m = reg2.findall(doc) print "Content-type: application/xml" print print """ <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns="http://purl.org/rss/1.0/"> <channel rdf:about="http://www.aaronsw.com/2002/zooko"> <title>""" + person + """'s Advogato Diary</title> <link>""" + zurl + """</link> <language>en-US</language> <items><rdf:Seq>""" for item in m: zuri = zurl + "?start=" + item[0] print ' <rdf:li rdf:resource="' + zuri + '" />' print """ </rdf:Seq></items> </channel>""" for item in m: zuri = zurl + "?start=" + item[0] print '<item rdf:about="' + zuri + '">' print " <link>"+zuri+"</link>" print " <title>" + html.strip(item[1]) + "</title>" print " <description>" + html.escape(item[2]) + "</description>" print "</item>" print "</rdf:RDF>"