Esempio n. 1
0
    def trim(html, prefix_url=None):
        """
        코멘트 제거, 자바스크립트 제거 (100.daum.net 제외)
        \r\n -> \n
        html에 포함된 <br>, <p>를 \n 으로 변환
        다수의 공백, \t, \n 을 하나로 합침
        :param html:
        :param prefix_url:
        :return:
        """
        html = html.replace('\r\n', '\n')
        convert_dic = {'<br>': '\n', '<br/>': '\n', '<br />': '\n', '<p>': '\n', '<p/>': '\n', '<p />': '\n',
                       '<BR>': '\n', '<BR/>': '\n', '<BR />': '\n', '<P>': '\n', '<P/>': '\n', '<P />': '\n'}
        for _from, _to in convert_dic.items():
            html = html.replace(_from, _to)
        html = HtmlUtil.remove_comments_in_html(html)  # remove html comments.
        doc = lxml.html.document_fromstring(html)  # convert to html element.r

        if prefix_url:
            doc.make_links_absolute(prefix_url)  # convert links to absolute links.

        if prefix_url:
            if '100.daum.net' not in prefix_url:  # javascript를 지우면 일부가 안 보이는 HTML도 있다. (100.daum.net)
                doc = HtmlUtil.remove_javascripts_in_doc(doc)  # remove javascript elements.
        else:
            doc = HtmlUtil.remove_javascripts_in_doc(doc)  # remove javascript elements.

        html = lxml.html.tostring(doc, encoding='utf8', include_meta_content_type=True)  # convert to html string.
        html = html.decode('utf8')  # bytes -> string
        html = StringUtil.merge(html)  # replace multiple blanks to one blank.
        return html.strip()
Esempio n. 2
0
def abrhtml(html, length=85):
    soup = bs4.BeautifulSoup(html.strip(), 'html.parser')
    size = len(soup.getText())
    toRemove = size - length
    if toRemove > 0:
        lastString = soup.find_all(string=True)[-1]
        lastString.replace_with(lastString[:-toRemove] + "...")
        return True, soup.prettify()
    else:
        return False, soup.prettify()
Esempio n. 3
0
    def _extractRawText(self):
        self.rawVars = {}
        self.raw_text = html.strip(self.content)

        # see if we can get the title
        # We get anything until the first tag, cause there aren't supposed to be other tags inside the title
        title_regex = re.compile("<title>([^<]*)<",
                      re.IGNORECASE|re.DOTALL|re.MULTILINE)

        match = title_regex.search(self.content)
        self.rawVars[dc.title] = match.group(1)
Esempio n. 4
0
def break_into_sentences(elem):
    simple_html = simplified_inner_html(elem)

    split_html = SENT_RE.findall(simple_html)

    sentences = []
    for html in split_html:
        shtml = html.strip()
        if not shtml:
            continue
        text = BeautifulSoup(shtml,
                             'html.parser').get_text()  # hacky but works!
        sentences.append({
            'html': shtml,
            'text': text,
            'chars': count_meaty_chars(text),
        })

    return sentences
Esempio n. 5
0
reg = re.compile('<p> <a name="(\d+)"><b>([^<]+)</b></a>.*?<blockquote>(.*?)</blockquote>', re.S)
reg2 = re.compile('<p> <a name="(\d+)"><b>([^<]+)</b></a>(.*?)<blockquote>', re.S)
try: 
 	m = reg.findall(doc)
except:
	m = reg2.findall(doc)

print "Content-type: application/xml"
print

print """
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:sy="http://purl.org/rss/1.0/modules/syndication/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns="http://purl.org/rss/1.0/">
<channel rdf:about="http://www.aaronsw.com/2002/zooko">
  <title>""" + person + """'s Advogato Diary</title>
  <link>""" + zurl + """</link>
  <language>en-US</language>
  <items><rdf:Seq>"""
for item in m:
	zuri = zurl + "?start=" + item[0]
	print '    <rdf:li rdf:resource="' + zuri + '" />'
print """  </rdf:Seq></items>
</channel>"""

for item in m:
	zuri = zurl + "?start=" + item[0]
	print '<item rdf:about="' + zuri + '">'
	print "  <link>"+zuri+"</link>"
	print "  <title>" + html.strip(item[1]) + "</title>"
	print "  <description>" + html.escape(item[2]) + "</description>"
	print "</item>"
print "</rdf:RDF>"