def ssdut_news_parse(raw): ''' parse the raw page src, store all result in a Storage object. all strings are unicode result.soup BeautifulSoup object result.raw raw page src result.hash sha1 hash of the page result.title title result.source 来源 result.date_str - date in string result.date - date object result.body html src of the news body result.clean_body unescaped src of the news body, result.publisher 发表人 ''' soup = bsoup(raw) result = Storage() # raw page / hash result.raw = raw result.soup = soup # title s = soup.find(attrs={'class': re_compile('title')}) result.title = s.text # source text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn")) r = re_compile(ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:") res = r.findall(text)[0] result.source = res[1].rstrip() # date result.date_str = res[0] result.date = datetime.date(*[int(n) for n in result.date_str.split('-')]) # content (body) c = soup.find(attrs={'class': re_compile('content')}) result.body = unicode(c) # content (body) unescaped texts = c.findAll(text=True) all_texts = '\n'.join(texts) result.clean_body = html_parser.unescape(all_texts) # publisher (could be find at the bottom of page) s = soup.find( attrs={ "style": "font-size:14px;float:left;text-align:right;width:80%" }) r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)") #logging.debug("publisher string = %r " % s) try: name = r.findall(s.text)[0] except: logging.warn(" %s has no publisher " % result.title) name = "" # no publisher: like this: index.php/News/8692.html result.publisher = name.rstrip().lstrip() # use utf-8 encoding for k in ['title', 'source', 'body', 'clean_body', 'publisher']: result[k] = result[k].encode('utf-8') hash_src = result.body + result.title + result.publisher if isinstance(hash_src, str): hash_src = unicode(hash_src, "utf-8", "ignore") elif isinstance(hash_src, unicode): pass else: pass result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest() result.search_text = ''.join([ result.title, result.source, result.clean_body, result.publisher, result.sha1 ]) return result
def ssdut_news_parse(raw): ''' parse the raw page src, store all result in a Storage object. all strings are unicode result.soup BeautifulSoup object result.raw raw page src result.hash sha1 hash of the page result.title title result.source 来源 result.date_str - date in string result.date - date object result.body html src of the news body result.clean_body unescaped src of the news body, result.publisher 发表人 ''' soup = bsoup(raw) result = Storage() # raw page / hash result.raw = raw result.soup = soup # title s = soup.find(attrs={'class': re_compile('title')}) result.title = s.text # source text = soup.find(text=re_compile(r"^http://ssdut.dlut.edu.cn")) r = re_compile( ur"(\d+-\d+-\d+)\u3000\u3000\u6765\u6e90:(.+)\u5173\u6ce8:") res = r.findall(text)[0] result.source = res[1].rstrip() # date result.date_str = res[0] result.date = datetime.date(*[int(n) for n in result.date_str.split('-')]) # content (body) c = soup.find(attrs={'class': re_compile('content')}) result.body = unicode(c) # content (body) unescaped texts = c.findAll(text=True) all_texts = '\n'.join(texts) result.clean_body = html_parser.unescape(all_texts) # publisher (could be find at the bottom of page) s = soup.find( attrs={ "style": "font-size:14px;float:left;text-align:right;width:80%" }) r = re_compile(ur"\u53d1\u8868\u4eba\uff1a(.+)") #logging.debug("publisher string = %r " % s) try: name = r.findall(s.text)[0] except: logging.warn(" %s has no publisher " % result.title) name = "" # no publisher: like this: index.php/News/8692.html result.publisher = name.rstrip().lstrip() # use utf-8 encoding for k in ['title', 'source', 'body', 'clean_body', 'publisher']: result[k] = result[k].encode('utf-8') hash_src = result.body + result.title + result.publisher if isinstance(hash_src, str): hash_src = unicode(hash_src, "utf-8", "ignore") elif isinstance(hash_src, unicode): pass else: pass result.sha1 = sha1(hash_src.encode("utf-8")).hexdigest() result.search_text = ''.join([result.title, result.source, result.clean_body, result.publisher, result.sha1]) return result