def parse_html(self, a_url, a_time, a_category, subcategory): try: html = urllib2.urlopen(a_url, timeout=30).read() soup = BeautifulSoup(html, "lxml") div = soup.find(name="div", class_="hl_content") # 标题 a_title = div.find(name="div", class_="hl_c_title").h2.string.encode("utf-8") # 作者,时间 a_author = div.find(name="div", class_="hl_c_twid").span.string[3:].encode("utf-8") # 正文 a_text = "" plist = div.find(name="div", class_="hl_body").find_all(name="p") for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 a_tags = subcategory article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tags, a_category=a_category) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason
def parse_html(self, a_url, a_time, a_category, a_tag): try: html = urllib2.urlopen(a_url, timeout=30).read() soup = BeautifulSoup(html, "lxml") article = soup.find(name="article", class_="post-article") # 标题 a_title = article.header.h1.string.encode("utf-8") # 作者,时间 a_author = article.header.a.span.string.encode("utf-8") # 正文 a_text = "" plist = article.find(name="div", class_="post-con").find_all(name="p", recursive=False) for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tag, a_category=a_category) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason
def parse_html(a_url, a_time): # time.sleep(5) try: html = urllib2.urlopen(a_url, timeout=30).read() soup = BeautifulSoup(html, "lxml") div_post_content_main = soup.find(name="div", class_="post_content_main") # 标题 a_title = div_post_content_main.h1.string.encode('utf-8') # 作者,时间 a_author = div_post_content_main.div.a.string.encode('utf-8') a_time = a_time # 正文 a_text = "" plist = div_post_content_main.find( name="div", class_="post_text").find_all("p") for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 a_tags = "" article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tags) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason
def parse_html(a_url): # time.sleep(5) try: html = urllib2.urlopen(a_url, timeout=30).read() soup = BeautifulSoup(html, "lxml") article_wrap = soup.find(name="div", class_="article-wrap") # 标题 a_title = article_wrap.h1.string.encode('utf-8') # 作者,时间 article_author = article_wrap.find(name="div", class_="article-author") a_author = article_author.span.a.string.encode('utf-8') a_time = article_author.find( name="span", class_="article-time").string.encode('utf-8') # 正文 a_text = "" plist = article_wrap.find( name="div", class_="article-content-wrap").find_all(name="p") for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 a_tags = "" div_tag_box = article_wrap.find(name="div", class_="tag-box ") if div_tag_box is not None: alist = div_tag_box.ul.find_all(name="a") else: alist = [] if len(alist) > 0: alist = [a.li.string.encode('utf-8') for a in alist] a_tags = " ".join(alist) article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tags) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason
def parse_html(self, a_url, a_time, a_category): try: html = urllib2.urlopen(a_url, timeout=30).read() soup = BeautifulSoup(html, "lxml") divs = soup.find(name="div", class_="center-research-t").div.find_all( name="div", recursive=False) div_txt = divs[0] div_foot = divs[1] # 标题 a_title = div_txt.h1.string.encode("utf-8") # 作者,时间 a_author = div_txt.find( name="div", class_="t11_info").div.a.string.encode("utf-8") # 正文 a_text = "" plist = div_txt.find( name="div", class_="t11_mlblk t11_contentarea").find_all(name="p") for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 a_tags = "" alist = div_foot.div.find_all(name="a", recursive=False) if alist is not None: alist = [a.string.encode('utf-8') for a in alist] a_tags = " ".join(alist) article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tags, a_category=a_category) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason
def parse_html(self, a_url, a_time): try: html = urllib2.urlopen(a_url, timeout=30).read() soup = BeautifulSoup(html, "lxml") scriptstr = soup.find( name="script", text=re.compile("var props")).string.encode("utf-8") scriptstr = scriptstr[10:scriptstr.find(",locationnal")] json_obj = json.loads(scriptstr) data = json_obj["detailArticle|post"] # 标题 a_title = data["title"].encode('utf-8') # 作者,时间 a_author = data["user"]["name"].encode('utf-8') a_time = a_time # 正文 a_text = "" soup = BeautifulSoup(data["content"], "lxml") plist = soup.find_all(name="p") for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 a_tags = "" tags = data[ "extraction_tags"] # "extraction_tags": "[[\"早期项目\",\"zaoqixiangmu\",1],[\"信息安全\",\"xinxianquan\",2],[\"网络运维\",\"wangluoyunwei\",2]]" if tags is not None: tags = re.findall(u"[\u4e00-\u9fa5]+", tags) if len(tags) > 0: a_tags = u" ".join(tags) a_tags = a_tags.encode("utf-8") article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tags) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason
def parse_html(self, a_url, a_time): try: html = urllib2.urlopen(a_url, timeout=30).read() soup = BeautifulSoup(html, "lxml") div_hl_content = soup.find(name="div", class_="hl_content") # 标题 a_title = div_hl_content.find( name="div", class_="hl_c_title").h2.contents[0].encode("utf-8") # 作者,时间 #a_author = div_hl_content.find(name="div", class_="hl_c_twid").a.stripped_string a_author = "暂未提取" a_time = a_time # 正文 a_text = "" plist = div_hl_content.find(name="div", class_="hl_body").find_all(name="p") for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 a_tags = "" div_hl_c_tagl = div_hl_content.find(name="div", class_="hl_c_tagl") if div_hl_c_tagl is not None: lis = div_hl_c_tagl.ul.find_all(name="li") else: alist = [] if len(lis) > 0: alist = [li.a.string.encode('utf-8') for li in lis] a_tags = " ".join(alist) article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tags) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason
def parse_html(self, a_url, a_time): time.sleep(2) try: html = urllib2.urlopen(a_url, timeout=30).read() soup = BeautifulSoup(html, "lxml") header_top_section = soup.find(name="header", class_="top-section") section_main_content = soup.find(name="section", class_="main-content") # 标题 a_title = header_top_section.h1.string.encode('utf-8') # 作者,时间 a_author = header_top_section.div.a.span.string.encode('utf-8') a_time = a_time # 正文 a_text = "" plist = section_main_content.find( name="div", class_="article-content").find_all(name="p") for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 a_tags = "" section_tags = section_main_content.section if section_tags is not None: alist = section_tags.find_all(name="a") else: alist = [] if len(alist) > 0: alist = [a.string.encode('utf-8') for a in alist] a_tags = " ".join(alist) article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tags) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason
def parse_html(self, a_url, a_time, a_category): try: html = urllib2.urlopen(a_url, timeout=30).read() soup = BeautifulSoup(html, "lxml") div_post = soup.find(name="div", id="post_content") # 标题 a_title = div_post.find(name="div", id="post_title").string.encode("utf-8") # 作者,时间 a_author = div_post.find(name="div", id="post_info").\ div.find(name="div",id="post_author").string.encode("utf-8") # 正文 a_text = "" plist = div_post.find(name="div", id="post_description").find_all( name="p", recursive=False) for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 a_tags = "" alist = div_post.find(name="div", class_="article_info_box").\ find(name="div", class_="article_info_box_right").find_all(name="a") if alist is not None: alist = [ a.string.encode('utf-8') for a in alist if a.string is not None ] a_tags = " ".join(alist) article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tags, a_category=a_category) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason
def parse_html(self, a_url, a_time, a_category): time.sleep(1) try: html = urllib2.urlopen(a_url, timeout=30).read() soup = BeautifulSoup(html, "lxml") div = soup.find(name="div", id="page") # 标题 a_title = "" strings = div.h1.stripped_strings for string in strings: a_title += string.encode("utf-8") # 作者,时间 a_author = "" a_time = div.div.div.span.string.encode("utf-8") a_time = time.strftime('%Y', time.localtime(time.time())) + "年" + a_time a_time = self.time_normalize(a_time, '%Y年%m月%d日 %H:%M') # 正文 a_text = "" plist = div.find(name="div", class_="article-detail").find_all(name="p", recursive=False) for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 a_tags = "" div_tags = div.div.find_all(name="div", recursive=False)[1] if div_tags is not None: alist = div_tags.find_all(name="a") else: alist = [] if len(alist) > 0: alist = [a.string.encode('utf-8') for a in alist] a_tags = " ".join(alist) article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tags, a_category=a_category) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason
def parse_html(self, a_url, a_time, a_category): try: html = urllib2.urlopen(a_url, timeout=30).read() soup = BeautifulSoup(html, "lxml") div = soup.find(name="div", class_="main-content") # 标题 a_title = div.div.string.encode('utf-8') # 作者,时间 a_author = div.find(name="div", class_="author").find(name="span", class_="name").string.encode('utf-8') # 正文 a_text = "" div_var = div.find(name="div", class_="left523") if div_var is not None: plist = div_var.find_all(name="p", recursive=False) else: plist = div.find_all(name="p", recursive=False) for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 a_tags = "" div_tags = div.find(name="div", class_="fl tags") if div_tags is not None: alist = div_tags.find_all(name="span") else: alist = [] if len(alist) > 0: alist = [a.string.encode('utf-8') for a in alist] a_tags = " ".join(alist) article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tags, a_category=a_category) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason
def parse_html(self, a_url, a_time): # time.sleep(1) try: req = urllib2.Request(a_url, headers=self.headers) html = urllib2.urlopen(req).read() soup = BeautifulSoup(html, "lxml") article_left = soup.find(name="div", class_="article-left lph-left") # 标题 a_title = article_left.div.h1.string.encode('utf-8') # 作者,时间 a_author = article_left.find( name="div", class_="pi-author").a.string.encode('utf-8') a_time = a_time # 正文 a_text = "" plist = article_left.find( name="div", class_="pageCont lph-article-comView ").find_all(name="p") for p in plist: strings = p.stripped_strings for string in strings: a_text = a_text + string.encode('utf-8') a_text += "\n" # 标签 a_tags = "" article = Article(a_title=a_title, a_text=a_text, a_time=a_time, a_author=a_author, a_url=a_url, a_tags=a_tags) return article except urllib2.HTTPError, e: print "HTTPError:", e.code, e.reason