def crawl(self): homepage = self.key data = self.data html_stream = _get_url(homepage) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div',class_=['rich_media_content',\ 'rich_media_thumb_wrp']) xp_title = "//div[@class='rich_media_area_primary']/\ h2[@class='rich_media_title']/text()" xp_putime = "//div/em[@class='rich_media_meta rich_media_meta_text']\ /text()" xp_author = "//div/em[@class='rich_media_meta rich_media_meta_text'][2]/text()" xp_publisher = "//div/a[@id='post-user']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author) publisher = HandleContent.get_author(html_stream, xpath=xp_publisher) comment = {} # con = lambda x, y: x.text.replace('\n','').replace('\r','') + \ # y.text.replace('\n','').replace('\r','') # comment['content'] = reduce(con,content) content = clear_label(content, root=homepage) text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) date = new_time() crawl_data = {} crawl_data = { 'province': self.data.get('province', ''), 'city': self.data.get('city', ''), 'district': self.data.get('district', ''), 'url': homepage, 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': 'sogou', 'author': author, 'publisher': self.data.get('publisher', publisher), 'origin_source': u'微信公共账号', 'type': u'微信', 'comment': comment } if data.get('key'): crawl_data.update(data) model = SearchArticleModel(crawl_data) else: model = WeixinArticleModel(crawl_data) export(model)
def crawl(self): homepage = self.key data = self.data html_stream = _get_url(homepage) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div',class_=['rich_media_content',\ 'rich_media_thumb_wrp']) xp_title = "//div[@class='rich_media_area_primary']/\ h2[@class='rich_media_title']/text()" xp_putime = "//div/em[@class='rich_media_meta rich_media_meta_text']\ /text()" xp_author = "//div/em[@class='rich_media_meta rich_media_meta_text'][2]/text()" xp_publisher = "//div/a[@id='post-user']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author) publisher = HandleContent.get_author(html_stream, xpath=xp_publisher) comment = {} # con = lambda x, y: x.text.replace('\n','').replace('\r','') + \ # y.text.replace('\n','').replace('\r','') # comment['content'] = reduce(con,content) content = clear_label(content, root=homepage) text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) date = new_time() crawl_data = {} crawl_data = { 'province': self.data.get('province',''), 'city': self.data.get('city',''), 'district': self.data.get('district',''), 'url': homepage, 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': 'sogou', 'author': author, 'publisher': self.data.get('publisher', publisher), 'origin_source': u'微信公共账号', 'type': u'微信', 'comment': comment } if data.get('key'): crawl_data.update(data) model = SearchArticleModel(crawl_data) else: model = WeixinArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div', 'contaner_nr') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='contaner']/div[@class='contaner_bt']/text()" xp_putime = "//div[@class='contaner']/div[@class='contaner_ly']/text()" xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'浙江', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'zjbts', 'publisher': u'浙江质监局', 'source_type': u'质监局', # 'origin_source': u'浙江质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('div','contaner_nr') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//div[@class='contaner']/div[@class='contaner_bt']/text()" xp_putime = "//div[@class='contaner']/div[@class='contaner_ly']/text()" xp_author = "//div[@class='contaner']/div[@class='contaner_ly']/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author) date = new_time() crawl_data = { 'url': url, 'province': u'浙江', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'zjbts', 'publisher': u'浙江质监局', 'source_type': u'质监局', # 'origin_source': u'浙江质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('td', 'conzt') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()" xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()" xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author, xp_text=u'来源:') date = new_time() crawl_data = { 'url': url, 'province': u'山东', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'sdqts', 'publisher': u'山东质监局', 'source_type': u'质监局', # 'origin_source': u'山东质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)
def crawl(self): url = self.key html_stream = _get_url(url) soup = HandleContent.get_BScontext(html_stream) content = soup.find_all('td','conzt') content = clear_label(content, root=url) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) xp_title = "//tr/td/p[@class='sub_title']/preceding-sibling::h1/text()" xp_putime = "//table[@class='normal']/tbody/tr[3]/td/text()" xp_author = "//table[@class='normal']/tbody/tr[3]/td/text()" title = HandleContent.get_title(html_stream, xpath=xp_title) pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) author = HandleContent.get_author(html_stream, xpath=xp_author, xp_text=u'来源:') date = new_time() crawl_data = { 'url': url, 'province': u'山东', # 'city': u'杭州', 'title': title, 'content': content, 'pubtime': pubtime, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'source': u'sdqts', 'publisher': u'山东质监局', 'source_type': u'质监局', # 'origin_source': u'山东质监局', 'author': author, 'type': u'文章', 'comment': comment, } model = ZjldArticleModel(crawl_data) export(model)