def initIframe_contentPage(website_url, content_url): """ @summary: 初始化订阅的内容页面(添加相应的js) :param website_url: 网站url地址 :param content_url: 内容url地址 :return: 页面源码 """ script_code = """ <script src="/static/Subpage/js/common.js"></script> <script src="/static/Subpage/js/filterXpath.js"></script> <script src="/static/Subpage/js/contentDiscriminate.js"></script> """ if global_Chrome: html = Spider().chromedriver(content_url) else: html = Spider().urllib(content_url) if not html: return "404" p = re.compile(r'(<head>|<head .*?>)') html = p.subn( r"\1%s" % ('<meta name="referrer" content="never">' '<link rel="stylesheet" type="text/css" href="/static/Subpage/css/sub.css">' ), html)[0] p = re.compile(r'(</body>)') extract_html = "%s<input id='sub_info' type='hidden' website_url='%s' content_url='%s'>" % ( script_code, website_url, content_url) html = p.subn(r"%s\1" % extract_html, html)[0] return html
def get(self): args = parser.parse_args() if 'page_now' not in args: return err_res(code=0, msg='no page_now') if 'page_size' not in args: return err_res(code=0, msg='no page_size') page_now = args['page_now'] page_size = args['page_size'] if page_now * page_size > 250: return err_res(code=0, msg='无更多电影') cached_movies = session.query(Movie).filter( Movie.id.between((page_now - 1) * page_size + 1, page_now * page_size)).all() if len(cached_movies): return success_res(code=1000, data=cached_movies, msg='success') try: spider = Spider() movies = spider.get_movies(const.BASE_URL) for movie in movies: create_movie(movie) cached_movies = session.query(Movie).filter( Movie.id.between((page_now - 1) * page_size + 1, page_now * page_size)).all() return success_res(code=1000, data=cached_movies, msg='success') except: return err_res(code=0, msg='err')
def initWebsite(website_type, website_url, content_url=''): """ @summary: 初始化订阅的网站页面(添加相应的js) :param website_type: 网站类型(0:普通网站地址;1:公众号地址) :param website_url: 网站地址 :param content_url: 内容地址 :return: """ if int(website_type) == 0 and not urlLegal(website_url): return {"res": InitResType.inlegalUrl, "code": ""} oa_name = "" if website_type == "1": oa_name = website_url website_url = getOAUrl(website_url) if website_url is None: return {"res": InitResType.OAnotfound, "code": ""} script_code = """ <script src="/static/Subpage/js/common.js"></script> <script src = "/static/Subpage/js/filterXpath.js"></script> <script src="/static/Subpage/js/websiteDiscriminate.js"></script> """ html = Spider().chromedriver(website_url) head = re.findall("(<head.*?>)", html) if len(head) > 0: head = head[0] html = html.replace( head, "%s%s" % (head, '<meta name="referrer" content="never"><link rel="stylesheet" type="text/css" href="/static/Subpage/css/sub.css">' )) html = html.replace( "</body>", "%s<input id = 'sub_info' type = 'hidden' detail='%s' website_url='%s' content_url='%s'></body>" % (script_code, oa_name, website_url, content_url)) return {"res": InitResType.success, "code": html}
def get_bookinfo(self, html): doc = pq(html) if self.settings['page']['rm_eles']: for cur in self.settings['page']['rm_eles']: doc(cur).remove() introduction = doc(self.settings['page']['introduction']).html() self.bookname = doc(self.settings['page']['name']).text().replace( u'《', '').replace(u'》', '').strip() self.creator = doc(self.settings['page']['creator']).text().strip() if not self.bookname: raise Exception('抓取网页失败!') print("《%s》开始抓取" % self.bookname) self.create_path() _list = introduction.split('<br/>') for item in _list: if not item: continue item = item.strip() if not item: continue self.introduction += item + '<br/>' cover = doc(self.settings['page']['cover']).attr('src') Spider.download_image(cover, os.path.join(self.path, self.bookname), 'cover') list_chapter = doc(self.settings['page']['chapters']).items() index = 0 for chapter in list_chapter: title = chapter('a').text() href = chapter('a').attr('href') if not title or not href: break # continue index = index + 1 self.chapters.append({ 'index': index, 'title': title, 'href': href })
def getOAUrl(name): """ @summary: 获取公众号链接地址 :param name: 公众号名称 :return: 公众号链接地址 or None """ try: oa_json = Spider().requests("http://top.aiweibang.com/user/getsearch", "POST", {'Kw': name}) oa_data = json.loads(oa_json)["data"] oa_id = oa_data['data'][0]['Id'] url = "http://top.aiweibang.com/article/%s" % oa_id return url except: return None
def get_chapter_content(self, index, url): _url = url try: bookdirpath = os.path.join(self.path, self.bookname) file_name = '%05d' % (index + 1) file_name = 'chapter_' + file_name + '.xhtml' folder = os.path.exists(os.path.join(bookdirpath, file_name)) if folder: self.mutex.acquire() self.num += 1 percent = self.num * 100.0 / len(self.chapters) _str = '%s [%.2f%%] (%d/%d) %d 已存在!' % ( self.bookname, percent, self.num, len( self.chapters), index) # _str = '%s [%.2f%%] %s 已存在!' % (self.bookname, percent, self.chapters[index]["title"]) print('\r%s' % _str, ) sys.stdout.flush() self.mutex.release() return if self.settings['page']['link_concat']: _url = self.settings['home'] + url html = Spider.get_content(_url) if self.settings['chapter']['gzip']: html = zlib.decompress(html, zlib.MAX_WBITS | 16) html = html.decode(self.settings['decode'], 'ignore') except Exception as e: self.mutex.acquire() # print '\r%s %s ' % (_url, e.message), print('%s %s' % (_url, str(e))) sys.stdout.flush() self.mutex.release() time.sleep(1) self.get_chapter_content(index, url) return html = html.replace('xmlns="http://www.w3.org/1999/xhtml" /', '').replace('xmlns="http://www.w3.org/1999/xhtml"', '') doc = pq(html) if self.settings['chapter']['rm_eles']: for cur in self.settings['chapter']['rm_eles']: doc(cur).remove() self.create_chapter(index, doc(self.settings['chapter']['content']).html())
def get_html(self): html = Spider.get_content(self.url).decode('utf-8', 'ignore') with open('file/weather.html', mode='w', encoding='utf-8') as f: f.write(html) doc = pq(html)
from selenium import webdriver from comman.expert_csv import ExpertCSV from common.spider import Spider from model.expert import Expert url_list = ['https://blog.csdn.net/weixin_43570367?t=1'] csdn_one_page_title_count = 40 def get_writer(url): return url.split('/')[-1].split('?')[0] if __name__ == '__main__': spider = Spider( webdriver.Chrome(executable_path='../asset/chromedriver11')) driver = spider.driver() count = 1 csv = ExpertCSV() try: for url in url_list: driver.get(url) while True: for i in range(1, 41): # model 对象 expert = Expert() # 获取标题#.article-item-box:nth-child(1) > h4 > a title = spider.find_ele_by_css( f'.article-item-box:nth-child({i}) > h4 > a')