def __init__(self): # 实例化需引用的对象 self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.output = html_output.HtmlOutPut() self.parser = html_parser.HtmlParser() self.errorList = []
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.parser = html_parser.HTMLParser() self.ip_pool = ip_pool.IPPool() self.session = Session() self.ip_pools = self.ip_pool.get_ip_pools(self.session)
def get_goals(top_url): """ 获取百度风云榜明星名单 """ try: #top_url = 'http://top.baidu.com/buzz?b=3' headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36" } downloader = html_downloader.HtmlDownLoader() html_content = downloader.download(top_url, retry_count=2, headers=headers) except: print('get top list failed') html_encode = 'utf-8' encodem = re.search('.+?charset=(\w+)\"', html_content.decode(html_encode, errors='ignore')) if encodem is not None: html_encode = encodem.group(1) soup = BeautifulSoup(html_content.decode(html_encode, errors='ignore'), "html.parser") links = soup.find_all("a", {'class': "list-title"}, href=re.compile('http.+?')) #使用正则表达式查找 linksname = [link.get_text() for link in links] return linksname
def __init__(self, seed_url, user_agent='', managerQueue=None): self.urls = url_manager.UrlManagerFIFO(managerQueue) self.downloader = html_downloader.HtmlDownLoader() self.parser = html_parser.HtmlParser() self.out_put = jpg_output.jpgOutPut() self.rp = get_robots(seed_url) self.headers = {"User-Agent": user_agent} self.managerQueue = managerQueue
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.paser = html_paser.HtmlPaser()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.parser = html_parser.HtmlParser() self.out_put = html_output.HtmlOutput()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.parser = html_parser.HtmlParser() self.out_put = html_output.HtmlOutput() self.conn_mysql = connectmysql.ConnectMysql()
#coding=utf-8 #设置编码 import sys reload(sys) sys.setdefaultencoding('utf-8') #获得系统编码格式 type = sys.getfilesystemencoding() #引入相关模块 import url_manager, html_downloader, html_parser, html_outputer #实例化相关模块 urls = url_manager.UrlManager() downloader = html_downloader.HtmlDownLoader() parser = html_parser.htmlParser() outputer = html_outputer.htmlOutputer() class SpiderMain(object): def __init__(self): #初始化数据 self.urls = urls self.downloader = downloader self.parser = parser self.outputer = outputer def craw(self, root_url): #爬虫调度函数 count = 1 self.urls.add_new_url(root_url) #将主地址加入到新的地址库里面 while self.urls.has_new_url(): #判断是否有新的url地址 try: new_url = self.urls.get_new_url() #获取新的地址 print 'craw %d : %s' % (count, new_url)
def __init__(self): # URL 管理器 self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.parse = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.urls = url_manager.UrlManager() # url管理器 self.downloader = html_downloader.HtmlDownLoader() # 下载器 self.parser = html_parser.HtmlParser() # 解析器 self.outputer = html_outputer.HtmlOutputer() # 输出器
def __init__(self): self.urlManager = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.parser = html_parser.HtmpParser() #self.output = output.Output() self.output = html_outputer.HtmlOutpter()
class SpiderMain (object): def __init__(self): self.urls = url_manmager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HmtlOutputer()
# coding:utf8