def __init__(self): super(SpiderMain, self).__init__() self.ts_url = 'http://127.0.0.1:8000/api/lol/save_news' self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.ts = ts_data.TsData(self.ts_url)
def __init__(self): self.url_manager = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.google_fetcher = html_google_fetcher.HtmlGoogleParser() self.pr_calculator = page_rank_util.PRCalculator() self.url_parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): super(SpiderMain, self).__init__() self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): # 实例化需引用的对象 self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.output = html_output.HtmlOutPut() self.parser = html_parser.HtmlParser() self.errorList = []
def __init__(self): self.maxcount = 1000 #设置最大抓取数据数量 self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.pdf = pdf_download.PdfDownload() self.pic = download.PicDowload()
def __init__(self,path=""): self.urls = url_manager.UrlManager(path) self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.root_url = "http://baike.baidu.com/view/21087.htm" self.path = path self.new = True if len(path)==0 else False
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() # self.targetKeywords = ['B-box', 'beatbox', 'bbox', 'Beatbox'] self.targetKeywords = ['三峡']
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownLoader() self.parser = html_parser.HTMLParser() self.ip_pool = ip_pool.IPPool() self.session = Session() self.ip_pools = self.ip_pool.get_ip_pools(self.session)
def __init__(self): # url管理器 self.urls = url_manager.UrlManager() # 网页下载器 self.downloader = html_downloader.HtmlDowmloader() # 网页解析器 self.parser = html_parser.HtmlParser()
def __init__(self): # 初始化配置 cf = ConfigParser.ConfigParser() cf.read("config.conf") self.projectid = '%s' % cf.get("start", "project_id") self.root_url = '%s' % cf.get("start", "root_url") self.number = '%s' % cf.get("start", "number") # 启动表连接 db = mysqldbhand() db.dbconnect() db.init_tables(self.projectid) project = db.FindAll('project', '*', where='id= %s' % (self.projectid)) project_field = db.FindAll('project_field', '*', where='pid= %s' % (self.projectid)) self.tablename = project[0][2] + '_content' # 加载URL管理器 self.urls = url_manager.UrlManager(self.tablename) # 加载下载器 self.downloader = html_downloader.HtmlDownloader() # 加载页面解析器 self.parse = html_parser.HtmlParser(self.tablename, project, project_field) # 加载入库程序 self.outputer = html_outputer.HtmlOutputer(self.tablename)
def __init__(self): """初始化 建立四个模块的实例 """ self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.download_list = [ 'tv', 'korean_tv', 'china_tv', 'hongkong_tv', 'cartoon', 'jilu' ]
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_request.HtmlRequests() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.download_list = [ 'gq_movie', 'hao6_mj', 'china_tv', 'hongkong_tv', 'cartoon', 'jilu' ]
def __init__(self): self.urlManager = url_manager.UrlManager() self.parser = html_parser.HtmlParser() self.downloader = html_downloader.HtmlDownloader() self.collector = data_collector.DataCollector() self.lock = threading.Lock() #线程锁 self.local_crawed = threading.local() #创建全局ThreadLoacl对象,让每个线程拥有自己的数据。 self.count = 0 #全局爬取页面计数
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.output = html_output.HtmlOutput() self.headers = { "User_Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36" }
def __init__(self, root_url): self.url = root_url self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_output.HtmlOutputer()
def __init__(self): # 初始化所需要的对象,包括url管理器,网页下载器,网页解析器,输出器 # 来提供给craw()使用 self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self, isuse, connection): self.config = config self.connection = connection self.urls = url_manager.UrlManager(connection, isuse) self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParse() self.outputer = html_outputer.HtmlOutputer() self.imgdownloader = img_downloader.ImgDownloader()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownload() self.parser = html_parser.HtmlParser() self.mkdirs = test_mkdir.MakeDir()
def __init__(self): # url管理器 self.urls = url_manager.UrlManager() # 下载器 self.downloader = html_downloader.HtmlDownloader() # 解析器 self.parser = html_parser.HtmlParse() # 输出器 self.outputer = html_outputer.HtmlOutputer()
def __init__(self): # url管理器 self.urls = url_manager.UrlManager() # 网页下载器 self.downloader = html_downloader.HtmlDowmloader() # 网页解析器 self.parser = html_parser.HtmlParser() # 文件读写工具 self.outputer = file_outputer.Outputer()
def __init__(self): #URL管理器 self.urls = url_manager.UrlManager() #HTML下载器 self.downloader = html_downloader.HtmlDownloader() #HTML解析器 self.parser = html_parser.HtmlParser() #HTML输出器 self.outputer = html_outputer.HtmlOutputer()
def __init__(self): # 初始化爬虫的管理器 self.manager = url_manager.UrlManager() # 初始化输出器 self.outputer = html_outputer.Outputer() # 初始化解析器 self.parser = html_parser.Parser() # 初始化下载器 self.downloader = html_downloader.Downloader()
def __init__(self): # urls 作为管理器 self.urls = url_manager.UrlManager() # downloader作为下载器 self.downloader = html_downloader.HtmlDownloader() # parser作为解析器 self.parser = html_parser.HtmlParser() # outputer 将数据处理好的数据写出到 html 的页面 self.outputer = html_outputer.HtmlOutputer()
def __init__(self): self.urls = url_manager.UrlManager( ) # url管理器 管理2个集合 分别存放已抓取的url和待抓取的url # 提供4个方法:has_new_url, get_new_url, add_new_url, add_new_urls self.downloader = html_downloader.HtmlDownloader() #下载器 # 提供 1个方法download(url): 给定url返回字符串 self.parser = html_parser.HtmlParser() # html页面解析器 # 提供1个方法parse(new_url, html_cont) 返回页面解析得到的urls和data self.outputer = html_outputer.HtmlOutputer() # 输出器
def __init__(self): # Url manager self.urls = url_manager.UrlManager() # Url downloader self.downloader = html_downloader.HtmlDownloader() # Url parser self.parser = html_parser.HtmlParser() # Url output device self.outputer = html_outputer.HtmlOutputer()
def __init__(self): # 获取URL管理器 self.urls = url_manager.UrlManager() # 获取网页下载器 self.downloader = html_downloader.HtmlDownloader() # 获取网页解析器 self.parser = html_parser.HtmlParser() # 获取数据输出器 self.output = html_outputer.HtmlOutput()
def __init__(self): super(SpiderMain, self).__init__() self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.uploader = data_uploader.DataUploader() self.localDataManager = local_data_manager.LocalDataManager() self.pdfDealer = fang_main.PDF_Data_Dealer(self.localDataManager)
def __init__(self): self.urls = url_manager.UrlManager() # url管理器 self.downloder = html_downloader.HtmlDownloader() # 网页下载器 self.parser = html_parser.HtmlParser() # 51JOB网页解析器 self.dataanalyse = data_analyse.DataAnalyse() #数据分析 self.datapicture = data_picture.DataPicture() #数据可视化 self.datapicturepie = data_picture_pie.ShowJodSalary() #数据可视化pie self.yy = yuyin.YuYin() #语音播报 #self.delect = del_huancun.DelHuancun() #清理语音数据缓存 self.user_agent = user_agent.Random_user_agent() #随机请求头
def __init__(self): #初始化各个模块. # 定义url管理器的对象 self.urls = url_manager.UrlManager() # html下载器 self.downloader = html_downloader.HtmlDownloader() # html解析器 self.parser = html_parser.HtmlParser() # html输出器 self.outputer = html_outputer.HtmlOutputer()