def __init__(self, parseClass): self.urls = url_manager.UrlManager() self.comms = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = parseClass self.outputer = html_outputer.HtmlOutputer() self.data_stat = data_stat.DataStat() self.count = 1 self.total = 0 self.quantity_of_raw_datas = 0 # self.hp = hpy() self.quantity_of_dupli = 0 self.quantity_of_datas = 0 #连续出现几个页面没有数据的暂停 self.nodata = 0 self.nodata_pages_stop = 5 #连续出现几个404的暂停 self.forbidden = 0 self.forbidden_pages_stop = 2 # 设置延时 if 'AjkParser' in str(parseClass): self.delay = 3 elif 'GjParser' in str(parseClass): self.delay = 3 elif 'LjParser' in str(parseClass): self.delay = 3 elif 'WBParser' in str(parseClass): self.delay = 0 elif 'LejuParser' in str(parseClass): self.delay = 3 else: self.delay = 0
def __init__(self): self.urls = url_manage.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.titles = [] self.pictures = [] self.links = []
def get_pic_url_list(self, key_words, pages): url = "http://image.baidu.com/search/acjson" downloader = html_downloader.HtmlDownloader() urls_list = [] key_word_list = [] for key_word, page in zip(key_words, pages): params = [] urls = [] for i in range(30, 30 * page + 30, 30): params.append({ 'tn': 'resultjson_com', 'ipn': 'rj', 'ct': 201326592, 'is': '', 'fp': 'result', 'queryWord': key_word, 'cl': 2, 'lm': -1, 'ie': 'utf-8', 'oe': 'utf-8', 'adpicid': '', 'st': -1, 'z': '', 'ic': 0, 'word': key_word, 's': '', 'se': '', 'tab': '', 'width': '', 'height': '', 'face': 0, 'istype': 2, 'qc': '', 'nc': 1, 'fr': '', 'pn': i, 'rn': 30, 'gsm': '1e', '1488942260214': '' }) for param in params: resp = downloader.get_with_params(url, param) json_data_list = json.loads(resp).get('data') for json_data in json_data_list: if json_data.get('thumbURL') is not None: urls.append(json_data.get('thumbURL')) urls_list.append(urls) key_word_list.append(key_word) return key_word_list, urls_list
def __init__(self, root_url, proxy_pool, threads): self.manager = url_manager.UrlManger() self.downloader = html_downloader.HtmlDownloader(proxy_pool) self.parser = html_parser.HtmlParser( urlparse.urlparse(root_url).hostname) self.outputer = html_outputer.HtmlOutputer() # self.proxy_pool = proxy_pool self.dir = dir_scan.DirScan(proxy_pool, self.manager.set_protocol(root_url)) self.cms = cms_scan.CMSScan(proxy_pool) self.sqli = sqli_scan.SqliScan(proxy_pool) self.xss = xss_scan.XSSScan(proxy_pool) self.pool = ThreadPool(threads)
def __init__(self, proxy_pool=None, url=None): self.downloader = html_downloader.HtmlDownloader(proxy_pool) self.url = url
#coding:utf-8 from spider import html_downloader, AJK_parser, mytools # from bs4 import BeautifulSoup from lxml import etree import re import sys reload(sys) sys.setdefaultencoding('utf-8') downloader = html_downloader.HtmlDownloader() parser = AJK_parser.AjkParser() url = 'http://xm.58.com/ershoufang/pn1/' html_cont = downloader.download(url, False, True) # 使用lxml解析 sel = etree.HTML(html_cont.encode('utf-8')) # 解析页码 pages = sel.xpath('//div[@class="pager"]/a/@href') # for page in pages: # print(page) # soup = BeautifulSoup(html_cont,'lxml',from_encoding='utf-8') titles = sel.xpath('//h2[@class="title"]/a') prices = sel.xpath('//p[@class="sum"]/b') houses = sel.xpath('//div[@class="list-info"]') i = 1 for title, price, house in zip(titles, prices, houses): each_data = {
def __init__(self): self.downloader = html_downloader.HtmlDownloader() self.parser = blog_sina_parser_by_xpath.PicParserByXpatch() self.outputer = spider_outputer.PicOutputer() self.ocrer = baidu_aip_ocr.PicOcrer()
def __init__(self): self.downloader = html_downloader.HtmlDownloader() self.parser = movie_parser_by_xpath.MovieParserByXpatch() self.outputer = movie_outputer.MovieOutputer()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() # self.parser = html_parser.HtmlParser() self.parser = html_parser_by_xpath.HtmlParserByXpath() self.outputer = spider_outputer.HtmlOutputer()
def __init__(self): self.downloader = html_downloader.HtmlDownloader() self.parser = baidu_pic_parser.PicParserByXpatch() self.outputer = spider_outputer.PicOutputer()
def __init__(self): self.urls=url_manager.UrlManger() #初始化url管理器 self.downloader=html_downloader.HtmlDownloader()#初始化url下载器 self.parser=html_parser.HtmlParser()#初始化html解析器 self.outputer=html_output.HtmlOutputer()#初始化html输出器
def __init__(self): #初始化, self.urls = url_manager.UrlManager() #创建url管理器实例 self.downloader = html_downloader.HtmlDownloader() #创建下载器实例 self.parser = html_parser.HtmlParser() #创建解析器实例 self.outputer = html_outputer.HtmlOutputer() #创建数据输出实例
def __init__(self): #初始化各个对象 self.url = url_manager.UrlManager() self.parser = html_parser.HtmlParser() self.downloader = html_downloader.HtmlDownloader() self.outputer = img_outputer.ImgOutputer()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() self.connect = connect_mysql.Conenct()
def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()
def __init__(self,proxy_pool): self.downloader = html_downloader.HtmlDownloader(proxy_pool) self.manager = url_manager.UrlManger()
def __init__(self): self.urls = url_manager.UrlManager( ) #类的方法也是属性,函数名是一个指向函数的变量,函数赋值给变量,那么变量就指向函数 self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer()