def developParser(): ''' 规则开发例子 ''' test_parser = { 'webname': '佛山市人民政府门户网站', 'urls': ['http://www.foshan.gov.cn/'], 'type': 'regular', 'pattern': r"<li class=[\s\S]*?href='([\s\S]*?)' title='([\s\S]*?)'[\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})", 'position': { 'title': 1, 'href': 0, 'time': 2 } } html_parser = Html_Parser() r = Html_Downloader.download(test_parser['urls'][0]) print(r) info_list, info_node = html_parser.parse(r, test_parser) for infoList in info_list: print(infoList) print('=============') for infoNode in info_node: print(infoNode)
def crawl(self, parser): """ 爬取 :param parser: :return: """ html_parser = Html_Parser() for url in parser['urls']: response = Html_Downloader.download(url) if response is not None: proxy_list = html_parser.parse(response, parser) if proxy_list is not None: # 检查爬取到的proxy count, new = 0, 0 for proxy in proxy_list: count += 1 proxy_str = '%s:%s' % (proxy['ip'], proxy['port']) if proxy_str not in self.proxies_set: self.proxies_set.add(proxy_str) new += 1 self.sqlhelper.insert(proxy) self.url_count += 1 logger.info( '%d/%d -- <%s> 获取%d, 未记录的%d' % (self.url_count, self.url_total, url, count, new)) else: self.url_count += 1 logger.warning('%d/%d -- <%s> 解析数据错误' % (self.url_count, self.url_total, url)) else: self.url_count += 1 logger.warning('%d/%d -- <%s> 下载页面错误' % (self.url_count, self.url_total, url))
def crawl(self, parser): proxys = [] html_parser = Html_Parser() for url in parser['urls']: response = Html_Downloader.download(url) if response != None: proxylist = html_parser.parse(response, parser) if proxylist != None: proxys.extend(proxylist) return proxys
def crawl(self,parser): proxys = [] html_parser = Html_Parser() for url in parser['urls']: response = Html_Downloader.download(url) # print response if response!=None: proxylist= html_parser.parse(response,parser) if proxylist != None: proxys.extend(proxylist) return proxys
def crawl(self, parser): html_parser = Html_Parser() for url in parser['urls']: response = Html_Downloader.download(url) if response is not None: proxylist = html_parser.parse(response, parser) if proxylist is not None: for proxy in proxylist: proxy_str = '%s:%s' % (proxy['ip'], proxy['port']) if proxy_str not in self.proxies: self.proxies.add(proxy_str) self.queue.put(proxy)
def crawl(self, parser): html_parser = Html_Parser() for url in parser['urls']: response = Html_Downloader.download(url) if response is not None: proxylist = html_parser.parse(response, parser) if proxylist is not None: for proxy in proxylist: proxy_str = '%s:%s' % (proxy['ip'], proxy['port']) if proxy_str not in self.proxies: self.proxies.add(proxy_str) while True: if self.queue.full(): time.sleep(0.1) else: self.queue.put(proxy) break
def crawl(self, parser): html_parser = Html_Parser() for url in parser['urls']: print('crawl的URL是:', url) response = Html_Downloader.download(url) if response is not None: proxylist = html_parser.parse(response, parser) if proxylist is not None: for proxy in proxylist: proxy_str = '%s:%s' % (proxy['ip'], proxy['port']) if proxy_str not in self.proxies: # self.proxies.add(proxy_str) #感觉没作用 # print('新爬取得代理IP') # print(proxy) while True: if self.queue.full(): time.sleep(0.1) else: # print('将新的爬取到的代理IP放入队列') self.queue.put(proxy) break