def crawl(self, parser): """ 爬取 :param parser: :return: """ html_parser = Html_Parser() for url in parser['urls']: response = Html_Downloader.download(url) if response is not None: proxy_list = html_parser.parse(response, parser) if proxy_list is not None: # 检查爬取到的proxy count, new = 0, 0 for proxy in proxy_list: count += 1 proxy_str = '%s:%s' % (proxy['ip'], proxy['port']) if proxy_str not in self.proxies_set: self.proxies_set.add(proxy_str) new += 1 self.sqlhelper.insert(proxy) self.url_count += 1 logger.info( '%d/%d -- <%s> 获取%d, 未记录的%d' % (self.url_count, self.url_total, url, count, new)) else: self.url_count += 1 logger.warning('%d/%d -- <%s> 解析数据错误' % (self.url_count, self.url_total, url)) else: self.url_count += 1 logger.warning('%d/%d -- <%s> 下载页面错误' % (self.url_count, self.url_total, url))
def developParser(): ''' 规则开发例子 ''' test_parser = { 'webname': '佛山市人民政府门户网站', 'urls': ['http://www.foshan.gov.cn/'], 'type': 'regular', 'pattern': r"<li class=[\s\S]*?href='([\s\S]*?)' title='([\s\S]*?)'[\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})", 'position': { 'title': 1, 'href': 0, 'time': 2 } } html_parser = Html_Parser() r = Html_Downloader.download(test_parser['urls'][0]) print(r) info_list, info_node = html_parser.parse(r, test_parser) for infoList in info_list: print(infoList) print('=============') for infoNode in info_node: print(infoNode)
def crawl_alone(parser): html_parser = Html_Parser() url = 'http://www.66ip.cn/areaindex_1/1.html' res = Html_Downloader.download(url) if res: proxylist = html_parser.parse(res, parser) print(proxylist) for proxy in proxylist: proxt_str = 'ip=%s:port=%s' % (proxy['ip'], proxy['port']) print(proxt_str)
def crawl(self, parser): html_parser = Html_Parser() for url in parser['urls']: response = Html_Downloader.download(url) if response is not None: proxylist = html_parser.parse(response, parser) if proxylist is not None: for proxy in proxylist: proxy_str = '%s:%s' % (proxy['ip'], proxy['port']) if proxy_str not in self.proxies: self.proxies.add(proxy_str) self.queue.put(proxy)
def crawl(parser): html_parser = Html_Parser() for url in parser['urls']: response = Html_Downloader.download(url) print("下载URl页面:{}".format(url)) if response is not None: proxylist = html_parser.parse(response, parser) print("代理列表: {}".format(proxylist)) if proxylist is not None: for proxy in proxylist: proxt_str = '%s:%s' % (proxy['ip'], proxy['port']) print(proxt_str)
def crawl(self, parser): html_parser = Html_Parser() for url in parser['urls']: response = Html_Downloader.download(url) if response is not None: proxylist = html_parser.parse(response, parser) if proxylist is not None: for proxy in proxylist: proxy_str = '%s:%s' % (proxy['ip'], proxy['port']) if proxy_str not in self.proxies: self.proxies.add(proxy_str) while True: if self.queue.full(): time.sleep(0.1) else: self.queue.put(proxy) break
def crawl(self, parser): html_parser = Html_Parser() for url in parser['urls']: print('crawl的URL是:', url) response = Html_Downloader.download(url) if response is not None: proxylist = html_parser.parse(response, parser) if proxylist is not None: for proxy in proxylist: proxy_str = '%s:%s' % (proxy['ip'], proxy['port']) if proxy_str not in self.proxies: # self.proxies.add(proxy_str) #感觉没作用 # print('新爬取得代理IP') # print(proxy) while True: if self.queue.full(): time.sleep(0.1) else: # print('将新的爬取到的代理IP放入队列') self.queue.put(proxy) break
from spider.HtmlDownloader import Html_Downloader url = 'http://www.66ip.cn/' res = Html_Downloader.download(url) print(res)