コード例 #1
0
 def crawl(self, parser):
     """
     爬取
     :param parser:
     :return:
     """
     html_parser = Html_Parser()
     for url in parser['urls']:
         response = Html_Downloader.download(url)
         if response is not None:
             proxy_list = html_parser.parse(response, parser)
             if proxy_list is not None:
                 # 检查爬取到的proxy
                 count, new = 0, 0
                 for proxy in proxy_list:
                     count += 1
                     proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
                     if proxy_str not in self.proxies_set:
                         self.proxies_set.add(proxy_str)
                         new += 1
                         self.sqlhelper.insert(proxy)
                 self.url_count += 1
                 logger.info(
                     '%d/%d -- <%s> 获取%d, 未记录的%d' %
                     (self.url_count, self.url_total, url, count, new))
             else:
                 self.url_count += 1
                 logger.warning('%d/%d -- <%s> 解析数据错误' %
                                (self.url_count, self.url_total, url))
         else:
             self.url_count += 1
             logger.warning('%d/%d -- <%s> 下载页面错误' %
                            (self.url_count, self.url_total, url))
コード例 #2
0
ファイル: develop.py プロジェクト: dalaomai/InfoPool
def developParser():
    '''
    规则开发例子
    '''

    test_parser = {
        'webname': '佛山市人民政府门户网站',
        'urls': ['http://www.foshan.gov.cn/'],
        'type': 'regular',
        'pattern':
        r"<li class=[\s\S]*?href='([\s\S]*?)' title='([\s\S]*?)'[\s\S]*?([0-9]{4}-[0-9]{2}-[0-9]{2})",
        'position': {
            'title': 1,
            'href': 0,
            'time': 2
        }
    }
    html_parser = Html_Parser()
    r = Html_Downloader.download(test_parser['urls'][0])
    print(r)
    info_list, info_node = html_parser.parse(r, test_parser)
    for infoList in info_list:
        print(infoList)
    print('=============')
    for infoNode in info_node:
        print(infoNode)
コード例 #3
0
def crawl_alone(parser):
    html_parser = Html_Parser()
    url = 'http://www.66ip.cn/areaindex_1/1.html'
    res = Html_Downloader.download(url)
    if res:
        proxylist = html_parser.parse(res, parser)
        print(proxylist)
        for proxy in proxylist:
            proxt_str = 'ip=%s:port=%s' % (proxy['ip'], proxy['port'])
            print(proxt_str)
コード例 #4
0
ファイル: ProxyCrawl.py プロジェクト: mhfh/IPProxyPool
 def crawl(self, parser):
     html_parser = Html_Parser()
     for url in parser['urls']:
         response = Html_Downloader.download(url)
         if response is not None:
             proxylist = html_parser.parse(response, parser)
             if proxylist is not None:
                 for proxy in proxylist:
                     proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
                     if proxy_str not in self.proxies:
                         self.proxies.add(proxy_str)
                         self.queue.put(proxy)
コード例 #5
0
ファイル: ProxyCrawl.py プロジェクト: makese/IPProxyPool
 def crawl(self, parser):
     html_parser = Html_Parser()
     for url in parser['urls']:
         response = Html_Downloader.download(url)
         if response is not None:
             proxylist = html_parser.parse(response, parser)
             if proxylist is not None:
                 for proxy in proxylist:
                     proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
                     if proxy_str not in self.proxies:
                         self.proxies.add(proxy_str)
                         self.queue.put(proxy)
コード例 #6
0
def crawl(parser):
    html_parser = Html_Parser()
    for url in parser['urls']:
        response = Html_Downloader.download(url)
        print("下载URl页面:{}".format(url))
        if response is not None:
            proxylist = html_parser.parse(response, parser)
            print("代理列表: {}".format(proxylist))
            if proxylist is not None:
                for proxy in proxylist:
                    proxt_str = '%s:%s' % (proxy['ip'], proxy['port'])
                    print(proxt_str)
コード例 #7
0
 def crawl(self, parser):
     html_parser = Html_Parser()
     for url in parser['urls']:
         response = Html_Downloader.download(url)
         if response is not None:
             proxylist = html_parser.parse(response, parser)
             if proxylist is not None:
                 for proxy in proxylist:
                     proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
                     if proxy_str not in self.proxies:
                         self.proxies.add(proxy_str)
                         while True:
                             if self.queue.full():
                                 time.sleep(0.1)
                             else:
                                 self.queue.put(proxy)
                                 break
コード例 #8
0
    def crawl(self, parser):
        html_parser = Html_Parser()
        for url in parser['urls']:
            print('crawl的URL是:', url)
            response = Html_Downloader.download(url)
            if response is not None:
                proxylist = html_parser.parse(response, parser)
                if proxylist is not None:
                    for proxy in proxylist:
                        proxy_str = '%s:%s' % (proxy['ip'], proxy['port'])
                        if proxy_str not in self.proxies:
                            #							self.proxies.add(proxy_str) #感觉没作用
                            #							print('新爬取得代理IP')
                            #							print(proxy)

                            while True:

                                if self.queue.full():
                                    time.sleep(0.1)

                                else:
                                    #									print('将新的爬取到的代理IP放入队列')
                                    self.queue.put(proxy)
                                    break
コード例 #9
0
from spider.HtmlDownloader import Html_Downloader

url = 'http://www.66ip.cn/'
res = Html_Downloader.download(url)
print(res)