コード例 #1
0
    def _crawl_site_6(self):
        '''
        站大爷代理,爬取最近三次更新的代理
        :return:
        '''
        main_url = 'http://ip.zdaye.com/dayProxy.html'

        resp = requests.get(main_url, self.header)
        soup = BeautifulSoup(resp.content, 'lxml')

        urls = [u.find('a').attrs['href'] for u in soup.find_all('h3', class_='thread_title')]

        header = copy(self.header)
        header['referer'] = main_url
        for url in urls[0:3]:
            resp = requests.get('http://ip.zdaye.com' + url, headers=header)
            soup = BeautifulSoup(resp.content, 'lxml')
            text = soup.find('div', class_='cont').text
            pattern = '((\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])):(\d*)@(HTTP|HTTPS)'

            ip_list = re.findall(pattern, text)
            for ip_items in ip_list:
                item = {
                    'ip': ip_items[0],
                    'port': ip_items[5],
                    'type': ip_items[-1]
                }
                # print(item)
                ProxyManager.feed_pool(json.dumps(item))
コード例 #2
0
    def _crawl_site_5(self):
        '''
        小舒代理,爬取最近两天更新的代理
        :return:
        '''
        main_url = 'http://www.xsdaili.com'
        resp = requests.get(main_url, headers=self.header)
        soup = BeautifulSoup(resp.text, 'lxml')

        urls = [u.find('a').attrs['href'] for u in soup.find_all('div', class_='title')]

        for url in urls[0:2]:
            resp = requests.get(main_url + url, headers=self.header)
            soup = BeautifulSoup(resp.content, 'lxml')

            text = soup.find('div', class_='cont').text
            pattern = '((\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])\.(\d{1,2}|1\d\d|2[0-4]\d|25[0-5])):(\d*)@(HTTP|HTTPS)'

            ip_list = re.findall(pattern, text)
            for ip_items in ip_list:
                item = {
                    'ip': ip_items[0],
                    'port': ip_items[5],
                    'type': ip_items[-1]
                }
                ProxyManager.feed_pool(json.dumps(item))
コード例 #3
0
 def run(self):
     try:
         pm = ProxyManager()
         proxies = set()
         tasks = []
         self.__gen_fetch_tasks(tasks, proxies)
         self.__wait_fetch(tasks)
         logger.info('Fetched %d proxies' % len(proxies))
         proxies = self.__remove_exist_proxies(pm, proxies)
         if proxies:
             verify_tasks = self.__gen_gevent_tasks(proxies)
             self.__wait_for_gevent_tasks(verify_tasks)
             self.__write_verify_result(pm, proxies)
         pm.close()
         gc.collect()
         logger.info('ProxyModel Fetch Finished, wait for 10 min')
     except Exception as e:
         logger.exception(e)
コード例 #4
0
    def _crawl_site_3(self):
        '''
        爬取无忧免费代理,只有10个
        :return:
        '''
        url = 'http://www.data5u.com/'
        resp = requests.get(url, headers=self.header)
        soup = BeautifulSoup(resp.text, 'lxml')

        rows = soup.find_all('ul', class_='l2')
        for row in rows:
            tds = row.find_all('li')
            item = {
                'ip': tds[0].text,
                'port': tds[1].text,
                'type': tds[3].text
            }
            ProxyManager.feed_pool(json.dumps(item))
コード例 #5
0
    def _crawl_site_7(self):
        '''
        米扑代理,游客只有第一页可见
        :return:
        '''
        url = 'https://proxy.mimvp.com/freeopen.php'
        resp = requests.get(url, headers=self.header)
        soup = BeautifulSoup(resp.text, 'lxml')

        rows = soup.find('div', class_='free-list').find_all('tr')
        for row in rows[1:]:
            tds = row.find_all('td')
            item = {
                'ip': tds[0].text,
                'port': tds[1].text,  # 需要ocr将图片内容转成文本
                'type': tds[3].text
            }
            ProxyManager.feed_pool(json.dumps(item))
コード例 #6
0
    def _crawl_site_4(self):
        '''
        云代理,爬取10页
        :return:
        '''
        for i in range(1, 11):
            url = 'http://www.ip3366.net/?stype=1&page={0}'.format(i)
            resp = requests.get(url, headers=self.header)
            soup = BeautifulSoup(resp.text, 'lxml')

            rows = soup.find('div', id='list').find_all('tr')
            for row in rows[1:]:
                tds = row.find_all('td')
                item = {
                    'ip': tds[0].text,
                    'port': tds[1].text,
                    'type': tds[3].text
                }
                ProxyManager.feed_pool(json.dumps(item))
コード例 #7
0
    def _crawl_site_0(self):
        '''
        爬取西刺代理前四页的IP
        :return:
        '''
        for i in range(1, 5):
            url = 'https://www.xicidaili.com/nn/{0}'.format(i)

            resp = requests.get(url, headers=self.header)
            soup = BeautifulSoup(resp.text, 'lxml')

            rows = soup.find('table', id='ip_list').find_all('tr')
            for row in rows[1:]:
                tds = row.find_all('td')
                item = {
                    'ip': tds[1].text,
                    'port': tds[2].text,
                    'type': tds[5].text
                }
                ProxyManager.feed_pool(json.dumps(item))
コード例 #8
0
    def _crawl_site_2(self):
        '''
        爬取极速代理前10页
        :return:
        '''
        for i in range(1, 11):
            url = 'http://www.superfastip.com/welcome/freeip/{0}'.format(i)

            resp = requests.get(url, headers=self.header)
            soup = BeautifulSoup(resp.text, 'lxml')

            rows = soup.find_all('div', class_='row clearfix')[2].find_all('tr')
            for row in rows[1:]:
                tds = row.find_all('td')
                item = {
                    'ip': tds[0].text,
                    'port': tds[1].text,
                    'type': tds[3].text
                }
                ProxyManager.feed_pool(json.dumps(item))
コード例 #9
0
    def _crawl_site_10(self):
        '''
        万能代理前十页
        :return:
        '''
        for i in range(1, 11):
            url = 'http://wndaili.cn/?page={0}'.format(i)
            resp = requests.get(url, headers=self.header)
            soup = BeautifulSoup(resp.text, 'lxml')

            rows = soup.find('div', id='list').find_all('tr')
            for row in rows[1:]:
                tds = row.find_all('td')
                item = {
                    'ip': tds[0].text,
                    'port': tds[1].text,
                    'type': tds[3].text
                }
                # print(item)
                ProxyManager.feed_pool(json.dumps(item))
コード例 #10
0
    def _crawl_site_1(self):
        '''
        爬取快代理前三页的ip
        :return:
        '''
        for i in range(1, 4):
            url = 'https://www.kuaidaili.com/free/inha/{0}'.format(i)

            resp = requests.get(url, headers=self.header)
            soup = BeautifulSoup(resp.text, 'lxml')

            rows = soup.find('div', id='list').find_all('tr')
            for row in rows[1:]:
                tds = row.find_all('td')
                item = {
                    'ip': tds[0].text,
                    'port': tds[1].text,
                    'type': tds[3].text
                }
                ProxyManager.feed_pool(json.dumps(item))

            time.sleep(2)
コード例 #11
0
    def _crawl_site_8(self):
        '''
        西拉代理
        :return:
        '''
        url = 'http://www.xiladaili.com'
        resp = requests.get(url, headers=self.header, )
        soup = BeautifulSoup(resp.text, 'lxml')

        tables = soup.find_all('table', class_='fl-table')
        # 爬取HTTP和HTTPS两块
        for t in tables[1:3]:
            for row in t.find_all('tr')[2:]:
                tds = row.find_all('td')
                ip, port = tds[0].text.split(':')
                item = {
                    'ip': ip,
                    'port': port,
                    'type': tds[2].text
                }
                print(item)
                ProxyManager.feed_pool(json.dumps(item))
コード例 #12
0
    def _crawl_site_9(self):
        '''
        泥马代理
        :return:
        '''
        url = 'http://www.nimadaili.com'
        resp = requests.get(url, headers=self.header)
        soup = BeautifulSoup(resp.text, 'lxml')

        tables = soup.find_all('div', id='overflow')
        # 爬取HTTP和HTTPS两块
        for t in tables[2:]:
            for row in t.find_all('tr')[1:-1]:
                tds = row.find_all('td')
                ip, port = tds[0].text.split(':')
                item = {
                    'ip': ip,
                    'port': port,
                    'type': tds[2].text
                }
                # print(item)
                ProxyManager.feed_pool(json.dumps(item))
コード例 #13
0
 def run(self):
     try:
         proxy_manager = ProxyManager()
         logger.info("Start proxy verify")
         while True:
             proxies = proxy_manager.proxy_verified_before(minutes=30,
                                                           limit=1000)
             if not len(proxies):
                 logger.info(
                     'Not proxy need to be verified! Sleep [ 5 ] minutes.')
                 proxy_manager.close()
                 break
             verifier = ProxyGeventVerifier()
             start = time.time()
             tasks = verifier.generate_tasks(proxies)
             logger.info('Created %d verify tasks' % len(proxies))
             gevent.joinall(tasks)
             logger.info('Proxy Verify Using %d sec.' %
                         (time.time() - start))
             passed, failed = 0, 0
             for proxy in proxies:
                 if proxy.usable:
                     passed += 1
                     proxy_manager.verify_passed(proxy)
                 else:
                     failed += 1
                     proxy_manager.verify_failed(proxy)
             proxy_manager.commit()
             logger.info('Valid Complete! %d / %d' % (passed, failed))
             proxy_manager.remove_bad_proxy()
     except Exception as e:
         logger.exception(e)
コード例 #14
0
from manager import ProxyManager
from model import ProxyModel

if __name__ == '__main__':
    # Config.Base.metadata.drop_all(Config.engine)
    # Config.Base.metadata.create_all(Config.engine)
    pm = ProxyManager()
    pm.add_proxy(ProxyModel.instance('http://27.208.25.190:8060'))
コード例 #15
0
from manager import ProxyManager
from utils import Util

#  refresh available ip proxy pool

log_file = 'refresh.log'
Util.log_to_file(
    'Refresh job is starting up at {0}.'.format(
        time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))), 0,
    log_file)
while True:
    try:

        Util.log_to_file('Start refreshing proxy pool.', 0, log_file)
        result = ProxyManager.refresh_proxy_pool()
        Util.log_to_file('Refresh finished.', 0, log_file)
        Util.log_to_file(result, 0, log_file)

        cp = ConfigParser()
        cp.read('config', encoding='utf-8')
        interval = cp.get('scheduler', 'refresh_interval')

        Util.log_to_file('Refresh job begin to sleep.', 0, log_file)
        time.sleep(int(interval) * 60)

        Util.log_to_file('Refresh job wake up and start next refresh.', 0,
                         log_file)
    except:
        Util.log_to_file(traceback.format_exc(), 1, log_file)
        Util.log_to_file(
コード例 #16
0
# coding:utf-8
import sys
from datetime import datetime

from flask import Flask, render_template, json, request

sys.path.append('../')

from utils import Config
from manager import ProxyManager

app = Flask(__name__)
app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True
proxy_manager = ProxyManager()


@app.route('/')
def index():
    return render_template('index.html')


@app.route('/api/proxy', methods=['POST'])
def api_proxy():
    args = request.json or {}
    start = int(args['start'] if 'start' in args else 0)
    length = int(args['length'] if 'length' in args else 10)
    draw = int(args['draw'] if 'draw' in args else 1)
    order = args['order'] if 'order' in args else []
    if order:
        column_name = args['columns'][order[0]['column']]['name']
        print('Order by', column_name)
コード例 #17
0
ファイル: main.py プロジェクト: ArP2018/WebCrawling
import time

from manager import ProxyManager

s = time.time()
ProxyManager.validate()
e = time.time()

print(str(e - s))
コード例 #18
0
ファイル: validation_job.py プロジェクト: ArP2018/WebCrawling
import traceback

from manager import ProxyManager
from utils import Util

#  refresh available ip proxy pool

log_file = 'validation.log'
Util.log_to_file(
    'Validation job is starting up at {0}.'.format(
        time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))), 0,
    log_file)
while True:
    try:
        Util.log_to_file('Begin validating ip proxy queue.', 0, log_file)
        result = ProxyManager.validate()
    except:
        Util.log_to_file(traceback.format_exc(), 1, log_file)
        Util.log_to_file(
            'Validation job failed running, this job will be shutdown.', 0,
            log_file)

        break

    Util.log_to_file('Validation job complete.', 0, log_file)
    time.sleep(600)

Util.log_to_file(
    'Validation job is ending at {0}.'.format(
        time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))), 0,
    log_file)
コード例 #19
0
ファイル: test_sqlalchemy.py プロジェクト: WINNY1/Proxy-Pool
from model import ProxyModel
from manager import ProxyManager

if __name__ == '__main__':
    # Config.Base.metadata.drop_all(Config.engine)
    # Config.Base.metadata.create_all(Config.engine)
    pm = ProxyManager()
    pm.add_proxy(ProxyModel.from_url('http://27.208.25.190:8060'))