コード例 #1
0
ファイル: new_proxy.py プロジェクト: emilymwang8/fang-broker
def worker(ftype, queue):
    '''
    线程worker

    :param ftype: 
    :param queue:
    :return:
    '''

    while True:
        # 队列为空,停止
        if queue.empty():
            LazyFW.log('''TaskEmpty: break''')
            break

        try:
            task = queue.get_nowait()
            #LazyFW.log('''TaskGet: %s''' % (task, ))

            if ftype == 'fetch_proxy':
                fetch_proxy(task)
            elif ftype == 'proxy_test':
                proxy, speed = LazyFW.test_proxy(task, PROXY_TIMEOUT, 'http://esf.sh.fang.com/agenthome/', '搜房网','gbk')
                if proxy != None:
                    proxy_insert(proxy, speed)
        except Exception, e:
            LazyFW.log('''TaskError(%s)''' % (e,))
コード例 #2
0
ファイル: new_proxy.py プロジェクト: emilymwang8/fang-broker
def main():
    proxy_queue = Queue()
    proxy_hosts = Queue()

    create_db()
    # 查询urls
    DB_CONN = get_conn()
    c = DB_CONN.cursor()
    LazyFW.log(r'''SELECT count(*) as `cnt` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, ))
    c.execute(r'''SELECT count(*) as `cnt` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, ))
    proxys = c.fetchone()
    c.close()
    if proxys[0] < 10:
        proxy_urls = get_proxy_urls()
        for url in proxy_urls:
            proxy_queue.put_nowait(url)

        workers = []
        for i in range(PROXY_THREAD_FETCH_MAX):
            p = Process(target=worker, args=('fetch_proxy', proxy_queue))
            p.daemon = True
            p.start()
            workers.append(p)

        for p in workers:
            p.join()
    DB_CONN.commit()
    DB_CONN.close()

    # 再次查询出数据
    DB_CONN = get_conn()
    LazyFW.log(r'''SELECT `host`,`port` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, ))
    c = DB_CONN.cursor()
    c.execute(r'''SELECT `host`,`port` FROM `proxys_%s` where `speed` > %d;''' % (CURR_DATE, PROXY_TIMEOUT, ))
    for row in c.fetchall():
        proxy_hosts.put_nowait(row)

    c.close()
    DB_CONN.commit()
    DB_CONN.close()

    workers = []
    for i in range(PROXY_THREAD_TEST_PROXY_MAX):
        p = Process(target=worker, args=('proxy_test', proxy_hosts))
        p.daemon = True
        p.start()
        workers.append(p)

    for p in workers:
        p.join()
コード例 #3
0
ファイル: jiandan.py プロジェクト: mylukin/Creeper
    def FetchLinksFromSource(self, url, htmlSource):
        html = LazyFW.format_url(url, htmlSource)
        html = LazyFW.clear_space(html)
        list_body = LazyFW.mid(html, '<!-- begin comments -->', '<!-- end comments -->')
        # <li id="comment-2792510"> <div>

        list_body = re.sub(r'''(<li\s+id="comment\-[\d]+">\s*<div>)''', r'''\1<h1>[BEGIN]</h1>''', list_body,
                           flags=re.I)
        list_body = re.sub(r'''(</li>)''', r'''<h1>[END]</h1>\1''', list_body, flags=re.I)

        list_text = LazyFW.html2text(url, list_body, {
            'ignore_links': True,
            'ignore_images': False,
        })

        list_tuple = re.findall(r'''\[BEGIN\](.+?)\[END\]''', list_text, re.S)
        if list_tuple != None:
            for block in list_tuple:
                oo = int(LazyFW.mid(block, 'oo [', ']'))
                xx = int(LazyFW.mid(block, 'xx [', ']'))
                total = oo + xx
                avg = total / 2
                if (oo > xx) and (oo > avg):
                    images = re.findall(r'''(http\://[^\.]+\.sinaimg\.cn/(.+?)\.jpg)''', block, re.I)
                    if len(images) > 0:
                        for image in images:
                            self.download_file(image[0])
コード例 #4
0
ファイル: new_proxy.py プロジェクト: emilymwang8/fang-broker
def fetch_proxy(url):
    LazyFW.log("Fetch URL: %s" % (url))
    try:
        matches = None
        urls = urlparse(url)
        r = requests.get(url, timeout=PROXY_TIMEOUT, headers={
            'User-Agent': LazyFW.USER_AGENT,
            'Referer': 'http://%s' % (urls.hostname)
        })

        if r.status_code == 200 or r.status_code == 304:
            html = r.text
            html = LazyFW.clear_space(html)

            # www.cz88.net
            if urls.hostname == 'www.cz88.net':
                matches = re.compile(r'''<tr><td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''', re.I).findall(html)
            # www.cnproxy.com
            elif urls.hostname == 'www.cnproxy.com':
                matches = []
                script = LazyFW.mid(html, '<SCRIPT type="text/javascript">', '</SCRIPT>')
                block = LazyFW.mid(html, '<div id="proxylisttb">', '<div class="proxylistnav">')
                tmp_vars = script.split(';')
                js_vars = {}
                for line in tmp_vars:
                    if line != '':
                        var = re.compile(r'''(^[a-z])\="([^"]+)"''', re.I).findall(line)
                        js_vars[var[0][0]] = var[0][1]

                host_lists = re.compile(r'''<td>([^<]+)<SCRIPT[^>]*>document\.write\("\:"([^)]+)\)</SCRIPT></td>''',
                                        re.I).findall(block)
                for line in host_lists:
                    tmp_arr = line[1].lstrip('+').split('+')
                    ports = []
                    for k in tmp_arr:
                        ports.append(js_vars[k])

                    match = (line[0], ''.join(ports))
                    matches.append(match)
            # www.xici.net.co
            elif urls.hostname == 'www.xici.net.co':
                # <td>119.233.255.24</td> <td>80</td>
                matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>.+?<td>HTTP<\/td>''',
                                     re.I).findall(html)
            # proxy.com.ru
            elif urls.hostname == 'proxy.com.ru':
                # <td>41.222.196.52</td><td>8080</td>
                matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''',
                                     re.I).findall(html)

            # proxy.com.ru
            elif urls.hostname == 'free-proxy.cz':
                # </div> 117.166.75.36</td><td><span class="fport">8123</span></td><td><small>HTTP</small></td>
                matches = re.compile(
                    r'''</div>\s*([\w\d\.]+?)<\/td><td><span\s+class="fport">(\d+)<\/span><\/td><td><small>HTTP<\/small><\/td>''',
                    re.I).findall(html)

            # checkerproxy.net
            elif urls.hostname == 'checkerproxy.net':
                # 122.227.8.190:80
                matches = re.compile(
                    r'''([\w\d\.]+?)\:(\d+)''',
                    re.I).findall(html)

        # proxy
        if matches != None:
            length = 0
            for proxy in matches:
                insertOk = proxy_insert(proxy, 999999)
                if insertOk == True:
                    length += 1
                # LazyFW.log("Add Proxy Server: %s:%s" % proxy);

            LazyFW.log("Add Proxy: %d/%d %s" % (length, len(matches), url))

    except Exception:
        return False

    return True
コード例 #5
0
ファイル: new_proxy.py プロジェクト: emilymwang8/fang-broker
import os
import random
import time
from urlparse import urlparse
import re
import sys
import LazyFW
import MySQLdb as mysql
from multiprocessing import Process
from multiprocessing.queues import Queue
import requests

__author__ = 'Lukin'

# 下载代理列表线程数
PROXY_THREAD_FETCH_MAX = int(LazyFW.config('Proxy', 'FETCH_THREAD_MAX'))
# 代理测速线程数
PROXY_THREAD_TEST_PROXY_MAX = int(LazyFW.config('Proxy', 'TEST_THREAD_MAX'))
# 代理超时
PROXY_TIMEOUT = int(LazyFW.config('Proxy', 'TIMEOUT'))

# 当前日期
CURR_DATE = LazyFW.t2date(time.time(), '%Y%m%d')

# DB config
DB_HOST = str(LazyFW.config('DB', 'HOST'))
DB_USER = str(LazyFW.config('DB', 'USER'))
DB_PASS = str(LazyFW.config('DB', 'PASS'))
DB_NAME = str(LazyFW.config('DB', 'NAME'))
DB_PORT = int(LazyFW.config('DB', 'PORT'))