コード例 #1
0
ファイル: jiandan.py プロジェクト: mylukin/Creeper
    def FetchLinksFromSource(self, url, htmlSource):
        html = LazyFW.format_url(url, htmlSource)
        html = LazyFW.clear_space(html)
        list_body = LazyFW.mid(html, '<!-- begin comments -->', '<!-- end comments -->')
        # <li id="comment-2792510"> <div>

        list_body = re.sub(r'''(<li\s+id="comment\-[\d]+">\s*<div>)''', r'''\1<h1>[BEGIN]</h1>''', list_body,
                           flags=re.I)
        list_body = re.sub(r'''(</li>)''', r'''<h1>[END]</h1>\1''', list_body, flags=re.I)

        list_text = LazyFW.html2text(url, list_body, {
            'ignore_links': True,
            'ignore_images': False,
        })

        list_tuple = re.findall(r'''\[BEGIN\](.+?)\[END\]''', list_text, re.S)
        if list_tuple != None:
            for block in list_tuple:
                oo = int(LazyFW.mid(block, 'oo [', ']'))
                xx = int(LazyFW.mid(block, 'xx [', ']'))
                total = oo + xx
                avg = total / 2
                if (oo > xx) and (oo > avg):
                    images = re.findall(r'''(http\://[^\.]+\.sinaimg\.cn/(.+?)\.jpg)''', block, re.I)
                    if len(images) > 0:
                        for image in images:
                            self.download_file(image[0])
コード例 #2
0
ファイル: new_proxy.py プロジェクト: emilymwang8/fang-broker
def fetch_proxy(url):
    LazyFW.log("Fetch URL: %s" % (url))
    try:
        matches = None
        urls = urlparse(url)
        r = requests.get(url, timeout=PROXY_TIMEOUT, headers={
            'User-Agent': LazyFW.USER_AGENT,
            'Referer': 'http://%s' % (urls.hostname)
        })

        if r.status_code == 200 or r.status_code == 304:
            html = r.text
            html = LazyFW.clear_space(html)

            # www.cz88.net
            if urls.hostname == 'www.cz88.net':
                matches = re.compile(r'''<tr><td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''', re.I).findall(html)
            # www.cnproxy.com
            elif urls.hostname == 'www.cnproxy.com':
                matches = []
                script = LazyFW.mid(html, '<SCRIPT type="text/javascript">', '</SCRIPT>')
                block = LazyFW.mid(html, '<div id="proxylisttb">', '<div class="proxylistnav">')
                tmp_vars = script.split(';')
                js_vars = {}
                for line in tmp_vars:
                    if line != '':
                        var = re.compile(r'''(^[a-z])\="([^"]+)"''', re.I).findall(line)
                        js_vars[var[0][0]] = var[0][1]

                host_lists = re.compile(r'''<td>([^<]+)<SCRIPT[^>]*>document\.write\("\:"([^)]+)\)</SCRIPT></td>''',
                                        re.I).findall(block)
                for line in host_lists:
                    tmp_arr = line[1].lstrip('+').split('+')
                    ports = []
                    for k in tmp_arr:
                        ports.append(js_vars[k])

                    match = (line[0], ''.join(ports))
                    matches.append(match)
            # www.xici.net.co
            elif urls.hostname == 'www.xici.net.co':
                # <td>119.233.255.24</td> <td>80</td>
                matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>.+?<td>HTTP<\/td>''',
                                     re.I).findall(html)
            # proxy.com.ru
            elif urls.hostname == 'proxy.com.ru':
                # <td>41.222.196.52</td><td>8080</td>
                matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''',
                                     re.I).findall(html)

            # proxy.com.ru
            elif urls.hostname == 'free-proxy.cz':
                # </div> 117.166.75.36</td><td><span class="fport">8123</span></td><td><small>HTTP</small></td>
                matches = re.compile(
                    r'''</div>\s*([\w\d\.]+?)<\/td><td><span\s+class="fport">(\d+)<\/span><\/td><td><small>HTTP<\/small><\/td>''',
                    re.I).findall(html)

            # checkerproxy.net
            elif urls.hostname == 'checkerproxy.net':
                # 122.227.8.190:80
                matches = re.compile(
                    r'''([\w\d\.]+?)\:(\d+)''',
                    re.I).findall(html)

        # proxy
        if matches != None:
            length = 0
            for proxy in matches:
                insertOk = proxy_insert(proxy, 999999)
                if insertOk == True:
                    length += 1
                # LazyFW.log("Add Proxy Server: %s:%s" % proxy);

            LazyFW.log("Add Proxy: %d/%d %s" % (length, len(matches), url))

    except Exception:
        return False

    return True