def FetchLinksFromSource(self, url, htmlSource): html = LazyFW.format_url(url, htmlSource) html = LazyFW.clear_space(html) list_body = LazyFW.mid(html, '<!-- begin comments -->', '<!-- end comments -->') # <li id="comment-2792510"> <div> list_body = re.sub(r'''(<li\s+id="comment\-[\d]+">\s*<div>)''', r'''\1<h1>[BEGIN]</h1>''', list_body, flags=re.I) list_body = re.sub(r'''(</li>)''', r'''<h1>[END]</h1>\1''', list_body, flags=re.I) list_text = LazyFW.html2text(url, list_body, { 'ignore_links': True, 'ignore_images': False, }) list_tuple = re.findall(r'''\[BEGIN\](.+?)\[END\]''', list_text, re.S) if list_tuple != None: for block in list_tuple: oo = int(LazyFW.mid(block, 'oo [', ']')) xx = int(LazyFW.mid(block, 'xx [', ']')) total = oo + xx avg = total / 2 if (oo > xx) and (oo > avg): images = re.findall(r'''(http\://[^\.]+\.sinaimg\.cn/(.+?)\.jpg)''', block, re.I) if len(images) > 0: for image in images: self.download_file(image[0])
def fetch_proxy(url): LazyFW.log("Fetch URL: %s" % (url)) try: matches = None urls = urlparse(url) r = requests.get(url, timeout=PROXY_TIMEOUT, headers={ 'User-Agent': LazyFW.USER_AGENT, 'Referer': 'http://%s' % (urls.hostname) }) if r.status_code == 200 or r.status_code == 304: html = r.text html = LazyFW.clear_space(html) # www.cz88.net if urls.hostname == 'www.cz88.net': matches = re.compile(r'''<tr><td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''', re.I).findall(html) # www.cnproxy.com elif urls.hostname == 'www.cnproxy.com': matches = [] script = LazyFW.mid(html, '<SCRIPT type="text/javascript">', '</SCRIPT>') block = LazyFW.mid(html, '<div id="proxylisttb">', '<div class="proxylistnav">') tmp_vars = script.split(';') js_vars = {} for line in tmp_vars: if line != '': var = re.compile(r'''(^[a-z])\="([^"]+)"''', re.I).findall(line) js_vars[var[0][0]] = var[0][1] host_lists = re.compile(r'''<td>([^<]+)<SCRIPT[^>]*>document\.write\("\:"([^)]+)\)</SCRIPT></td>''', re.I).findall(block) for line in host_lists: tmp_arr = line[1].lstrip('+').split('+') ports = [] for k in tmp_arr: ports.append(js_vars[k]) match = (line[0], ''.join(ports)) matches.append(match) # www.xici.net.co elif urls.hostname == 'www.xici.net.co': # <td>119.233.255.24</td> <td>80</td> matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>.+?<td>HTTP<\/td>''', re.I).findall(html) # proxy.com.ru elif urls.hostname == 'proxy.com.ru': # <td>41.222.196.52</td><td>8080</td> matches = re.compile(r'''<td>([\w\d\.]+?)<\/td>\s*<td>(\d+)<\/td>''', re.I).findall(html) # proxy.com.ru elif urls.hostname == 'free-proxy.cz': # </div> 117.166.75.36</td><td><span class="fport">8123</span></td><td><small>HTTP</small></td> matches = re.compile( r'''</div>\s*([\w\d\.]+?)<\/td><td><span\s+class="fport">(\d+)<\/span><\/td><td><small>HTTP<\/small><\/td>''', re.I).findall(html) # checkerproxy.net elif urls.hostname == 'checkerproxy.net': # 122.227.8.190:80 matches = re.compile( r'''([\w\d\.]+?)\:(\d+)''', re.I).findall(html) # proxy if matches != None: length = 0 for proxy in matches: insertOk = proxy_insert(proxy, 999999) if insertOk == True: length += 1 # LazyFW.log("Add Proxy Server: %s:%s" % proxy); LazyFW.log("Add Proxy: %d/%d %s" % (length, len(matches), url)) except Exception: return False return True