def fetch_proxies_org(): """http://proxies.org/2014/05/""" proxy_list = [] def url_to_proxies(url, results, i): plist = [] data = requests.get(url).text items = re.findall(r"(?:\d+\.){3}\d+:\d+", data) if not items: return False for item in items: part = item.split(":") protocol = find_ptotocol(part[0], part[1]) plist.append((part[0], part[1], protocol, "UN", "1001")) results[i] = plist threads = [None] * 7 results = [None] * 7 for i in range(7): url = "http://proxies.org/2014/0%s/" % (i+2) threads[i] = threading.Thread( target=url_to_proxies, args=([url, results, i])) threads[i].start() for i in range(7): threads[i].join() for i in range(7): #print i #print results[i] proxy_list += results[i] #去重 p_set = set([]) for item in proxy_list: if item[0] not in p_set: p_set.add(item[0]) else: proxy_list.remove(item) #print "proxies org len:", len(proxy_list) return filter_fetch(proxy_list)
def fetch_letushide(): """http://letushide.com/""" #print "start fetch_letushide" proxy_list = [] start_soup = bs(requests.get("http://letushide.com/").text) num = int(start_soup.find(id="page").find_all("a")[-1].string) def soup_to_proxies(soup): plist = [] trs = soup.find_all("tr", id="data") for tr in trs: tds = tr.find_all("td") items = map(lambda x: x.string, tds) proxy = [items[x] for x in [1, 2, 3, 4, 5]] plist.append(proxy) return plist def url_to_proxies(url, results, i): soup = bs(requests.get(url).text) results[i] = soup_to_proxies(soup) threads = [None] * (num - 1) results = [None] * (num - 1) for i in range(len(threads)): url = "http://letushide.com/" + \ str(i + 2) + "/list_of_free_proxy_servers" threads[i] = threading.Thread( target=url_to_proxies, args=([url, results, i])) threads[i].start() for i in range(len(threads)): threads[i].join() proxy_list += soup_to_proxies(start_soup) for i in range(len(threads)): proxy_list += results[i] return filter_fetch(proxy_list)