Ejemplo n.º 1
0
def fetch_proxies_org():
    """http://proxies.org/2014/05/"""

    proxy_list = []

    def url_to_proxies(url, results, i):
        plist = []
        data = requests.get(url).text
        items = re.findall(r"(?:\d+\.){3}\d+:\d+", data)
        if not items:
            return False
        for item in items:
            part = item.split(":")
            protocol = find_ptotocol(part[0], part[1])
            plist.append((part[0], part[1], protocol, "UN", "1001"))
        results[i] = plist

    threads = [None] * 7
    results = [None] * 7
    for i in range(7):
        url = "http://proxies.org/2014/0%s/" % (i+2)
        threads[i] = threading.Thread(
            target=url_to_proxies, args=([url, results, i]))
        threads[i].start()
    for i in range(7):
        threads[i].join()

    for i in range(7):
        #print i
        #print results[i]
        proxy_list += results[i]

    #去重
    p_set = set([])
    for item in proxy_list:
        if item[0] not in p_set:
            p_set.add(item[0])
        else:
            proxy_list.remove(item)

    #print "proxies org len:", len(proxy_list)
    return filter_fetch(proxy_list)
Ejemplo n.º 2
0
def fetch_letushide():
    """http://letushide.com/"""

    #print "start fetch_letushide"
    proxy_list = []

    start_soup = bs(requests.get("http://letushide.com/").text)
    num = int(start_soup.find(id="page").find_all("a")[-1].string)

    def soup_to_proxies(soup):
        plist = []
        trs = soup.find_all("tr", id="data")
        for tr in trs:
            tds = tr.find_all("td")
            items = map(lambda x: x.string, tds)
            proxy = [items[x] for x in [1, 2, 3, 4, 5]]
            plist.append(proxy)
        return plist

    def url_to_proxies(url, results, i):
        soup = bs(requests.get(url).text)
        results[i] = soup_to_proxies(soup)

    threads = [None] * (num - 1)
    results = [None] * (num - 1)

    for i in range(len(threads)):
        url = "http://letushide.com/" + \
            str(i + 2) + "/list_of_free_proxy_servers"
        threads[i] = threading.Thread(
            target=url_to_proxies, args=([url, results, i]))
        threads[i].start()
    for i in range(len(threads)):
        threads[i].join()

    proxy_list += soup_to_proxies(start_soup)
    for i in range(len(threads)):
        proxy_list += results[i]

    return filter_fetch(proxy_list)