コード例 #1
0
def main():
    """
	if something is wrong, I should not crawl at start from scratch.
	Change the variable 'start' then I can begin from 'start'.
	And I try to make every page in visible status.
	"""
    start = 1
    # here is that for every group I try a different array of proxies
    for group in range(0, 500):
        i = group * 500 + 1
        while i <= group * 500 + 500:
            # make 'i' bigger than 'start'
            if start > group * 500 + 500:
                break
            while i < start:
                i += 1
            try:
                urla = urls + str(i)
                html = etree.HTML(url_to_str(urla))
                title = get_title(html)
                # if the image not found, the title should be '404 - 妹子图'
                if '404' in title:
                    print('404 at ' + str(i))
                    i += 1
                    continue
                title = clear_dir(title)
                last_page = get_last_page(html)
                # start at page 1 and end at page 'last_page'
                for j in range(1, last_page + 1):
                    urlb = urla + '/' + str(j)
                    src = get_img_src(etree.HTML(url_to_str(urlb)))
                    download(src, title, urlb)
                print('succeed at ' + str(i))
            # the next 3 types of errors can ignore and it does not matter
            # although I want to handle it :)
            except FileExistsError:
                print('FileExistsError at ' + str(i))
            except AttributeError:
                print('AttributeError at ' + str(i))
            except IndexError:
                print('IndexError at ' + str(i))
            except Exception as e:
                print('Error at ' + str(i))
                raise e
            i += 1
        global proxy
        proxy = get_proxy(random.sample(proxy, 1)[0])
コード例 #2
0
def url_to_str(url, times=0):
    """
	I will give ten chances to you.
	If you fail still, I..I will cry.
	"""
    global proxy
    try:
        html = requests.get(url,
                            headers={
                                'User-agent': user_agent,
                                'proxies': random.sample(proxy, 1)[0]
                            })
        if html.status_code != 200:
            raise ConnectionError(url)
        return html.text
    except Exception as e:
        if times >= 10:
            raise e
        time.sleep(1)
        try:
            proxy = get_proxy(random.sample(proxy, 1)[0])
        except Exception:
            pass
        return url_to_str(url, times + 1)
コード例 #3
0
import os
import random
import requests
from time import clock
from lxml import etree
from pxy import get_proxy
from bs4 import BeautifulSoup

user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
proxy = get_proxy()


def url_to_soup(url):
    html = requests.get(url,
                        headers={
                            'User-agent': user_agent,
                            'proxies': random.sample(proxy, 1)[0]
                        })
    return BeautifulSoup(html.text, 'lxml')


def url_to_str(url):
    soup = url_to_soup(url)
    return soup.prettify()


def find_file_name(url):
    """
	find the file name from the url
	for example:
		url: https://www.baidu.com/robots.txt
コード例 #4
0
ファイル: test.py プロジェクト: ConanYu/MyCrawl
from pxy import get_proxy
arr = get_proxy()
for ele in arr:
    print(ele)
"""
output:
82.202.68.26:8095
182.165.175.46:8151
142.93.199.216:8162
35.235.85.91:8939
159.203.58.149:8624
178.128.111.246:8349
181.112.34.222:8956
172.104.118.230:8499
200.196.240.169:8304
159.89.126.130:8293
190.186.59.22:8363
24.124.113.53:8602
121.152.17.96:8867
124.158.4.3:8931
203.153.109.34:8427
190.46.95.54:8578
206.189.222.146:9001
112.27.129.54:8942
181.196.17.50:8304
131.117.215.19:8580

time:
2018-08-31 20:00
"""