def main(): """ if something is wrong, I should not crawl at start from scratch. Change the variable 'start' then I can begin from 'start'. And I try to make every page in visible status. """ start = 1 # here is that for every group I try a different array of proxies for group in range(0, 500): i = group * 500 + 1 while i <= group * 500 + 500: # make 'i' bigger than 'start' if start > group * 500 + 500: break while i < start: i += 1 try: urla = urls + str(i) html = etree.HTML(url_to_str(urla)) title = get_title(html) # if the image not found, the title should be '404 - 妹子图' if '404' in title: print('404 at ' + str(i)) i += 1 continue title = clear_dir(title) last_page = get_last_page(html) # start at page 1 and end at page 'last_page' for j in range(1, last_page + 1): urlb = urla + '/' + str(j) src = get_img_src(etree.HTML(url_to_str(urlb))) download(src, title, urlb) print('succeed at ' + str(i)) # the next 3 types of errors can ignore and it does not matter # although I want to handle it :) except FileExistsError: print('FileExistsError at ' + str(i)) except AttributeError: print('AttributeError at ' + str(i)) except IndexError: print('IndexError at ' + str(i)) except Exception as e: print('Error at ' + str(i)) raise e i += 1 global proxy proxy = get_proxy(random.sample(proxy, 1)[0])
def url_to_str(url, times=0): """ I will give ten chances to you. If you fail still, I..I will cry. """ global proxy try: html = requests.get(url, headers={ 'User-agent': user_agent, 'proxies': random.sample(proxy, 1)[0] }) if html.status_code != 200: raise ConnectionError(url) return html.text except Exception as e: if times >= 10: raise e time.sleep(1) try: proxy = get_proxy(random.sample(proxy, 1)[0]) except Exception: pass return url_to_str(url, times + 1)
import os import random import requests from time import clock from lxml import etree from pxy import get_proxy from bs4 import BeautifulSoup user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' proxy = get_proxy() def url_to_soup(url): html = requests.get(url, headers={ 'User-agent': user_agent, 'proxies': random.sample(proxy, 1)[0] }) return BeautifulSoup(html.text, 'lxml') def url_to_str(url): soup = url_to_soup(url) return soup.prettify() def find_file_name(url): """ find the file name from the url for example: url: https://www.baidu.com/robots.txt
from pxy import get_proxy arr = get_proxy() for ele in arr: print(ele) """ output: 82.202.68.26:8095 182.165.175.46:8151 142.93.199.216:8162 35.235.85.91:8939 159.203.58.149:8624 178.128.111.246:8349 181.112.34.222:8956 172.104.118.230:8499 200.196.240.169:8304 159.89.126.130:8293 190.186.59.22:8363 24.124.113.53:8602 121.152.17.96:8867 124.158.4.3:8931 203.153.109.34:8427 190.46.95.54:8578 206.189.222.146:9001 112.27.129.54:8942 181.196.17.50:8304 131.117.215.19:8580 time: 2018-08-31 20:00 """