def retry_crawl(url, isProxy): response = None logging.error('抓取异常!正在试图重新抓取页面{}'.format(url)) for i in range(spider_retry_num): logging.error('重新抓取第{}次'.format(i + 1)) try: if isProxy: proxy = _proxy() print('正在使用代理{},抓取页面 {}'.format(proxy, url)) response = requests.get(url, headers=get_proxy_headers(proxy), proxies=proxy, timeout=spider_timeout) else: response = requests.get(url, headers=get_headers(), timeout=spider_timeout) except requests.exceptions.ProxyError as e: # logging.exception(e) logging.error(e) continue except requests.exceptions.ConnectTimeout as e: # logging.exception(e) logging.error(e) continue soup = BeautifulSoup(response.text, 'lxml') com_all_info = soup.find_all(class_='m_srchList') _response = response.text if len(com_all_info) > 0: break # elif '<script>window.location.href=' in _response: # 操作频繁验证链接 # verify_url = re.findall("<script>window.location.href='(.*?)';</script>", _response)[0] # print('由于操作频繁被企查查识别为爬虫,请手动点击此链接验证:{}'.format(verify_url)) # # verify(verify_url) # time.sleep(20) else: logging.error('=================返回异常=================') logging.error(response.text) time.sleep(random.randint(crawl_interval_mintime, crawl_interval_maxtime)) return response
def __init__(self, name=None, mode='rb', fileobj=None): if mode != 'rb': raise NotImplementedError("currently the only supported mode is 'rb'") self._fileobj = fileobj or io.open(name, mode) self._header_range, self._headers = get_headers(self._fileobj) self._ownes_fd = fileobj is not None
def __init__(self, format_url, num_articles, start=1, step=1): logging.basicConfig(filename='log', filemode='w', level=logging.INFO) self.logger = logging.getLogger(__name__) # URL with {} in place of page number parameter. # e.g. ...&page=3 becomes ...&page={} self._format_url = format_url # What page to start on self._start = start # How to much to step up url parameter by each time # Usually can be set to 1 self._step = step # The number of articles to download before stopping self._num_articles = num_articles # Get headers with a randomly chosen user-agent. self.headers = get_headers() # Set initial referer to homepage of the scraped site self.headers['Referer'] = 'http://www.{}'.format( urlparse.urlparse(format_url).hostname)
def get_images_set_and_save_ctime(uid:int)->set: images_set = set() images_doc_list_api = "https://api.vc.bilibili.com/link_draw/v1/doc/doc_list" headers = get_headers() # proxies = get_success_proxies() if os.path.exists(f"{uid}.ctime"): with open(f"{uid}.ctime", 'r') as f: ctime = f.read() else: ctime = 0 ctime = int(ctime) if ctime else 0 # 第一次单独处理 current_page_num = 0 while True: params = { "uid": uid, "page_num": current_page_num, "page_size": 30, "biz": "all", } html = requests.get(images_doc_list_api, headers=headers, params=params) html.encoding = 'utf-8' if html.status_code != 200: print("获取图片列表状态码异常!", html.status_code) return None result = json.loads(html.text) items = result['data']['items'] # 如果为空表示无需继续 if not items: return images_set if current_page_num == 0: new_ctime = items[0]['ctime'] new_ctime = int(new_ctime) if new_ctime > ctime: with open(f"{uid}.ctime", 'w') as f: f.write(str(new_ctime)) else: print("已经存在过去的图片,无需获取图片集!") return None current_page_num = current_page_num + 1 for item in items: # 先判断是否需要保存 current_ctime = int(item['ctime']) if current_ctime < ctime: # 无需保存 continue pictures = item['pictures'] for picture in pictures: images_set.add(picture['img_src']) return images_set
def download_one(url:str, uid:int): headers = get_headers() result = re.search(r"([\w]+).(bmp|jpg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|image)", url) if not result: print("图片格式匹配失败!", url) return None file_name = result.group(1) format = result.group(2) file_name = f"{file_name}.{format}" html = requests.get(url, headers=headers) if html.status_code != 200: print("图片下载失败!", html.status_code) return None if not os.path.exists(f"images/{uid}/"): os.makedirs(f"images/{uid}/") with open(f"images/{uid}/{file_name}", 'wb') as f: f.write(html.content) print("图片下载完成", url)
from headers import get_headers ## This is to get the connection to the nba endpoints correctly (hot fix to work until they push changes) headers = get_headers() # Make API Call for players across all seasons for a particular franchise from nba_api.stats.endpoints import franchiseplayers ## Get all players: nba2019_2020players = {} for team in list_teams: fp = franchiseplayers.FranchisePlayers(team_id=team_to_id[team], headers=headers) players = fp.get_data_frames()[0] active_players = players.loc[players['ACTIVE_WITH_TEAM'] == 1] ids_per_team = active_players['PERSON_ID'].values player_names = active_players['PLAYER'].values players_per_team = dict(zip(ids_per_team, player_names)) nba2019_2020players.update(players_per_team) ## Determine_condition or ret true if you cant def dc(values, sign, threshold): if len(values) > 0: if sign == 'l': return np.mean(values) < threshold elif sign == "g": return np.mean(values) > threshold else: return True def classify_players(players): three_and_d_players = [] facilitators = []
import requests, time, random from headers import get_headers import data_selector headers_getter = get_headers() proxy_pool = ["23.83.243.75:8989", "172.93.33.29:8989", "138.128.199.17:8989"] #随机代理池选取代理 def random_proxy(): index = random.randint(0, len(proxy_pool) - 1) return {"http": proxy_pool[index], "https": proxy_pool[index]} #延时 def delay(): delay_time = random.randint(7, 25) #print(delay_time) time.sleep(delay_time) #反爬机制的预浏览得到请求头 def pre_requests(): delay() session = requests.session() session.headers.update(headers_getter.get_headers()) r = session.get("https://angel.co/companies", proxies=random_proxy()) csrf_token = data_selector.csrf_token(r.text) session.headers.update({ "referer": "https://angel.co/companies", "csrf-token": csrf_token[0],
error_data_list = [] i += 1 start_url = base_url + str(name) # print(start_url) try: print("正在抓取第{}个公司==========================={}".format(i, name)) if is_proxy: proxy = _proxy() print('正在使用代理{},抓取页面 {}'.format(proxy, start_url)) try: response = requests.get(start_url, headers=get_proxy_headers(proxy), proxies=proxy, timeout=spider_timeout) except Exception as e: response = retry_crawl(start_url, is_proxy) else: try: response = requests.get(start_url, headers=get_headers(), timeout=spider_timeout) except Exception as e: response = retry_crawl(start_url, is_proxy) # if response.status_code != 200: # error_data_list.append(name) # print("抓取页面 {},异常 {} 可能被企查查网站反爬拦截了!".format(start_url, response.status_code)) # continue # 获取企业筛选信息链接 if response is not None: search_url = get_detail_url(start_url, response, is_proxy) if search_url is None: # print('请求企查查网站操作频繁,被反爬拦截了,需等待一段时间再试!') # error_data_list.append(name) # break raise RuntimeError('请求企查查网站操作频繁,被反爬拦截了,需等待一段时间再试!')