def validate_ip(proxy): headers = { 'User-Agent': get_random_useragent() } # 代理设置 proxy_handler = urllib.request.ProxyHandler({'http': proxy}) opener = urllib.request.build_opener(proxy_handler) urllib.request.install_opener(opener) # 请求网址 validateUrl = 'https://www.baidu.com' req = urllib.request.Request(url=validateUrl, headers=headers) # 延时,等待反馈结果 time.sleep(4) #判断结果 try: res = urllib.request.urlopen(req) # 延时,等待反馈结果 time.sleep(2) content = res.read() # 写入文件 if content and res.status == 200: print('%s is ok' % proxy) write('./proxy.txt', proxy) else: # 未通过 print('%s is not ok' % proxy) except urllib.request.URLError as e: print('%s error %s' % (proxy, e.reason))
def main(): target_url = 'https://www.jianshu.com/c/yD9GAd?utm_medium=index-collections&utm_source=desktop' server_url = 'https://www.jianshu.com' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'www.jianshu.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_useragent() } session = requests.session() session.get('https://www.jianshu.com', headers=headers) print(session.cookies.get_dict()) resp = session.get(target_url, headers=headers) print(resp.cookies.get_dict()) title_dict = get_title_url(resp.text) for title, url in title_dict.items(): r = session.get(server_url + url, headers=headers) print(r.cookies.get_dict()) content = get_content(r.text) with open('./' + title + '.txt', 'w', encoding='utf-8') as fw: fw.write(content)
def download(url, referer): try: headers = {'User-Agent': get_random_useragent(), 'Referer': referer} resp = requests.get(url, headers=headers) if resp.status_code == 200: return resp else: return None except BaseException as e: print(e) return None
def main(): headers = { 'User-Agent': get_random_useragent(), 'Host': 'www.lagou.com', 'Referer': 'www.lagou.com', 'Upgrade-Insecure-Requests': '1', } target_url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E6%88%90%E9%83%BD&needAddtionalResult=false' # 获取session-cookie session = requests.session() session.get('https://www.lagou.com', headers=headers) headers['Referer'] = 'https://www.lagou.com/jobs/list_python?city=%E6%88%90%E9%83%BD&cl=false&fromSearch=true&labelWords=&suginput=' headers['X-Anit-Forge-Code'] = '0' headers['X-Anit-Forge-Token'] = 'None' headers['X-Requested-With'] = 'XMLHttpRequest' positions = [] for x in range(1, 8): data = { 'first': 'true', 'pn': x, 'kd': 'python' } resp = session.post(target_url, headers=headers, data=data) json_str = resp.json() page_position = json_str['content']['positionResult']['result'] for position in page_position: position_dict = { 'positionId': position['positionId'], 'positionName': position['positionName'], 'positionDesc': crawl_positiondesc(session, position['positionId']), 'education': position['education'], 'city': position['city'], 'createTime': position['createTime'], 'companyShortName': position['companyShortName'], 'financeStage': position['financeStage'], 'salary': position['salary'], 'industryField': position['industryField'], 'district': position['district'], 'positionAdvantage': position['positionAdvantage'], 'companySize': position['companySize'], 'companyLabelList': position['companyLabelList'], 'workYear': position['workYear'], 'positionLables': position['positionLables'], 'companyFullName': position['companyFullName'], 'firstType': position['firstType'], 'secondType': position['secondType'], 'subwayline': position['subwayline'], 'stationname': position['stationname'] } positions.append(position_dict) # print(positions) time.sleep(10)
def crawl_positiondesc(session, p_id): url = 'https://www.lagou.com/jobs/%s.html' % p_id headers = { 'User-Agent': get_random_useragent(), 'Host': 'www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=', 'Upgrade-Insecure-Requests': '1' } resp = session.get(url=url, headers=headers) print(resp.text.encode('iso-8859-1').decode('utf-8'))
def download(self, url): try: headers = { 'User-Agent': get_random_useragent(), } resp = requests.get(url, headers=headers) if resp.status_code == 200: return resp.text else: return None except BaseException as e: print(e) return None
def download(self, url): """ 根据url下载页面 :param url: url :return: html string """ try: headers = {'User-Agent': get_random_useragent()} resp = requests.get(url, headers=headers) return resp.text except BaseException as e: print(e) return None
def download(self, url, referer): headers = { 'Host': 'www.mmjpg.com', 'Referer': referer, 'User-Agent': get_random_useragent() } try: resp = requests.get(url=url, headers=headers) if resp.status_code == 200: return resp.text return None except BaseException as e: print(e) return None
def download(url): headers = { 'Host': 'www.xicidaili.com', 'Referer': 'http://www.xicidaili.com/', 'User-Agent': get_random_useragent() } try: resp = requests.get(url=url, headers=headers) if resp.status_code == 200: return resp.text return None except BaseException as e: print(e) return None
def downloader(url): """下载页面""" headers = { 'Host': 'maoyan.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_useragent() } try: resp = requests.get(url=url, headers=headers) if resp.status_code == 200: return resp.text return None except RequestException as e: print('请求 %s 失败' % url) print(e) return None
def main(): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'www.jianshu.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_useragent() } session = requests.session() session.get('https://www.jianshu.com', headers=headers) print(session.cookies.get_dict()) resp1 = session.get('https://www.jianshu.com/p/cc3754e1a761', headers=headers) resp2 = session.get('https://www.jianshu.com/p/1f9663d82b58', headers=headers) print(resp1.cookies.get_dict()) print(resp2.cookies.get_dict())
def download(self, session, url, referer=None): print(session.cookies.get_dict()) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'Host': 'www.qiushibaike.com', 'User-Agent': get_random_useragent() } if referer: headers['Referer'] = referer try: resp = session.get(url, headers=headers) return resp.text except BaseException as e: print('请求失败!爬取失败!', e) return None
def downloader(url): """ 下载器函数,用于下载url所代表的页面 :param url: url :return: 下载页面的字符串 """ headers = { 'Host': 'www.weather.com.cn', 'Referer': 'http://www.weather.com.cn/forecast/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': get_random_useragent() } try: resp = requests.get(url, headers=headers) if resp.status_code == 200: return resp.text.encode('iso-8859-1').decode('utf-8') except BaseException as e: print(e) return None