def get_bing(): """ 获取必应图片地址 :return: """ data = {'format': 'js', 'idx': 0, 'n': 1} try: response = HttpUtil.get_json( url='http://cn.bing.com/HPImageArchive.aspx', data=data) logging.debug(response) except Exception as e: logging.error("网络请求错误:", e) time.sleep(120) get_bing() images = response["images"] url = "http://cn.bing.com" + images[0]["url"].split("&")[0] # 拼接目录路径 directory = os.path.join(Constants.APP_DIRECTORY, "images") image_name = url.split("=")[1] # 拼接文件绝对路径 image_path = os.path.join(directory, image_name) # 下载图片 HttpUtil.download_file(url, directory, image_name) # 分割文件名和后缀,如果后缀不为bmp if os.path.splitext(image_name)[1] != "bmp": # 转为bmp image_path = FileUtil.image_to_bmp(image_path)
def download_chromedriver(): """ 下载chrome驱动 http://chromedriver.storage.googleapis.com/index.html :return: """ # 获取版本号列表 url = "http://chromedriver.storage.googleapis.com/" result = BeautifulSoup(HttpUtil.get(url, { "delimiter": "/", "prefix": "" }).text, features="lxml") prefix = result.find_all("prefix") # 过滤 # info = [s.extract() for s in prefix('prefix')] local_version = get_local_version(prefix) # 获取版本下面的文件列表 driver_list = BeautifulSoup(HttpUtil.get(url, { "delimiter": "/", "prefix": local_version }).text, features="lxml") filename_list = driver_list.find_all("key") for s in filename_list: s = s.text # 如果在文件名中找到系统平台名称 if s.find(sys.platform) != -1: filename = s[len(local_version):] # 下载文件 HttpUtil.download_file(url + s, None, filename) FileUtil.zip_extract(filename, None)
def download_taobao_chromedriver(): """ 下载淘宝镜像chromedriver http://npm.taobao.org/mirrors/chromedriver :return: """ # 获取版本号列表 url = "http://npm.taobao.org/mirrors/chromedriver/" result = BeautifulSoup(HttpUtil.get(url).text, features="lxml") prefix = result.find("pre").find_all("a") # 过滤 # info = [s.extract() for s in prefix('prefix')] local_version_url = url + get_local_version(prefix) # 获取版本下面的文件列表 driver_list = BeautifulSoup(HttpUtil.get(local_version_url).text, features="lxml") filename_list = driver_list.find_all("a") for s in filename_list: s = s.text # 如果在文件名中找到系统平台名称 if s.find(sys.platform) != -1: # 下载文件 HttpUtil.download_file(local_version_url + s, None, s) FileUtil.zip_extract(s, None)
def search(self): url = 'http://www.wyl.cc/' home_soup = BeautifulSoup(HttpUtil.request(url)) target_url = home_soup.find(attrs={"class": "entry-title"}).find('a')['href'].encode('utf-8') target_soup = BeautifulSoup(HttpUtil.request(target_url)) content_soup = target_soup.find(attrs={"class": "single-content"}).findAll('p') text = content_soup[0].text url = content_soup[1].find('a')['href'].encode('utf-8') self.__send(text, url)
def sample(self): page = 1 url='http://search.51job.com/list/070200%252C00,000000,0000,00,9,99,java,0,1.html?lang=c&stype=2&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&confirmdate=9&dibiaoid=0' opener = HttpUtil.init_opener() HttpUtil.opener_request(url, opener) while page < 50: print '第%s页' % page self.__research(page, opener) page=page+1
def space_code_provision(self, new_space_code): r = """/vb.htm?IntelligentInfo={"version":1.1,"method":"get","devicetype":1}""" for host in self.ssh.hosts: data = HttpUtil.httpget(host[0], r) json = data.split(" ")[-1] print("get intelligentinfo = %s" % json) output = re.sub(""""pointcode":"[0-9]*""", '''"pointcode":"''' + new_space_code, json) newdata = "/vb.htm?language=ie&IntelligentInfo=" + urllib.parse.quote(output) HttpUtil.httpget(host[0], newdata)
def install_openvpn(self, client_key, client_crt, firmware=''): if firmware == '': self.upgrade('openvpn_patch_20160426113637_A1.bin') else: self.upgrade('openvpn_patch_20160426113637_A1.bin') self.ssh.do_scp_put(client_key, '/opt/ipnc/openvpn/client.key') self.ssh.do_scp_put(client_crt, '/opt/ipnc/openvpn/client.crt') for host in self.ssh.hosts: HttpUtil.httpget(host[0], '/cgi-bin/net_adv.cgi?openvpn_enable=1')
def install_openvpn(self, client_key, client_crt, firmware=""): if firmware == "": self.upgrade("openvpn_patch_20160426113637_A1.bin") else: self.upgrade("openvpn_patch_20160426113637_A1.bin") self.ssh.do_scp_put(client_key, "/opt/ipnc/openvpn/client.key") self.ssh.do_scp_put(client_crt, "/opt/ipnc/openvpn/client.crt") for host in self.ssh.hosts: HttpUtil.httpget(host[0], "/cgi-bin/net_adv.cgi?openvpn_enable=1")
def set_ntp(self, ntp_server, ntp_interval, ntp_type): for host in self.ssh.hosts: HttpUtil.httpcfg( host[0], "language=ie&datestampenable3=" + str(ntp_type) + "&sntpip=" + ntp_server + "&ntpinterval=" + str(ntp_interval), )
def space_code_provision(self, new_space_code): r = '''/vb.htm?IntelligentInfo={"version":1.1,"method":"get","devicetype":1}''' for host in self.ssh.hosts: data = HttpUtil.httpget(host[0], r) json = data.split(' ')[-1] print('get intelligentinfo = %s' % json) output = re.sub('''"pointcode":"[0-9]*''', '''"pointcode":"''' + new_space_code, json) newdata = '/vb.htm?language=ie&IntelligentInfo=' + urllib.parse.quote( output) HttpUtil.httpget(host[0], newdata)
def search(self): url = 'http://www.wyl.cc/' home_soup = BeautifulSoup(HttpUtil.request(url)) target_url = home_soup.find(attrs={ "class": "entry-title" }).find('a')['href'].encode('utf-8') target_soup = BeautifulSoup(HttpUtil.request(target_url)) content_soup = target_soup.find(attrs={ "class": "single-content" }).findAll('p') text = content_soup[0].text url = content_soup[1].find('a')['href'].encode('utf-8') self.__send(text, url)
def get_360_image(images): for image in images: url = image["url"] # 拼接目录路径 directory = os.path.join(Constants.APP_DIRECTORY, "images") # 如果目录不存在则创建 if not os.path.exists(directory): os.makedirs(directory) urls = url.split("/") # 拼接文件绝对路径 image_path = os.path.join(directory, urls[len(urls) - 1]) HttpUtil.download_file(image_path, url)
def get_myssl_ip(): """ 通过myssl查询dns :return: """ new_hosts = delete_dns(Constants.GITHUB_DOMAIN) for domain in Constants.GITHUB_DOMAIN: try: data = {"qtype": 1, "host": domain, "qmode": -1} response = HttpUtil.get(url=Constants.MYSSL_DNS, data=data) jsp = json.loads(response.text) if jsp["code"] == 0 and jsp["error"] is None: result_data = jsp["data"] addr_us = result_data["01"][0]["answer"]["records"] # addr_hk = result_data["852"][0]["answer"]["records"] # addr_cn = result_data["86"][0]["answer"]["records"] # 拼接host for us in addr_us: new_hosts.append(us["value"] + " " + domain + "\n") except Exception as e: print("错误:", e) update_hosts(new_hosts)
def get_user_info(cls, openid, access_token, lang='zh_CN'): """ 拉取用户信息, snsapi_base中不需此步骤 :return:用户信息dict """ params = {'access_token': access_token, 'openid': openid, 'lang':lang} return HttpUtil.get(cls._USERINFO_URL, params)
def refresh_access_token(cls, appid, refresh_token): """ 刷新auth_access_token :return: 返回dict, keys:access_token,expires_in,refresh_token,openid,scope """ params = {'appid':appid, 'grant_type':'refresh_token', 'refresh_token':refresh_token} return HttpUtil.get(cls._REFRESH_TOKEN_UEL, params)
def get_360_category(): data = {'c': 'WallPaper', 'a': 'getAllCategoriesV2', 'from': '360chrome'} category = HttpUtil.get(url='http://wallpaper.apc.360.cn/index.php', data=data) if category["errno"] != "0": raise MsgException('请求接口错误', category["errmsg"]) return category["data"]
def jian_shu_article_retrive(url): html = HttpUtil.request(url) content_soup = BeautifulSoup( html, fromEncoding="utf-8").find(attrs={"class": "article"}) title = content_soup.find(attrs={"class": "title"}).text.encode('utf-8') content = content_soup.find(attrs={"class": "show-content"}) return (title, content)
def get_access_token(cls, appid, appsecret, code): """ 通过code换取网页授权acces_token :return: 返回dict. keys:access_token,expires_in,refresh_token,openid,scope """ params = {'appid': appid, 'secret': appsecret, 'code':code, 'grant_type':'authorization_code'} return HttpUtil.get(cls._ACCESS_TOKEN_URL, params)
def lin_shi_you_xiang_list(prefix): """ 获取邮箱列表 :param prefix: 邮箱前缀 :return: """ url = Constants.LIN_SHI_YOU_XIANG + "/api/v1/mailbox/" + prefix return HttpUtil.get_json(url=url, data=None)
def chick_search(self): url = 'http://www.59xihuan.cn/' home_soup = BeautifulSoup(HttpUtil.request(url)) content_soup = home_soup.find(attrs={"class": "pic_text1"}) text = content_soup.find('p').text url = url + content_soup.find('img')['bigimagesrc'].encode('utf-8') self.__send(text, url) print ''
def query_view_by_url(author_url, blog_url): url = 'http://www.jianshu.com/' + author_url soup = BeautifulSoup(HttpUtil.request(url)) blogs_soup = soup.find(attrs={"class": "article-list latest-notes"}).findAll('li') for blog_soup in blogs_soup: blog = service.parse_blog(blog_soup) if blog.url == blog_url: return blog
def short_time_mail_list(last_id): """ 查询邮件列表 :param last_id: :return: """ url = Constants.SHORT_TIME_MAIL + "/mail/list" return HttpUtil.get_json(url=url, data={"last_id": last_id})
def short_time_get_mail(id): """ 查询邮件内容 :param last_id: :return: """ url = Constants.SHORT_TIME_MAIL + "/zh-Hans/mail/detail" return HttpUtil.get_json(url=url, data={"id": id})
def lin_shi_you_xiang_apply(prefix): """ 获取邮箱 :param prefix: 邮箱前缀 :return: """ url = Constants.LIN_SHI_YOU_XIANG + "/api/v1/mailbox/keepalive" data = {"force_change": 1, "mailbox": prefix, "_ts": round(time.time() / 1000)} return HttpUtil.get_json(url=url, data=data)
def send_weibo_pics(text, urls, browser): browser.get('http://m.weibo.cn/mblog') time.sleep(3) browser.find_element_by_id('txt-publisher').send_keys(text.decode('utf-8')) for i,url in enumerate(urls): if i==0: open(r'/home/1.jpg', 'wb').write(HttpUtil.request(url)) pic_browser = browser.find_element_by_class_name('picupload') pic_browser.send_keys(r'/home/1.jpg') time.sleep(8) else: open(r'/home/1.jpg', 'wb').write(HttpUtil.request(url)) pic_browser = browser.find_element_by_name('pic') pic_browser.send_keys(r'/home/1.jpg') time.sleep(8) os.remove(r'/home/1.jpg') browser.find_element_by_link_text('发送').click() print '发送成功'
def send_weibo_pics(text, urls, browser): browser.get('http://m.weibo.cn/mblog') time.sleep(3) browser.find_element_by_id('txt-publisher').send_keys(text.decode('utf-8')) for i, url in enumerate(urls): if i == 0: open(r'/home/1.jpg', 'wb').write(HttpUtil.request(url)) pic_browser = browser.find_element_by_class_name('picupload') pic_browser.send_keys(r'/home/1.jpg') time.sleep(8) else: open(r'/home/1.jpg', 'wb').write(HttpUtil.request(url)) pic_browser = browser.find_element_by_name('pic') pic_browser.send_keys(r'/home/1.jpg') time.sleep(8) os.remove(r'/home/1.jpg') browser.find_element_by_link_text('发送').click() print '发送成功'
def publish_articles(): url = 'http://www.jianshu.com' html = BeautifulSoup(HttpUtil.request(url)) articles_soup = [ parse_article_soup(article) for article in html.find(attrs={"class": "article-list thumbnails"}).findAll('li')] sorted_articles = sorted(articles_soup, key=lambda articl_soup:articl_soup[0], reverse=True) article_body = service.jian_shu_article_retrive(url + sorted_articles[0][1]) browser = LoginController.get_browser() passwd_dict = PasswdUtil.get_passwd_dict() send_text = '《%s》 又到每天推荐文章的时候到了,这些都是精选的枕边读物哦,希望大家喜欢@当幸福敲不开门[害羞][害羞][害羞] ' % article_body[0] service.send_weibo(send_text, service.txt_to_pic(article_body[1], browser), LoginController.mobile_login(browser, passwd_dict[Const.WEIBO_USERNAME], passwd_dict[Const.WEIBO_PASSWD]))
def lin_shi_you_xiang_delete(prefix, id): """ 删除邮件delete请求 :param id: 邮件编号 :param prefix: 邮箱前缀 :return: """ url = Constants.LIN_SHI_YOU_XIANG + "/api/v1/mailbox/" + prefix + "/" + id res = HttpUtil.delete(url=url, data=None) res_json = json.loads(res.text)
def request_access_token(appid, appsecret): """ 网络请求获取access_token :param appid: :param appsecret: :return: {'access_token':'', 'expires_in':3600} """ url = WxApi.BASE_URL + "/cgi-bin/token?grant_type=client_credential" params = {'appid': appid, 'secret': appsecret} return HttpUtil.get(url, params)
def short_time_mail_apply(): """ 随机申请shorttimemail.com邮箱 :return: 邮箱号 """ prefix = StringUtil.random_lowercase_alphanumeric(9) suffix = "@shorttimemail.com" data = {"prefix": prefix, "suffix": suffix} # post续期30分钟:/mail/continue # post销毁:/mail/destory # post删除邮件:/mail/delete ,参数:{ ids: ids.join('|') }以|分割字符串 res = HttpUtil.get_json(url=Constants.SHORT_TIME_MAIL + "/mail/apply", data=data) if res.code != 200: raise MsgException(res.msg) return prefix + suffix
def _common_post(self, url, ddata, verify=None): """ :return: (ret_code, dict_data/err_msg) """ if self.BASE_URL not in url: url = self.BASE_URL + url results = HttpUtil.post(url, ddata, ctype='xml', verify=verify) if results.get('return_code', '') == 'SUCCESS': assert results.get('sign') == self._generate_sign(**results), 'sign error, not from wechat pay server' if results.get('result_code','') == 'SUCCESS': return 0, results else: return 1, results.get('err_code_des', '') else: return 1,results.get('return_msg', '')
def user_info(user_url): url = 'http://www.jianshu.com' + user_url soup = BeautifulSoup(HttpUtil.request(url)) user = User() base_info_soup = soup.find(attrs={"class": "basic-info"}) user.name = base_info_soup.find('h3').text.encode('utf-8') signed_soup = base_info_soup.find(attrs={"class": "signed_author"}) if signed_soup: user.is_signed = 0 nums_soup = soup.find(attrs={"class": "user-stats"}).findAll('b') user.focus = int(nums_soup[0].text.encode('utf-8')) user.fans = int(nums_soup[1].text.encode('utf-8')) user.blog_nums = int(nums_soup[2].text.encode('utf-8')) user.word_nums = int(nums_soup[3].text.encode('utf-8')) user.like_nums = int(nums_soup[4].text.encode('utf-8')) return user
def _common_post(self, url, ddata, cert=None): """ :return: (ret_code, dict_data/err_msg) """ if self.BASE_URL not in url: url = self.BASE_URL + url results = HttpUtil.post(url, ddata, ctype='xml', cert=cert) if results.get('return_code', '') == 'SUCCESS': assert results.get('sign') == self._generate_sign(**results), 'sign error, not from wechat pay server' if results.get('result_code','') == 'SUCCESS': return 0, results else: return 1, results.get('err_code_des', '') else: return 1,results.get('return_msg', '')
def get_360_update_image(): # order排序,start从第几幅图开始(用于分页),count每次加载的数量最大200 data = { 'c': 'WallPaper', 'a': 'getAppsByOrder', 'order': 'create_time', 'start': 0, 'count': 200, 'from': '360chrome' } response = HttpUtil.get(url='http://wallpaper.apc.360.cn/index.php', data=data) if response["errno"] != "0": raise MsgException('请求接口错误', response["errmsg"]) images = response["data"] get_360_image(images)
def get_360_category_image(): category = get_360_category() logging.debug(category) # cid分类ID,start从第几幅图开始(用于分页),count每次加载的数量最大200 data = { 'c': 'WallPaper', 'a': 'getAppsByCategory', 'cid': '36', 'start': 0, 'count': 200, 'from': '360chrome' } response = HttpUtil.get(url='http://wallpaper.apc.360.cn/index.php', data=data) if response["errno"] != "0": raise MsgException('请求接口错误', response["errmsg"]) images = response["data"] get_360_image(images) logging.debug(images)
def get_short_time_mail_dns(): """ 通过shorttimemail.com查询DNS :return: """ new_hosts = delete_dns(Constants.GITHUB_DOMAIN) for domain in Constants.GITHUB_DOMAIN: try: data = {"server": "8.8.8.8", "rrtype": "A", "domain": domain} response = HttpUtil.get(url=Constants.SHORT_TIME_MAIL_DNS, data=data) jsp = json.loads(response.text) if jsp["code"] == 0: # 拼接host for us in jsp["data"]: new_hosts.append(us["value"] + " " + domain + "\n") except Exception as e: print("错误:", e) update_hosts(new_hosts)
def txt_to_pic(txt, browser): url = 'http://www.changweibo.com/' js = """document.getElementById('ueditor_0').contentWindow.document.getElementsByClassName('view')[0].innerHTML='%s'; """ % str(txt).replace('\n', '<br>').replace('\'', '"') browser.get(url) time.sleep(5) browser.execute_script(js) time.sleep(3) browser.find_element_by_xpath( "//a[@class='btn btn-success btn-lg']").click() time.sleep(10) browser.switch_to_frame('ueditor_0') html = browser.page_source soup = BeautifulSoup(html) data = {"reserve_check": 1, "text": "", "html": soup.find('body')} url = 'http://www.changweibo.com/convert_changweibo_com.php' response = HttpUtil.request_post(url, data) img_url = json.loads(response)['image_url'] print(img_url) return img_url
def get_tag(page): """ 获取标签 :param page: 页码 :return: """ html = BeautifulSoup( HttpUtil.get(f"https://wallhaven.cc/tags?page={str(page)}").text, features="lxml") tags_html = html.find_all("a", {"class": "sfw"}) for tag_html in tags_html: url = tag_html.attrs["href"] tag_id = url[url.rfind("/") + 1:] tag_text = tag_html.text print(tag_id, tag_text) # 获取所有包含指定属性的标签 page_all = html.find_all(lambda tag: tag.has_attr('original-title')) page_total = page_all[len(page_all) - 1].text # 如果不是最后一页,那么就继续下载下一页 if page != page_total: get_tag(page + 1)
def publish_articles(): url = 'http://www.jianshu.com' html = BeautifulSoup(HttpUtil.request(url)) articles_soup = [ parse_article_soup(article) for article in html.find(attrs={ "class": "article-list thumbnails" }).findAll('li') ] sorted_articles = sorted(articles_soup, key=lambda articl_soup: articl_soup[0], reverse=True) article_body = service.jian_shu_article_retrive(url + sorted_articles[0][1]) browser = LoginController.get_browser() passwd_dict = PasswdUtil.get_passwd_dict() send_text = '《%s》 又到每天推荐文章的时候到了,这些都是精选的枕边读物哦,希望大家喜欢@当幸福敲不开门[害羞][害羞][害羞] ' % article_body[ 0] service.send_weibo( send_text, service.txt_to_pic(article_body[1], browser), LoginController.mobile_login(browser, passwd_dict[Const.WEIBO_USERNAME], passwd_dict[Const.WEIBO_PASSWD]))
def txt_to_pic(txt, browser): url = 'http://www.changweibo.com/' js ="""document.getElementById('ueditor_0').contentWindow.document.getElementsByClassName('view')[0].innerHTML='%s'; """ % str(txt).replace('\n', '<br>').replace('\'', '"') browser.get(url) time.sleep(5) browser.execute_script(js) time.sleep(3) browser.find_element_by_xpath("//a[@class='btn btn-success btn-lg']").click() time.sleep(10) browser.switch_to_frame('ueditor_0') html = browser.page_source soup = BeautifulSoup(html) data={ "reserve_check":1, "text":"", "html":soup.find('body') } url = 'http://www.changweibo.com/convert_changweibo_com.php' response = HttpUtil.request_post(url, data) img_url = json.loads(response)['image_url'] print(img_url) return img_url
def find_blogs(url): soup = BeautifulSoup(HttpUtil.request(url)) blogs = map(lambda blog_soup:parse_blog(blog_soup), soup.find(attrs={"class": "article-list thumbnails"}).findAll('li')) return filter(lambda blog : blog is not None, blogs)
def __research(self, page, opener): html = 'http://search.51job.com/jobsearch/search_result.php?jobarea=070200%2C00&keyword=java&curr_page=' + str(page) response = HttpUtil.opener_request(html, opener) soup = BeautifulSoup(response) job_list = soup.find(id='resultList').findAll('div' ,attrs={"class": "el"})[1:] self.__insert_jobs(map(self.__get_51job_detail, job_list))
def set_ntp(self, ntp_server, ntp_interval, ntp_type): for host in self.ssh.hosts: HttpUtil.httpcfg( host[0], 'language=ie&datestampenable3=' + str(ntp_type) + '&sntpip=' + ntp_server + '&ntpinterval=' + str(ntp_interval))
def lin_shi_you_xiang_get_mail(prefix, id): url = Constants.LIN_SHI_YOU_XIANG + "/mailbox/" + prefix + "/" + id + "/source" return HttpUtil.get(url=url, data=None).text
def download_images(url, page, directory): """ 下载图片 :param url: 链接 :param page: 页 :param directory: 文件存放目录 :return: """ try: SystemUtil.restart_process(os.path.abspath(__file__)) html = BeautifulSoup(HttpUtil.get(url + str(page)).text, features="lxml") figure = html.find_all("figure") # 获取所有包含指定属性的标签 page_all = html.find_all(lambda tag: tag.has_attr('original-title')) page_total = int(page_all[len(page_all) - 1].text) print(page, len(figure), page_total) if page > page_total: page = 1 raise ValueError("page超出范围") for label in figure: image_id = label.attrs["data-wallpaper-id"] # 图片详情页 info_html = BeautifulSoup(HttpUtil.get("https://wallhaven.cc/w/" + image_id).text, features="lxml") tags_html = info_html.find_all("a", { "class": "tagname", "rel": "tag" }) # 图片的标签 tags = ",".join([tag_html.text for tag_html in tags_html]).replace("'", "") if len(tags) > 0 and tags != "": tags = TranslationUtil.translate_google(tags).replace(",", ",") tags = re.sub(r"[^a-z,\u4e00-\u9fa5]+|^,|,$", "", tags).replace(",,", ",") download_url = info_html.find("img", { "id": "wallpaper" }).attrs["src"] if len(download_url) <= 0 or download_url == "": raise ConnectionError("获取下载链接失败") s3.execute_commit(f""" INSERT OR IGNORE INTO images(image_id,suffix,url,type,page,tags) VALUES('{image_id}','{download_url[download_url.rfind(".") + 1:]}', '{download_url}','latest','{page}','{tags}') """) image_name = download_url.split("/") image_name = image_name[len(image_name) - 1] # 判断文件是否存在 # if not os.path.exists(name): if not os.path.isfile(os.path.join(directory, image_name)): # 每张图片启用单个线程下载 # done = ThreadPool.pool.submit(HttpUtil.download_file, download_url, directory, image_name) # done.add_done_callback(ThreadPool.thread_call_back) asyncio.run( HttpUtil.download_one_async(download_url, directory, image_name)) global run_count run_count += 1 # 如果获取到的页数大于0不是最后一页,并且内存占用率小于80%时 if len(page_all) > 0 and page <= page_total and run_count <= 10: download_images(url, page + 1, directory) else: if len(page_all) > 0: page += 1 if page > page_total: page = 1 run_count = 0 except Exception as e: print(e) finally: print("当前活跃线程数:", threading.activeCount()) time.sleep(400) download_images(url, page, directory)
def _get(self, url, params=None): return HttpUtil.get(self._final_url(url), params)
def jian_shu_article_retrive(url): html = HttpUtil.request(url) content_soup = BeautifulSoup(html, fromEncoding="utf-8").find(attrs={"class": "article"}) title = content_soup.find(attrs={"class": "title"}).text.encode('utf-8') content = content_soup.find(attrs={"class": "show-content"}) return (title, content)
def _post(self, url, ddata): final_url = self._final_url(url) return HttpUtil.post(final_url, ddata, ctype='json')
def _final_url(self, url): new_url = WxApi.BASE_URL + url final_url = HttpUtil.url_update_query(new_url, access_token=self.access_token) return final_url
def check_access_token(cls, openid, access_token): """ 检验授权凭证(access_token)是否有效 """ params = {'access_token':access_token, 'openid': openid} return HttpUtil.get(cls._CHECK_TOKEN_URL, params)
def download_latest_images_selenium(page, directory): """ 使用selenium获取 :param page: :param directory: :return: """ SystemUtil.restart_process(os.path.abspath(__file__)) driver = ReptileUtil.selenium_driver( "https://www.pexels.com/new-photos?page=" + str(page)) try: articles = driver.find_elements_by_tag_name("article") next_page = True try: driver.find_element_by_xpath( "/html/body/section/div[4]/div/a[@rel='next']") except Exception as e: next_page = False # 获取当前所有窗口句柄(窗口A、B) main_window = driver.current_window_handle print(articles) for article in articles: # 图片id image_id = article.get_attribute("data-photo-modal-medium-id") info_url = "https://www.pexels.com/photo/" + image_id # 通过执行js打开新标签页并访问url driver.execute_script(f"window.open('{info_url}')") driver.switch_to.window(driver.window_handles[-1]) tags = "" if driver.title.find("500") == -1: tags = driver.find_element_by_xpath( "//meta[@name='keywords']").get_attribute("content") tags = TranslationUtil.translate_google(tags).replace(",", ",") tags = re.sub(r"[^a-z,\u4e00-\u9fa5]+|^,|,$", "", tags).replace(",,", ",") # 关闭当前窗口。 driver.close() # 关闭新选项卡后回到主窗口,必须做这一步,否则会引发错误 driver.switch_to.window(main_window) # 图片下载链接 download_url = f"https://images.pexels.com/photos/{image_id}/pexels-photo-{image_id}.jpeg?dl={image_id}.jpg" s3.execute_commit(f""" INSERT OR IGNORE INTO images(image_id,suffix,url,type,page,tags) VALUES('{image_id}','jpg','{download_url}','latest','{page}','{tags}') """) image_name = f"pexels-photo-{image_id}.jpg" # 判断文件是否存在 if not os.path.exists(os.path.join(directory, image_name)): asyncio.run( HttpUtil.download_one_async(download_url, directory, image_name)) global run_count run_count += 1 # 如果获取到的页数大于0不是最后一页 if next_page and run_count <= 10: download_latest_images(page + 1, directory) else: if next_page: page += 1 else: page = 1 run_count = 0 except Exception as e: print(e) finally: # 关闭当前窗口。 driver.close() # 关闭浏览器并关闭chreomedriver进程 driver.quit() print("当前活跃线程数:", threading.activeCount()) time.sleep(400) download_latest_images(page, directory)
def download_latest_images(page, directory): try: SystemUtil.restart_process(os.path.abspath(__file__)) html = BeautifulSoup( HttpUtil.get("https://www.pexels.com/zh-cn/new-photos?page=" + str(page)).text, features="lxml") articles = html.find_all("article") pages_html = BeautifulSoup(str( html.find("div", {"class": "pagination"})), features="lxml").find_all("a") page_total = int(pages_html[len(pages_html) - 2].text) print(page, len(articles), page_total) if page > page_total: page = 1 raise ValueError("page超出范围") for article in articles: # 图片id image_id = article["data-photo-modal-medium-id"] # 图片原始大小 # image_org_size = article["data-photo-modal-download-value-original"] # 图片下载链接 download_url = article["data-photo-modal-image-download-link"] image_name = f"pexels-photo-{image_id}.jpg" info_html = BeautifulSoup( HttpUtil.get("https://www.pexels.com/zh-cn/photo/" + image_id).text, features="lxml") tags = info_html.find("meta", { "name": "keywords" }).attrs["content"] if len(tags) > 0 and tags != "": # 简繁转换 tags = zhconv.convert(tags[:len(tags) - 7], 'zh-cn') tags = re.sub(r"[^a-z,\u4e00-\u9fa5]+|^,|,$", "", tags).replace(",,", ",") s3.execute_commit(f""" INSERT OR IGNORE INTO images(image_id,suffix,url,type,page,tags) VALUES('{image_id}','{download_url[download_url.rfind(".") + 1:]}', '{download_url}','latest','{page}','{tags}') """) # dl = info_html.find(lambda tag: tag.has_attr('data-id') and tag.has_attr('href')).attrs["href"] # dl = info_html.find(lambda tag: tag.has_attr('data-id') and tag.has_attr('data-url')).attrs["data-url"] # 判断文件是否存在 if not os.path.exists(os.path.join(directory, image_name)): # 每张图片启用单个线程下载 # done = ThreadPool.pool.submit(HttpUtil.download_file, download_url, directory, image_name) # done.add_done_callback(ThreadPool.thread_call_back) asyncio.run( HttpUtil.download_one_async(download_url, directory, image_name)) global run_count run_count += 1 # 如果获取到的页数大于0不是最后一页 if page_total > 0 and page <= page_total and run_count <= 10: download_latest_images(page + 1, directory) else: if len(pages_html) > 0 and page <= page_total: page += 1 if page > page_total: page = 1 run_count = 0 except Exception as e: print(e) finally: print("当前活跃线程数:", threading.activeCount()) time.sleep(400) download_latest_images(page, directory)