def run(self): headers = my_user_agent.get_user_agent() # 判断队列中url_queue,是否为空,为空就停止 while self.url_queue.empty() == False: response = requests.get(self.url_queue.get(), headers=headers) if response.status_code == 200: self.html_queue.put(response.text) # print(response.text) print("*" * 60) print(response.url) print("*" * 60)
def get_video(file_name, url): ''' 拿到视频能下载的url地址,并下载 :param: file_name 名字 :param url 下载链接 :return: ''' # 使用requests发出请求,下载 response = requests.get(url, stream=True, headers=get_user_agent()) if response.status_code == 200: # 写入收到的视频数据 with open(file_name, 'ab') as f: f.write(response.content) # 刷新缓冲区 f.flush() print("下载成功") else: print("找不到该视频。。。。")
import requests from my_user_agent import get_user_agent from selenium import webdriver import time from lxml import etree import json from queue import Queue from gevent import monkey import gevent import random # 有耗时操作时需要 monkey.patch_all() # 将程序中用到的耗时操作的代码,换为genvent中自已实现的模块 # 西瓜美食频道 headers = get_user_agent() file_name = input("请输入要保存的文件名:") path = r"C:\\Users\\Administrator\\Desktop\\youtube\\" print(path) def get_source(): browser.implicitly_wait(10) for i in range(3): # 鼠标拉动滚动条 browser.execute_script( "window.scrollTo(0, document.body.scrollHeight); var lenOfPage=document.body.scrollHeight; return lenOfPage" ) time.sleep(1) source = browser.page_source return source
import os import re from queue import Queue from threading import Thread import requests import my_user_agent import json # from gevent import monkey import gevent import random # 有耗时操作时需要 # monkey.patch_all() # 将程序中用到的耗时操作的代码,换为genvent中自已实现的模块 path = r"C:\\Users\\Administrator\\Desktop\\youtube\\youku\\" headers = my_user_agent.get_user_agent() # 爬虫类(json_url) class CrawlInfo(Thread): def __init__(self, url_q, info_q): Thread.__init__(self) self.url_q = url_q self.info_q = info_q def run(self): # params = { # 'page_size': 10, # 'next_offset': str(num), # 'tag': '今日热门',