Beispiel #1
0
def get_proxy_list():
    request_obj = request.Request(url='http://www.xicidaili.com/',
                                  headers={'User-Agent': UA().random})
    response = request.urlopen(request_obj)
    html = str(response.read(), encoding='utf-8')

    xml = etree.HTML(html)
    ip_list = xml.xpath('//tr[@class="odd" or @class]/td[2]/text()')
    port_list = xml.xpath('//tr[@class="odd" or @class]/td[3]/text()')
    type_list = xml.xpath('//tr[@class="odd" or @class]/td[6]/text()')
    endure_list = xml.xpath('//tr[@class="odd" or @class]/td[7]/text()')
    last_check_list = xml.xpath('//tr[@class="odd" or @class]/td[8]/text()')

    proxy_list = []
    proxy = namedtuple('proxy', ['ip', 'port', 'type', 'endure', 'last_check'])
    for i in range(len(ip_list)):
        proxy_list.append(
            proxy(ip_list[i], port_list[i], type_list[i],
                  get_time(endure_list[i]), get_time(last_check_list[i])))
    # 按照最后验证时间排序,将最有可能连接成功的放在前面
    proxy_list.sort(key=lambda item: item.last_check)
    return proxy_list
Beispiel #2
0
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    # 'Accept-Language': 'en',
    'User-Agent': UA().random
}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'tencent_positions.middlewares.TencentPositionsSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'tencent_positions.middlewares.TencentPositionsDownloaderMiddleware': 543,
#}

# Enable or disable extensions
Beispiel #3
0
# encoding='utf-8'
import requests
import random
from selenium import webdriver as web
from fake_useragent import UserAgent as UA
from lxml import etree
import urllib.request as ur
import time
ua = UA()

chromePath = "d:/chromedriver.exe"
driver = web.Chrome(chromePath)
url = 'https://www.pornhub.com/view_video.php?viewkey=ph5cf931e03d669'
driver.get(url)
# tag = driver.find_element_by_class_name("mhp1138_btn mhp1138_volume-low mhp1138_icon mhp1138_icon-volume-low")
# print(tag)
driver.minimize_window()
print('-'*30,'开始下载')

tag = driver.find_element_by_class_name("mhp1138_videoWrapper")
downloadFileURL = tag.find_element_by_tag_name("source").get_attribute("src")
driver.quit()

# 获取下载文件的长度
resposeFile = requests.get(url=downloadFileURL,stream=True)
downFileSize = eval(resposeFile.headers['content-length'])
print('文件大小:',downFileSize)
print('文件类型:',type(downFileSize))

# 开始下载文件
size = 0
Beispiel #4
0
    if("方式" in sstr):
        tmp_rent.rent_type = sstr.split(':')[1]
def getdate(beforeOfDay):
    # 获取前1天或N天的日期,beforeOfDay=1:前1天;beforeOfDay=N:前N天
    today = datetime.datetime.now()
    # 计算偏移量
    offset = datetime.timedelta(days=-beforeOfDay)
    # 获取想要的日期的时间
    re_date = (today + offset).strftime('%Y-%m-%d')
    return re_date
headers = {
            'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
            'Accept - Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
            'Connection': 'Keep-Alive',
            'User-Agent': UA().random  # 获取随机的useragent
        }

# ua = UserAgent()
# posC = "C:\Download\small App\WebDriver\chromedriver.exe"
# s = Session(webdriver_path=posC,
#             browser='chrome',
#             default_timeout=15,
#             webdriver_options={'arguments': ['headless','--no-sandbox','--disable-gpu']})
# # 'arguments': ['headless','--no-sandbox','--disable-gpu']
# # response=s.get('http://www.yeeyi.com/bbs/forum.php?mod=viewthread&tid=4523663',headers={'User-Agent':ua.chrome}).text
# pages=10
# url = "http://www.yeeyi.com/bbs/forum.php?mod=viewthread&tid=4521587"
# print(1)
# s.driver.get(url)
# print(1)
Beispiel #5
0
class TopMoviesSpider(CrawlSpider):
    name = 'douban'
    allowed_domains = [
        'www.douban.com', 'movie.douban.com', 'accounts.douban.com'
    ]
    start_urls = [
        'https://movie.douban.com/top250?start=0',
    ]
    # 生成随机的User Agent
    headers = {'User-Agent': UA().random}

    # 定义页面提取规则
    page_extractor = LinkExtractor(allow=('start=\d*', ))
    rules = [Rule(page_extractor, follow=True, callback='parse_movie_info')]

    # 开始处理请求,首先发起登陆请求
    def start_requests(self):
        return [
            scrapy.FormRequest(url="https://accounts.douban.com/login",
                               headers=self.headers,
                               meta={"cookiejar": 1},
                               callback=self.login_douban)
        ]

    # 从响应的登陆页面获取captcha-id,这是之后发送post请求时的必要参数之一
    def login_douban(self, response):
        # 获取captcha-id与验证码的图片地址
        captcha_id = response.xpath(
            '//input[@name="captcha-id"]/@value').extract()

        # 准备基本的formdata
        formdata = {
            'source': 'None',
            'form_email': input('请输入账户名:'),
            'form_password': input('请输入密码:'),
            'login': '******'
        }
        if captcha_id:
            captcha_id = captcha_id[0]
            captcha_url = response.xpath(
                '//img[@id="captcha_image"]/@src').extract()[0]
            # 有验证码图片地址则去下载验证码图片到本地
            captcha_path = os.path.dirname(
                os.path.dirname(__file__)) + '/captcha-images/captcha.jpg'
            urlretrieve(captcha_url, captcha_path)
            # 确保验证码图片下载成功
            try:
                image = Image.open(captcha_path)
                image.show()
                captcha_solution = input('请输入验证码: ')
                formdata['captcha-id'] = captcha_id
                formdata['captcha-solution'] = captcha_solution
            except FileNotFoundError:
                pass
        # 登陆完成后,携带cookie访问待爬取页面
        return scrapy.FormRequest.from_response(
            response,
            meta={'cookiejar': response.meta['cookiejar']},
            headers=self.headers,
            formdata=formdata,
            callback=self.start_parse)

    # 开始爬取电影页面,一旦页面中有符合规则的数据就调用其回调函数进行处理,产生item
    def start_parse(self, response):
        for url in self.start_urls:
            yield self.make_requests_from_url(url)

    def parse_movie_info(self, response):
        movies_in_page = response.xpath('//div[@class="info"]')
        for each in movies_in_page:
            item = items.DoubanMovieItem()
            item['title'] = each.xpath(
                './/span[@class="title"][1]/text()').extract()[0].replace(
                    '\n', '').replace('\xa0', '').strip()
            item['info'] = each.xpath(
                './div[@class="bd"]/p/text()').extract()[0]
            item['score'] = each.xpath(
                './/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()[0]
            quote = each.xpath('.//p[@class="quote"]/span/text()').extract()
            if quote:
                item['quote'] = quote[0].replace('\xa0', '').strip()
            else:
                item['quote'] = ''
            yield item