Example #1
0
    def process_request(self, request, spider):
        k = PyKeyboard()
        self.driver.get((request.url))
        x = request.url[:]
        #request.headers["Referer"] = "https://antirobot.tianyancha.com/captcha/verify?return_url=https%3A%2F%2Fwww.tianyancha.com%2Fcompany%" + str(request.url[35:]) + "&rnd="
        #print(request.headers["Referer"])
        time.sleep(random.random())
        try:
            denglu = self.driver.find_element_by_xpath(
                '//div[@id="J_TLoginInfoHd"]/a[1]')
            if not denglu:
                pass
            else:
                denglu.click()
                time.sleep(random.random())
                self.driver.find_element_by_xpath(
                    '//a[@class="forget-pwd J_Quick2Static"]').click()
                time.sleep(0.3)
                self.driver.find_element_by_xpath(
                    '//input[@id="TPL_username_1"]').send_keys('15123358380')
                self.driver.find_element_by_xpath(
                    '//input[@id="TPL_password_1"]').send_keys('a135792468')
                self.driver.find_element_by_xpath(
                    '//button[@id="J_SubmitStatic"]').click()
                #win32api.SetCursorPos([random.randint(1160,1170),random.randint(590,595)])
                #win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP | win32con.MOUSEEVENTF_LEFTDOWN, 0, 0, 0, 0)
                #self.mima()
                # try:
                #     while True:
                #         huakuai = self.driver.find_element_by_xpath('//*[@id="nc_1_n1z"]')
                #         if not huakuai:
                #             break
                #         else:
                #             x = random.randint(975,1010)
                #             y = random.randint(505, 515)
                #             win32api.SetCursorPos([x, y])
                #             time.sleep(0.1)
                #             win32api.mouse_event(win32con.MOUSEEVENTF_LEFTDOWN, 0, 0, 0, 0)
                #             sum = x
                #             for i in range(9):
                #                 sum += random.randint(30, random.randint(40, 65))
                #                 win32api.SetCursorPos([sum, y])  # 为鼠标焦点设定一个位置
                #                 time.sleep(0.1)
                #             win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP, 0, 0, 0, 0)
                #             win32api.SetCursorPos([random.randint(1000, 1200), random.randint(560, 565)])
                #             win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP | win32con.MOUSEEVENTF_LEFTDOWN, 0, 0, 0,0)
                #             time.sleep(random.random())
                #             self.driver.refresh()
                #             time.sleep(5)
                #             try:
                #                 win32api.SetCursorPos([random.randint(1160, 1170), random.randint(590, 595)])
                #                 win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP | win32con.MOUSEEVENTF_LEFTDOWN, 0, 0,0, 0)
                #             except:
                #                 pass
                #             self.mima()
                # except:
                #     pass
                time.sleep(10)
                request.url = x[:]
                self.driver.get((request.url))
        except:
            pass
        time.sleep(0.5 + random.random())
        str = self.driver.page_source
        a = 0

        month_sell = etree.HTML(str).xpath(
            '//ul[@class="info-list"]/li[1]/em/text()')

        while (month_sell == ['-'] or month_sell == []):
            if request.url == 'https://www.fliggy.com/dujia/?spm=181.11358650.0.0.78d5223eYq1rts':
                break
            # self.driver.switch_to.frame('sufei-dialog-content')
            try:
                y = random.randint(325, 335)
                x = random.randint(835, 845)
                win32api.SetCursorPos([x, y])
                time.sleep(0.3 + random.random())
                win32api.mouse_event(win32con.MOUSEEVENTF_LEFTDOWN, 0, 0, 0, 0)
                sum = x
                for i in range(9):
                    sum += random.randint(random.randint(25, 30),
                                          random.randint(40, 65))
                    win32api.SetCursorPos([sum, y])  # 为鼠标焦点设定一个位置
                    time.sleep(random.random() * 0.1)
                win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP, 0, 0, 0, 0)

                # self.driver.switch_to.default_content()

                str = self.driver.page_source
                month_sell = etree.HTML(str).xpath(
                    '//ul[@class="info-list"]/li[1]/em/text()')
                print(month_sell)
                if (month_sell == ['-'] or month_sell == []):
                    time.sleep(random.random())
                    self.driver.refresh()
                    a += 1
                    time.sleep(random.random())
                if (a >= 7):
                    self.cursor.execute(self.sql, (request.url))
                    self.conn.commit()
                    break
            except:
                pass
        source = self.driver.page_source
        response = HtmlResponse(url=self.driver.current_url,
                                body=source,
                                request=request,
                                encoding='utf-8')
        return response
 def test_empty_page(self):
     spider, page, _ = open_spider_page_and_results(
         'cars.com_nested.json')
     page = HtmlResponse(page.url, body=u'', encoding='utf-8')
     items = [i for i in spider.parse(page) if not isinstance(i, Request)]
     self.assertEqual(items, [])
Example #3
0
 def get_response_object(self, url):
     path_to_file = url.replace(FILE_SYSTEM_PREFIX, '')
     f = open(path_to_file, 'rb')
     bytess = f.read()
     f.close()
     return HtmlResponse(url, 200, self.generate_response_headers(), bytess, None, Request(url), encoding='utf-8')
Example #4
0
    def collect_all_tagets(self, driver, starturl: str):
        """
        从入口starturl开始,遍历每一页收集所有标的url,该函数执行前务必已经`driver.get(starturl)`
        """
        try:
            total = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR,
                     'div.pagination.J_Pagination  .page-total')))
        except TimeoutException:  # TimeoutException
            self.logger.debug("导航页有问题,获取导航失败:{}".format(starturl))
            return
        # 分页信息分析
        max_page_num = int(total.text)

        try:
            nav = WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located(
                    (By.CSS_SELECTOR, 'div.pagination.J_Pagination a')))
        except TimeoutException:  # TimeoutException
            self.logger.debug("导航页有问题,获取导航失败:{}".format(starturl))
            return
        # 跳转到首页,第二个是第一页
        driver.execute_script("arguments[0].click();", nav[1])

        # 从首页开始遍历每页,动作是收集所有标的,然后到下一页.
        self.logger.info("将收集{}页,starturl{}".format(max_page_num, starturl))

        max_retry_times = 5  # 尝试5次
        retrys = max_retry_times
        for i in range(max_page_num):
            # 提示
            if i < max_page_num - 1 and i % 50 == 0:
                self.logger.info("第{}页开始收集".format(i))
            elif i == max_page_num - 1:
                self.logger.info("已收集{}页".format(i))

            try:
                _ = WebDriverWait(driver, 20).until(
                    EC.presence_of_all_elements_located(
                        (By.CSS_SELECTOR, ".sf-content .sf-item-list li a")))
                #从实践看,网页会变化,所以采用保存下来再解析
                response = HtmlResponse(driver.current_url,
                                        body=str.encode(driver.page_source),
                                        encoding='utf-8')
                yield list(
                    set(
                        map(
                            lambda x: response.urljoin(x),
                            response.css(
                                ".sf-content .sf-item-list li a::attr(href)").
                            getall())))

            except TimeoutException:
                self.logger.info("{} 页收集标的url失败".format(i))

            # 已经到了最后页,就不需要跳转下一页了
            if i < max_page_num - 1:
                # 跳转到下一页,
                if self._to_next_page(driver) == -1:
                    retrys -= 1
                    if retrys <= 0:
                        break  # 如果跳转失败跳出循环结束
                else:
                    retrys = max_retry_times
        # return urls
        return
Example #5
0
    def process_request(self, request, spider):
        if spider.name == 'sohucaijing_Spider':
            request.headers['User-Agent'] = random_user_agent.give_a_head()

            # self.driver = Firefox(executable_path='geckodriver', firefox_options=self.options)
            # 配了环境变量第一个参数就可以省了,不然传绝对路径
            # wait = WebDriverWait(self.driver, timeout=10)

            # 当引擎从调度器中取出request进行请求发送下载器之前
            # 会先执行当前的爬虫中间件 ,在中间件里面使用selenium
            # 请求这个request ,拿到动态网站的数据 然后将请求
            # 返回给spider爬虫对象
            # 使用爬虫文件的url地址

            # self.driver.get(request)
            # wait.until(expected.visibility_of_element_located((By.NAME, 'q'))).send_keys(
            #     'headless firefox' + Keys.ENTER)
            # wait.until(expected.visibility_of_element_located((By.CSS_SELECTOR, '#ires a'))).click()
            # print(self.driver.page_source)

            # 整数,需要爬取的新闻数量,最好定义成整20
            news_amount = 1000

            # 整数 额外获取的数据包数量,一包20条新闻,只要初始的20条就改成0  不保证因为网卡产生的数据损失
            ex_packages_amount = int(news_amount / 20) - 1

            url = request.url
            if 'https://' in request.url:
                url = request.url[9:]

            print("在中间件请求的连接:" + url)
            self.driver.get(url)

            if 'mp.sohu.com/profile?xpt=c29odWNqeWMyMDE3QHNvaHUuY29t' in url:
                for temp in range(0, ex_packages_amount):
                    # for x in range(1, 12, 2):
                    #     i = (float(x) / 11)/(temp+1) + temp/(temp+1)
                    #     # scrollTop 从上往下的滑动距离
                    #     # print("中间件:准备执行这个滚动js")
                    #     js = 'document.body.scrollTop=document.body.scrollHeight * %f' % i
                    #     time.sleep(1)
                    #     self.driver.execute_script(js)
                    self.driver.execute_script(
                        "document.documentElement.scrollTop=document.body.scrollHeight"
                    )
                    time.sleep(1)
                    print("现在的feed item数量:" + str(
                        len(
                            self.driver.find_elements_by_class_name(
                                "feed-item"))))

                    # float_list = []
                    # for _ in range(0, 3):
                    #     float_list.append(random.random())
                    # float_list.sort()
                    # print("random list:" + str(float_list))
                    # for i in float_list:
                    #     js = 'document.body.scrollTop=document.body.scrollHeight * %f' % i
                    #     time.sleep(3)
                    #     spider.driver.execute_script(js)
                    #     time.sleep(3)

            else:
                for x in range(1, 6, 2):
                    i = float(x) / 5
                    # scrollTop 从上往下的滑动距离
                    # print("中间件:准备执行这个滚动js")
                    js = 'document.body.scrollTop=document.body.scrollHeight * %f' % i
                    time.sleep(1)
                    self.driver.execute_script(js)

            response = HtmlResponse(url=url,
                                    body=self.driver.page_source,
                                    encoding='utf-8',
                                    request=request)
            # print("中间件:准备return这个response")
            # 这个地方只能返回response对象,当返回了response对象,那么可以直接跳过Internet,将response的值传递给引擎,引擎又传递给 spider进行解析
            return response
Example #6
0
 def process_request(self,request,spider):
     self.driver.get(request.url)
     self.driver.implicitly_wait(50)
     source = self.driver.page_source
     response = HtmlResponse(url = self.driver.current_url,body=source,request =request,encoding='utf-8')
     return  response
Example #7
0
 def test_display_response_without_return(self):
     response = HtmlResponse(body=b'<html>test</html>',
                             url='http://some.url',
                             status=200)
     resp = display(response)
     self.assertIsNone(resp)
Example #8
0
 def process_request(self,request,spider):
     self.driver.get(request.url)
     time.sleep(1)
     source = self.driver.page_source
     response = HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding='utf-8')
     return response
Example #9
0
def cached_page(site, url_path, spider_name='toc'):
    handle_client_ip()

    site = base64.standard_b64decode(site.encode()).decode()
    url_path = base64.standard_b64decode(url_path.encode()).decode()
    url_site = SiteSchemas.get(site).get(SSK.URL)
    url = url_site + url_path
    origin_encoding = SiteSchemas.get(site).get(SSK.ENCODING, 'utf-8')
    aid = request.args.get('aid', default=None, type=int)

    from moltspider.consts import Schemas
    from moltspider.parser import iter_items
    from scrapy.utils.misc import load_object
    from scrapy.utils.project import get_project_settings
    from scrapy.http.request import Request
    from scrapy.http.response.html import HtmlResponse
    from scrapy.utils.gz import gunzip
    from scrapy.downloadermiddlewares.httpcompression import ACCEPTED_ENCODINGS
    try:
        import brotli
    except:
        pass
    import zlib
    settings = get_project_settings()
    storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)

    body = None
    spider_req = Request(url)
    if spider_name == Spiders.META:
        from moltspider.spiders.meta import MetaSpider
        spider = MetaSpider()
        schema_name = Schemas.META_PAGE
    elif spider_name == Spiders.TOC:
        from moltspider.spiders.toc import TocSpider
        spider = TocSpider
        schema_name = Schemas.TOC_PAGE
    else:
        raise Exception('No support for spider "%s"\'s cache page' %
                        spider_name)

    cachedresponse = storage.retrieve_response(spider, spider_req)
    if cachedresponse:
        content_encoding = cachedresponse.headers.getlist('Content-Encoding')
        if content_encoding:
            encoding = content_encoding.pop()
            if encoding == b'gzip' or encoding == b'x-gzip':
                body = gunzip(cachedresponse.body)

        if encoding == b'deflate':
            try:
                body = zlib.decompress(body)
            except zlib.error:
                # ugly hack to work with raw deflate content that may
                # be sent by microsoft servers. For more information, see:
                # http://carsten.codimi.de/gzip.yaws/
                # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
                # http://www.gzip.org/zlib/zlib_faq.html#faq38
                body = zlib.decompress(body, -15)
        if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS:
            body = brotli.decompress(body)

    if body:
        if spider_name == Spiders.TOC and aid:
            sb = []
            colspan = 4
            i = 0
            scrapy_resp = HtmlResponse(url)
            scrapy_resp = scrapy_resp.replace(body=body,
                                              encoding=origin_encoding)
            sb.append('<table width="1000px" align="center"><tr>')
            for item in iter_items(spider, scrapy_resp, [
                    site,
            ], schema_name):
                if i % colspan == 0:
                    sb.append('</tr><tr>')
                item['_'] = url_site
                sb.append('<td><a href="%(_)s%(url)s">%(name)s</a></td>' %
                          item)
                del item['_']
                i += 1
            sb.append('</tr></table>')
            body = '\n'.join(sb)
            body = render_template_string(template_page, content=body)
        else:
            body = body.decode(encoding=origin_encoding)
    else:
        body = '%s (%s) not found in cache.' % (url, origin_encoding)

    resp = make_response(body)
    resp.headers['Content-Type'] = 'text/html; charset=utf-8'
    return resp
Example #10
0
def load_response(url, filename):
    input_path = os.path.join(os.path.dirname(__file__), '_tests', filename)
    with open(input_path, 'rb') as input_file:
        return HtmlResponse(url, body=input_file.read())
Example #11
0
    def process_request(self, request, spider):
        if 'origin' in request.meta or spider.errback_status is True:
            while True:
                # 每隔10分钟设置代理为None
                now = time.time()
                if now - self.run_time > 60 * 10:
                    self.close_err_driver(spider, 'set proxy is None')
                    self.proxy = None
                    self.run_time = now
                # 初始化浏览器
                if not self.driver:
                    self.init_browser(spider)
                try:
                    # 模拟请求初始页
                    self.driver.get(request.url)

                    if 'err504' in self.driver.current_url:
                        self.close_err_driver(spider, 'page err504')
                        # 更换代理
                        self.proxy = _get_proxy(spider)
                        continue
                    # 等待数据
                    WebDriverWait(self.driver, 40).until(
                        EC.presence_of_element_located(
                            (By.XPATH, '//*[@id="availabilityForm"]')))
                except:
                    traceback.print_exc()
                    # 更换代理
                    self.proxy = _get_proxy(spider)
                    self.close_err_driver(
                        spider, 'browser error, 3 seconds after the restart.')
                else:
                    break

            # 设置cookies
            self.cookies = self.driver.get_cookies()
            request.cookies = self.cookies
            request.meta['proxy'] = self.proxy

            # 清除cookies
            spider.log('delete_all_cookies', 20)
            self.driver.delete_all_cookies()

            # 自定义Response对象
            current_url = bytes(self.driver.current_url)
            body = bytes(self.driver.page_source.encode('utf-8'))
            response = HtmlResponse(current_url, body=body, request=request)

            # # 注释掉以后不再关闭浏览器
            # if self.num % 6 == 0:
            #     spider.log('browser closed.', 20)
            #     self.driver.close()
            #     self.driver = None
            #
            # self.num += 1

            spider.errback_status = False
            return response
        else:
            request.meta['proxy'] = self.proxy
            request.cookies = self.cookies
Example #12
0
    def process_request(self, request, spider):
        if spider.name == 'uo' and 'origin' in request.meta and request.meta[
                'origin'] == 1:

            meta = request.meta
            _from = meta['_from']
            _to = meta['_to']

            while True:
                # 初始化浏览器
                if not self.driver:
                    self.init_browser(spider)
                try:
                    # 模拟请求初始页
                    self.driver.get(request.url)

                    # 处理滑块验证码
                    if u"Please slide to verify that you're not a robot" in self.driver.page_source:
                        self.driver.maximize_window()
                        action = ActionChains(self.driver)
                        f**k = self.driver.find_element_by_id('nc_1_n1z')
                        action.drag_and_drop_by_offset(f**k, 233.51,
                                                       0).perform()

                        spider.log(
                            "Please slide to verify that you're not a robot",
                            40)
                        self.driver.delete_all_cookies()
                        try:
                            self.driver.close()
                            self.driver = None
                        except:
                            self.driver = None
                        time.sleep(8)
                        continue

                    else:
                        # Ajax请求
                        comment = '''var data=%s;
                           $.post("#", data, function(){
                           window.location.href = "/en-US/select?origin=%s&amp;destination=%s"
                           })''' % (request.body, _from, _to)
                        self.driver.execute_script(comment)

                        # 等待数据
                        WebDriverWait(self.driver, 50).until(
                            EC.presence_of_element_located(
                                (By.XPATH,
                                 '//*[@name="__RequestVerificationToken"]')))
                        break
                except:
                    traceback.print_exc()
                    spider.log('browser error, 8 seconds after the restart.',
                               40)
                    try:
                        self.driver.close()
                        self.driver = None
                    except:
                        self.driver = None
                    time.sleep(8)

            # 自定义Response对象
            current_url = bytes(self.driver.current_url)
            body = bytes(self.driver.page_source.encode('utf-8'))
            response = HtmlResponse(current_url, body=body)

            # 设置cookies
            request.cookies = self.driver.get_cookies()

            # 设置headers
            request.headers.setdefault('Referer', current_url)

            # 注释掉以后不再关闭浏览器
            if self.num % 10 == 0:
                spider.log('browser closed.', 20)
                self.driver.close()
                self.driver = None

            self.num += 1

            # 清除cookies
            spider.log('delete_all_cookies', 20)
            self.driver.delete_all_cookies()
            return response
Example #13
0
    def process_request(self, request, spider):
        if spider.name == 'lx' and 'origin' in request.meta:
            while True:
                # 初始化浏览器
                if not self.driver:
                    self.init_browser(spider)
                try:
                    # 模拟请求初始页
                    self.driver.get(request.url)

                    if 'Schedule' in self.driver.current_url:
                        print('这条路线与其他航空公司一起提供。')
                        return None
                    if 'False' in self.driver.current_url:
                        print('我们无法找到所选路线的航班。')
                        return None
                    if 'distil_r_captcha' in self.driver.current_url:
                        spider.log('验证码出现,换ip,,,,,,')
                        if spider.proxy:
                            self.proxy = _get_proxy(spider)
                    # 等待数据
                    WebDriverWait(self.driver, 50).until(
                        EC.presence_of_element_located((By.ID, 'frm-matrix')))
                except:
                    spider.log('browser error, 8 seconds after the restart.',
                               40)
                    spider.log('exception ,change proxy....')
                    if spider.proxy:
                        self.proxy = _get_proxy(spider)
                    # traceback.print_exc()
                    try:
                        self.driver.close()
                        self.driver = None
                    except:
                        self.driver = None
                    time.sleep(8)
                else:
                    break

            # 自定义Response对象
            current_url = bytes(self.driver.current_url)
            body = bytes(self.driver.page_source.encode('utf-8'))
            response = HtmlResponse(current_url, body=body)

            # 设置cookies
            if spider.proxy:
                request.meta["proxy"] = self.proxy
            request.cookies = self.driver.get_cookies()

            # 设置headers
            request.headers.setdefault('Referer', current_url)

            # 清除cookies
            self.driver.delete_all_cookies()
            spider.log('delete_all_cookies', 20)
            # 暂停爬虫引擎
            # spider.crawler.engine.pause()

            # 注释掉以后不再关闭浏览器
            if self.num % 6 == 0:
                spider.log('browser closed.', 20)
                self.driver.close()
                self.driver = None
            self.num += 1

            return response
        elif not spider.is_ok and spider.proxy:
            self.proxy = _get_proxy(spider)
Example #14
0
    def process_request(self, request, spider):
        # ua = UserAgent()
        # agent = ua.chrome
        # f = faker.Faker(locale='zh_cn')
        # agent = f.user_agent()
        agent = random.choice(useragent)
        same = get_lxsdk_cuid(agent)
        cook = '_lxsdk_cuid={}; _lxsdk={}; _hc.v={}; _lxsdk_s={}'.format(
            same, same, get_hc(), get_lxsdk_s())
        # cook = '_lxsdk_cuid={}; _lxsdk={}; _hc.v={}; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s={}'.format(same,same,get_hc(),get_lxsdk_s())
        cook1 = 'cy=1236; cityid=1236; cye=huixian; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=165c6b8e911c8-0383cc5ec3114e-37664109-144000-165c6b8e91289; _lxsdk=165c6b8e911c8-0383cc5ec3114e-37664109-144000-165c6b8e91289; _hc.v=0c84e8b5-c945-5c86-bb54-94e4936012e5.1536637332; s_ViewType=10; cye=beijing; _lxsdk_s=165cb7d7e23-268-18-f1%7C%7C87'
        # print(cook)
        headers = {
            'Host':'www.dianping.com',
            'Upgrade-Insecure-Requests':'1',
            'Cookie':cook,
            'User-Agent':agent ,
            # 'Proxy-Connection':'keep-alive'
        }
        proxyHost = "http-dyn.abuyun.com"
        proxyPort = "9020"
        # 代理隧道验证信息
        proxyUser = "******"
        proxyPass = "******"
        proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxyHost,
            "port": proxyPort,
            "user": proxyUser,
            "pass": proxyPass,
        }

        proxies = {
            # "http": proxyMeta,
            "https": proxyMeta,
        }
        proxiess = {
            "https": "http://140.255.6.45:5649",
            # "https": "http://118.79.54.90:6996",
            # "https": "http://117.42.201.221:6214",

        }

        import requests
        #s = requests.Session()
        #base = 'https://www.dianping.com/'
        try:
            # start_url = requests.get(base, headers=headers, proxies=proxies, timeout=15)
            # print(start_url.text)
            res = requests.get(request.url, headers=headers, proxies=proxies, timeout=15)
            if res.status_code != 200 or len(res.text) < 560:
                if res.status_code == 403 or res.status_code == 404:
                    content = '页面无法访问'
                else:
                    content = res.text
                print('该URL:{},状态码:{},内容为:{}'.format(request.url, res.status_code, content))
                key = getattr(spider, 'redis_key')
                db = RedisClient()
                print('该URL:{}需要重新入队列'.format(request.url))
                db.add_value(key, request.url)
                raise IgnoreRequest
            else:
                from scrapy.http.response.html import HtmlResponse
                rs = res.content.decode('utf-8')
                # print(rs)
                response = HtmlResponse(url=request.url, body=res.content.decode('utf-8'), encoding="utf-8", request=request)
                return response
        except Exception as e:
            print('出现错误,原因{}'.format(e.args))
            key = getattr(spider, 'redis_key')
            db = RedisClient()
            print('该URL:{}需要重新入队列'.format(request.url))
            db.add_value(key, request.url)
            raise IgnoreRequest
Example #15
0
 def test_display_selector_without_return(self):
     selector = HtmlResponse(body=b'<html>test</html>',
                             url='http://some.url',
                             status=200).xpath('//html')
     resp = display(selector)
     self.assertIsNone(resp)
Example #16
0
def open_page(name):
    return HtmlResponse(url=name, body=open_spec(name), encoding='utf-8')
Example #17
0
    def build_links(self):
        """ Build a complete list of links from html in elasticsearch """
        def binary_search(array, key, low, high):
            """ Fast search in a sorted array """
            if low > high:  # termination case
                return -1
            middle = (low + high) / 2  # gets the middle of the array
            if array[middle] == key:   # if the middle is our key
                return middle
            elif key < array[middle]:  # our key might be in the left sub-array
                return binary_search(array, key, low, middle-1)
            else:                      # our key might be in the right sub-array
                return binary_search(array, key, middle+1, high)

        es_obj = ElasticSearchPipeline.from_crawler(self.crawler).es
        new_links = []
        hashes = sorted([
            url['_id']for url in scan(
                es_obj,
                query={
                    "query": {
                        "exists": {
                            "field": "url"
                        }
                    }
                },
                index=self.es_index,
                doc_type=self.settings['ELASTICSEARCH_TYPE'],
                _source_exclude=["*"])
        ])
        urls_iter = scan(
            es_obj,
            query={
                "query": {
                    "exists": {
                        "field": "content"
                    }
                }
            },
            index=self.es_index,
            doc_type=self.settings['ELASTICSEARCH_TYPE'],
            _source_include=["content", "url"]
        )

        for hit in urls_iter:
            id_ = hit['_id']
            url = hit['_source']['url']
            content = hit['_source']['content']
            try:
                #response = HtmlResponse(url, encoding="utf-8", body=content)
                response = HtmlResponse(url, body=content)
                for request in self._requests_to_follow(response):
                    hash_target = hashlib.sha1(request.url.encode('utf-8')).hexdigest()
                    if binary_search(hashes, hash_target, 0, len(hashes)-1) < 0:
                        continue
                    new_links.append((id_,
                                      hash_target))
            except TypeError:
                pass

        return new_links
Example #18
0
def body2hxs(ctx, encoding='utf-8', url='http://localhost'):
    r = HtmlResponse(url, body=ctx, encoding=encoding)
    return HtmlXPathSelector(r)
Example #19
0
    def parse(self, response):
        # meta = response.meta
        # _from = meta.get('_from')
        # _to = meta.get('_to')
        # _date = meta.get('_date')

        res = json.loads(response.text)
        if 'origin' in response.meta:
            multiDayAvailabilityOutbound = res['multiDayAvailabilityOutbound']
            r = HtmlResponse(url=self.start_urls,
                             body=multiDayAvailabilityOutbound.encode('utf-8'))
            __RequestVerificationToken = r.xpath(
                '//div[@class="animation-container"]//input[@name="__RequestVerificationToken"]/@value'
            ).extract_first()
            li_days = r.xpath(
                '//*[@class="HV-gc bulletless days"]/li/div[@class="day day-with-availability"]'
            )
            for li in li_days:
                date_date = li.xpath('@data-date').extract_first()
                body = {
                    'selectSingleDayAvailability.JourneyType':
                    'OutboundFlight',
                    'selectSingleDayAvailability.Date.DateToParse':
                    date_date[:10],
                    'selectSingleDayAvailability.AutoSelect': False,
                    '__RequestVerificationToken': __RequestVerificationToken
                }
                yield scrapy.Request(
                    self.select_url,
                    method='POST',
                    body=parse.urlencode(body),
                    headers=response.request.headers,
                    cookies=response.request.cookies,
                )

        else:
            SingleDayOutbound = res['SingleDayOutbound']
            html = HtmlResponse(url='', body=SingleDayOutbound.encode('utf-8'))
            buttons = html.xpath('//button[@class="flight-result-button"]')
            for button in buttons:
                # 机场
                button_value = button.xpath('@value').extract_first()
                dep_airport, arr_airport = re.findall(r'~(\w{3})~',
                                                      button_value)[:2]
                fromCity = self.city_airport.get(dep_airport, dep_airport)
                toCity = self.city_airport.get(arr_airport, arr_airport)
                # 时间
                div_times = button.xpath('div[@class="times"]')
                departure = div_times.xpath(
                    'time[@class="departure"]/@datetime').extract_first()
                departure_time = div_times.xpath(
                    'time[@class="departure"]/text()').extract_first().strip()
                dep_date = "%s %s:00" % (departure[:10], departure_time)
                arrival = div_times.xpath(
                    'time[@class="arrival"]/@datetime').extract_first()
                arrival_time = div_times.xpath(
                    'time[@class="arrival"]/text()').extract_first().strip()
                arr_date = "%s %s:00" % (arrival[:10], arrival_time)
                # 航班号
                details = button.xpath('div[@class="details"]')
                flight_number_list = details.xpath(
                    'ul/li[@class="flight-number"]/text()').extract()
                flight_number = flight_number_list[1].strip()
                # 价格
                actions = button.xpath('div[@class="actions"]')
                price_div = actions.xpath('div[contains(@class, "price")]')
                currency = price_div.xpath(
                    'span[@class="currency"]/text()').extract_first().strip()
                currency = self.currency_cache.get(currency, currency)
                price = price_div.xpath('text()[2]').extract_first().strip()

                item = FlightsItem()
                item.update(
                    dict(
                        flightNumber=flight_number,  # 航班号
                        depTime=int(
                            time.mktime(
                                time.strptime(dep_date,
                                              "%d/%m/%Y %H:%M:%S"))),  # 出发时间
                        arrTime=int(
                            time.mktime(
                                time.strptime(arr_date,
                                              "%d/%m/%Y %H:%M:%S"))),  # 达到时间
                        fromCity=fromCity,  # 出发城市
                        toCity=toCity,  # 到达城市
                        depAirport=dep_airport,  # 出发机场
                        arrAirport=arr_airport,  # 到达机场
                        currency=currency,  # 货币种类
                        adultPrice=float(price),  # 成人票价
                        adultTax=0,  # 税价
                        netFare=float(price),  # 净票价
                        maxSeats=3,  # 可预定座位数
                        cabin='E',  # 舱位
                        carrier=flight_number[:2],  # 航空公司
                        isChange=1,  # 是否为中转 1.直达2.中转
                        segments="NULL",  # 中转时的各个航班信息
                        getTime=int(time.time()),
                    ))

                yield item