Example #1
0
 def video_page(self, response: HtmlResponse):
     video_title = response.css('h1.title').css('span::text').get()
     video_channel = response.css('div.video-actions-container').css(
         'div.usernameWrap.clearfix').css('a::text').get()
     js = response.css('div.video-wrapper').css('#player').css(
         'script').get()
     data_video_id = response.css('div.video-wrapper').css(
         '#player::attr(data-video-id)').get()
     prepare_js = js.split('<script type="text/javascript">')[1].split(
         'loadScriptUniqueId')[0]
     exec_js = '{0}\nqualityItems_{1};'.format(prepare_js, data_video_id)
     js_result = js2py.eval_js(exec_js)  # type: js2py.base.JsObjectWrapper
     quality_items = js_result.to_list()  # type: list
     quality = quality_items[-1]['text'].split('p')[0]
     if int(quality) >= 720:
         video_url = quality_items[-1]['url']
         self.logger.info('parse [%s] success, url: %s', video_title,
                          video_url)
         if self.settings.get('ENABLE_SQL'):
             result = self.data_base.select_all_by_title_my_follow(
                 video_title)
             if len(result) != 0:
                 for line in result:
                     self.logger.error('has duplicate record: %s', line)
             else:
                 self.data_base.save_my_follow(video_title, video_channel,
                                               video_url, response.url)
         yield PornhubItem(file_urls=video_url,
                           file_name=video_title,
                           file_channel=video_channel)
Example #2
0
    def get_desc(cls, value):
        try:
            # Step 1
            url_desc = ingram.desc_api % (value)

            # Step 2
            data = urllib.urlopen(url_desc)
            response = HtmlResponse(url=url_desc, body=data.read())

            data = None
            all_xpath = [
                "substring-before(//div[@class='training_details_content'],'Language')",
                "substring-before(//div[@class='training_details_content'],'Please bring your')",
                "//div[@class='training_details_content']/p/text()",
                "//div[@class='training_details_content']/div[1]/text()",
                "//div[@class='training_details_content']/span/text()",
                "//div[@class='training_details_content']//text()"
            ]

            for xpath in all_xpath:
                data = response.xpath(xpath).extract()
                desc = html_to_text(data)
                if desc:
                    return desc

            return None

        except:
            pass
Example #3
0
    def most_popular_page(self, response: HtmlResponse):
        description_list = response.css('div.descriptionContainer')
        for item in description_list:
            title = item.css('a::text').extract_first()
            sub_link = item.css('a::attr(href)').extract_first()
            channel_url = response.urljoin(sub_link)
            self.logger.warning('get channel:{0} ,link is:{1}'.format(
                title, channel_url))
            yield scrapy.Request(channel_url,
                                 callback=self.channel_page_see_all)

        # determine has next page
        next_page_li = response.css('li.page.next.wl-page')
        if next_page_li:
            next_page_sub_link = next_page_li.css(
                'a::attr(href)').extract_first()
            page_number = int(next_page_sub_link.split('page=')[1])
            page_number_start = self.settings.get('PAGE_NUMBER_START')
            page_number_end = self.settings.get('PAGE_NUMBER_END')
            if page_number_end is not None:
                if page_number_start < page_number <= page_number_end:
                    next_page_url = response.urljoin(next_page_sub_link)
                    self.logger.warning(
                        'has next page, url is:{0}'.format(next_page_url))
                    yield scrapy.Request(next_page_url,
                                         callback=self.most_popular_page)
                else:
                    self.logger.warning('has next page, but is in limit')
            else:
                next_page_url = response.urljoin(next_page_sub_link)
                self.logger.warning(
                    'has next page, url is:{0}'.format(next_page_url))
                yield scrapy.Request(next_page_url,
                                     callback=self.most_popular_page)
Example #4
0
 def test_extract_repeated_field(self):
     sample = {
         'plugins': {'annotations-plugin': {}},
         'url': 'https://stackoverflow.com',
         'original_body': re.sub(
             'data-scrapy-annotate=".*"', '', html_page._body),
         'scrapes': 'default',
         'version': '0.13.0'
     }
     data = _open_spec('so_annotations.json')
     annos, items, results = data['annos'], data['items'], data['results']
     sample['plugins']['annotations-plugin']['extracts'] = annos
     spider = IblSpider('so', _spider(sample=sample),
                        items, {}, Settings())
     page = HtmlResponse('http://url', body=sample['original_body'],
                         encoding='utf-8')
     items = [i for i in spider.parse(page) if not isinstance(i, Request)]
     keys = {(u'_index', u'_template', u'_type', u'answered', u'tags',
              u'title', 'url')}
     self.assertEqual({tuple(sorted(i.keys())) for i in items}, keys)
     self.assertEqual(items[0], results[0])
     self.assertEqual(items[52], results[1])
     self.assertEqual(items[-1], results[2])
     self.assertEqual(len(items), 96)
     data = _open_spec('autoevolution.json')
     schemas = data['schemas']
     results = data['results']
     page = HtmlResponse('http://url', body=data['original_body'],
                         encoding='utf-8')
     spider = IblSpider('ae', _spider(sample=data), schemas, {}, Settings())
     items = [i for i in spider.parse(page) if not isinstance(i, Request)]
     self.assertEqual(items, results)
Example #5
0
    def process_request(self, request, spider):
        try:
            self.browser.get(request.url)
            time.sleep(1)
            if request.url == "http://www.jianshu_selenium.com/" :
                for i in range(20):

                    js = 'window.scrollTo(0,%s)' % (i * 300)
                    self.browser.execute_script(js)
                    time.sleep(0.5)

            # while self.browser.execute_script('alert("To Bottom")'):
            #     self.browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
            #     self.browser.execute_script('alert("To Bottom")')
            #     time.sleep(1)

            try:
                while True:
                    showMore =self.browser.find_element_by_class_name('load-more')
                    showMore.click()
                    time.sleep(1)
                    if not showMore:
                       break
            except Exception:
                pass
            return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
                                status=200)
        except TimeoutException:
            return HtmlResponse(url=request.url, status=500, request=request)
Example #6
0
    def page(cls, response: HtmlResponse):

        current = url_query(response.url)
        current_page = int(current['p'])
        tag = current['word']
        item_type = response.meta['item_type']
        _search = demjson.decode(response.text)['body'][item_type['type']]
        _pages = math.ceil(_search['total'] / item_type['page_count'])
        cls.spider_log.info(
            "Search :%s Type:%s Total :%s Pages: %s Current :%s" %
            (tag, item_type['type'], _search['total'], _pages, current_page))

        _datas = _search['data']
        response.meta['word'] = tag
        _cls = cls
        _space = cls.settings().get('FILES_STORE')

        def _filter(id):
            _database = os.path.join(_space, response.meta['group'],
                                     '%s_main.db' % cls.script_name())
            cls.space.set(_database, MainSpace.space(_database))
            _has = _cls.space.get(_database).skip_complete({'id': id})
            if _has is True:
                _cls.spider_log.info("Skip Item :%s" % str(id))
            return _has

        for _data in _datas:

            if _filter(_data['id']):
                continue

            if item_type['type'] in ['manga', 'illust']:
                artworks = "https://www.pixiv.net/ajax/illust/%s" % _data['id']
                referer = 'https://www.pixiv.net/artworks/%s' % _data['id']
                cls.spider_log.info("Illust Title :%s" % _data['title'])
                author_item = AuthorItem()

                author_item['id'] = _data['userId']
                author_item['name'] = _data['userName']

                response.meta['author'] = author_item
                yield Request(url=artworks,
                              callback=cls.illust_detail,
                              meta=response.meta,
                              headers={'Referer': referer})

            if item_type['type'] in ['novel']:
                _novel_url = "https://www.pixiv.net/ajax/novel/%s" % _data['id']
                cls.spider_log.info("Novel Title :%s" % _data['title'])
                author_item = AuthorItem()
                author_item['id'] = _data['userId']
                author_item['name'] = _data['userName']
                response.meta['author'] = author_item
                yield Request(url=_novel_url,
                              callback=cls.novels_metas,
                              meta=response.meta)
        if current_page < _pages:
            _item_url = "https://www.pixiv.net/ajax/search/%s/%s?word=%s&order=date_d&mode=all&p=%s&s_mode=s_tag_full&lang=zh" % (
                item_type['url'], tag, tag, current_page + 1)
            yield Request(url=_item_url, callback=cls.page, meta=response.meta)
Example #7
0
File: gov.py Project: MRPTY1/Khala
 def parse(self, response: HtmlResponse, **kwargs):
     url = 'https://www.news.gov.hk/jsp/NewsArticle.jsp'
     new_url = 'https://sc.news.gov.hk/TuniS/www.news.gov.hk/jsp/NewsArticle.jsp'
     category_list = [
         'finance', 'school_work', 'health', 'environment', 'law_order',
         'infrastructure', 'admin', 'city_life', 'record'
     ]
     language_list = ['eng', 'chi']
     params = {
         'language': 'chi',
         'category': 'finance',
         'date': '',
     }
     for date in get_date():
         for category in category_list:
             if date == '202102':
                 break
             for language in language_list:
                 params['date'] = date
                 params['language'] = language
                 params['category'] = category
                 yield response.follow(url=url + '?' + urlencode(params),
                                       callback=self.get_news_list)
             params['language'] = 'chi'
             yield response.follow(url=new_url + '?' + urlencode(params),
                                   callback=self.get_news_list)
Example #8
0
 def model_page(self, response: HtmlResponse):
     video_sum_element = response.css('div.showingInfo').css(
         'span.totalSpan')
     # some p**n star hasn't show video number
     page_number = 1
     if video_sum_element:
         video_sum = video_sum_element.css('::text').get()
         sum_number = int(video_sum)
         page_number = math.ceil(sum_number / 40)
     # url contains page means load all videos || num == 1, start parse
     if 'page' in response.url or page_number == 1:
         li_list = response.css('div.videoUList').css('ul').css('li')
         for li_tag in li_list:  # type: SelectorList
             a_tag = li_tag.css('span.title').css('a')
             video_title = a_tag.css('::text').get()
             video_url = a_tag.css('::attr(href)').get()
             real_url = 'https://www.pornhubpremium.com' + video_url
             self.logger.info('send [%s] ,url: %s', video_title, video_url)
             yield scrapy.Request(real_url,
                                  callback=self.video_page,
                                  priority=100)
     else:
         # url not contains page and num > 1 means need load all videos
         new_link = '{0}?page={1}'.format(response.url, page_number)
         yield scrapy.Request(new_link,
                              callback=self.model_page,
                              priority=10)
Example #9
0
    def _create_product_data_dictionary(
        self,
        response: HtmlResponse,
        name: str,
        brand: Optional[str] = None,
        model_number: Optional[str] = None,
        upc: Optional[str] = None,
        data: Optional[Dict] = None,
    ) -> Dict:
        breadcrumbs = response.css('ul.nav.breadcrumb \
                > li[itemtype="http://data-vocabulary.org/Breadcrumb"] \
                > a[itemprop="url"] \
                > span[itemprop="title"]::text').getall()

        item = product_data_item_loader \
            .ProductDataItemLoader(response=response) \
            .add_language_data(
                response=response,
                brand=brand,
                images=response.css(
                    'meta[property="og:image"]::attr(content)'
                ).extract(),
                name=name,
                url=response.url,
                breadcrumbs=breadcrumbs
            ).add_sku(sku=upc) \
            .add_upc(response=response, upc=upc) \
            .add_store_id(store_id=self.store_id) \
            .add_sold_by(sold_by=self.sold_by) \
            .add_version(version=self.version) \
            .load_item()

        return item.get_dictionary()
Example #10
0
    def parse_task(self, response: HtmlResponse, subsection='empty'):
        # Source
        task_name = response.css('table.viewingtable div.componentboxheader::text').extract_first().strip()
        source = TaskSourceItem()
        source['name'] = f'{task_name} (problems.ru)'
        source['url'] = response.url

        content = response.css('table.viewingtable .componentboxcontents')

        # Themes
        info = content.css('table.problemdetailscaptiontable')
        themes = [theme.strip() for theme in info.css('.problemdetailssubject .problemdetailssubjecttablecell a.componentboxlink::text').extract()]

        # Grades
        _, grades = info.css('.problemdetailsdifficulty nobr::text').extract()
        grades = list(map(lambda n: int(n), re.findall(r'\d+', grades)))

        # Task
        task_dict, image_urls, tex_used = self.extract_task(content, response)

        yield ParseResultItem(
            source=source,
            themes=themes,
            grades=grades,
            task=task_dict,
            section=SECTION,
            subsection=subsection,
            image_urls=image_urls,
            tex_used = tex_used
        )
    def parse_region(self, response: HtmlResponse):
        """Parse regions.

        Nordbayern -> Frankenjura Nord

        Example: https://www.frankenjura.com/klettern/region/2
        """
        item = SectorItem()
        item["name"] = response.meta["region_name"]
        item["fk_sector"] = response.meta["parent"]
        item["source"] = response.url
        item["description"] = response.css(
            'div[class="location-head"]+p ::text').get()
        yield item

        region = item.django_model.objects.get(**item)

        sub_regions = response.css('div[class="column"]').css(
            'a[href*="region"]')
        for sub_region in sub_regions:
            meta = {
                "sub_region_name": sub_region.css("::text").get(),
                "parent": region
            }
            yield response.follow(sub_region, self.parse_sub_region, meta=meta)
Example #12
0
    def detail_xpath(response: HtmlResponse):
        data = JobDetail()
        url = response.url
        job_top_detail = response.xpath("//div[@class='cn']")
        job_name = job_top_detail.xpath("./h1/@title").extract()[0]
        job_salary = job_top_detail.xpath("./strong//text()").extract()[0]
        job_company = job_top_detail.xpath(
            "./p[@class='cname']/a/@title").extract()[0]
        job_tag = job_top_detail.xpath(
            "./p[contains(@class,'msg')]/@title").extract()[0]

        job_position_information = response.xpath(
            "//div[contains(@class,'bmsg job_msg')]/p//text()").extract()
        job_position_information: str = ''.join(job_position_information)

        company_detail = response.xpath("//div[@class='com_tag']")
        company_category = company_detail.xpath("./p[1]/@title").extract()[0]
        company_number_of_people = company_detail.xpath(
            "./p[2]/@title").extract()[0]
        company_tag = company_detail.xpath("./p[3]/@title").extract()[0]

        data['url'] = url
        data['job_name'] = job_name.replace('\xa0', '')
        data['job_salary'] = job_salary.replace('\xa0', '')
        data['job_company'] = job_company.replace('\xa0', '')
        data['job_tag'] = job_tag.replace('\xa0', '')
        data['job_position_information'] = job_position_information.replace(
            '\r\n', '').replace('\xa0', '')
        data['company_category'] = company_category.replace('\xa0', '')
        data['company_number_of_people'] = company_number_of_people.replace(
            '\xa0', '')
        data['company_tag'] = company_tag.replace('\xa0', '')
        return data
Example #13
0
    def parse(self, response):
        '''抽取每个分类中的总页数,并对每一页分发请求'''
        #print "[url: %s || status: %s]"%(response.url,response.status)
        retitem = ExporterItem()
        urlprefix = "http://mp.aiweibang.com/asyn/categoryarticleList?uid=311487&cid=75967&pageindex="
        page_num = 1

        driver = webdriver.PhantomJS()
        #essay_urls=[]

        while True:
            _url = urlprefix + str(page_num)
            driver.get(_url)
            resp = HR("", 200, {}, driver.page_source.encode(
                "utf8"))  #把抓回的内容封装为HtmlResponse只是利用HtmlResponse的XPATH而已
            dic = eval(resp.xpath("//pre/text()").extract()[0])
            if len(dic["list"]) == 0:
                break
            else:
                for i in dic["list"]:
                    essay = i["url"]
                    if essay.split("&sn")[0] not in self.all_urls:
                        retitem.set_record(essay)
            page_num += 1

        return retitem
Example #14
0
    def parse(self, response: HtmlResponse):
        log.info(f"Parsing {response.url}")

        title = response.xpath("//title/text()").extract_first()
        log.info(f"Visiting {title}")

        start_url = TestSpider.start_urls[0]
        p_url = f"{start_url}?p="

        for link_href in response.xpath("//link[@href]"):
            url = link_href.xpath("@href").extract_first()

            if p_url in url:
                log.info(f"Recording page ID URL: {url}")
                yield {
                    "title": title,
                    "long_url": response.url,
                    "short_url": url
                }

        # https://github.com/dkmiller/tidbits/blob/graph-algorithms/2020/2020-12-15_graph-algorithms/Graph.Algorithms/Web.cs
        for link in response.xpath("//a[@href]"):
            url = link.xpath("@href").extract_first()

            if start_url in url and ("#comment-" not in url) and ("mailto:"
                                                                  not in url):
                log.info(f"Queuing {url} to visit.")
                yield scrapy.Request(url, callback=self.parse)
    def parse_wall(self, response: HtmlResponse):
        """Parse walls.

        ... -> Region Wattendorf -> Falkenwand

        Example: https://www.frankenjura.com/klettern/poi/21
        """
        item = SectorItem()
        item["name"] = response.meta["wall_name"]
        item["fk_sector"] = response.meta["parent"]
        item["source"] = response.url
        item["internal_rating"] = _parse_stars(response)
        item["max_height_in_m"] = _parse_wall_max_height(response)
        item["rain_protected"] = _parse_rain_protected(response)
        item["child_friendly"] = _parse_child_friendly(response)
        item["description"] = _parse_wall_description(response)
        item["approach"] = _parse_wall_approach(response)
        item["approach_road"] = _parse_wall_approach_road(response)
        item["fk_orientation"] = _parse_orientation(response)
        item["latitude"], item["longitude"] = _parse_lat_lon(response)
        yield item

        wall = item.django_model.objects.get(name=item["name"],
                                             fk_sector=item["fk_sector"])

        routes = response.css('div[class="poi-link-container"]').css("a")
        for route in routes:
            meta = {"route_name": route.css("::text").get(), "parent": wall}
            yield response.follow(route, self.parse_route, meta=meta)
Example #16
0
    def parse(self, response: HtmlResponse):
        print(type(response), '+++++++++++++++++++++++++')
        print(response.encoding)
        print(response.status)

        with open('books.html', 'w', encoding='utf8') as f:
            f.write(response.text)

        subjects = response.xpath('//li[@class="subject-item"]')
        for subject in subjects:
            item = DoubanbookItem()

            title = subject.xpath('.//h2/a/text()').extract_first()
            item['title'] = title.strip()

            rate = subject.xpath(
                './/span[@class="rating_nums"]/text()').extract_first()
            item['rate'] = rate

            publish = subject.xpath(
                './/div[@class="pub"]/text()').extract_first()
            item['publish'] = publish.strip()
            yield item

        for i in range(2):
            next_pag = response.xpath(
                '//div[@class="paginator"]/a/@href').extract_first()
            url = response.urljoin(next_pag)
            yield scrapy.Request(url=url, callback=self.parse)
Example #17
0
    def parse_category(self, response: HtmlResponse) -> HtmlResponse:
        """
            List category and traverse product pages.
        """
        products_query = response.css(
            "section#bc-sf-filter-products > div.product-grid-item")
        if not products_query:
            raise IgnoreRequest('Product items not found')
        self.logger.info(
            f'parse product_categories len: {len(products_query)}')

        for pdp in products_query.css('div.product-grid-item'):
            item_loader = ProductLoader(item=UrgeItem(), selector=pdp)
            item_loader.add_css('product_name',
                                'div.product-text > p.title::text')
            item_loader.add_css('product_brand',
                                'div.product-text > h2.vendor.h5::text')
            # get regular product price through OR (,).
            item_loader.add_css(
                'product_price',
                'div.product-text p.price s::text , span[itemprop="price"]::text'
            )
            item_loader.add_css(
                'product_sale_price',
                'div.product-text p.sale span[itemprop="price"]::text')
            if 'href' in pdp.css('a').attrib:
                product_url = pdp.css('a').attrib['href']
                yield response.follow(product_url,
                                      callback=self.product_page,
                                      meta={'item': item_loader.load_item()})
Example #18
0
    def parse_video_page(self, response: HtmlResponse):
        self.logger.warn('开始解析{0}真实视频'.format(response.url))
        title = response.css('#viewvideo-title::text').extract_first().strip()
        author = response.css('a[href*="uprofile.php"]').css(
            'span::text').extract_first().strip()
        # 发现有的视频,名字相同,作者相同,只有Url中的viewkey不同
        view_key = response.url.split('viewkey=')[1].split('&')[0]
        # 由于有的视频名字中带 / 会导致创建成文件夹,所以需要处理一下
        if '/' in title:
            title = title.replace('/', '')

        encrypted_url = response.css('video').extract_first().split(
            'strencode("')[1].split('"))')[0]
        first_encrypted = encrypted_url.split('"')[0]
        second_excrypted = encrypted_url.split('"')[2]
        video_link = ParseRealUrl.get_url(first_encrypted, second_excrypted)

        if video_link:
            # 处理一下链接中 http://185.38.13.130//mp43/2998... 这种的 url
            video_link_list = video_link.split('//')
            real_video_link = video_link_list[0] + '//' + video_link_list[
                1] + '/' + video_link_list[2]
            self.logger.warn('获取到下载链接,丢入下载队列')
            down_file_name = title + '-' + author + '-' + view_key
            yield DownloadVideoItem(file_urls=real_video_link,
                                    file_name=down_file_name)
            self.logger.warn('丢入下载后,更新数据库')
            yield UpdateMovieLinkItem(movie_page_url=response.url,
                                      movie_real_url=real_video_link)
        else:
            self.logger.warn('获取视频下载地址失败,地址:{0}'.format(response.url))
Example #19
0
 def ajax_model_page(self, response: HtmlResponse):
     model_info_list = response.css('li.pcVideoListItem')
     for item in model_info_list:  # type: SelectorList
         video_url = item.css('span.title').css('a::attr(href)').get()
         yield scrapy.Request(response.urljoin(video_url),
                              callback=self.video_page,
                              priority=100)
Example #20
0
    def process_request(self, request, spider):
        url = request.url
        print(
            "1. process_request(): " + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') + " -> " + url)
        self.driver.get(url)
        source = self.driver.page_source

        if str('currentPage') not in url:
            print("3. if finish process_request(): " + datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S.%f') + " -> " + url)
            # if str('REPORT_NDOC_006051') in url or str('REPORT_NDOC_006010') in url:
            #     print(">>> debug: " + url)
            #     print(source)

            response = HtmlResponse(url=url, body=source, request=request, encoding="utf-8")
            return response

        else:
            next_page = self.driver.find_element_by_xpath(
                "//*[@id='4864']/table/tbody/tr/td/table/tbody/tr/td[8]/a")
            url = str(next_page.find_element_by_xpath("./a").get_attribute('href'))

            print("3. else finish process_request(): " + datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S.%f') + " -> " + url)

            response = HtmlResponse(url=url, body=source, request=request, encoding="utf-8")
            return response
Example #21
0
 def channel_page(self, response: HtmlResponse):
     video_css = response.css('span.title')
     for item in video_css:
         video_sub_link = item.css('a::attr(href)').extract_first()
         video_url = response.urljoin(video_sub_link)
         self.logger.warning(
             'send to parse real video, url is:{0}'.format(video_url))
         yield scrapy.Request(video_url, callback=self.video_page)
Example #22
0
    def _create_product_dictionary(
        self,
        response: HtmlResponse,
        data: Optional[Dict] = None,
    ) -> product.Product:
        try:
            upc = (universal_product_code.UniversalProductCode(
                upc=data.get('ProductId').replace('_', ''))).value
        except:
            # TODO: Log issue and return nothing.
            return None

        title1 = response.css('meta[property="og:title"]::attr(content)'
                              ).extract()[0].split('|')[0]
        title2 = response.css('title::text').get()
        name = title1 or title2

        if not name:
            pass  # TODO: Log error and return none.
        elif name == 'Grocery Product' or name == 'Produit épicerie en ligne':
            pass  # TODO: Log error and return none.

        brand = data.get('BrandName')

        if not name:
            pass  # TODO: Log error and return none.

        item_loader = product_item_loader.ProductItemLoader(
            response=response
        ).add_name(
            response=response,
            name=name, # TODO: What about if it's none.
            language=self.language,
        ).add_brand(
            response=response,
            brand=brand, # TODO: What about if it's none.
            language=self.language,
        ).add_upc(response=response, upc=upc) \
        .add_product_data_dictionary(
            product_data_dictionary=self._create_product_data_dictionary(
                response=response,
                data=data,
                name=name,
                brand=brand,
                upc=upc,
            ),
        ).add_offer_dictionary(
            offer_dictionary=self._create_offer_dictionary(
                response=response,
                data=data,
            ),
        ).add_store_dictionary(
            store_dictionary=self._create_store_dictionary(
                response=response,
            ),
        ).add_supported_language(language=self.language)

        return item_loader.load_item()
Example #23
0
def parse_chapter(response: HtmlResponse):
    title = response.xpath("//div[@class='bookname']/h1//text()").getall()[0].split()
    text = response.xpath("//div[@id='content']//text()").getall()

    chapter_index = re.findall("\\d+", title[0])[0]
    chapter_title = title[1]
    chapter_content = "".join([x.strip() for x in text]).strip()

    yield BookItem(index=chapter_index, title=chapter_title, content=chapter_content)
Example #24
0
 def get_image_url(cls, response: HtmlResponse) -> Union[str, None]:
     """Extract image url from html response"""
     image_p = response.css("p > img")
     image_figure = response.css("figure > img")
     image_selectors = image_p if image_p else image_figure
     images_re = image_selectors.re(r'src="(http.*?)\"')
     images = [img for img in images_re if img.split(".")[-1] != "svg"]
     sorted_by_length = sorted(images, key=len, reverse=True)
     return sorted_by_length[0] if sorted_by_length else None
Example #25
0
 def parse_list_of_tasks(self, response: HtmlResponse, max_number=0, next_number=0, step=5, subsection:str = ''):
     task_urls = response.css('.problemsmallnumber .componentboxlink::attr(href)').extract()
     for task_url in task_urls:
         callback = partial(self.parse_task, subsection=subsection)
         yield response.follow(response.urljoin(task_url), callback=callback)
     if next_number < max_number:
         url = set_get_parameter(response.url, 'start', next_number)
         callback = partial(self.parse_list_of_tasks, max_number=max_number, next_number=next_number + step, subsection=subsection)
         yield response.follow(url, callback=callback)
Example #26
0
 def video_parse(self, response: HtmlResponse, category):
     title = response.css('h2.title.big::text').get()
     for item in response.css('ul.video-downloads-buttons').css('li'):
         if '1080p' in item.css('a::text').get().strip():
             link = item.css('a::attr(href)').get()
             req_cookie = response.request.headers.get('Cookie').decode()
             resp_cookie = response.headers.get('Set-Cookie').decode().split(';')[0]
             yield ArtPornItem(name=title, link=link, category=category,
                               cookie='{0};{1}'.format(req_cookie, resp_cookie))
Example #27
0
 def parse_next_link(self, response: HtmlResponse) -> str:
     next_page_tag = response.css(
         'a[href*="?category=long&viewtype=basic"]')
     next_link = None
     for item in next_page_tag:
         if '»' == item.css('a::text').extract_first():
             ori_link = item.css('a::attr(href)').extract_first()
             next_link = response.urljoin(ori_link)
     return next_link
Example #28
0
 def video_parse(self, response: HtmlResponse, category):
     link = response.urljoin(response.css("a.full_download_link[onclick*='mp43000']::attr(href)").get())
     title = ''
     for i in response.css('div.title_bar::text').getall():
         i = i.strip()
         if i:
             title = i
             break
     if link != 'http://www.hotwiferio.com/members/':
         yield HotItem(name=title, link=link, category=category)
Example #29
0
 def get_contents_list(self, response: HtmlResponse):
     meat = response.meta
     contents_list = response.json().get('list')
     with open('Khala/spider_params/lenovo/language.txt',
               'r+') as languages:
         for language in languages:
             language = language.replace('\n', '')
             for contents in contents_list:
                 url = f'https://pcsupport.lenovo.com/us/{language}/products/{meat["model"]}/solutions/{contents["docid"]}'
                 yield response.follow(url=url, callback=self.out_item)
Example #30
0
def get_response(url,
                 headers=None,
                 cookies=None,
                 delay=30,
                 response_type="html"):
    num_retries = 0
    response = None
    if cookies is None:
        cookies = {}
    while num_retries < MAX_NUM_RETRY:
        try:
            response = None
            if headers is not None:
                response = requests.get(url,
                                        headers=headers,
                                        timeout=delay,
                                        verify=False,
                                        cookies=cookies)
            else:
                response = requests.get(url,
                                        timeout=delay,
                                        verify=False,
                                        cookies=cookies)
            num_retries += 1
            if response.status_code >= 200:
                if response_type == "html":
                    ret_obj = HtmlResponse(url,
                                           status=response.status_code,
                                           body=response.content,
                                           encoding='utf-8')
                    return ret_obj
                elif response_type == "xml":
                    ret_obj = XmlResponse(url,
                                          status=response.status_code,
                                          body=response.content,
                                          encoding='utf-8')
                    return ret_obj
                else:
                    raise Exception("Invalid response type")

        except Exception as e:
            logging.error("Exception %s" % e.message)
            num_retries += 1
    logging.error("Could not fetch the url")
    if response_type == "html":
        err_obj = HtmlResponse(url,
                               status=110,
                               body="<html><body>Failure</body></html>",
                               encoding='utf-8')
    else:
        err_obj = XmlResponse(url,
                              status=110,
                              body="<html><body>Failure</body></html>",
                              encoding='utf-8')
    return err_obj
Example #31
0
    def parse_country_links(self, response: HtmlResponse) -> Request:
        table = response.css(self.config.countries_table)
        all_link_tags = table.css('a')
        country_links = [link.attrib['href'] for link in all_link_tags]

        for country_link in country_links:
            full_country_link = response.urljoin(country_link)
            current_country = country_link.split('/')[1]
            yield scrapy.Request(full_country_link,
                                 callback=self.parse_country,
                                 cb_kwargs={"country": current_country})
Example #32
0
def test_attrib():
    f = open('test.html')
    test_response = HtmlResponse("http://example.com", body = f.read())
    print test_response.xpath('/html/body/div[@id="001"]/p/text()').extract()
    print test_response.xpath('/html/body/div[@id="A001"]/p/text()').extract()
Example #33
0
def test_basic():
    f = open('test.html')
    test_response = HtmlResponse('http://www.example.com', body = f.read())
    print test_response.xpath('/html/body/div[1]/p/text()').extract()
    print test_response.xpath('/html/body1/div[1]/p/text()').extract()
def get_response(filepath, encoding='utf-8'):
    body = open(filepath, 'r').read()
    response = HtmlResponse('test', encoding=encoding)
    response.body = body
    return response