Ejemplo n.º 1
0
    def most_popular_page(self, response: HtmlResponse):
        description_list = response.css('div.descriptionContainer')
        for item in description_list:
            title = item.css('a::text').extract_first()
            sub_link = item.css('a::attr(href)').extract_first()
            channel_url = response.urljoin(sub_link)
            self.logger.warning('get channel:{0} ,link is:{1}'.format(
                title, channel_url))
            yield scrapy.Request(channel_url,
                                 callback=self.channel_page_see_all)

        # determine has next page
        next_page_li = response.css('li.page.next.wl-page')
        if next_page_li:
            next_page_sub_link = next_page_li.css(
                'a::attr(href)').extract_first()
            page_number = int(next_page_sub_link.split('page=')[1])
            page_number_start = self.settings.get('PAGE_NUMBER_START')
            page_number_end = self.settings.get('PAGE_NUMBER_END')
            if page_number_end is not None:
                if page_number_start < page_number <= page_number_end:
                    next_page_url = response.urljoin(next_page_sub_link)
                    self.logger.warning(
                        'has next page, url is:{0}'.format(next_page_url))
                    yield scrapy.Request(next_page_url,
                                         callback=self.most_popular_page)
                else:
                    self.logger.warning('has next page, but is in limit')
            else:
                next_page_url = response.urljoin(next_page_sub_link)
                self.logger.warning(
                    'has next page, url is:{0}'.format(next_page_url))
                yield scrapy.Request(next_page_url,
                                     callback=self.most_popular_page)
Ejemplo n.º 2
0
    def parse_task(self, response: HtmlResponse, subsection='empty'):
        # Source
        task_name = response.css('table.viewingtable div.componentboxheader::text').extract_first().strip()
        source = TaskSourceItem()
        source['name'] = f'{task_name} (problems.ru)'
        source['url'] = response.url

        content = response.css('table.viewingtable .componentboxcontents')

        # Themes
        info = content.css('table.problemdetailscaptiontable')
        themes = [theme.strip() for theme in info.css('.problemdetailssubject .problemdetailssubjecttablecell a.componentboxlink::text').extract()]

        # Grades
        _, grades = info.css('.problemdetailsdifficulty nobr::text').extract()
        grades = list(map(lambda n: int(n), re.findall(r'\d+', grades)))

        # Task
        task_dict, image_urls, tex_used = self.extract_task(content, response)

        yield ParseResultItem(
            source=source,
            themes=themes,
            grades=grades,
            task=task_dict,
            section=SECTION,
            subsection=subsection,
            image_urls=image_urls,
            tex_used = tex_used
        )
Ejemplo n.º 3
0
 def video_page(self, response: HtmlResponse):
     video_title = response.css('h1.title').css('span::text').get()
     video_channel = response.css('div.video-actions-container').css(
         'div.usernameWrap.clearfix').css('a::text').get()
     js = response.css('div.video-wrapper').css('#player').css(
         'script').get()
     data_video_id = response.css('div.video-wrapper').css(
         '#player::attr(data-video-id)').get()
     prepare_js = js.split('<script type="text/javascript">')[1].split(
         'loadScriptUniqueId')[0]
     exec_js = '{0}\nqualityItems_{1};'.format(prepare_js, data_video_id)
     js_result = js2py.eval_js(exec_js)  # type: js2py.base.JsObjectWrapper
     quality_items = js_result.to_list()  # type: list
     quality = quality_items[-1]['text'].split('p')[0]
     if int(quality) >= 720:
         video_url = quality_items[-1]['url']
         self.logger.info('parse [%s] success, url: %s', video_title,
                          video_url)
         if self.settings.get('ENABLE_SQL'):
             result = self.data_base.select_all_by_title_my_follow(
                 video_title)
             if len(result) != 0:
                 for line in result:
                     self.logger.error('has duplicate record: %s', line)
             else:
                 self.data_base.save_my_follow(video_title, video_channel,
                                               video_url, response.url)
         yield PornhubItem(file_urls=video_url,
                           file_name=video_title,
                           file_channel=video_channel)
Ejemplo n.º 4
0
    def parse_region(self, response: HtmlResponse):
        """Parse regions.

        Nordbayern -> Frankenjura Nord

        Example: https://www.frankenjura.com/klettern/region/2
        """
        item = SectorItem()
        item["name"] = response.meta["region_name"]
        item["fk_sector"] = response.meta["parent"]
        item["source"] = response.url
        item["description"] = response.css(
            'div[class="location-head"]+p ::text').get()
        yield item

        region = item.django_model.objects.get(**item)

        sub_regions = response.css('div[class="column"]').css(
            'a[href*="region"]')
        for sub_region in sub_regions:
            meta = {
                "sub_region_name": sub_region.css("::text").get(),
                "parent": region
            }
            yield response.follow(sub_region, self.parse_sub_region, meta=meta)
Ejemplo n.º 5
0
    def _create_product_data_dictionary(
        self,
        response: HtmlResponse,
        name: str,
        brand: Optional[str] = None,
        model_number: Optional[str] = None,
        upc: Optional[str] = None,
        data: Optional[Dict] = None,
    ) -> Dict:
        breadcrumbs = response.css('ul.nav.breadcrumb \
                > li[itemtype="http://data-vocabulary.org/Breadcrumb"] \
                > a[itemprop="url"] \
                > span[itemprop="title"]::text').getall()

        item = product_data_item_loader \
            .ProductDataItemLoader(response=response) \
            .add_language_data(
                response=response,
                brand=brand,
                images=response.css(
                    'meta[property="og:image"]::attr(content)'
                ).extract(),
                name=name,
                url=response.url,
                breadcrumbs=breadcrumbs
            ).add_sku(sku=upc) \
            .add_upc(response=response, upc=upc) \
            .add_store_id(store_id=self.store_id) \
            .add_sold_by(sold_by=self.sold_by) \
            .add_version(version=self.version) \
            .load_item()

        return item.get_dictionary()
Ejemplo n.º 6
0
    def parse_video_page(self, response: HtmlResponse):
        self.logger.warn('开始解析{0}真实视频'.format(response.url))
        title = response.css('#viewvideo-title::text').extract_first().strip()
        author = response.css('a[href*="uprofile.php"]').css(
            'span::text').extract_first().strip()
        # 发现有的视频,名字相同,作者相同,只有Url中的viewkey不同
        view_key = response.url.split('viewkey=')[1].split('&')[0]
        # 由于有的视频名字中带 / 会导致创建成文件夹,所以需要处理一下
        if '/' in title:
            title = title.replace('/', '')

        encrypted_url = response.css('video').extract_first().split(
            'strencode("')[1].split('"))')[0]
        first_encrypted = encrypted_url.split('"')[0]
        second_excrypted = encrypted_url.split('"')[2]
        video_link = ParseRealUrl.get_url(first_encrypted, second_excrypted)

        if video_link:
            # 处理一下链接中 http://185.38.13.130//mp43/2998... 这种的 url
            video_link_list = video_link.split('//')
            real_video_link = video_link_list[0] + '//' + video_link_list[
                1] + '/' + video_link_list[2]
            self.logger.warn('获取到下载链接,丢入下载队列')
            down_file_name = title + '-' + author + '-' + view_key
            yield DownloadVideoItem(file_urls=real_video_link,
                                    file_name=down_file_name)
            self.logger.warn('丢入下载后,更新数据库')
            yield UpdateMovieLinkItem(movie_page_url=response.url,
                                      movie_real_url=real_video_link)
        else:
            self.logger.warn('获取视频下载地址失败,地址:{0}'.format(response.url))
Ejemplo n.º 7
0
 def model_page(self, response: HtmlResponse):
     video_sum_element = response.css('div.showingInfo').css(
         'span.totalSpan')
     # some p**n star hasn't show video number
     page_number = 1
     if video_sum_element:
         video_sum = video_sum_element.css('::text').get()
         sum_number = int(video_sum)
         page_number = math.ceil(sum_number / 40)
     # url contains page means load all videos || num == 1, start parse
     if 'page' in response.url or page_number == 1:
         li_list = response.css('div.videoUList').css('ul').css('li')
         for li_tag in li_list:  # type: SelectorList
             a_tag = li_tag.css('span.title').css('a')
             video_title = a_tag.css('::text').get()
             video_url = a_tag.css('::attr(href)').get()
             real_url = 'https://www.pornhubpremium.com' + video_url
             self.logger.info('send [%s] ,url: %s', video_title, video_url)
             yield scrapy.Request(real_url,
                                  callback=self.video_page,
                                  priority=100)
     else:
         # url not contains page and num > 1 means need load all videos
         new_link = '{0}?page={1}'.format(response.url, page_number)
         yield scrapy.Request(new_link,
                              callback=self.model_page,
                              priority=10)
Ejemplo n.º 8
0
    def parse(self, response):
        driver = WebDriver
        driver.get(response.url)
        content = driver.page_source.encode('utf-8')
        response = HtmlResponse(response.url, encoding='utf-8', body=content)
        pageStr = response.css(
            ".dw_page .p_box .p_wp .p_in >span::text")[0].extract()
        pageNum = int(pageStr[1:pageStr.find("页", 1, len(pageStr))])
        if pageNum - 1 > 0:
            urlss = []
            for i in range(1000):
                js = "document.getElementById('jump_page').value=%d" % (i + 2)
                driver.execute_script(js)
                driver.find_element_by_css_selector(".p_in .og_but").click()
                content = driver.page_source.encode('utf-8')
                sub_response = HtmlResponse(url=response.url,
                                            encoding='utf-8',
                                            body=content)

                main = sub_response.css("div#resultList div.el")
                urls = main.css(".t2 a::attr(href)").extract()
                urlss.append(urls)

            for index in range(len(urlss)):
                for url in urlss[index]:
                    try:
                        for obj in self.__parse_by_webdriver(url):
                            yield obj
                    except Exception as error:
                        print("【" + str(index) +
                              "】spirder error: {0}".format(error))
                        continue
Ejemplo n.º 9
0
    def _create_product_dictionary(
        self,
        response: HtmlResponse,
        data: Optional[Dict] = None,
    ) -> product.Product:
        try:
            upc = (universal_product_code.UniversalProductCode(
                upc=data.get('ProductId').replace('_', ''))).value
        except:
            # TODO: Log issue and return nothing.
            return None

        title1 = response.css('meta[property="og:title"]::attr(content)'
                              ).extract()[0].split('|')[0]
        title2 = response.css('title::text').get()
        name = title1 or title2

        if not name:
            pass  # TODO: Log error and return none.
        elif name == 'Grocery Product' or name == 'Produit épicerie en ligne':
            pass  # TODO: Log error and return none.

        brand = data.get('BrandName')

        if not name:
            pass  # TODO: Log error and return none.

        item_loader = product_item_loader.ProductItemLoader(
            response=response
        ).add_name(
            response=response,
            name=name, # TODO: What about if it's none.
            language=self.language,
        ).add_brand(
            response=response,
            brand=brand, # TODO: What about if it's none.
            language=self.language,
        ).add_upc(response=response, upc=upc) \
        .add_product_data_dictionary(
            product_data_dictionary=self._create_product_data_dictionary(
                response=response,
                data=data,
                name=name,
                brand=brand,
                upc=upc,
            ),
        ).add_offer_dictionary(
            offer_dictionary=self._create_offer_dictionary(
                response=response,
                data=data,
            ),
        ).add_store_dictionary(
            store_dictionary=self._create_store_dictionary(
                response=response,
            ),
        ).add_supported_language(language=self.language)

        return item_loader.load_item()
Ejemplo n.º 10
0
 def get_image_url(cls, response: HtmlResponse) -> Union[str, None]:
     """Extract image url from html response"""
     image_p = response.css("p > img")
     image_figure = response.css("figure > img")
     image_selectors = image_p if image_p else image_figure
     images_re = image_selectors.re(r'src="(http.*?)\"')
     images = [img for img in images_re if img.split(".")[-1] != "svg"]
     sorted_by_length = sorted(images, key=len, reverse=True)
     return sorted_by_length[0] if sorted_by_length else None
Ejemplo n.º 11
0
 def video_parse(self, response: HtmlResponse, category):
     title = response.css('h2.title.big::text').get()
     for item in response.css('ul.video-downloads-buttons').css('li'):
         if '1080p' in item.css('a::text').get().strip():
             link = item.css('a::attr(href)').get()
             req_cookie = response.request.headers.get('Cookie').decode()
             resp_cookie = response.headers.get('Set-Cookie').decode().split(';')[0]
             yield ArtPornItem(name=title, link=link, category=category,
                               cookie='{0};{1}'.format(req_cookie, resp_cookie))
Ejemplo n.º 12
0
 def video_parse(self, response: HtmlResponse, category):
     link = response.urljoin(response.css("a.full_download_link[onclick*='mp43000']::attr(href)").get())
     title = ''
     for i in response.css('div.title_bar::text').getall():
         i = i.strip()
         if i:
             title = i
             break
     if link != 'http://www.hotwiferio.com/members/':
         yield HotItem(name=title, link=link, category=category)
Ejemplo n.º 13
0
    def categories_parse(self, response: HtmlResponse, category):
        next_url_list = response.css('a.button.prev::attr(href)').getall()
        if len(next_url_list) > 1:
            yield scrapy.Request(url=response.urljoin(next_url_list[1]), callback=self.categories_parse,
                                 cb_kwargs={'category': category})
        else:
            yield scrapy.Request(url=response.urljoin(next_url_list[0]), callback=self.categories_parse,
                                 cb_kwargs={'category': category})

        for item in response.css('div.thumb-video.cf').css('a.thumb-video-link::attr(href)').getall():
            yield scrapy.Request(url=item, callback=self.video_parse, cb_kwargs={'category': category})
Ejemplo n.º 14
0
    def _create_product_dictionary(
        self,
        response: HtmlResponse,
        data: Optional[Dict] = None,
    ) -> product.Product:
        try:
            upc = (universal_product_code.UniversalProductCode(
                upc=response.css('span[itemprop="sku"]::text').get())).value
        except Exception as exception:
            logging.exception(msg='Unable to get UPC.', exc_info=exception)
            return None

        name1 = response.css(
            "div.product-info.item-addToCart > a.invisible-text::text"
        ).extract()
        name2 = response.css('title::text').extract()[0].split('|')[0]
        name = name1 or name2

        if not name:
            pass  # TODO: Log error and return none.

        brand = response.css('div[itemtype="http://schema.org/Product"] \
            > span[itemprop="brand"]::text').extract()
        item_loader = product_item_loader.ProductItemLoader(
            response=response
        ).add_name(
            response=response,
            name=name,
            language=self.language,
        ).add_brand(
            response=response,
            brand=brand,
            language=self.language,
        ).add_upc(response=response, upc=upc) \
        .add_product_data_dictionary(
            product_data_dictionary=self._create_product_data_dictionary(
                response=response,
                data=data,
                name=name,
                brand=brand,
                upc=upc,
            ),
        ).add_offer_dictionary(
            offer_dictionary=self._create_offer_dictionary(
                response=response,
                data=data,
            ),
        ).add_store_dictionary(
            store_dictionary=self._create_store_dictionary(
                response=response,
            ),
        ).add_supported_language(language=self.language)

        return item_loader.load_item()
Ejemplo n.º 15
0
    def _create_offer_dictionary(
        self,
        response: HtmlResponse,
        data: Dict,
    ) -> Dict:
        offers = response.css('div[itemprop="offers"]')

        if len(offers) == 0:
            pass  # TODO: Throw error.

        offer_objects = []

        for o in offers:
            price = o.css('span[itemprop="price"]::text').get()
            valid_through = o.css('span[itemprop="validThrough"]::text').get()
            offer_objects.append(
                offer.Offer(price=float(price), valid_until=valid_through))

        # Order to get the sales price.
        offer_objects.sort(key=lambda x: x.price)

        amount = offer_objects[0].price
        valid_until = offer_objects[0].valid_until  # TODO: Add valid until.
        item = offer_item_loader.OfferItemLoader(response=response) \
            .add_store_id(store_id=self.store_id) \
            .add_sold_by(sold_by=self.sold_by) \
            .add_amount(
                amount=str(amount),
            ).add_currency(currency=curreny.Currency.CAD.value) \
            .add_availability(
                availability=availability.Availability.IN_STOCK.value,
            ).add_condition(condition=condition.Condition.NEW.value) \
            .load_item()

        return item.get_dictionary()
Ejemplo n.º 16
0
def get_comment_links(response: HtmlResponse, spider: ColgSpider):
    url = []
    lastPageUrls = response.css('.tps a:last-child').xpath('@href').extract()
    for lastPageUrl in lastPageUrls:
        url.extend(
            get_commentlistpage_urls_near_endof_lastpage(lastPageUrl, spider))
    return url
Ejemplo n.º 17
0
    def _find_json_data(self, response: HtmlResponse) -> Optional[Dict]:
        css_path = "div.product-details.js-ga-productdetails > " + \
           "div.relative::attr(data-product)"

        product_data = response.css(css_path).extract()

        if not product_data:
            logging.error('Unable to load JSON data.')  # TODO: Log URL.
            return None

        try:
            return json.loads(product_data[0])
        except:
            pass

        # try:
        #     return ast.literal_eval(product_data[0])
        # except:
        #     pass

        try:
            data = product_data[0].replace("'", '"')
            return json.loads(data)
        except:
            logging.error('Unable to load JSON data.')
Ejemplo n.º 18
0
    def parse_category(self, response: HtmlResponse) -> HtmlResponse:
        """
            List category and traverse product pages.
        """
        products_query = response.css(
            "section#bc-sf-filter-products > div.product-grid-item")
        if not products_query:
            raise IgnoreRequest('Product items not found')
        self.logger.info(
            f'parse product_categories len: {len(products_query)}')

        for pdp in products_query.css('div.product-grid-item'):
            item_loader = ProductLoader(item=UrgeItem(), selector=pdp)
            item_loader.add_css('product_name',
                                'div.product-text > p.title::text')
            item_loader.add_css('product_brand',
                                'div.product-text > h2.vendor.h5::text')
            # get regular product price through OR (,).
            item_loader.add_css(
                'product_price',
                'div.product-text p.price s::text , span[itemprop="price"]::text'
            )
            item_loader.add_css(
                'product_sale_price',
                'div.product-text p.sale span[itemprop="price"]::text')
            if 'href' in pdp.css('a').attrib:
                product_url = pdp.css('a').attrib['href']
                yield response.follow(product_url,
                                      callback=self.product_page,
                                      meta={'item': item_loader.load_item()})
Ejemplo n.º 19
0
def _parse_stars(response: HtmlResponse):
    """Parse count of stars given for walls and routes."""
    stars_selector = response.css("img[class*=stars]")
    if stars_selector:
        return int(stars_selector[0].attrib["class"].split("stars")[1])
    else:
        return 0
Ejemplo n.º 20
0
 def parse(self, response: HtmlResponse):
     list_channel = response.css('div.listchannel')
     for item in list_channel:
         link = item.css('a::attr(href)').extract_first()
         title = item.css('a::attr(title)').extract_first()
         self.logger.warn('获取到视频:{0}'.format(title))
         yield scrapy.Request(url=link, callback=self.real_video_parse)
Ejemplo n.º 21
0
    def parse_wall(self, response: HtmlResponse):
        """Parse walls.

        ... -> Region Wattendorf -> Falkenwand

        Example: https://www.frankenjura.com/klettern/poi/21
        """
        item = SectorItem()
        item["name"] = response.meta["wall_name"]
        item["fk_sector"] = response.meta["parent"]
        item["source"] = response.url
        item["internal_rating"] = _parse_stars(response)
        item["max_height_in_m"] = _parse_wall_max_height(response)
        item["rain_protected"] = _parse_rain_protected(response)
        item["child_friendly"] = _parse_child_friendly(response)
        item["description"] = _parse_wall_description(response)
        item["approach"] = _parse_wall_approach(response)
        item["approach_road"] = _parse_wall_approach_road(response)
        item["fk_orientation"] = _parse_orientation(response)
        item["latitude"], item["longitude"] = _parse_lat_lon(response)
        yield item

        wall = item.django_model.objects.get(name=item["name"],
                                             fk_sector=item["fk_sector"])

        routes = response.css('div[class="poi-link-container"]').css("a")
        for route in routes:
            meta = {"route_name": route.css("::text").get(), "parent": wall}
            yield response.follow(route, self.parse_route, meta=meta)
Ejemplo n.º 22
0
 def ajax_model_page(self, response: HtmlResponse):
     model_info_list = response.css('li.pcVideoListItem')
     for item in model_info_list:  # type: SelectorList
         video_url = item.css('span.title').css('a::attr(href)').get()
         yield scrapy.Request(response.urljoin(video_url),
                              callback=self.video_page,
                              priority=100)
Ejemplo n.º 23
0
    def parse(self, response: HtmlResponse):
        """Parse webpage to extract important recipe information"""
        if response.css(".wprm-recipe-ingredients-container"):
            data = {
                "name": response.css(".title::text").get(),
                "source_id": self.sid,
                "url": response.url,
                "image": self.get_image_url(response),
                "ingredients": self.get_ingredients(response),
            }
            if all(val is not None for val in data.values()):
                resp = requests.post(self.endpoint, json=data)
                if resp.status_code == 400:
                    raise CloseSpider("Recipe already exists")

        for anchor_tag in response.css(".nav-previous a"):
            yield response.follow(anchor_tag, callback=self.parse)
 def _retrieve_table_field(
     response: HtmlResponse  # pylint: disable=C0330
 ) -> Generator[Selector, None, None]:
     """
     Yield single table cell.
     """
     for field in response.css('.apexir_WORKSHEET_DATA td'):
         yield field
Ejemplo n.º 25
0
 def channel_page(self, response: HtmlResponse):
     video_css = response.css('span.title')
     for item in video_css:
         video_sub_link = item.css('a::attr(href)').extract_first()
         video_url = response.urljoin(video_sub_link)
         self.logger.warning(
             'send to parse real video, url is:{0}'.format(video_url))
         yield scrapy.Request(video_url, callback=self.video_page)
 def _parse_documents(response: HtmlResponse):
     """
     Parse principal's documents.
     """
     data = response.meta['data']
     data['exhibit_url'] = response.css(
         'td[headers=DOCLINK] ::attr(href)').extract_first()
     yield data
Ejemplo n.º 27
0
def _parse_child_friendly(response: HtmlResponse):
    """Parse optional element 'child friendly'."""
    selector = response.css('th:contains("Kinder") + td ::text')
    if selector:
        value = selector.get()
        return False if "Ungeeignet" in value.lower() else True
    else:
        return False
Ejemplo n.º 28
0
def _parse_rain_protected(response: HtmlResponse):
    """Parse optional element 'rain protected'."""
    selector = response.css('th:contains("Regensicher") + td ::text')
    if selector:
        value = selector.get()
        return False if value.lower() == "nein" else True
    else:
        return False
Ejemplo n.º 29
0
 def parse_next_link(self, response: HtmlResponse) -> str:
     next_page_tag = response.css(
         'a[href*="?category=long&viewtype=basic"]')
     next_link = None
     for item in next_page_tag:
         if '»' == item.css('a::text').extract_first():
             ori_link = item.css('a::attr(href)').extract_first()
             next_link = response.urljoin(ori_link)
     return next_link
Ejemplo n.º 30
0
 def porn_star_page(self, response: HtmlResponse):
     # p**n star type no need page number,because next page=2 not show all 2 page videos
     li_list = response.css('div.videoUList').css('ul').css('li')
     for li_tag in li_list:  # type: SelectorList
         video_url = li_tag.css('span.title').css('a::attr(href)').get()
         yield scrapy.Request(response.urljoin(video_url),
                              callback=self.video_page,
                              priority=100)
     # check has next button
     page_element = response.css('div.pagination3')
     if page_element:
         # if in last page, page_next css not exist
         next_element = page_element.css('li.page_next')
         if next_element:
             next_url = next_element.css('a::attr(href)').get()
             yield scrapy.Request(response.urljoin(next_url),
                                  callback=self.porn_star_page,
                                  priority=10)