Example #1
0
    def get_businesses(self, response: scrapy.http.Response):
        print(response.css('h1 span span::text').get())
        parsed = urlparse.urlparse(response.url)
        self.token = parse_qs(parsed.query)['id'][0]
        script = response.xpath('//script[contains(., "viewInstanceKey:")]/text()').get()
        script2 = response.xpath('//script[contains(., "containerNodeId:")]/text()').get()

        searchOperator = response.css('.registerItemSearch-tabs-criteriaAndButtons-criteria-itemNameSearchOperatorBox-itemNameSearchOperatorSelector::attr(id)').get()
        advance = response.css('.Attribute-Advanced::attr(id)').get()
        vi_pattern = r"viewInstanceKey:('[A-Za-z0-9_\./\\-]*')"
        vi_match = re.search(vi_pattern, script)
        vikey = vi_match.group().split(':')[1].strip("'")

        guid_pattern = r'guid:([0-9]*)'
        guid_match = re.search(guid_pattern,script)
        guid = guid_match.group().split(':')[1]

        frag_node_pattern = "containerNodeId:'([A-Za-z0-9]*)'"
        frag_match = re.search(frag_node_pattern, script2)
        frag_node = frag_match.group(1)

        node = response.css('.appSearchButton::attr(id)').get()[4:]

        self.data[f'{searchOperator}-ItemNameSearchOperator'] = 'StartsWith'
        self.data[f'{advance}-Advanced'] = 'N'
        self.data['_VIKEY_'] = vikey
        self.data['_CBHTMLFRAGID_'] = str(guid)
        self.data['_CBHTMLFRAGNODEID_'] = str(frag_node)
        self.data['_CBNODE_'] = node
        yield scrapy.http.FormRequest(url=f'https://www.mtsosfilings.gov/mtsos-master/viewInstance/update.html?id={self.token}',
                                      formdata=self.data, headers=self.headers, callback=self.pagination)
    def parseCity(self, response: scrapy.http.Response):
        #example https://www.tripadvisor.in/Attractions-g186338-Activities-London_England.html#FILTERED_LIST

        attractionBoxs = response.css(
            'div.attraction_list.attraction_list_short > div.attraction_element > div > div > div > div > div.listing_title'
        )

        tourSetRegex = ".+([0-9]+).*"
        tourSetRegChecker = re.compile(tourSetRegex)

        for attraction in attractionBoxs:
            pointName = attraction.css('a::text').extract_first()
            if not tourSetRegChecker.match(pointName):
                attractionUrl = response.urljoin(
                    attraction.css('a::attr(href)').extract_first())
                response.meta['rank'] += 1
                yield response.follow(url=attractionUrl,
                                      callback=self.parseAttractionsPage,
                                      meta=response.meta)

        nextPageLink = response.css(
            'div.al_border.deckTools.btm > div > div.unified.pagination > a.nav.next.rndBtn.ui_button.primary.taLnk::attr(href)'
        )
        if nextPageLink:
            nextPageLink = response.urljoin(nextPageLink.extract_first())
            self.log("nextpage: " + nextPageLink)
            if response.meta['rank'] < 100:
                yield response.follow(nextPageLink,
                                      callback=self.parseCity,
                                      meta=response.meta)
    def parseCityAttractionsListPage(self, response: scrapy.http.Response):
        # example page:  https://www.viator.com/Mumbai/d953

        print(
            'PARSING ATTRACTION LIST ####################################################################################'
        )
        print(response.url)

        self.incrementRequestCount()
        hrefs = response.css('div.ptm *> h2 > a')
        for href in hrefs:
            pointURL = href.css('::attr(href)').extract_first().strip()
            pointName = href.css('::text').extract_first().strip()
            yield response.follow(pointURL,
                                  callback=self.parseAttractionsPage,
                                  meta={
                                      'countryName':
                                      response.meta['countryName'],
                                      'cityName': response.meta['cityName'],
                                      'pointName': pointName
                                  })

        nextPageLink = response.css(
            'div.ptm > div:nth-child(1) > div:nth-child(2) > p > a:last-child::attr(href)'
        ).extract_first()
        if nextPageLink:
            yield response.follow(nextPageLink,
                                  callback=self.parseCityAttractionsListPage,
                                  meta=response.meta)
Example #4
0
 def parse_listing(self, response: scrapy.http.Response):
     i = {}
     i['url'] = response.url
     i['expire'] = response.xpath(
         '//a[@class="expire"]/span/text()').extract_first()
     i['job-title'] = response.css('span#main-job-title  *::text').extract()
     i['main'] = response.css('div#main-lang-block *::text').extract()
     i['job-details'] = response.css('div.jobdetails *::text').extract()
     return i
Example #5
0
    def parse(self, response: scrapy.http.Response):

        # Extract every link to a landing page:
        for title in response.css('.document-row > h3 > a'):
            yield response.follow(title, self.parse_landing_page)

        # Extract the link to the next page of results:
        for next_page in response.css('.next > a'):
            yield response.follow(next_page, self.parse)
Example #6
0
    def parse_posts_list(self, response: scrapy.http.Response):
        # Fetch the posts
        for href in response.css("#posts a::attr(href)"):
            if href.get().startswith("/p"):
                yield response.follow(href, self.parse_thread)

        # Fetch all pages
        for href in response.css(".pagination a::attr(href)"):
            yield response.follow(href, self.parse_posts_list)
def form_title_colors_brand(response: scrapy.http.Response):
    brand_and_name = response.css("div.brand-and-name.j-product-title")
    brand = brand_and_name.css("span.brand::text").get()
    name = brand_and_name.css("span.name::text").get()
    colors = response.css("div.color.j-color-name-container").css(
        "span.color::text").get()
    if colors:
        title = f"{brand} / {name}, {colors}"
    else:
        title = f"{brand} / {name}"
        colors = ""
    return title, colors, brand
Example #8
0
    def parse(self, response: scrapy.http.Response):
        comic = ManhuaItem()
        comic['comic_id'] = response.url.split('/')[-1]
        comic['url'] = response.urljoin(response.url)
        comic['thumbnail'] = response.urljoin(
            response.css('div.cover img::attr(src)').get())
        comic['title'] = response.css('td.comic-titles::text').get()
        comic['original_title'] = response.css(
            'td.comic-original-titles::text').get()
        comic['author'] = response.css('ul.creators li a::text').getall()
        comic['summary'] = response.css('p.comic_story::text').get()
        comic['state'] = response.css('a.comic-pub-state::text').get()
        comic['duration'] = '-'.join(
            response.css('td.pub-duration a::text').getall())
        comic['vols'] = dict()

        res = []
        vol_urls = response.css(
            'ol.links-of-books.num_div li a::attr(href)').getall()
        for i, url in enumerate(vol_urls, start=1):
            comic['vols']['vol-%s' % i] = response.urljoin(url)
            request = scrapy.Request(response.urljoin(url),
                                     callback=self.parse_vol)
            request.meta['comic_id'] = comic['comic_id']
            request.meta['id'] = 'vol-%s' % i
            res.append(request)
        yield comic

        for request in res:
            yield request
    def parseCountryAttractionsListPage(self, response: scrapy.http.Response):
        # example page:  https://www.viator.com/Netherlands/d60

        self.incrementRequestCount()
        hrefs = response.css('div.ptm *> h2 > a::attr(href)').extract()
        for href in hrefs:
            yield response.follow(href, callback=self.parseAttractionsPage)

        nextPageLink = response.css(
            'div.ptm > div:nth-child(1) > div:nth-child(2) > p > a:last-child::attr(href)'
        ).extract_first()
        if nextPageLink:
            yield response.follow(
                nextPageLink, callback=self.parseCountryAttractionsListPage)
Example #10
0
    def parse_item_page(self, response: scrapy.http.Response):
        thumb_urls = response.css('.alt-images__thumb img::attr(src)').getall()
        first_image_url = response.css(
            '.view-product-image-print::attr(src)').get()

        image_urls = generate_image_urls(thumb_urls, first_image_url)

        item = HouzzProductItem()
        item['url'] = response.url
        item['title'] = response.css('.view-product-title::text').get()
        item['keywords'] = response.css(
            '.product-keywords__word::text').getall()
        item['images'] = image_urls[:2]

        yield item
def form_price(response: scrapy.http.Response) -> item.Price:
    current_price_raw: str = response.css("span.final-cost::text").get()
    original_price_raw: str = response.css("del.c-text-base::text").get()
    current_price = price_to_float(current_price_raw)
    if original_price_raw is not None:
        original_price = price_to_float(original_price_raw)
        ratio = current_price / original_price
        sale = 1 - ratio
        sale_tag = f"Скидка {int(100 * sale)}%"
    else:
        original_price = current_price
        sale_tag = ""
    return item.Price(current=current_price,
                      original=original_price,
                      sale_tag=sale_tag)
    def parse(self, response: scrapy.http.Response):
        issue_list = []
        for li_item in response.css("section.authors-list ol li"):
            issue_list += li_item.css("a::attr(href)").extract()
        issue_list = [x for x in issue_list if not "suppl" in x]


        article_list = []
        for url in issue_list:
            json_url = str(url) + "/JSON"
            res = urllib.request.urlopen(json_url)
            res_body = res.read()
            d = json.loads(res_body.decode("utf-8"))
            collection_index = []
            for key in d.keys():
                try:
                    collection_index.append(int(key))
                except:
                    pass
            collection_index = sorted(collection_index)
            article_count = []
            for count in collection_index:
                article_count.append(len(d[str(count)]['articles']))
            index = 0
            for x in collection_index:
                for y in range(article_count[index]):
                    article_list.append(d[str(x)]['articles'][y]['articleUrl'])
                index += 1

        file = open("article_list.txt", "w")
        file.write("\n".join(article_list))
        file.close()
    def parseCountryPage(self, response: scrapy.http.Response):
        # example page:  https://www.viator.com/India/d723-ttd

        self.incrementRequestCount()

        breadcrumbs = response.css('div.crumbler *> span::text').extract()
        countryName = breadcrumbs[1].strip()

        countryListing = CountryListing(crawler=self.name,
                                        sourceURL=response.url,
                                        crawlTimestamp=getCurrentTime(),
                                        countryName=countryName)
        yield countryListing.jsonify()

        if skipNonRequired:
            if processName(countryName) not in processedRequiredCountries:
                # do not process this country's cities
                print('Skipping country: ', countryName)
                return
        countryId = response.url.split('/')[-1].split('-')[0][1:]
        cityListingURL = 'https://www.viator.com/pascities.jspa?country={}'.format(
            countryId)
        yield response.follow(cityListingURL,
                              callback=self.parseCountryCities,
                              meta={'countryName': countryName})
def form_product(response: scrapy.http.Response) -> item.Product:
    url = response.request.url
    article = url_to_article(url)

    title, colors, brand = form_title_colors_brand(response)

    price = form_price(response)

    assets = form_assets(response)

    metadata = form_metadata(response)

    return item.Product(
        RPC=article,
        url=url,
        title=title,
        marketing_tags=response.css("li.about-advantages-item::text").getall(),
        brand=brand,
        section=response.meta["section"],
        price_data=price,
        stock=item.Stock(
            in_stock=True,  # didn't found any info on site
            count=-1,  # didn't found any info on site
        ),
        assets=assets,
        metadata=metadata,
        variants=len(colors.split()))
Example #15
0
 def parse_docs(self, response: scrapy.http.Response):
     pdfs: List[str] = []
     for url in response.css('a::attr(href)'):
         full = response.urljoin(url.extract())
         if full.endswith('.pdf'):
             pdfs.append(full)
     yield {'from': response.url, 'file_urls': pdfs}
def form_assets(response: scrapy.http.Response):
    # assets
    zoomed_image = response.css("img.MagicZoomFullSizeImage::attr(src)").get()
    images_set = response.css("a.j-carousel-image::attr(href)").getall()
    view_3d_list = []

    view_3d_base_raw = response.css(
        "div.j-3d-container.three-d-container::attr(data-path)").get()
    if view_3d_base_raw:
        for i in range(1, VIEW360_IMAGES_COUNT + 1):
            url = ''.join(['http:', view_3d_base_raw, '/', str(i), '.jpg'])
            view_3d_list.append(url)

    return item.Assets(main_image=zoomed_image,
                       set_images=images_set,
                       view360=view_3d_list,
                       video=[])
 def parse(self, response: scrapy.http.Response):
     # example page:  https://www.viator.com/Amsterdam/d525-ttd
     countryMenuBox = response.css(
         '#countryMenuBox > div.menu-dropdown-box.small > div > div:nth-child(1)'
     )
     hrefs = countryMenuBox.css('a::attr(durl)').extract()
     for href in hrefs:
         yield response.follow(href, callback=self.parseCountryPage)
Example #18
0
    def parse(self, response: scrapy.http.Response, **kwargs):
        race_name = response.css('dl.racedata h1::text').get()
        race_conditions = re.split(
            r'\xa0/?\xa0',
            response.css('dl.racedata diary_snap_cut span::text').get())

        race_data = response.meta['race_data']
        race_data['race_name'] = race_name
        race_data['race_condition'] = {
            'has_turf': '芝' in race_conditions[0],
            'has_dirt': 'ダ' in race_conditions[0],
            'has_obstacle': '障' in race_conditions[0],
        }

        if (distance_match := re.match(r'\D*\s*0*(?P<distance>\d+)m',
                                       race_conditions[0])) is None:
            race_data['race_distance'] = None
Example #19
0
 def parse(self, response: scrapy.http.Response):
     self._scraped_indexes = self._scraped_in_past
     # extract url from main article in img
     spotted_event = response.css('.main-news')[0]
     path = spotted_event.xpath('div/div/a/@href').extract_first()
     yield from self._yield_request(path)
     # extract urls from list
     yield from self._yield_requests_from_response(response)
Example #20
0
def get_links(response: scrapy.http.Response) -> List[Tuple[str or None, str]]:
    """
    Get links in the page.
    :param response: A scrapy.http.Response that contains the page
    :return: A list of tuple (title, url)
    """
    link_list = [(a_node.xpath('.//text()').get(), a_node.attrib['href'])  # Make a tuple of title, href
                 for a_node in response.css('a[href]')]
    return filter_links(link_list)
Example #21
0
 def __call__(self, to_provide, response: scrapy.http.Response,
              spider: scrapy.Spider):
     assert isinstance(spider, scrapy.Spider)
     ret: List[Any] = []
     if Price in to_provide:
         ret.append(Price(response.css(".price::text").get()))
     if Html in to_provide:
         ret.append(Html("Price Html!"))
     return ret
Example #22
0
 def __call__(self, to_provide, response: scrapy.http.Response,
              settings: Settings):
     assert isinstance(settings, Settings)
     ret: List[Any] = []
     if Name in to_provide:
         ret.append(Name(response.css(".name::text").get()))
     if Html in to_provide:
         ret.append(Html("Name Html!"))
     return ret
Example #23
0
 def parse_landing_page(self, response: scrapy.http.Response):
     # On a landing page, we can extract all the documents, or infer the JSON link and use that.
     #    yield {'title': pub.css('h1 ::text').extract_first().strip()}
     for pub in response.css('.publication'):
         # This is a publication, so let's infer the API link:
         lp_url = list(urlsplit(response.url))
         lp_url[2] = "/api/content%s" % lp_url[2]
         api_json_url = urlunsplit(lp_url)
         yield response.follow(api_json_url, self.parse_content_api_json)
    def parseReviewsPage(self, response: scrapy.http.Response):

        box = response.css('div.rating.reviewItemInline')

        content = response.css(
            'div.entry > p.partial_entry::text').extract_first()
        ratingDate = box.css('span::attr(title)').extract_first()
        rating = float(
            box.css('span::attr(class)').extract_first().split('_')[-1]) / 10
        rating = scaleRating(rating, 1, 5)
        yield Review(crawler=self.name,
                     sourceURL=response.url,
                     crawlTimestamp=getCurrentTime(),
                     countryName=response.meta['countryName'],
                     cityName=response.meta['cityName'],
                     pointName=response.meta['pointName'],
                     content=content,
                     rating=rating,
                     date=ratingDate).jsonify()
Example #25
0
 def get_next_vimeo_overview_page(self, response: scrapy.http.Response):
     """
     if there is a "next"-button at the bottom of the vimeo-user's overview page:
     grabs the url from it and yields it
     """
     # next_vimeo_overview_page = response.xpath('//*[@id="pagination"]/ol/li[9]').get()
     next_vimeo_overview_page = response.css(
         '#pagination > ol > li.pagination_next a::attr(href)').get()
     if next_vimeo_overview_page is not None:
         yield response.follow(next_vimeo_overview_page, self.parse)
Example #26
0
 def parse_page(self, response: scrapy.http.Response):
     image_url = response.css(
         'div#all div.text-center img.img-fluid::attr(src)').get()
     image_url = response.urljoin(image_url)
     image = ImageItem()
     image['comic_id'] = response.meta['comic_id']
     image['vol_id'] = response.meta['vol_id']
     image['page'] = response.meta['page']
     image['url'] = image_url
     yield image
Example #27
0
    def parse(self, response: scrapy.http.Response):
        url = response.url
        abstract = response.css("article.abstract")
        num_abstract_sections = len(abstract.css("p span::text"))
        out = dict([(abstract.css("p span::text")[i].extract()[:-2].lower(),
                     abstract.css("p::text")[i].extract())
                    for i in range(num_abstract_sections)])

        out.update(url=url)

        main_article = response.css("article.main-article")
        items = main_article.xpath("./*/text()").extract()
        whole_string = ""
        for i in items:
            whole_string += i
            whole_string += "\n"

        whole_string = re.sub(
            r"(\[\n(.*?)\])|(\[(.*?)\n\])|(\[\n(.*?)\n\])|(\[(.*?)\])", "",
            whole_string)
        saved_string = whole_string  # temp
        whole_string = re.sub(r"\n", " ", whole_string)  # temp
        out.update(article_text=whole_string)

        references = {}
        reference_text = []
        reference_url = []
        ol = response.xpath('//div[@class="footnotes"]/ol')
        for span in ol.xpath('./li/span'):
            reference_text.append(span.xpath('./text()').extract())
            try:
                reference_url.append(
                    span.xpath('./a[@target="_blank"]/@href').extract())
            except:
                reference_url.append("")

        for index in range(len(reference_text)):
            references[str(reference_text[index])] = str(reference_url[index])
        out.update(references=references)

        self.logger.warning('Out: %s', out)
        yield out
    def parse(self, response: scrapy.http.Response, **kwargs):
        section: list = response.css("ul.bread-crumbs").css(
            "span::text").getall()

        for product in response.css('div.dtList.i-dtList.j-card-item'):
            product_ref = product.css(
                "a.ref_goods_n_p.j-open-full-product-card::attr(href)").get()
            prod_card_url = response.urljoin(product_ref)
            yield common_request(url=prod_card_url,
                                 callback=self.parse_product_card,
                                 meta={'section': section})
            if self.small_sample:
                break

        next_page_ref = response.css("a.pagination-next::attr(href)").get()
        if next_page_ref is not None:
            next_page_url = response.urljoin(next_page_ref)
            yield common_request(url=next_page_url,
                                 callback=self.parse,
                                 meta={'section': section})
    def parseX(self, response: scrapy.http.Response):
        #"https://www.trip.skyscanner.com/bangkok/things-to-do

        hrefs = response.css('div.items_list *> h2 > a::attr(href)').extract()
        for href in hrefs:
            self.log("visiting: " + href)
            response.meta['rank'] += 1
            yield response.follow(href,
                                  callback=self.parseAttractionsPage,
                                  meta=response.meta)

        nextPageLink = response.css(
            'div.items_list > div:nth-child(2) > ul > li.next.next_page > a::attr(href)'
        ).extract_first()
        if nextPageLink:
            self.log("nextpage: " + nextPageLink)
            if response.meta['rank'] < 100:
                yield response.follow(nextPageLink,
                                      callback=self.parseX,
                                      meta=response.meta)
    def parseCountryCities(self, response: scrapy.http.Response):
        # example page: https://www.viator.com/pascities.jspa?country=723

        self.incrementRequestCount()

        hrefs = response.css(
            'div.unit.size-pas-cities *> a::attr(durl)').extract()
        for href in hrefs:
            yield response.follow(href,
                                  callback=self.parseCityPage,
                                  meta=response.meta)