Ejemplo n.º 1
0
    def emailtrack_errback(self, failure):
        meta = failure.request.meta
        web_url = meta['web_url']
        web_type = meta['web_type']
        index = meta['index']

        if (len(web_url) > 1):
            web_url.pop(0)
            url = web_url[0]
            yield SeleniumRequest(url=url,
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.emailtrack,
                                  errback=self.emailtrack_errback,
                                  meta={
                                      'index': index,
                                      'web_name': web_url,
                                      'web_type': web_type
                                  },
                                  dont_filter=True)
        else:
            yield SeleniumRequest(url='http://isearchfrom.com/',
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.parse,
                                  errback=self.parse_errback,
                                  meta={'index': index},
                                  dont_filter=True)
Ejemplo n.º 2
0
    def parse_pages(self, response):
        # from scrapy.shell import inspect_response
        # inspect_response(response, self)
        base_url = "https://c4dcenter.com/material-library/page/"
        # print(self.mongo_uri)
        results = []
        articles = response.css('ul.products li')
        for article in articles:
            url = article.css('a::attr(href)').get()
            title = article.css('h2::text').get().strip()
            preview_img_url = article.css('img').attrib.get('src')
            result = self.db[self.collection].find_one({'url': url})
            if not result:
                yield SeleniumRequest(
                    url=url,
                    callback=self.parse,
                    cb_kwargs={
                        'preview_img_url': preview_img_url,
                        'title': title
                    },
                    wait_until=EC.presence_of_element_located(
                        (By.ID, 'somdn-form-submit-button')),
                    wait_time=30,
                )
            results.append(result)

        # if [result for result in results if not result] == len(results):
        next_page = response.css('a.next').attrib.get('href')
        if next_page:
            yield SeleniumRequest(url=next_page,
                                  callback=self.parse_pages,
                                  wait_time=30,
                                  wait_until=EC.presence_of_element_located(
                                      (By.ID, 'jupiterx-primary')))
Ejemplo n.º 3
0
    async def handle_auth_user(self, response):
        user = response.meta["user"]

        if user.get("classes", None) == None:
            yield SeleniumRequest(
                url=user["url"] + "/HomeAccess/Classes/Schedule",
                callback=self.parse_courses,
                dont_filter=True,
                # wait_time=30,
                # wait_until=EC.text_to_be_present_in_element(
                #     (By.ID, "plnMain_pageTitle"), "2020 - 2021 Schedule"
                # ),
                meta={
                    "cookiejar": response.meta["cookiejar"],
                    "user": user
                },
            )
        yield SeleniumRequest(
            url=user["url"] + "/HomeAccess/Classes/Classwork",
            callback=self.parse_assignments,
            dont_filter=True,
            # wait_until=EC.text_to_be_present_in_element(
            #    (By.ID, "plnMain_lblForRCardRun"), "Report Card Run"
            # ),
            # wait_time=30,
            meta={
                "cookiejar": response.meta["cookiejar"],
                "user": user
            },
        )
Ejemplo n.º 4
0
    def parse(self, response):

        top_item = response.xpath('//div[@class="top-item"]//h2/a').css(
            '::attr(href)').get()
        if top_item:
            yield SeleniumRequest(url=top_item, callback=self.parse_post)

        scroll_news = response.xpath(
            '//div[@id="top-news-scroll"]/ul/li//a[@class="font-16"]').css(
                '::attr(href)')
        if scroll_news:
            for new in scroll_news:
                time.sleep(1)
                yield SeleniumRequest(url=new.extract(),
                                      callback=self.parse_post)

        posts = response.xpath('//ul[@class="feed"]/li/h2/a').css(
            '::attr(href)')

        for post in posts:
            url_extract = post.extract()
            yield SeleniumRequest(url=url_extract, callback=self.parse_post)

        next_page = response.xpath(
            '//a[@class="btn btn-xs font-14 btn-primary"]')

        if next_page:
            next_url = response.xpath(
                '//a[@class="btn btn-xs font-14 btn-primary"]').css(
                    '::attr(href)').get()
            yield SeleniumRequest(url=next_url)
Ejemplo n.º 5
0
    def errback_scrapepages_all(self,failure):
        meta = failure.request.meta
        print('\n'*2)
        print('in errback_scrapepages_all')
        print()
        page=meta['page']
        if len(page) != 0:
            print()
            print('near in errback_scrapepages_all')
            print()
            a = page[0]
            page.pop(0)
            yield SeleniumRequest(
                url=a,
                wait_time=1000,
                screenshot=True,
                callback=self.scrapepages,
                errback=self.errback_scrapepages_all,
                meta=meta,
                dont_filter=True
            )

        else:
            print()
            print('parse in errback_scrapepages_all')
            print()
            yield SeleniumRequest(
                url="https://www.yelp.com/",
                wait_time=1000,
                screenshot=True,
                callback=self.parse,
                errback=self.error_yelp,
                meta={'index': meta['index']},
                dont_filter=True
            )
Ejemplo n.º 6
0
    def parse(self, response):
        products = response.xpath(
            "//li[@class='fpGridBox grid altDeal hasPrice']")
        for product in products:
            name = product.xpath(
                ".//a[contains(@class,'itemTitle')]/text()").get()
            link = product.xpath(
                ".//a[@class='itemTitle bp-p-dealLink bp-c-link']/@href").get(
                )
            absolute = f"https://slickdeals.net{link}"
            yield SeleniumRequest(url=absolute,
                                  wait_time=2,
                                  callback=self.product_comment,
                                  meta={'Name': name})

        next_page = response.xpath("//a[@data-role='next-page']/@href").get()
        if next_page:
            abs_url = f"https://slickdeals.net{next_page}"
            print(abs_url)
            print("***********************")
            yield SeleniumRequest(
                url=abs_url,
                wait_time=2,
                callback=self.parse,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
                })
    def parse(self, response):
        """
        @with_selenium
        @url https://charityvillage.com/search/#results/5f4583ff061c57fc640eb1dc?job_type=-Unpaid+Volunteer+Position&page_num=1&kw=
        @returns items 20 20
        @scrape_not_none url title date_posted apply_before organisation location
        """
        page_jobs = []
        """ Iterating through the result of get_jobs_list()"""

        jobs_div_list = self.get_jobs_list(response)
        for div in jobs_div_list:

            # Calling abstarct method get_job_dict()
            job_dict = self.get_job_dict(div)

            page_jobs.append(job_dict)
            """
            Load full job page only if:
            - load_full_jobs=Yes
            """
            if (self.load_full_jobs):
                # Call parse_full_job_page() with job URL
                yield SeleniumRequest(url=job_dict['url'],
                                      callback=self.parse_full_job_page,
                                      cb_kwargs=dict(job_dict=job_dict),
                                      wait_time=self.selenium_wait_time,
                                      script=SCROLL_DOWN)

            else:
                yield Job(job_dict)
        """ Just printing """
        if self.load_full_jobs:
            print("Scraping {} jobs from {}...".format(len(page_jobs),
                                                       response.url))
        else:
            if self.load_all_pages == False:
                print(
                    "Scraped {} jobs from {}. load_all_pages=False and load_full_jobs=False, some new job postings and job informations might be missing"
                    .format(len(page_jobs), response.url))
            else:
                print(
                    "Scraped {} jobs from {}. load_full_jobs=False, some informations might be missing"
                    .format(len(page_jobs), response.url))
        """
        Scrape next page if
         - load_all_pages=True and get_next_page_url() is not None
        """
        if self.load_all_pages:
            if self.get_next_page_url(response) != None:
                # Loading next page...
                yield SeleniumRequest(url=self.get_next_page_url(response),
                                      callback=self.parse,
                                      wait_time=self.selenium_wait_time,
                                      script=SCROLL_DOWN,
                                      dont_filter=True)
            else:
                print("No more pages to load")
Ejemplo n.º 8
0
    def emailtrack(self, response):
        driver = response.meta['driver']
        html = driver.page_source
        response_obj = Selector(text=html)
        page = response.meta['page']
        category = response.meta['category']
        index = response.meta['index']
        find = response.meta['find']
        near = response.meta['near']
        catg = response.meta['catg']
        duplicateurl = response.meta['duplicateurl']
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        Finallinks = [str(link.url) for link in links]
        links = []
        for link in Finallinks:
            if ('Contact' in link or 'contact' in link or 'About' in link
                    or 'about' in link or 'home' in link or 'Home' in link
                    or 'HOME' in link or 'CONTACT' in link or 'ABOUT' in link):
                links.append(link)

        links.append(str(response.url))

        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            uniqueemail = set()
            yield SeleniumRequest(url=l,
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.finalemail,
                                  dont_filter=True,
                                  meta={
                                      'links': links,
                                      'page': page,
                                      'category': category,
                                      'index': index,
                                      'find': find,
                                      'near': near,
                                      'catg': catg,
                                      'duplicateurl': duplicateurl,
                                      'uniqueemail': uniqueemail
                                  })
        else:
            yield SeleniumRequest(url='https://www.google.com/',
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.scrapepages,
                                  dont_filter=True,
                                  meta={
                                      'page': page,
                                      'category': category,
                                      'index': index,
                                      'find': find,
                                      'near': near,
                                      'catg': catg,
                                      'duplicateurl': duplicateurl
                                  })
Ejemplo n.º 9
0
    def parse_movie_list(self, response, **kwargs):
        offset = response.meta['offset']
        # yield SeleniumRequest(url=self.movie_api.format(offset), meta={'offset': offset}, callback=self.parse_movie_list)
        # selector = 'li' if offset > 100 else '#results > li'
        if offset < 100:
            for movie in response.css('#results > li')[:5]:
                data = dict(
                    title=movie.css('span.display-title > a::text').get(),
                    url=movie.css('span.display-title > a::attr(href)').get(),
                    country=movie.css('ul.title > li:contains("Country")::text').get('').split(',')[0].split(':')[
                                -1].strip() or
                            movie.css('ul.title > li:contains("Country")::text').get('').split(',')[0].split(':')[
                                -1].strip(),
                    genre=', '.join(movie.css('ul.title > li:contains("Genre")::text').extract()).strip().split(':')[
                        -1],
                    year=movie.css('span.year::text').get('').replace('(', '').replace(')', ''),
                    gross=movie.css('span.us_gross::text').get('').strip(),
                    budget=movie.css('span.budget_usd::text').get('').strip()
                )
                data['path'] = urlparse(data.get('url')).path
                yield data
                # yield SeleniumRequest(url=f'https://www.imdb.com{data["path"]}fullcredits',
                #                       callback=self.parse_movie_details, dont_filter=True,
                #                       meta={'data': data})
            offset += 100
            yield SeleniumRequest(url=self.movie_api.format(offset), meta={'offset': offset},
                                  callback=self.parse_movie_list)

        else:
            movie = False
            for movie in response.css('li'):
                if movie.css('ul li span.display-title > a::text').get():
                    data = dict(
                        title=movie.css('ul li span.display-title > a::text').get(),
                        url=movie.css('ul li span.display-title > a::attr(href)').get(),
                        genre=
                        ', '.join(movie.css('ul li:last-child::text').extract()).strip().split(':')[-1],
                        year=movie.css('span.year::text').get('').replace('(', '').replace(')', ''),
                        country=
                        movie.css('ul.title > li > span > span:contains("Country")::text').get('').split(',')[0].split(
                            ':')[-1].strip() or
                        movie.css('ul.title > li > span > span:contains("Countries")::text').get('').split(',')[
                            0].split(':')[-1].strip(),
                        gross=movie.css('span.us_gross::text').get('').strip(),
                        budget=movie.css('span.budget_usd::text').get('').strip(),
                    )
                    data['path'] = urlparse(data.get('url')).path
                    yield data
                    # yield SeleniumRequest(url=f'https://www.imdb.com{data["path"]}fullcredits',
                    #                       callback=self.parse_movie_details,
                    #                       dont_filter=True, meta={'data': data})

            if movie:
                offset += 100
                yield SeleniumRequest(url=self.movie_api.format(offset), meta={'offset': offset},
                                      callback=self.parse_movie_list)
Ejemplo n.º 10
0
    def start_requests(self):

        for asin in self.asinList:

            request = SeleniumRequest(
                url="https://www.amazon.in/dp/{}".format(asin),
                callback=self.parseData,
                errback=self.asinsNotScraped)
            request.meta['asin'] = asin
            request.meta['proxy'] = "172.16.115.110:25008"
            yield request
Ejemplo n.º 11
0
    def emailtrack(self, response):
        driver = response.meta['driver']
        index = response.meta['index']
        web_name = response.meta['web_name']
        web_type = response.meta['web_type']

        html = driver.page_source
        response_obj = Selector(text=html)

        links = LxmlLinkExtractor(allow=()).extract_links(response)
        Finallinks = [str(link.url) for link in links]
        links = []
        for link in Finallinks:
            if ('Contact' in link or 'contact' in link or 'About' in link
                    or 'about' in link or 'CONTACT' in link
                    or 'ABOUT' in link):
                links.append(link)

        links.append(str(response.url))

        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            uniqueemail = set()

            yield SeleniumRequest(url=l,
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.finalemail,
                                  errback=self.errback_finalemail,
                                  meta={
                                      'index': index,
                                      'web_name': web_name,
                                      'web_type': web_type,
                                      'uniqueemail': uniqueemail,
                                      'links': links
                                  },
                                  dont_filter=True)
        else:
            finalemail = []
            yield SeleniumRequest(url='https://www.google.com/',
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.parse_page,
                                  errback=self.errback_google,
                                  meta={
                                      'index': index,
                                      'web_name': web_name,
                                      'web_type': web_type,
                                      'finalemail': finalemail,
                                      'links': links
                                  },
                                  dont_filter=True)
 def parse(self, response):
     product_link = response.xpath(
         '//div[@class="product_name"]/a/@href').getall()
     next_page = response.xpath(
         '(//a[@class="pageNavLink pageNavNext"])[2]/@href').get()
     print(next_page)
     for product in product_link:
         product_url = response.urljoin(product)
         yield SeleniumRequest(url=product_url, callback=self.parse_item_2)
     if next_page:
         print('following')
         yield SeleniumRequest(url=next_page, callback=self.parse)
Ejemplo n.º 13
0
    def emailtrack(self, response):
        driver = response.meta['driver']
        html = driver.page_source
        response_obj = Selector(text=html)
        page = response.meta['page']
        # category = response.meta['category']
        index = response.meta['index']
        find = response.meta['find']
        near = response.meta['near']
        # catg = response.meta['catg']
        # duplicateurl = response.meta['duplicateurl']
        links = LxmlLinkExtractor(allow=()).extract_links(response)
        Finallinks = [str(link.url) for link in links]
        linkscheck = []
        for link in Finallinks:
            if (
                    'Contact' in link or 'contact' in link or 'About' in link or 'about' in link  or 'CONTACT' in link or 'ABOUT' in link):
                linkscheck.append(link)

        links=[]
        for link in linkscheck:
            if('facebook' not in link and 'instagram' not in link and 'youtube' not in link and 'twitter' not in link and 'wiki' not in link and 'linkedin' not in link):
                links.append(link)
        links.append(str(response.url))

        if (len(links) > 0):
            l = links[0]
            links.pop(0)
            uniqueemail = set()
            yield SeleniumRequest(
                url=l,
                wait_time=1000,
                screenshot=True,
                callback=self.finalemail,
                errback=self.errback_finalemail,
                dont_filter=True,
                meta={'links': links, 'page': page, 'index': index, 'find': find, 'near': near,
                      'uniqueemail': uniqueemail}
            )
        else:
            finalemail=[]
            driver = response.meta['driver']
            yield SeleniumRequest(
                url = driver.current_url,
                wait_time=1000,
                screenshot=True,
                callback=self.data_save,
                errback=self.error_google,
                dont_filter=True,
                meta={'page': page, 'index': index, 'find': find, 'near': near, 'finalemail': finalemail}
            )
Ejemplo n.º 14
0
    def likes_follow(self, response):
        self.channel_type = []
        self.pc = []
        self.l_f = []

        #       getting some data from the home page
        if response.xpath('//div[@class="_4bl9"]/ div//text()').getall()[0:2]:
            self.l_f = response.xpath(
                '//div[@class="_4bl9"]/ div//text()').getall()[0:2]

        if response.xpath(
                '//div[@class="_3qn7 _61-0 _2fyi _3qnf _2pi9 _3-95"]/span/text()'
        ).getall():
            self.pc = response.xpath(
                '//div[@class="_3qn7 _61-0 _2fyi _3qnf _2pi9 _3-95"]/span/text()'
            ).getall()

        if response.xpath('//a[@class="_81ge"]/text()').getall():
            self.path1 = response.xpath('//a[@class="_81ge"]/text()').getall()
            self.channel_type = [self.path1[len(self.path1) - 1]]

        try:
            #           put '/' in the url path where there is no '/' at the end and create a about page url
            set_url = ''
            set_url = self.driver.current_url
            if not str(set_url).endswith('/'):
                set_url += '/'
            self.aboutc = urlparse(
                self.driver.current_url).scheme + "://" + urlparse(
                    self.driver.current_url).netloc + "/pg" + urlparse(
                        self.driver.current_url
                    ).path + "about/?ref=page_internal"

        except Exception as e:
            print(e)
            yield SeleniumRequest(url='https://www.google.com/',
                                  callback=self.parse,
                                  wait_time=10,
                                  screenshot=True,
                                  dont_filter=True)

        self.driver.save_screenshot('image2.png')
        print(self.aboutc)

        #       send request to about page
        yield SeleniumRequest(url=self.aboutc,
                              callback=self.get_about,
                              wait_time=10,
                              screenshot=True,
                              dont_filter=True)
Ejemplo n.º 15
0
    def handle_product_listings(self, response):
        urls = response.xpath(
            '//ul[@class="productLister gridView"]//li[@class="gridItem"]//h3/a/@href'
        ).extract()
        for url in urls:
            if url.startswith('http'):
                yield SeleniumRequest(url=url,
                                      callback=self.parse_product_detail)

        next_page = response.xpath(
            '//ul[@class="pages"]/li[@class="next"]/a/@href').extract()
        if next_page:
            yield SeleniumRequest(url=next_page[0],
                                  callback=self.handle_product_listings)
Ejemplo n.º 16
0
    def scrape_product(self, response):
        """
        scrape product information
        :param response:
        :return:
        """

        for href in response.css(
                'ul.a-pagination li.a-last ::attr(href)').getall():
            self.logger.info('following to next page: %s', href)
            print('{}/{}'.format(self.amazon_base_url, href))
            # yield response.follow(href, self.scrape_product)
            yield SeleniumRequest(url='{}/{}'.format(self.amazon_base_url,
                                                     href),
                                  callback=self.scrape_product,
                                  errback=self.failure,
                                  dont_filter=True,
                                  wait_time=0.0001,
                                  wait_until=EC.element_to_be_clickable(
                                      (By.CLASS_NAME, 's-search-results')))

        result_xpath = '//div[@id="search"]//span[@data-component-type="s-search-results"]//div[@data-asin]'
        for product in response.xpath(result_xpath):
            asin = product.xpath('./@data-asin').get()
            product_info = {
                'asin':
                asin,
                'product_url':
                '{}/dp/{}'.format(self.amazon_base_url, asin),
                'product_name':
                product.xpath(
                    './/span[@class="a-size-base-plus a-color-base a-text-normal"]/text()'
                ).get(),
                'lowest_price':
                product.xpath('.//span[@class="a-price"]/span/text()').get(),
                'prime':
                product.xpath(
                    './/i[@aria-label="Amazon Prime"]/@aria-label').get(),
                'start_rating':
                product.xpath('.//span[@class="a-icon-alt"]/text()').get(),
            }
            yield SeleniumRequest(url=product_info['product_url'],
                                  callback=self.scrape_product_details,
                                  meta=dict(product_info=product_info),
                                  errback=self.failure,
                                  dont_filter=True,
                                  wait_time=0.0001,
                                  wait_until=EC.element_to_be_clickable(
                                      (By.ID, 'productTitle')))
Ejemplo n.º 17
0
 def parse(self, response):
     for url in response.css(
             "header.c-eventSummary-header > a::attr('href')").extract():
         yield SeleniumRequest(
             wait_time=5,
             wait_until=EC.visibility_of_element_located(
                 (By.CSS_SELECTOR, "div.c-eventSummary-stats")),
             url=urljoin(self.root_url, url),
             callback=self.parse_event,
         )
     next_page = response.css(
         'li.page > a[rel="next"]::attr("href")').extract_first()
     if next_page:
         yield SeleniumRequest(url=urljoin(self.root_url, next_page),
                               callback=self.parse)
Ejemplo n.º 18
0
 def _follow_link(self, response):
     driver = self.get_driver(response)
     for element in driver.find_elements_by_xpath(
             '//*[@id="div1"]/table/tbody/tr/td/div/a'):
         self.log('find link {}'.format(element.text), level=logging.INFO)
         yield SeleniumRequest(response.urljoin(
             element.get_attribute('href')),
                               callback=self.parse)
     for element in driver.find_elements_by_xpath(
             '//div[@class="hot-ent-box"]/div[contains(@class, "hot fl")]/ul/li/a'
     ):
         self.log('find link {}'.format(element.text), level=logging.INFO)
         yield SeleniumRequest(response.urljoin(
             element.get_attribute('href')),
                               callback=self.parse)
Ejemplo n.º 19
0
 def start_requests(self):
     yield SeleniumRequest(
         url='https://slickdeals.net/computer-deals/',
         wait_time=10,
         screenshot=True,
         callback=self.parse
     )
 def start_requests(self):
     for url in self.start_urls:
         # Auto scroll down
         yield SeleniumRequest(url=url,
                               callback=self.parse,
                               wait_time=self.selenium_wait_time,
                               script=SCROLL_DOWN)
    def parse(self, response):
        products = response.xpath(
            "//ul[@class='dealTiles categoryGridDeals']/li")
        for product in products:
            yield {
                'name':
                product.xpath(".//a[@class='itemTitle']/text()").get(),
                'link':
                product.xpath(".//a[@class='itemTitle']/@href").get(),
                'store_name':
                self.remove_characters(
                    product.xpath(
                        "normalize-space(.//span[@class='itemStore']/text())").
                    get()),
                'price':
                product.xpath(
                    "normalize-space(.//div[@class='itemPrice  wide ']/text())"
                ).get()
            }

        next_page = response.xpath("//a[@data-role='next-page']/@href").get()
        if next_page:
            absolute_url = f"https://slickdeals.net{next_page}"
            yield SeleniumRequest(url=absolute_url,
                                  wait_time=3,
                                  callback=self.parse)
Ejemplo n.º 22
0
 def start_requests(self):
     yield SeleniumRequest(
         url="https://www.flyuia.com/ua/en/home",
         wait_time=5,
         screenshot=True,
         callback=self.parse,
     )
Ejemplo n.º 23
0
 def start_requests(self):
     yield SeleniumRequest(
         url="https://www.amazon.in",
         wait_time=3,
         screenshot=True,
         callback=self.selenium_tasks
     )
Ejemplo n.º 24
0
 def start_requests(self):
     yield SeleniumRequest(
         url="https://www.myanimelist.net",
         callback=self.link_to_login_form,
         wait_time=10,
         wait_until=EC.element_to_be_clickable((By.ID, 'malLogin'))
     )
Ejemplo n.º 25
0
    def parse(self, response, discipline, page_number):

        results = []
        title_page = response.css('title::text').get()
        if not re.search('Error', title_page):  # no error page is shown
            articles = response.css('ol#results-list li')
            for article in articles:
                article_link = article.css('a').attrib.get('href')
                title = article.css('a::text').get()
                article_type = article.css('p.content-type::text').get().strip()
                # if the article is not scraped
                if not self.db[self.collection].find_one({'article_link': article_link}):

                    yield response.follow(url=article_link,
                                          callback=self.parse_item,
                                          cb_kwargs={'title': title,
                                                     'discipline': discipline,
                                                     'article_type': article_type,
                                                     'page_number': page_number})
                results.append(article_link)
                if len([result for result in results if result is not None]) == len(results):
                    break

        elif re.search('Error', title_page):  ## error page is shown
            time.sleep(3)
            url = response.url
            yield SeleniumRequest(url=url,
                                  callback=self.parse,
                                  wait_time=60,
                                  # wait_until=EC.presence_of_element_located((By.XPATH, '//a[@class=""]')),
                                  cb_kwargs={'discipline': discipline,
                                             'page_number': page_number
                                             }
                                  )
Ejemplo n.º 26
0
 def start_requests(self):
     yield SeleniumRequest(
         url="https://kathmandupost.com",
         wait_time=3,
         screenshot=True,
         callback=self.parse,
         headers={'User-Agent': TheKathmanduPostSpider.ua.random})
Ejemplo n.º 27
0
 def start_requests(self):
     yield SeleniumRequest(
         url='https://duckduckgo.com',
         wait_time=3,
         screenshot = True,
         callback=self.parse
     )
Ejemplo n.º 28
0
    def start_requests(self):
        start_urls = ['https://dashboard.vidtao.com/login']

        yield SeleniumRequest(url=start_urls[0],
                              callback=self.parse,
                              wait_time=10,
                              screenshot=True)
Ejemplo n.º 29
0
 def start_requests(self):
     yield SeleniumRequest(
         url="https://www.lazada.com.my/shop-software/?spm=a2o4k.pdp_revamp.breadcrumb.3.54b05c1aRaQ9f5",
         callback=self.parse,
         headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'},
         dont_filter=True,
     )
Ejemplo n.º 30
0
    def app_listings(self, response):
        driver = response.meta['driver']
        cntr = True
        while cntr:
            time.sleep(3)
            html = driver.page_source
            response = Selector(text=html)

            app_listings = response.xpath(
                "//div[@id='searchresult']/div[@class='result']")
            for apps in app_listings:
                url = self.gen_app_url(
                    apps.xpath(".//a[@class='search']/@href").get())
                applicant = apps.xpath(".//div/text()").getall()
                yield scrapy.Request(url=url,
                                     callback=self.parse,
                                     meta={'applicant': applicant})

            next_page = response.xpath("//a[@class='next']")
            if next_page:
                np = driver.find_element_by_xpath("//a[@class='next']")
                np.click()
            else:
                cntr = False

        for url in self.urls[1:]:
            yield SeleniumRequest(url=url,
                                  wait_time=8,
                                  callback=self.app_listings)