Ejemplo n.º 1
0
 def start_requests(self):
     for url in self.start_urls:
         yield SeleniumRequest(url=url,
                               dont_filter=True,
                               callback=self.parse_list,
                               errback=self.handle_failure_selenium)
Ejemplo n.º 2
0
 def create_request(self, item):
     yield SeleniumRequest(url=item.link, callback=self.parse, meta={'item': item})
Ejemplo n.º 3
0
 def start_requests(self):
     yield SeleniumRequest(
         url="https://www.wish.com/feed/tag_53dc186321a86318bdc87ef8",
         wait_time=6,
         callback=self.parse)
Ejemplo n.º 4
0
    def parse_links(self, response):
        """Links to specific offers parser using JSON files from API"""

        html_data = response.xpath(
            '//pre/text()').get()  # get JSON code from HTML code
        json_data = json.loads(html_data)  # parse string to JSON code
        pages_100_handler = 0  # iterator

        # Searching the offer space according to given parameters (100_pages, localization, salary range) by the user and collback to parse_offers method.
        ## Scraper always chooses only offers with a given salary.
        ## Logic of salary expectations conditions (sets) is directly taken from the webpage.
        ### In the authors' opinion, it is a bit of a distortion, but to be consistent with the part of the project carried out at Selenium we had to use this logic.

        for i in json_data:
            if type(i["salary_from"]) != type(None) and type(
                    i["salary_to"]) != type(None):
                if self.localization_choice == "ALL":
                    if (self.salary_expectations_lower <= int(i["salary_from"])
                            and int(i["salary_from"]) <=
                            self.salary_expectations_upper) or (
                                self.salary_expectations_upper >= int(
                                    i["salary_to"]) and int(i["salary_to"]) >=
                                self.salary_expectations_lower) or (
                                    i["salary_to"] >
                                    self.salary_expectations_upper and
                                    (i["salary_from"] <
                                     self.salary_expectations_lower)):
                        if pages_100_handler < self.pages_100:
                            offers_url = "https://justjoin.it/offers/" + i["id"]
                            pages_100_handler += 1
                            yield SeleniumRequest(
                                url=offers_url,
                                callback=self.parse_offers,
                                wait_time=10,
                                wait_until=EC.element_to_be_clickable(
                                    (By.CSS_SELECTOR, 'div.css-1xc9aks')))
                        else:
                            break
                else:
                    if self.localization_choice == i["city"]:
                        if (self.salary_expectations_lower <= int(
                                i["salary_from"]) and int(i["salary_from"]) <=
                                self.salary_expectations_upper) or (
                                    self.salary_expectations_upper >= int(
                                        i["salary_to"]) and int(i["salary_to"])
                                    >= self.salary_expectations_lower) or (
                                        i["salary_to"] >
                                        self.salary_expectations_upper and
                                        (i["salary_from"] <
                                         self.salary_expectations_lower)):
                            if pages_100_handler < self.pages_100:
                                offers_url = "https://justjoin.it/offers/" + i[
                                    "id"]
                                pages_100_handler += 1
                                yield SeleniumRequest(
                                    url=offers_url,
                                    callback=self.parse_offers,
                                    wait_time=10,
                                    wait_until=EC.element_to_be_clickable(
                                        (By.CSS_SELECTOR, 'div.css-1xc9aks')))
                            else:
                                break
Ejemplo n.º 5
0
 def parse(self, response, **kwargs):
     self.cookies = self.login(response)
     yield SeleniumRequest(url=self.movie_api.format(0), callback=self.parse_movie_list, meta={'offset': 0})
Ejemplo n.º 6
0
    def parse(self, response):
        driver = response.meta['driver']
        index = response.meta['index']

        if (index % 3 == 0):
            # for check
            time.sleep(9)
        elif (index % 3 == 1):
            # for check
            time.sleep(17)
        else:
            # for check
            time.sleep(3)
        # time.sleep(120)

        if (index > 0):
            driver.close()
            driver.switch_to.window(driver.window_handles[0])

        if (os.stat(os.path.abspath(os.curdir) + '\countryfour.txt').st_size !=
                0 and os.stat(os.path.abspath(os.curdir) +
                              '\keywordfour.txt').st_size != 0
                and os.stat(os.path.abspath(os.curdir) +
                            '\cityfour.txt').st_size != 0):
            firstinput = os.path.abspath(os.curdir) + "\countryfour.txt"
            f = open(firstinput, "r")
            country_name = f.read().splitlines()

            secondinput = os.path.abspath(os.curdir) + "\keywordfour.txt"
            f = open(secondinput, "r")
            keyword_name = f.read().splitlines()

            thirdinput = os.path.abspath(os.curdir) + "\cityfour.txt"
            f = open(thirdinput, "r")
            city_name = f.read().splitlines()

            WebDriverWait(driver, 100).until(
                EC.presence_of_element_located((By.ID, "searchbutton")))

            # print('\n'*2)
            # print(driver.current_url)
            # print('\n' * 2)
            # time.sleep(10)
            self.website_type.clear()
            self.website_name.clear()
            self.duplicate.clear()
            self.country = ''
            self.keyword = ''
            self.city = ''

            length = len(country_name)
            print("\n" * 2)
            print(index, length)
            print("\n" * 2)

            # if (index < length):
            print("\n" * 2)
            print('passed values')
            print("\n" * 2)

            self.country = country_name[0]
            self.keyword = keyword_name[0]
            self.city = city_name[0]
            index += 1

            if (self.country == '' or self.keyword == '' or self.city == ''):

                country_name.pop(0)
                keyword_name.pop(0)
                city_name.pop(0)

            if (len(country_name) > 0 and len(keyword_name) > 0
                    and len(city_name) > 0):
                self.country = country_name[0]
                self.keyword = keyword_name[0]
                self.city = city_name[0]

                driver.find_element_by_xpath('//*[@id="countrytags"]').clear()
                search_input1 = driver.find_element_by_xpath(
                    '//*[@id="countrytags"]')
                search_input1.send_keys(country_name[0])

                driver.find_element_by_xpath('//*[@id="searchinput"]').clear()
                search_input2 = driver.find_element_by_xpath(
                    '//*[@id="searchinput"]')
                search_input2.send_keys(keyword_name[0])

                driver.find_element_by_xpath('//*[@id="city"]').clear()
                search_input3 = driver.find_element_by_xpath('//*[@id="city"]')
                search_input3.send_keys(city_name[0])
                search_button = driver.find_element_by_xpath(
                    '//*[@id="searchbutton"]')
                search_button.click()
                time.sleep(4)
                driver.switch_to_window(driver.window_handles[1])
                driver = response.meta['driver']

                #for check
                time.sleep(11)

                print('\n' * 2)
                print('Above yield to parse_page')
                print('\n' * 2)

                country_name.pop(0)
                keyword_name.pop(0)
                city_name.pop(0)

                if (len(country_name) > 0 and len(keyword_name) > 0
                        and len(city_name) > 0):

                    with open(
                            os.path.abspath(os.curdir) + '\countryfour.txt',
                            'w') as f:
                        f.write('')

                    new_country = ''
                    for b in country_name:
                        new_country += b + "\n"

                    with open(
                            os.path.abspath(os.curdir) + '\countryfour.txt',
                            'a') as f:
                        f.write(str(new_country))

                    with open(
                            os.path.abspath(os.curdir) + '\cityfour.txt',
                            'w') as f:
                        f.write('')

                    new_city = ''
                    for b in city_name:
                        new_city += b + "\n"

                    with open(
                            os.path.abspath(os.curdir) + '\cityfour.txt',
                            'a') as f:
                        f.write(str(new_city))

                    with open(
                            os.path.abspath(os.curdir) + '\keywordfour.txt',
                            'w') as f:
                        f.write('')

                    new_keyword = ''
                    for b in keyword_name:
                        new_keyword += b + "\n"

                    with open(
                            os.path.abspath(os.curdir) + '\keywordfour.txt',
                            'a') as f:
                        f.write(str(new_keyword))

                    yield SeleniumRequest(url=driver.current_url,
                                          wait_time=1000,
                                          screenshot=True,
                                          callback=self.parse_page,
                                          errback=self.errback_parse_page,
                                          meta={'index': index},
                                          dont_filter=True)
                else:
                    with open(
                            os.path.abspath(os.curdir) + '\keywordfour.txt',
                            'w') as f:
                        f.write('')

                    with open(
                            os.path.abspath(os.curdir) + '\countryfour.txt',
                            'w') as f:
                        f.write('')

                    with open(
                            os.path.abspath(os.curdir) + '\cityfour.txt',
                            'w') as f:
                        f.write('')

                    yield SeleniumRequest(url=driver.current_url,
                                          wait_time=1000,
                                          screenshot=True,
                                          callback=self.parse_page,
                                          errback=self.errback_parse_page,
                                          meta={'index': index},
                                          dont_filter=True)
Ejemplo n.º 7
0
 def start_requests(self):
     yield SeleniumRequest(url='http://quotes.toscrape.com/js/',
                           callback=self.parse)
Ejemplo n.º 8
0
 def start_requests(self):
     yield SeleniumRequest(
         url="https://bulldogjob.com/companies/jobs?page=1",
         wait_time=5,
         callback=self.parse)
Ejemplo n.º 9
0
 def start_requests(self):
     yield SeleniumRequest(
         url="https://rodeosusa.com/rodeos/",
         wait_time=10,
         callback=self.parse
     )
Ejemplo n.º 10
0
 def start_requests(self):
     yield SeleniumRequest(
         url=
         "https://webdevelopment111.com/directory/wp-admin/options-general.php?page=grw",
         wait_time=10,
         callback=self.parse)
Ejemplo n.º 11
0
 def start_requests(self):
     yield SeleniumRequest(
         url="http://da.ballina.nsw.gov.au/Home/Disclaimer",
         wait_time=5,
         callback=self.app_listings
     )
Ejemplo n.º 12
0
 def start_requests(self):
     yield SeleniumRequest(url="https://www.udr.com/search-apartments",
                           wait_time=5,
                           callback=self.parse)
Ejemplo n.º 13
0
 def start_requests(self):
     yield SeleniumRequest(url='https://duckduckgo.com',
                           wait_time=3,
                           screenshot=True,
                           callback=self.parse)
Ejemplo n.º 14
0
 def start_requests(self):
     yield SeleniumRequest(
         url="https://www.walmart.com/cp/Musical-Instruments/7796869?&povid=4171+%7C+contentZone20+%7C+2017-02-09+%7C+1+%7C+Musical+Flyout",
         wait_time=6,
         callback=self.parse
     )
Ejemplo n.º 15
0
 def start_requests(self):
     yield SeleniumRequest(url="https://justjoin.it/",
                           wait_time=3,
                           callback=self.parse)
Ejemplo n.º 16
0
 def start_requests(self):
     yield SeleniumRequest(
         url=
         "https://eservices.moreland.vic.gov.au/ePathway/Production/Web/GeneralEnquiry/EnquiryLists.aspx?ModuleCode=LAP&js=-1768304710",
         wait_time=5,
         callback=self.parse)
Ejemplo n.º 17
0
    def parse_page(self, response):
        Searchdetails_item = SearchdetailItem()
        driver = response.meta['driver']
        index = response.meta['index']
        # traffic_url = response.meta['traffic_url']

        if (response.url == 'https://www.google.com/'):
            finalemail = response.meta['finalemail']
            web_name = response.meta['web_name']
            web_type = response.meta['web_type']
            Searchdetails_item['url'] = web_name
            Searchdetails_item['email'] = '-'
            Searchdetails_item['country'] = self.country
            Searchdetails_item['keyword'] = self.keyword
            Searchdetails_item['city'] = self.city
            Searchdetails_item['type'] = web_type

            if (len(finalemail) == 0):
                yield Searchdetails_item
            else:
                if (len(finalemail) < 5):
                    length = len(finalemail)
                else:
                    length = 5
                for i in range(0, length):
                    Searchdetails_item['email'] = finalemail[i]
                    yield Searchdetails_item

        else:
            print('\n' * 2)
            print(driver.current_url)
            print('\n' * 2)
            # time.sleep(8)

            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.ID, "tvcap")))
            except:
                # html = driver.page_source
                # response_obj = Selector(text=html)
                #
                # url = response_obj.xpath("//div/div/text()").extract()
                # new_url=url[-2].split(' ')
                # print("\n"*2)
                # print('check this url',new_url)
                # print("\n" * 2)
                # body = driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL+'t')
                # driver.get(new_url[1])

                # driver.switch_to_window(driver.window_handles[2])
                # time.sleep(10)
                print('\n' * 2)
                print('tvcap except', driver.current_url)
                print('\n' * 2)

            details = response.xpath('//div[@id="tvcap"]/div/div/ol/li/div[1]')

            for detail in details:

                name = detail.xpath(
                    './/div/div/div[1]/w-visurl/div/span[2]/text()').get()
                if (name == None):
                    print('\n' * 2)
                    print('gone for second xpath')
                    print('\n' * 2)
                    name = detail.xpath('.//div/cite/text()').get()
                print('\n' * 2)
                print('name', name)
                print('\n' * 2)
                if (name != None):
                    name_list = name.split('/')
                    if ('http://' not in name_list[0]):
                        name_added = 'http://' + name_list[0]
                    if (name_added not in self.duplicate
                            and 'www.' in name_added):
                        self.duplicate.append(name_added)
                        self.website_name.append(name_added)
                        self.website_type.append('organic')

            print('\n' * 2)
            print(self.website_name)
            print('\n' * 2)

            details = response.xpath(
                '//div[@id="bottomads"]/div/div/ol/li/div[1]')

            for detail in details:

                name = detail.xpath(
                    './/div/div/div[1]/w-visurl/div/span[2]/text()').get()
                if (name == None):
                    print('\n' * 2)
                    print('gone for second xpath')
                    print('\n' * 2)
                    name = detail.xpath('.//div/cite/text()').get()
                print('\n' * 2)
                print('name', name)
                print('\n' * 2)
                if (name != None):
                    name_list = name.split('/')
                    if ('http://' not in name_list[0]):
                        name_added = 'http://' + name_list[0]
                    if (name_added not in self.duplicate
                            and 'www.' in name_added):
                        self.duplicate.append(name_added)
                        self.website_name.append(name_added)
                        self.website_type.append('organic')

            print('\n' * 2)
            print(self.website_name)
            print('\n' * 2)

            print('\n' * 2)
            print('check wait')
            print('\n' * 2)

            #some try
            # time.sleep(60)
            details = response.xpath('//div[@class="X2Dase irChYb"]')
            for detail in details:

                name = detail.xpath('.//span[2]/span/text()').get()
                if (name == None):
                    print('\n' * 2)
                    print('gone for second xpath')
                    print('\n' * 2)
                    name = detail.xpath('.//div/cite/text()').get()
                print('\n' * 2)
                print('name', name)
                print('\n' * 2)
                if (name != None):
                    name_list = name.split('/')
                    if ('http://' not in name_list[0]):
                        name_added = 'http://' + name_list[0]
                    if (name_added not in self.duplicate
                            and 'www.' in name_added):
                        self.duplicate.append(name_added)
                        self.website_name.append(name_added)
                        self.website_type.append('organic')

            try:
                WebDriverWait(driver, 100).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "i0vbXd")))
            except:
                print('\n' * 2)
                print('i0vbXd except', driver.current_url)
                print('\n' * 2)

            time.sleep(3)
            # url=response.xpath('//*[@id="rso"]/div[1]/div/div[2]/div/div[4]/div[3]/div/div/a/@href').get()
            # print('\n' * 2)
            # print('url',url)
            # print('\n' * 2)
            # next_url = 'https://www.google.com'+ url
            # click_ads = driver.find_element_by_xpath('//*[@id="rso"]/div[1]/div/div[2]/div/div[4]/div[3]/div/div/a/div/span')
            # click_ads.click()
            #
            # WebDriverWait(driver, 1000).until(
            #     EC.presence_of_element_located((By.CLASS_NAME, "section-result-title"))
            # )
            #
            # driver=response.meta['driver']
            print('\n' * 2)
            print('check xpath')
            print('\n' * 2)

            # click_moreads=driver.find_element_by_xpath('//*[@id="rso"]/div/div/div[2]/div/div[4]/div[3]/div/div/a/div/span')
            try:
                click_moreads = driver.find_element_by_xpath(
                    '//*[@id="rso"]/div/div/div/div/div/div/div/div/a/div/span[contains(text(),"More")]'
                )
                click_moreads.click()
            except:
                print('\n' * 2)
                print('rso except', driver.current_url)
                print('\n' * 2)

            driver = response.meta['driver']
            print('\n' * 2)
            print(driver.current_url)
            print('\n' * 2)

            time.sleep(4)
            html = driver.page_source
            response_obj = Selector(text=html)

            # search_button = driver.find_element_by_xpath('(//span[@class="VqFMTc p8AiDd"])[2]')
            # search_button.click()
            # time.sleep(3)
            details = response_obj.xpath('//span[@class="VqFMTc p8AiDd"]')
            print(len(details))
            # print('\n' * 3)
            for idx in range(1, len(details) + 1):

                search_ads = driver.find_element_by_xpath(
                    "(//span[@class='VqFMTc p8AiDd'])[{}]".format(idx))
                search_ads.click()
                time.sleep(3)
                html = driver.page_source
                response_obj = Selector(text=html)
                name = response_obj.xpath(
                    '//a[@class="CL9Uqc ab_button" ][contains(text(),"Website")]/@href'
                ).get()
                print('\n' * 2)
                print('name', name)
                print('\n' * 2)
                # name_list = name.split('/')
                if (name != None):
                    if ('http://' in name or 'https://' in name):
                        pass
                    else:
                        name = 'http://' + name

                    if (name not in self.duplicate and 'www.' in name):
                        self.duplicate.append(name)
                        self.website_name.append(name)
                        self.website_type.append('map')

                # if (name not in self.duplicate):
                #     self.duplicate.append(name)
                #     self.website_name.append(name)
                #     self.website_type.append('map')

            print('\n' * 2)
            print(self.website_name)
            print(self.website_type)
            print('\n' * 2)

        if (len(self.website_type) != 0):
            web_url = self.website_name[0]
            web_type = self.website_type[0]
            self.website_name.pop(0)
            self.website_type.pop(0)
            print('\n' * 2)
            print('ready to emailtrack')
            print(web_url)
            print(web_type)
            print('\n' * 2)
            yield SeleniumRequest(url=web_url,
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.emailtrack,
                                  errback=self.emailtrack_errback,
                                  meta={
                                      'index': index,
                                      'web_name': web_url,
                                      'web_type': web_type
                                  },
                                  dont_filter=True)
        else:
            print('\n' * 2)
            print('back to isearch')
            print('\n' * 2)
            yield SeleniumRequest(url='http://isearchfrom.com/',
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.parse,
                                  errback=self.parse_errback,
                                  meta={'index': index},
                                  dont_filter=True)
Ejemplo n.º 18
0
 def start_requests(self):
     for url in self.start_urls:
         yield SeleniumRequest(url=url)
Ejemplo n.º 19
0
    def finalemail(self, response):
        links = response.meta['links']
        driver = response.meta['driver']
        html = driver.page_source
        response_obj = Selector(text=html)
        index = response.meta['index']
        web_name = response.meta['web_name']
        web_type = response.meta['web_type']
        links = response.meta['links']

        uniqueemail = response.meta['uniqueemail']

        flag = 0
        bad_words = [
            'facebook', 'instagram', 'youtube', 'twitter', 'wiki', 'linkedin'
        ]
        for word in bad_words:
            if word in str(response.url):
                # return
                flag = 1
        if (flag != 1):
            html_text = str(response.text)
            mail_list = re.findall('\w+@\w+\.{1}\w+', html_text)
            #
            mail_list = set(mail_list)
            if (len(mail_list) != 0):
                for i in mail_list:
                    mail_list = i
                    if (mail_list not in uniqueemail):
                        uniqueemail.add(mail_list)
                        print('\n' * 2)
                        print(uniqueemail)
                        print('\n' * 2)
            else:
                pass

        if (len(links) > 0 and len(uniqueemail) < 5):
            print('\n' * 2)
            print('hi', len(links))
            print('\n' * 2)
            l = links[0]
            links.pop(0)
            yield SeleniumRequest(url=l,
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.finalemail,
                                  errback=self.errback_finalemail,
                                  dont_filter=True,
                                  meta={
                                      'web_name': web_name,
                                      'web_type': web_type,
                                      'uniqueemail': uniqueemail,
                                      'links': links,
                                      'index': index
                                  })
        else:
            print('\n' * 2)
            print('hello')
            print('\n' * 2)
            emails = list(uniqueemail)
            finalemail = []
            discard = ['*****@*****.**']
            for email in emails:
                if ('.in' in email or '.com' in email or 'info' in email
                        or '.org' in email):
                    for dis in discard:
                        if (dis not in email):
                            finalemail.append(email)
            print('\n' * 2)
            print('final', finalemail)
            print('\n' * 2)
            yield SeleniumRequest(url='https://www.google.com/',
                                  wait_time=1000,
                                  screenshot=True,
                                  callback=self.parse_page,
                                  errback=self.errback_google,
                                  dont_filter=True,
                                  meta={
                                      'web_name': web_name,
                                      'web_type': web_type,
                                      'links': links,
                                      'finalemail': finalemail,
                                      'index': index
                                  })
Ejemplo n.º 20
0
 def start_requests(self):
     yield SeleniumRequest(url="https://www.lightinthebox.com",
                           wait_time=6,
                           callback=self.parse)
Ejemplo n.º 21
0
 def start_requests(self):
     yield SeleniumRequest(url='https://trends.google.com/trends/?geo=ES',
                           wait_time=1,
                           callback=self.parse,
                           dont_filter=True,
                           headers={'User-Agent': self.user_agent})
Ejemplo n.º 22
0
 def start_requests(self):
     yield SeleniumRequest(
         url="https://www.familydollar.com",
         wait_time=3,
         callback=self.parse
     )
Ejemplo n.º 23
0
 def start_requests(self):
     yield SeleniumRequest(
         url='https://secure.imdb.com/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.imdb.com%2Fregistration%2Fap-signin-handler%2Fimdb_pro_us&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=imdb_pro_us&openid.mode=checkid_setup&siteState=eyJvcGVuaWQuYXNzb2NfaGFuZGxlIjoiaW1kYl9wcm9fdXMiLCJyZWRpcmVjdFRvIjoiaHR0cHM6Ly9wcm8uaW1kYi5jb20vIn0&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0')
Ejemplo n.º 24
0
 def start_requests(self):
     yield SeleniumRequest(url="https://www.walmart.com/receipt-lookup",
                           wait_time=5,
                           callback=self.parse)
Ejemplo n.º 25
0
 def start_requests(self):
     yield SeleniumRequest(url="", callback=self.parse)
Ejemplo n.º 26
0
 def start_requests(self):
     yield SeleniumRequest(
         url=
         'https://www.aliexpress.com/category/100003070/men-clothing.html',
         wait_time=100,
         callback=self.parse)