def get_next_page_url(self): Logger.log_it("Retrieving next page url...") next_url = self.driver.find_element_by_css_selector( 'div.ui_pagination a.next').get_attribute("href") f = lambda x: "None" if next_url is None else next_url Logger.log_it("Next url: " + f(next_url)) return next_url
def save_to_file(reviews, location_name, current_page, last_page): filename = 'scraped_data/data_reviews/selenium_reviews-%s-%s-%s.csv' % ( location_name, current_page, last_page) with open(filename, 'w') as f: f.write(Review.get_csv_header()) for review in reviews: f.write(review.get_csv_line()) Logger.log_it('Saved file %s' % filename)
def next_page(self): try: if not self.is_all_languages_selected(): Logger.log_it( "All language not selected... selecting all language") self.select_all_languages() self.driver.find_element_by_css_selector( 'div.ui_pagination a.next').click() except WebDriverException: Logger.log_it("There is no more pages!") self.driver.implicitly_wait(2)
def __init__(self, url): Logger.log_it("##########################################") self.timer = Timer() self.timer.start_timer() # self.driver = gecko_utils.get_gecko_driver() self.driver = webdriver.Firefox() # driver.add_cookie({'name': 'TALanguage', 'value': 'ALL'}) self.driver.get(url) self.driver.implicitly_wait(2)
def wrapper(*args, **kwargs): counter = 3 while counter != 0: try: result = f(*args, **kwargs) return result except StaleElementReferenceException: Logger.log_it("Stale element... retrying") counter -= 1 except WebDriverException: Logger.log_it("Web driver exception... retrying") counter -= 1 return None
def has_next_review_page(self): Logger.log_it("Checking if next page exists...") return not (self.get_next_page_url() is None)
def stop_spider(self): Logger.log_it("-------------------------------------------") self.driver.close() self.timer.stop_timer() Logger.log_it(self.timer.print_time())
def refresh_page(self): Logger.log_it("Refreshing") self.driver.refresh()
import sys sys.path.append("..") from scrapy import cmdline from os import listdir from masters.utils.logger_utils import Logger from masters.utils.file_utils import location_scraped print("Scraper started...") # cmdline.execute("scrapy crawl locations".split()) # # exit(47 # cmdline.execute("scrapy crawl reviews -a location=/Attraction_Review-g1887526-d11849033-Reviews-OLIMPIJCI_IZ_CRNE_NA_KOROSKEM_Olympians_from_Crna_na_Koroskem-Crna_na_Koroskem_.html".split()) root = "scraped_data/data_attractions/" for attraction in listdir(root): f = open(root + attraction, 'r') while True: location = f.readline() location = location.rstrip() if not location: break if not location_scraped(location): os.system("scrapy crawl reviews -a location=" + location) Logger.log_location(location) else: print("location already scraped: " + location)