def _default_scraping_routine(self, page_source): ## # ... # # @param {string} page_source - ... # # @return {list} # hotel_results = [] soup = bs4.BeautifulSoup(page_source, 'html.parser') # search_info_span = soup.select("#breadcrumb div:nth-of-type(5) span") # search_target_adults = search_info_span[0].contents[1].string.encode("utf-8").strip() # search_nights = search_info_span[0].contents[2].string.encode("utf-8").strip() # search_dates = search_info_span[0].contents[3].string.encode("utf-8").strip() # search_info = "{} {} {}".format(search_target_adults, search_nights, search_dates) hotellist_items = soup.select("#resultsContainer .hotel.listing") for div in hotellist_items: hotelname = price_text = number_of_nights_text = rating_value = \ rating_unit = location = None # Extract the name of the hotel. strong_name = div.select(".hotelTitle > .hotelName") if len(strong_name) > 0: hotelname = strong_name[0].string.encode("utf-8").strip() location_tag = div.select(".hotel-info .neighborhood") if len(location_tag) > 0: location = list( location_tag[0].strings)[0].encode("utf-8").strip() # Extract the number of stars. span_stars = div.select( "li.starRating strong.star-rating > span:nth-of-type(2)") if len(span_stars) > 0: star_classes = span_stars[0].get("class") for star_class in star_classes: if "icon-stars" in star_class: rating_value = float(star_class[-3:].replace("-", ".")) rating_unit = "stars" break else: rating_unit = None rating_value = 0.0 # Extract the price of the hotel. price_tag = div.select(".hotel-price .actualPrice") if len(price_tag) > 0: price_text = list( price_tag[0].strings)[-1].encode("utf-8").strip() # Extract the number of nights, the price stands for. li_nights = div.select("li.avgPerNight.priceType") if len(li_nights) > 0: number_of_nights_text = list( li_nights[0].strings)[-1].encode("utf-8").strip() if hotelname is not None: number_of_nights = 1 # split price and currency. price, currency = \ CurrencyConverter.split_price_and_currency(price_text=price_text) currency_code = \ CurrencyConverter.get_currency_code_of_sign(currency_sign=currency) # get the normalized price from the api function price_norm = CurrencyConverter.get_normalized_price( price=price, currency_code=currency_code) hotel_results.append({ "name": hotelname, "location": location, "price": price, "currency": currency_code, "price_norm": price_norm, "number_of_nights": number_of_nights, "rating_value": rating_value, "rating_unit": rating_unit, "access_time": time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()), "debug": { "price_text": price_text, "number_of_nights_text": number_of_nights_text, }, }) return hotel_results
def _scraping_routine(self, page_source): ## # ... # # @param {string} page_source - ... # # @return {list} # regex_price_per_nights = re.compile("[0-9]+") hotel_results = [] soup = bs4.BeautifulSoup(page_source, 'html.parser') # Extract info about the search values. For debug. div_search_info = soup.select(".dates-occupancy") search_dates = div_search_info[0].select(".search-dates")[0].string.encode("utf-8").strip() search_nights = div_search_info[0].select(".search-nights")[0].string.encode("utf-8").strip() search_rooms = div_search_info[0].select(".search-rooms")[0].string.encode("utf-8").strip() search_info = "{dates}, {nights}, {rooms}".format( dates=search_dates, nights=search_nights, rooms=search_rooms) # Start with the extraction of hotel information. hotellist_items = soup.select("div#listings > ol.listings > li.hotel") for div in hotellist_items: hotelname = price = price_text = number_of_nights = \ number_of_nights_text = currency_code = price_norm = \ rating_value = rating_unit = location = None # Extract the hotel name. a_name = div.select("h3.p-name a") if len(a_name) > 0: hotelname = a_name[0].string.encode("utf-8").strip() # Extract location. address_element = div.select(".contact .p-adr") if len(address_element) > 0: location = "".join(list(address_element[0].strings)[:-1]).encode('utf-8').strip() # Extract the stars rating. span_star_rating = div.select("span.star-rating.widget-star-rating-overlay") if len(span_star_rating) > 0: rating_value = float(span_star_rating[0].get("data-star-rating")) rating_unit = "stars" else: rating_value = 0.0 # try to extract the normal price. b_price = div.select(".price b") if len(b_price) > 0: price_text = b_price[0].string.encode("utf-8").strip() else: # if price was reduced (red colored price) ins_price = div.select(".price span.old-price-cont ins") if len(ins_price) > 0: price_text = ins_price[0].string.encode("utf-8").strip() # determine the number of nights the extracted price stands for. span_price_info = div.select(".price-breakdown > .price-info") if len(span_price_info) > 0: number_of_nights_text = span_price_info[0].string.encode("utf-8").strip() match = regex_price_per_nights.search(number_of_nights_text) if match != None: number_of_nights = int(number_of_nights_text[match.start():match.end()]) else: # Fallback, because if the price is per night, there is not # always a 1 in the string. number_of_nights = 1 # save hotel in results, if all information were extracted. if hotelname != None and price_text != None and number_of_nights != None: # split price and currency. price, currency = \ CurrencyConverter.split_price_and_currency(price_text=price_text) currency_code = \ CurrencyConverter.get_currency_code_of_sign(currency_sign=currency) # get the normalized price from the api function price_norm = CurrencyConverter.get_normalized_price( price=price, currency_code=currency_code ) if price_norm is not None: # calc price for one night price_norm = round(price_norm / number_of_nights, 2) hotel_results.append({ "name" : hotelname, "location" : location, "price" : price, "currency" : currency_code, "price_norm" : price_norm, "number_of_nights" : number_of_nights, "rating_value" : rating_value, "rating_unit" : rating_unit, "access_time" : time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()), "debug" : { "price_text" : price_text, "number_of_nights_text" : number_of_nights_text, "search_info" : search_info, }, }) return hotel_results
def _alternative_scraping_routine(self, page_source): ## # ... # # @param {string} page_source - ... # # @return {list} # regex_price = re.compile("([0-9]+[ .])*[0-9]+") hotel_results = [] soup = bs4.BeautifulSoup(page_source, 'html.parser') # search_info_span = soup.select("#breadcrumb div:nth-of-type(5) span") # search_target_adults = search_info_span[0].contents[1].string.encode("utf-8").strip() # search_nights = search_info_span[0].contents[2].string.encode("utf-8").strip() # search_dates = search_info_span[0].contents[3].string.encode("utf-8").strip() # search_info = "{} {} {}".format(search_target_adults, search_nights, search_dates) hotellist_items = soup.select( ".hotelSlimResultsModuleMod > div > div.hotel-result") for div in hotellist_items: hotelname = price_text = number_of_nights_text = None a_name = div.select("h2.hotel-result-title > a") if len(a_name) > 0: hotelname = a_name[0].string.encode("utf-8").strip() b_price = div.select(".primary-price strong") if len(b_price) > 0: price_text = b_price[0].string.encode("utf-8").strip() # per night info div_nights = div.select(".rate-choice-msg") if len(div_nights) > 0: number_of_nights_text = div_nights[0].string.encode( "utf-8").strip() if hotelname != None: number_of_nights = 1 # split price and currency. price, currency = \ CurrencyConverter.split_price_and_currency(price_text=price_text) currency_code = \ CurrencyConverter.get_currency_code_of_sign(currency_sign=currency) # get the normalized price from the api function price_norm = CurrencyConverter.get_normalized_price( price=price, currency_code=currency_code) hotel_results.append({ "name": hotelname, "price": price, "currency": currency_code, "price_norm": price_norm, "number_of_nights": number_of_nights, "access_time": time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()), "debug": { "price_text": price_text, "number_of_nights_text": number_of_nights_text, }, }) return hotel_results
def _default_scraping_routine(self, page_source): car_results = [] soup = bs4.BeautifulSoup(page_source, 'html.parser') car_items = soup.select("#vehPresentation .listOfVehicles .carView") for car_item in car_items: price_total_text = car_class = car_model = currency_total = \ transmission = None # Determine the class of the car. car_class_element = car_item.select(".brandName h2") if len(car_class_element) > 0: car_class = car_class_element[0].string.encode("utf-8").strip() # Determine the model of the car. car_model_element = car_item.select(".brandName .moreDet") if len(car_model_element) > 0: car_model = list(car_model_element[0].strings)[1].replace( "\t", "").replace("\n", " ").strip() # car_model += " - " + list(car_model_element[0].strings)[2].replace("\t", "").replace("\n", " ").strip() # Determine transmission. car_transmission_element = car_item.select( ".featureList li:nth-of-type(2) p:nth-of-type(1)") if len(car_transmission_element) > 0: transmission = car_transmission_element[0].string.encode( "utf-8").strip().lower() # Determine the price of the car. price_total_element = car_item.select( ".colHalf_payLater .pricePD .price") if len(price_total_element) > 0: price_total_text = price_total_element[0].string.encode( "utf-8").strip() # Determine the currency of the price. currency_total_element = car_item.select( ".colHalf_payLater .pricePD span.setTop") if len(currency_total_element) > 0: currency_total = currency_total_element[0].string.encode( "utf-8").strip() # Check if all information of the current car are available. if price_total_text is not None and car_class is not None and \ currency_total is not None and car_model is not None: # Normalize the daily and total price for the current car. # price_total = float( price_total_text.replace(",", ".").replace("\xc2\xa0", "").strip()) currency_code_total = \ CurrencyConverter.get_currency_code_of_sign(currency_sign=currency_total) # get the normalized total price from the api function price_norm_total = CurrencyConverter.get_normalized_price( price=price_total, currency_code=currency_code_total) # Store all the received information in the result list. car_results.append( { "car_model": car_model, "car_class": car_class, "transmission": transmission, "price_total": price_total, "price_norm_total": price_norm_total, "currency": currency_code_total, "access_time": time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()), }, ) return car_results
def _mobile_scraping_routine(self, page_source): car_results = [] soup = bs4.BeautifulSoup(page_source, 'html.parser') car_sections = soup.select(".car-selector > section") for car_section in car_sections: # Determine the car items of the current section. car_items = car_section.select(".cars-list .car") for car_item in car_items: price_total_text = car_class = car_model = currency_total = \ transmission = None # Determine the class of the car. car_class_element = car_item.select(".status-label") if len(car_class_element) > 0: car_class = car_class_element[0].string.encode( "utf-8").strip() # Determine the model of the car. car_model_element = car_item.select(".car-class-desc") if len(car_model_element) > 0: car_model = car_model_element[0].string.encode( "utf-8").strip() # Determine transmission. car_transmission_element = car_item.select(".transmission i") if len(car_transmission_element) > 0: trans_class = car_transmission_element[0].get("class") if "icon-automatic" in trans_class: transmission = "automatic" elif "icon-manual" in trans_class: transmission = "manual" # Determine the price of the car. price_total_element = car_item.select("span.price") if len(price_total_element) > 0: price_total_text = price_total_element[0].string.encode( "utf-8").strip() # Determine the currency of the price. currency_total_element = car_item.select("span.currency") if len(currency_total_element) > 0: currency_total = currency_total_element[0].string.encode( "utf-8").strip() # Check if all information of the current car are available. if price_total_text is not None and car_class is not None and \ currency_total is not None and car_model is not None: # Normalize the daily and total price for the current car. price_total = float( price_total_text.replace(",", ".").replace("\xc2\xa0", "").strip()) currency_code_total = \ CurrencyConverter.get_currency_code_of_sign(currency_sign=currency_total) # get the normalized total price from the api function price_norm_total = CurrencyConverter.get_normalized_price( price=price_total, currency_code=currency_code_total) # Store all the received information in the result list. car_results.append( { "car_model": car_model, "transmission": transmission, "car_class": car_class, "price_total": price_total, "price_norm_total": price_norm_total, "currency": currency_code_total, "access_time": time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()), }, ) return car_results
def _scraping_routine(self, page_source): orbitz_car_results = [] bsObj = bs4.BeautifulSoup(page_source, 'html.parser') hotellist_items = bsObj.select("#search-results .listing-wrapper") for div in hotellist_items: price_daily_text = price_total_text = car_class = company_name = \ car_model = None # Determine the daily price of the current car. price_daily_element = div.find_all("div", {"class":"full-price"}) if len(price_daily_element) > 0: price_daily_text = self._encode_and_strip(price_daily_element[0]) # Determine the total price of the current car. price_total_element = div.find_all("div", {"class":"total"}) if len(price_total_element) > 0: # TODO split "total" string from price. price_total_text = self._encode_and_strip(price_total_element[0]) # Determine the class of the car. car_class_element = div.select("div.fullName span") if len(car_class_element) > 0: car_class = self._encode_and_strip(car_class_element[0]) # Determine the company name of the current car. company_name_element = div.select("div.vendor-image-box img") if len(company_name_element) > 0: # TODO Test whether the alt content is found or not. company_name = self._encode_and_strip(company_name_element[0]["alt"]) # Determine the model of the current car. car_model_element = div.select(".car-model") if len(car_model_element) > 0: car_model = self._encode_and_strip(car_model_element[0]) # Check if all information of the current car are available. if price_daily_text is not None and price_total_text is not None and \ car_class is not None and company_name is not None and car_model is not None: # Normalize the daily and total price for the current car. # # split price and currency of daily price. price_daily, currency_daily = \ CurrencyConverter.split_price_and_currency(price_text=price_daily_text) currency_code_daily = \ CurrencyConverter.get_currency_code_of_sign(currency_sign=currency_daily) # get the normalized daily price from the api function price_norm_daily = CurrencyConverter.get_normalized_price( price=price_daily, currency_code=currency_code_daily ) # split price and currency of total price. price_total, currency_total = \ CurrencyConverter.split_price_and_currency(price_text=price_total_text) currency_code_total = \ CurrencyConverter.get_currency_code_of_sign(currency_sign=currency_total) # get the normalized total price from the api function price_norm_total = CurrencyConverter.get_normalized_price( price=price_total, currency_code=currency_code_total ) # Store all the received information in the result list. orbitz_car_results.append({ "company_name": company_name, "car_model": car_model, "car_class" : car_class, "price_daily": price_daily, "price_norm_daily" : price_norm_daily, "price_total": price_total, "price_norm_total" : price_norm_total, "currency" : currency_code_daily, "access_time": time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()), }, ) return orbitz_car_results
def _touch_scraping_routine(self, page_source): hotel_results = [] soup = bs4.BeautifulSoup(page_source, 'html.parser') hotellist_items = soup.select("#resultList .listItem") for div in hotellist_items: hotelname = price = currency_code = price_norm = None # Extract hotel name. div_name = div.select(".hotelData > .labeled") if len(div_name) > 0: hotelname = div_name[0].string.encode("utf-8").strip() # Extract rating information (stars) stars_element = div.select(".hotelData .smaller .stars") if len(stars_element) > 0: stars_element_classes = stars_element[0].get("class") # Search the sX class which indicates the number of stars. for star_class in stars_element_classes: if star_class[-1].isdigit(): rating_value = float(star_class[1:]) rating_unit = "stars" else: rating_value = 0.0 rating_unit = None # Extract hotel price. span_price = div.select(".hotelData > .priceInfo span.price") if len(span_price) > 0: price_text = span_price[0].string.encode("utf-8").strip() # Extract the price and currency from the price string. price, currency = \ CurrencyConverter.split_price_and_currency(price_text=price_text) # Get the currency code for the extracted currency. currency_code = \ CurrencyConverter.get_currency_code_of_sign(currency_sign=currency) # get the normalized price from the api function price_norm = CurrencyConverter.get_normalized_price( price=price, currency_code=currency_code) if hotelname != None and price != None and price_norm != None: # this information is not available on the website. # it seems to be always the price per night number_of_nights = 1 hotel_results.append({ "name": hotelname, "price": price, "currency": currency_code, "price_norm": price_norm, "number_of_nights": number_of_nights, "rating_value": rating_value, "rating_unit": rating_unit, "access_time": time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()), }) return hotel_results
def _scraping_routine(self, page_source): hotel_results = [] soup = bs4.BeautifulSoup(page_source, 'html.parser') hotellist_items = soup.select( "div#containerAllHotels > .hotelTeaserContainer") for div in hotellist_items: hotelname = price = currency_code = price_norm = \ rating_value = rating_unit = None # Extract hotel name. a_hotelname = div.select(".hotelname > a:nth-of-type(1)") if len(a_hotelname) > 0: hotelname = a_hotelname[0].string.encode("utf-8").strip() # Extract rating information (stars) stars_element = div.select(".hotelname > span:nth-of-type(1)") if len(stars_element) > 0: stars_element_classes = stars_element[0].get("class") stars_class = stars_element_classes[0] rating_value = float(stars_class[5:]) rating_unit = "stars" else: rating_value = 0.0 rating_unit = None # Extract price of hotel. for strong_price in div.select( ".priceContainer > .standardPrice > strong"): contents = strong_price.contents big_money = contents[0].encode("utf-8").replace( "\xc2\xa0", "").replace(".", "").strip() small_money = strong_price.find_all("sup")[0].string.encode( "utf-8").strip() if len(small_money) < 2: small_money = "00" # Remove all non-numeric values from big_money big_money = re.sub("[^0-9]", "", big_money) price = float("{0}.{1}".format(big_money, small_money)) span_currency = strong_price.find_all("span") if len(span_currency) > 0: currency = span_currency[0].string.encode("utf-8").strip() else: currency = contents[2].encode("utf-8").strip() currency_code = CurrencyConverter.get_currency_code_of_sign( currency) # get the normalized price from the api function price_norm = CurrencyConverter.get_normalized_price( price=price, currency_code=currency_code) if hotelname != None and price != None: # this information is not available on the website. # it seems to be always the price per night number_of_nights = 1 hotel_results.append({ "name": hotelname, "price": price, "currency": currency_code, "price_norm": price_norm, "number_of_nights": number_of_nights, "rating_value": rating_value, "rating_unit": rating_unit, "access_time": time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()), }) return hotel_results