Beispiel #1
0
    def _default_scraping_routine(self, page_source):
        ##
        #   ...
        #
        #   @param {string} page_source - ...
        #
        #   @return {list}
        #

        hotel_results = []
        soup = bs4.BeautifulSoup(page_source, 'html.parser')

        # search_info_span = soup.select("#breadcrumb div:nth-of-type(5) span")
        # search_target_adults = search_info_span[0].contents[1].string.encode("utf-8").strip()
        # search_nights = search_info_span[0].contents[2].string.encode("utf-8").strip()
        # search_dates = search_info_span[0].contents[3].string.encode("utf-8").strip()

        # search_info = "{} {} {}".format(search_target_adults, search_nights, search_dates)

        hotellist_items = soup.select("#resultsContainer .hotel.listing")

        for div in hotellist_items:
            hotelname = price_text = number_of_nights_text = rating_value = \
            rating_unit = location = None

            # Extract the name of the hotel.
            strong_name = div.select(".hotelTitle > .hotelName")
            if len(strong_name) > 0:
                hotelname = strong_name[0].string.encode("utf-8").strip()

            location_tag = div.select(".hotel-info .neighborhood")
            if len(location_tag) > 0:
                location = list(
                    location_tag[0].strings)[0].encode("utf-8").strip()

            # Extract the number of stars.
            span_stars = div.select(
                "li.starRating strong.star-rating > span:nth-of-type(2)")
            if len(span_stars) > 0:
                star_classes = span_stars[0].get("class")
                for star_class in star_classes:
                    if "icon-stars" in star_class:
                        rating_value = float(star_class[-3:].replace("-", "."))
                        rating_unit = "stars"
                        break
            else:
                rating_unit = None
                rating_value = 0.0

            # Extract the price of the hotel.
            price_tag = div.select(".hotel-price .actualPrice")
            if len(price_tag) > 0:
                price_text = list(
                    price_tag[0].strings)[-1].encode("utf-8").strip()

            # Extract the number of nights, the price stands for.
            li_nights = div.select("li.avgPerNight.priceType")
            if len(li_nights) > 0:
                number_of_nights_text = list(
                    li_nights[0].strings)[-1].encode("utf-8").strip()

            if hotelname is not None:

                number_of_nights = 1

                # split price and currency.
                price, currency = \
                    CurrencyConverter.split_price_and_currency(price_text=price_text)

                currency_code = \
                    CurrencyConverter.get_currency_code_of_sign(currency_sign=currency)

                # get the normalized price from the api function
                price_norm = CurrencyConverter.get_normalized_price(
                    price=price, currency_code=currency_code)

                hotel_results.append({
                    "name":
                    hotelname,
                    "location":
                    location,
                    "price":
                    price,
                    "currency":
                    currency_code,
                    "price_norm":
                    price_norm,
                    "number_of_nights":
                    number_of_nights,
                    "rating_value":
                    rating_value,
                    "rating_unit":
                    rating_unit,
                    "access_time":
                    time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()),
                    "debug": {
                        "price_text": price_text,
                        "number_of_nights_text": number_of_nights_text,
                    },
                })

        return hotel_results
Beispiel #2
0
    def _scraping_routine(self, page_source):
        ##
        #   ...
        #
        #   @param {string} page_source - ...
        #
        #   @return {list}
        #

        regex_price_per_nights = re.compile("[0-9]+")

        hotel_results = []
        soup = bs4.BeautifulSoup(page_source, 'html.parser')

        # Extract info about the search values. For debug.
        div_search_info = soup.select(".dates-occupancy")

        search_dates = div_search_info[0].select(".search-dates")[0].string.encode("utf-8").strip()
        search_nights = div_search_info[0].select(".search-nights")[0].string.encode("utf-8").strip()
        search_rooms = div_search_info[0].select(".search-rooms")[0].string.encode("utf-8").strip()

        search_info = "{dates}, {nights}, {rooms}".format(
            dates=search_dates, nights=search_nights, rooms=search_rooms)


        # Start with the extraction of hotel information.
        hotellist_items = soup.select("div#listings > ol.listings > li.hotel")

        for div in hotellist_items:
            hotelname = price = price_text = number_of_nights = \
            number_of_nights_text = currency_code = price_norm = \
            rating_value = rating_unit = location = None

            # Extract the hotel name.
            a_name = div.select("h3.p-name a")
            if len(a_name) > 0:
                hotelname = a_name[0].string.encode("utf-8").strip()

            # Extract location.
            address_element = div.select(".contact .p-adr")
            if len(address_element) > 0:
                location = "".join(list(address_element[0].strings)[:-1]).encode('utf-8').strip()

            # Extract the stars rating.
            span_star_rating = div.select("span.star-rating.widget-star-rating-overlay")
            if len(span_star_rating) > 0:
                rating_value = float(span_star_rating[0].get("data-star-rating"))
                rating_unit = "stars"
            else:
                rating_value = 0.0

            # try to extract the normal price.
            b_price = div.select(".price b")
            if len(b_price) > 0:
                price_text = b_price[0].string.encode("utf-8").strip()

            else:
                # if price was reduced (red colored price)
                ins_price = div.select(".price span.old-price-cont ins")
                if len(ins_price) > 0:
                    price_text = ins_price[0].string.encode("utf-8").strip()

            # determine the number of nights the extracted price stands for.
            span_price_info = div.select(".price-breakdown > .price-info")
            if len(span_price_info) > 0:
                number_of_nights_text = span_price_info[0].string.encode("utf-8").strip()
                match = regex_price_per_nights.search(number_of_nights_text)
                if match != None:
                    number_of_nights = int(number_of_nights_text[match.start():match.end()])
                else:
                    # Fallback, because if the price is per night, there is not
                    # always a 1 in the string.
                    number_of_nights = 1


            # save hotel in results, if all information were extracted.
            if hotelname != None and price_text != None and number_of_nights != None:

                # split price and currency.
                price, currency = \
                    CurrencyConverter.split_price_and_currency(price_text=price_text)

                currency_code = \
                    CurrencyConverter.get_currency_code_of_sign(currency_sign=currency)

                # get the normalized price from the api function
                price_norm = CurrencyConverter.get_normalized_price(
                    price=price,
                    currency_code=currency_code
                )

                if price_norm is not None:
                    # calc price for one night
                    price_norm = round(price_norm / number_of_nights, 2)

                hotel_results.append({
                    "name" : hotelname,
                    "location" : location,
                    "price" : price,
                    "currency" : currency_code,
                    "price_norm" : price_norm,
                    "number_of_nights" : number_of_nights,
                    "rating_value" : rating_value,
                    "rating_unit" : rating_unit,
                    "access_time" : time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()),
                    "debug" : {
                        "price_text" : price_text,
                        "number_of_nights_text" : number_of_nights_text,
                        "search_info" : search_info,
                    },
                })

        return hotel_results
Beispiel #3
0
    def _alternative_scraping_routine(self, page_source):
        ##
        #   ...
        #
        #   @param {string} page_source - ...
        #
        #   @return {list}
        #

        regex_price = re.compile("([0-9]+[ .])*[0-9]+")

        hotel_results = []
        soup = bs4.BeautifulSoup(page_source, 'html.parser')

        # search_info_span = soup.select("#breadcrumb div:nth-of-type(5) span")
        # search_target_adults = search_info_span[0].contents[1].string.encode("utf-8").strip()
        # search_nights = search_info_span[0].contents[2].string.encode("utf-8").strip()
        # search_dates = search_info_span[0].contents[3].string.encode("utf-8").strip()

        # search_info = "{} {} {}".format(search_target_adults, search_nights, search_dates)

        hotellist_items = soup.select(
            ".hotelSlimResultsModuleMod > div > div.hotel-result")

        for div in hotellist_items:
            hotelname = price_text = number_of_nights_text = None

            a_name = div.select("h2.hotel-result-title > a")
            if len(a_name) > 0:
                hotelname = a_name[0].string.encode("utf-8").strip()

            b_price = div.select(".primary-price strong")
            if len(b_price) > 0:
                price_text = b_price[0].string.encode("utf-8").strip()

            # per night info
            div_nights = div.select(".rate-choice-msg")
            if len(div_nights) > 0:
                number_of_nights_text = div_nights[0].string.encode(
                    "utf-8").strip()

            if hotelname != None:

                number_of_nights = 1

                # split price and currency.
                price, currency = \
                    CurrencyConverter.split_price_and_currency(price_text=price_text)

                currency_code = \
                    CurrencyConverter.get_currency_code_of_sign(currency_sign=currency)

                # get the normalized price from the api function
                price_norm = CurrencyConverter.get_normalized_price(
                    price=price, currency_code=currency_code)

                hotel_results.append({
                    "name":
                    hotelname,
                    "price":
                    price,
                    "currency":
                    currency_code,
                    "price_norm":
                    price_norm,
                    "number_of_nights":
                    number_of_nights,
                    "access_time":
                    time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()),
                    "debug": {
                        "price_text": price_text,
                        "number_of_nights_text": number_of_nights_text,
                    },
                })

        return hotel_results
Beispiel #4
0
    def _default_scraping_routine(self, page_source):

        car_results = []

        soup = bs4.BeautifulSoup(page_source, 'html.parser')

        car_items = soup.select("#vehPresentation .listOfVehicles .carView")

        for car_item in car_items:
            price_total_text = car_class = car_model = currency_total = \
            transmission = None

            # Determine the class of the car.
            car_class_element = car_item.select(".brandName h2")
            if len(car_class_element) > 0:
                car_class = car_class_element[0].string.encode("utf-8").strip()

            # Determine the model of the car.
            car_model_element = car_item.select(".brandName .moreDet")
            if len(car_model_element) > 0:
                car_model = list(car_model_element[0].strings)[1].replace(
                    "\t", "").replace("\n", " ").strip()
                # car_model += " - " + list(car_model_element[0].strings)[2].replace("\t", "").replace("\n", " ").strip()

            # Determine transmission.
            car_transmission_element = car_item.select(
                ".featureList li:nth-of-type(2) p:nth-of-type(1)")
            if len(car_transmission_element) > 0:
                transmission = car_transmission_element[0].string.encode(
                    "utf-8").strip().lower()

            # Determine the price of the car.
            price_total_element = car_item.select(
                ".colHalf_payLater .pricePD .price")
            if len(price_total_element) > 0:
                price_total_text = price_total_element[0].string.encode(
                    "utf-8").strip()

            # Determine the currency of the price.
            currency_total_element = car_item.select(
                ".colHalf_payLater .pricePD span.setTop")
            if len(currency_total_element) > 0:
                currency_total = currency_total_element[0].string.encode(
                    "utf-8").strip()

            # Check if all information of the current car are available.
            if price_total_text is not None and car_class is not None and \
                currency_total is not None and car_model is not None:
                # Normalize the daily and total price for the current car.
                #

                price_total = float(
                    price_total_text.replace(",", ".").replace("\xc2\xa0",
                                                               "").strip())

                currency_code_total = \
                    CurrencyConverter.get_currency_code_of_sign(currency_sign=currency_total)

                # get the normalized total price from the api function
                price_norm_total = CurrencyConverter.get_normalized_price(
                    price=price_total, currency_code=currency_code_total)

                # Store all the received information in the result list.
                car_results.append(
                    {
                        "car_model":
                        car_model,
                        "car_class":
                        car_class,
                        "transmission":
                        transmission,
                        "price_total":
                        price_total,
                        "price_norm_total":
                        price_norm_total,
                        "currency":
                        currency_code_total,
                        "access_time":
                        time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()),
                    }, )

        return car_results
Beispiel #5
0
    def _mobile_scraping_routine(self, page_source):

        car_results = []

        soup = bs4.BeautifulSoup(page_source, 'html.parser')

        car_sections = soup.select(".car-selector > section")

        for car_section in car_sections:
            # Determine the car items of the current section.
            car_items = car_section.select(".cars-list .car")

            for car_item in car_items:
                price_total_text = car_class = car_model = currency_total = \
                transmission = None

                # Determine the class of the car.
                car_class_element = car_item.select(".status-label")
                if len(car_class_element) > 0:
                    car_class = car_class_element[0].string.encode(
                        "utf-8").strip()

                # Determine the model of the car.
                car_model_element = car_item.select(".car-class-desc")
                if len(car_model_element) > 0:
                    car_model = car_model_element[0].string.encode(
                        "utf-8").strip()

                # Determine transmission.
                car_transmission_element = car_item.select(".transmission i")
                if len(car_transmission_element) > 0:
                    trans_class = car_transmission_element[0].get("class")
                    if "icon-automatic" in trans_class:
                        transmission = "automatic"
                    elif "icon-manual" in trans_class:
                        transmission = "manual"

                # Determine the price of the car.
                price_total_element = car_item.select("span.price")
                if len(price_total_element) > 0:
                    price_total_text = price_total_element[0].string.encode(
                        "utf-8").strip()

                # Determine the currency of the price.
                currency_total_element = car_item.select("span.currency")
                if len(currency_total_element) > 0:
                    currency_total = currency_total_element[0].string.encode(
                        "utf-8").strip()

                # Check if all information of the current car are available.
                if price_total_text is not None and car_class is not None and \
                    currency_total is not None and car_model is not None:
                    # Normalize the daily and total price for the current car.
                    price_total = float(
                        price_total_text.replace(",",
                                                 ".").replace("\xc2\xa0",
                                                              "").strip())

                    currency_code_total = \
                        CurrencyConverter.get_currency_code_of_sign(currency_sign=currency_total)

                    # get the normalized total price from the api function
                    price_norm_total = CurrencyConverter.get_normalized_price(
                        price=price_total, currency_code=currency_code_total)

                    # Store all the received information in the result list.
                    car_results.append(
                        {
                            "car_model":
                            car_model,
                            "transmission":
                            transmission,
                            "car_class":
                            car_class,
                            "price_total":
                            price_total,
                            "price_norm_total":
                            price_norm_total,
                            "currency":
                            currency_code_total,
                            "access_time":
                            time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()),
                        }, )

        return car_results
Beispiel #6
0
    def _scraping_routine(self, page_source):

        orbitz_car_results = []

        bsObj = bs4.BeautifulSoup(page_source, 'html.parser')

        hotellist_items = bsObj.select("#search-results .listing-wrapper")

        for div in hotellist_items:
            price_daily_text = price_total_text = car_class = company_name = \
            car_model = None

            # Determine the daily price of the current car.
            price_daily_element = div.find_all("div", {"class":"full-price"})
            if len(price_daily_element) > 0:
                price_daily_text = self._encode_and_strip(price_daily_element[0])

            # Determine the total price of the current car.
            price_total_element = div.find_all("div", {"class":"total"})
            if len(price_total_element) > 0:
                # TODO split "total" string from price.
                price_total_text = self._encode_and_strip(price_total_element[0])

            # Determine the class of the car.
            car_class_element = div.select("div.fullName span")
            if len(car_class_element) > 0:
                car_class = self._encode_and_strip(car_class_element[0])

            # Determine the company name of the current car.
            company_name_element = div.select("div.vendor-image-box img")
            if len(company_name_element) > 0:
                # TODO Test whether the alt content is found or not.
                company_name = self._encode_and_strip(company_name_element[0]["alt"])

            # Determine the model of the current car.
            car_model_element = div.select(".car-model")
            if len(car_model_element) > 0:
                car_model = self._encode_and_strip(car_model_element[0])


            # Check if all information of the current car are available.
            if price_daily_text is not None and price_total_text is not None and \
                car_class is not None and company_name is not None and car_model is not None:

                # Normalize the daily and total price for the current car.
                #
                # split price and currency of daily price.
                price_daily, currency_daily = \
                    CurrencyConverter.split_price_and_currency(price_text=price_daily_text)

                currency_code_daily = \
                    CurrencyConverter.get_currency_code_of_sign(currency_sign=currency_daily)

                # get the normalized daily price from the api function
                price_norm_daily = CurrencyConverter.get_normalized_price(
                    price=price_daily,
                    currency_code=currency_code_daily
                )

                # split price and currency of total price.
                price_total, currency_total = \
                    CurrencyConverter.split_price_and_currency(price_text=price_total_text)

                currency_code_total = \
                    CurrencyConverter.get_currency_code_of_sign(currency_sign=currency_total)

                # get the normalized total price from the api function
                price_norm_total = CurrencyConverter.get_normalized_price(
                    price=price_total,
                    currency_code=currency_code_total
                )

                # Store all the received information in the result list.
                orbitz_car_results.append({
                    "company_name": company_name,
                    "car_model": car_model,
                    "car_class" : car_class,
                    "price_daily": price_daily,
                    "price_norm_daily" : price_norm_daily,
                    "price_total": price_total,
                    "price_norm_total" : price_norm_total,
                    "currency" : currency_code_daily,
                    "access_time": time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()),
                    },
                )

        return orbitz_car_results
Beispiel #7
0
    def _touch_scraping_routine(self, page_source):

        hotel_results = []

        soup = bs4.BeautifulSoup(page_source, 'html.parser')

        hotellist_items = soup.select("#resultList .listItem")

        for div in hotellist_items:
            hotelname = price = currency_code = price_norm = None

            # Extract hotel name.
            div_name = div.select(".hotelData > .labeled")
            if len(div_name) > 0:
                hotelname = div_name[0].string.encode("utf-8").strip()

            # Extract rating information (stars)
            stars_element = div.select(".hotelData .smaller .stars")
            if len(stars_element) > 0:
                stars_element_classes = stars_element[0].get("class")
                # Search the sX class which indicates the number of stars.
                for star_class in stars_element_classes:
                    if star_class[-1].isdigit():
                        rating_value = float(star_class[1:])
                rating_unit = "stars"

            else:
                rating_value = 0.0
                rating_unit = None

            # Extract hotel price.
            span_price = div.select(".hotelData > .priceInfo span.price")
            if len(span_price) > 0:
                price_text = span_price[0].string.encode("utf-8").strip()

                # Extract the price and currency from the price string.
                price, currency = \
                    CurrencyConverter.split_price_and_currency(price_text=price_text)

                # Get the currency code for the extracted currency.
                currency_code = \
                    CurrencyConverter.get_currency_code_of_sign(currency_sign=currency)

                # get the normalized price from the api function
                price_norm = CurrencyConverter.get_normalized_price(
                    price=price, currency_code=currency_code)

            if hotelname != None and price != None and price_norm != None:

                # this information is not available on the website.
                # it seems to be always the price per night
                number_of_nights = 1

                hotel_results.append({
                    "name":
                    hotelname,
                    "price":
                    price,
                    "currency":
                    currency_code,
                    "price_norm":
                    price_norm,
                    "number_of_nights":
                    number_of_nights,
                    "rating_value":
                    rating_value,
                    "rating_unit":
                    rating_unit,
                    "access_time":
                    time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()),
                })

        return hotel_results
Beispiel #8
0
    def _scraping_routine(self, page_source):

        hotel_results = []

        soup = bs4.BeautifulSoup(page_source, 'html.parser')

        hotellist_items = soup.select(
            "div#containerAllHotels > .hotelTeaserContainer")

        for div in hotellist_items:
            hotelname = price = currency_code = price_norm = \
            rating_value = rating_unit = None

            # Extract hotel name.
            a_hotelname = div.select(".hotelname > a:nth-of-type(1)")
            if len(a_hotelname) > 0:
                hotelname = a_hotelname[0].string.encode("utf-8").strip()

            # Extract rating information (stars)
            stars_element = div.select(".hotelname > span:nth-of-type(1)")
            if len(stars_element) > 0:
                stars_element_classes = stars_element[0].get("class")
                stars_class = stars_element_classes[0]
                rating_value = float(stars_class[5:])
                rating_unit = "stars"
            else:
                rating_value = 0.0
                rating_unit = None

            # Extract price of hotel.
            for strong_price in div.select(
                    ".priceContainer > .standardPrice > strong"):
                contents = strong_price.contents

                big_money = contents[0].encode("utf-8").replace(
                    "\xc2\xa0", "").replace(".", "").strip()
                small_money = strong_price.find_all("sup")[0].string.encode(
                    "utf-8").strip()
                if len(small_money) < 2:
                    small_money = "00"

                # Remove all non-numeric values from big_money
                big_money = re.sub("[^0-9]", "", big_money)

                price = float("{0}.{1}".format(big_money, small_money))

                span_currency = strong_price.find_all("span")
                if len(span_currency) > 0:
                    currency = span_currency[0].string.encode("utf-8").strip()
                else:
                    currency = contents[2].encode("utf-8").strip()

                currency_code = CurrencyConverter.get_currency_code_of_sign(
                    currency)

                # get the normalized price from the api function
                price_norm = CurrencyConverter.get_normalized_price(
                    price=price, currency_code=currency_code)

            if hotelname != None and price != None:

                # this information is not available on the website.
                # it seems to be always the price per night
                number_of_nights = 1

                hotel_results.append({
                    "name":
                    hotelname,
                    "price":
                    price,
                    "currency":
                    currency_code,
                    "price_norm":
                    price_norm,
                    "number_of_nights":
                    number_of_nights,
                    "rating_value":
                    rating_value,
                    "rating_unit":
                    rating_unit,
                    "access_time":
                    time.strftime("%d-%m-%Y %H:%M:%S", time.gmtime()),
                })

        return hotel_results