Beispiel #1
0
    def from_string(cls, url):
        parsed_url = urlparse(url)
        path_parts = parsed_url.path.split('/')[1:-1]
        transaction, category, city, street = None, 'nieruchomosci', None, None
        for i, path_part in enumerate(path_parts):
            if path_part == 'nieruchomosci':
                pass
            elif path_part in POSSIBLE_TRANSACTIONS:
                transaction = path_part
            elif path_part in POSSIBLE_CATEGORIES:
                category = path_part
            elif not city:
                city = path_part
            else:
                street = path_part

        filters = {}
        query_params = parsed_url.query.split('ps')[1:]
        for i, query_param in enumerate(query_params):
            query_param, value = replace_all(query_param, {
                '%5B': '[',
                '%5D': ']',
                '&': ''
            }).split('=')
            filters[query_param] = value

        return cls(category, city, street, transaction, filters)
Beispiel #2
0
def get_city_for_offer(item, *args, **kwargs):
    """ Parse city information

    :param item:
    :param args:
    :param kwargs:
    :return: name of city
    :rtype: str
    """
    nav = item.text.split('\n\n')
    return replace_all(nav[4], {' ': ''})
Beispiel #3
0
def get_voivodeship_for_offer(item, *args, **kwargs):
    """ Parse voivodeship information

    :param item:
    :param args:
    :param kwargs:
    :return: name of voivodeship
    :rtype: str
    """
    nav = item.text.split('\n\n')
    return replace_all(nav[3], {' ': ''})
Beispiel #4
0
def encode_text_to_html(text):
    """ Change text to lower cases, gets rid of polish characters replacing them with simplified version,
    replaces spaces with dashes
    :param text: text to encode
    :type text: str
    :return: encoded text which can be used in url
    :rtype: str
    """
    replace_dict = POLISH_CHARACTERS_MAPPING
    replace_dict.update({' ': '-'})
    return replace_all(text.lower(), replace_dict)
Beispiel #5
0
def get_floor_for_offer(item, *args, **kwargs):
    """ Parse floor information

    :param item:
    :param args:
    :param kwargs:
    :return: number of floor
    :rtype: int
    """
    if not item:
        return None
    floor_raw = item.find_parent('tr').find('td').text
    floor_sanitized = replace_all(floor_raw, {'\n': '', ' ': ''}).split('/')[0]
    floor = int(floor_sanitized) if floor_sanitized != 'parter' else 0
    return floor
Beispiel #6
0
def get_offer_apartment_details(html_parser):
    """
    This method returns detailed information about the apartment.
    :param html_parser: a BeautifulSoup object
    :rtype: dict
    :return: A dictionary full of details.
    """
    raw_data = html_parser.find(class_="oferta")
    details_dict = {}
    replace_dict = {"\xa0": "", "Negocjuj cenę": "", "\n": ", "}
    while True:
        try:
            if raw_data.find_all("li"):
                item_list = raw_data.find_all("li")
                for detail in item_list:
                    details_dict[detail.span.contents[0]] = replace_all(
                        detail.div.text.strip("\n"), replace_dict)
            else:
                if raw_data.h4.contents[0] == "Opis dodatkowy":
                    raw_data = raw_data.find_next_sibling("div")
                    continue
                item_list = raw_data.find_all("p")
                for detail in item_list:
                    if raw_data.h4.text not in details_dict:
                        details_dict[raw_data.h4.contents[0]] = replace_all(
                            detail.text.strip("\n"), replace_dict)
                    else:
                        details_dict[raw_data.h4.contents[0]] += replace_all(
                            detail.text.strip("\n"), replace_dict)
            raw_data = raw_data.find_next_sibling("div")
        except AttributeError:
            break
    available_from_date = details_dict.get("Wolne od")
    if available_from_date:
        details_dict["Wolne od"] = parse_date_to_timestamp(available_from_date)
    return details_dict
Beispiel #7
0
def city_name(city):
    """ Creates valid OLX url city name

    OLX city name can't include polish characters, upper case letters.
    It also should replace white spaces with dashes.

    :param city: City name not in OLX url format
    :type city: str
    :return: Valid OLX url city name
    :rtype: str

    :Example:

    >> city_name("Ruda Śląska")
    "ruda-slaska"
    """
    output = replace_all(city.lower(),
                         POLISH_CHARACTERS_MAPPING).replace(" ", "-")
    if sys.version_info < (3, 3):
        return output.encode('utf-8')
    else:
        return output
Beispiel #8
0
def test_replace_all(text, dic, expected_value):
    assert utils.replace_all(text, dic) == expected_value
Beispiel #9
0
def get_offer_information(url, context=None):
    """
    Scrape detailed information about an OtoDom offer.

    :param url: a string containing a link to the offer
    :param context: a dictionary(string, string) taken straight from the :meth:`scrape.category.get_category`

    :returns: A dictionary containing the scraped offer details
    """
    # getting response
    response = get_response_for_url(url)
    content = response.content
    html_parser = BeautifulSoup(content, "html.parser")
    # getting meta values
    if context:
        cookie = get_cookie_from(response)
        try:
            csrf_token = get_csrf_token(content)
            offer_id = context['offer_id']
        except AttributeError:
            csrf_token = ''
            offer_id = ''

        # getting offer details
        try:
            phone_numbers = get_offer_phone_numbers(offer_id, cookie,
                                                    csrf_token)
        except KeyError:
            # offer was not present any more
            phone_numbers = []

        phone_number_replace_dict = {u'\xa0': "", " ": "", "-": "", "+48": ""}
        phone_numbers = sum([
            replace_all(num, phone_number_replace_dict).split(".")
            for num in phone_numbers
        ], [])
    else:
        cookie = ""
        csrf_token = ""
        phone_numbers = ""
        context = {}

    ninja_pv = get_offer_ninja_pv(content)
    result = {
        'title': get_offer_title(html_parser),
        'address': get_offer_address(html_parser),
        'poster_name': get_offer_poster_name(html_parser),
        'poster_type': ninja_pv.get("poster_type"),
        'price': ninja_pv.get("ad_price"),
        'currency': ninja_pv.get("price_currency"),
        'city': ninja_pv.get("city_name"),
        'district': ninja_pv.get("district_name", ""),
        'voivodeship': ninja_pv.get("region_name"),
        'geographical_coordinates':
        get_offer_geographical_coordinates(html_parser),
        'phone_numbers': phone_numbers,
        'description': get_offer_description(html_parser),
        'offer_details': get_offer_details(html_parser),
        'photo_links': get_offer_photos_links(html_parser),
        'video_link': get_offer_video_link(html_parser),
        'facebook_description': get_offer_facebook_description(html_parser),
        'meta': {
            'cookie': cookie,
            'csrf_token': csrf_token,
            'context': context
        }
    }

    flat_data = get_flat_data(html_parser, ninja_pv)
    if any(flat_data.values()):
        result.update(flat_data)
    return result