Beispiel #1
0
def parse_offer(url):
    """ Parses data from offer page url

    :param url: Offer page markup
    :param url: Url of current offer page
    :type url: str
    :return: Dictionary with all offer details or None if offer is not available anymore
    :rtype: dict, None
    """
    # log.info(url)
    print(url)
    html_parser = BeautifulSoup(
        get_content_for_url(url).content, "html.parser")
    offer_content = str(html_parser.body)
    # tutaj
    poster_name = get_poster_name(offer_content)
    price, currency, add_id = parse_tracking_data(str(html_parser.head))
    if not all([add_id, poster_name]):
        log.info("Offer {0} is not available anymore.".format(url))
        return
    region = parse_region(offer_content)
    if len(region) == 3:
        city, voivodeship, district = region
    else:
        city, voivodeship = region
        district = None
    data_dict = get_gpt_script(offer_content)
    result = {
        "title": get_title(offer_content),
        "add_id": add_id,
        "price": price,
        "currency": currency,
        "city": city,
        "district": district,
        "voivodeship": voivodeship,
        "gps": get_gps(offer_content),
        "description": parse_description(offer_content),
        "poster_name": poster_name,
        "url": url,
        "date_added": get_date_added(offer_content),
        "images": get_img_url(offer_content),
        "private_business": data_dict.get("private_business"),
    }
    flat_data = parse_flat_data(offer_content, data_dict)
    if flat_data and any(flat_data.values()):
        result.update(flat_data)
    return result
Beispiel #2
0
def get_offers_for_page(page,
                        main_category=None,
                        sub_category=None,
                        detail_category=None,
                        region=None,
                        search_query=None,
                        url=None,
                        **filters):
    """ Parses offers for one specific page of given category with filters.

    :param page: Page number
    :param url: User defined url for OLX page with offers. It overrides category parameters and applies search filters.
    :param main_category: Main category
    :param sub_category: Sub category
    :param detail_category: Detail category
    :param region: Region of search
    :param filters: See :meth category.get_category for reference
    :type page: int
    :type url: str, None
    :type main_category: str, None
    :type sub_category: str, None
    :type detail_category: str, None
    :type region: str, None
    :type search_query: str, None
    :type filters: dict
    :return: List of all offers for given page and parameters
    :rtype: list
    """
    city = city_name(region) if region else None
    if url is None:
        url = get_url(main_category,
                      sub_category,
                      detail_category,
                      city,
                      search_query,
                      page=page,
                      **filters)
    else:
        url = get_url(page=page, user_url=url, **filters)
    response = get_content_for_url(url)
    log.info("Loaded page {0} of offers".format(page))
    offers = parse_available_offers(response.content)
    log.info("Loaded {0} offers".format(str(len(offers))))
    return offers
Beispiel #3
0
def get_page_count_for_filters(main_category=None,
                               sub_category=None,
                               detail_category=None,
                               region=None,
                               search_query=None,
                               url=None,
                               **filters):
    """ Reads total page number for given search filters

    :param url: User defined url for OLX page with offers. It overrides category parameters and applies search filters.
    :param main_category: Main category
    :param sub_category: Sub category
    :param detail_category: Detail category
    :param region: Region of search
    :param search_query: Additional search query
    :param filters: See :meth category.get_category for reference
    :type url: str, None
    :type main_category: str, None
    :type sub_category: str, None
    :type detail_category: str, None
    :type region: str, None
    :type search_query: str, None
    :return: Total page number
    :rtype: int
    """
    city = city_name(region) if region else None
    if url is None:
        url = get_url(main_category, sub_category, detail_category, city,
                      search_query, **filters)
    response = get_content_for_url(url)
    html_parser = BeautifulSoup(response.content, "html.parser")
    script = html_parser.head.script.next_sibling.next_sibling.next_sibling.text.split(
        ",")
    for element in script:
        if "page_count" in element:
            current = element.split(":")
            out = ""
            for char in current[len(current) - 1]:
                if char.isdigit():
                    out += char
            return int(out)
    log.warning(
        "Error no page number found. Please check if it's valid olx page.")
    return 1
Beispiel #4
0
def get_category(main_category=None,
                 sub_category=None,
                 detail_category=None,
                 region=None,
                 search_query=None,
                 url=None,
                 **filters):
    """ Parses available offer urls from given category from every page

    :param url: User defined url for OLX page with offers. It overrides category parameters and applies search filters.
    :param main_category: Main category
    :param sub_category: Sub category
    :param detail_category: Detail category
    :param region: Region of search
    :param search_query: Additional search query
    :param filters: Dictionary with additional filters. Following example dictionary contains every possible filter
    with examples of it's values.

    :Example:

    input_dict = {
        "[filter_float_price:from]": 2000, # minimal price
        "[filter_float_price:to]": 3000, # maximal price
        "[filter_enum_floor_select][0]": 3, # desired floor, enum: from -1 to 11 (10 and more) and 17 (attic)
        "[filter_enum_furniture][0]": True, # furnished or unfurnished offer
        "[filter_enum_builttype][0]": "blok", # valid build types:
        #                                             blok, kamienica, szeregowiec, apartamentowiec, wolnostojacy, loft
        "[filter_float_m:from]": 25, # minimal surface
        "[filter_float_m:to]": 50, # maximal surface
        "[filter_enum_rooms][0]": 2 # desired number of rooms, enum: from 1 to 4 (4 and more)
    }

    :type url: str, None
    :type main_category: str, None
    :type sub_category: str, None
    :type detail_category: str, None
    :type region: str, None
    :type search_query: str, None
    :type filters: dict
    :return: List of all offers for given parameters
    :rtype: list
    """
    parsed_content, page, start_url = [], 0, None
    city = city_name(region) if region else None
    if url is None:
        url = get_url(main_category, sub_category, detail_category, city,
                      search_query, **filters)
    else:
        start_url = url
    response = get_content_for_url(url)
    page_max = get_page_count(response.content)
    while page < page_max:
        if start_url is None:
            url = get_url(main_category, sub_category, detail_category, city,
                          search_query, page, **filters)
        else:
            url = get_url(page=page, user_url=start_url, **filters)
        log.debug(url)
        response = get_content_for_url(url)
        log.info("Loaded page {0} of offers".format(page))
        offers = parse_available_offers(response.content)
        if offers is None:
            break
        parsed_content.append(offers)
        page += 1
    parsed_content = list(flatten(parsed_content))
    log.info("Loaded {0} offers".format(str(len(parsed_content))))
    return parsed_content