def parse_offer(url): """ Parses data from offer page url :param url: Offer page markup :param url: Url of current offer page :type url: str :return: Dictionary with all offer details or None if offer is not available anymore :rtype: dict, None """ # log.info(url) print(url) html_parser = BeautifulSoup( get_content_for_url(url).content, "html.parser") offer_content = str(html_parser.body) # tutaj poster_name = get_poster_name(offer_content) price, currency, add_id = parse_tracking_data(str(html_parser.head)) if not all([add_id, poster_name]): log.info("Offer {0} is not available anymore.".format(url)) return region = parse_region(offer_content) if len(region) == 3: city, voivodeship, district = region else: city, voivodeship = region district = None data_dict = get_gpt_script(offer_content) result = { "title": get_title(offer_content), "add_id": add_id, "price": price, "currency": currency, "city": city, "district": district, "voivodeship": voivodeship, "gps": get_gps(offer_content), "description": parse_description(offer_content), "poster_name": poster_name, "url": url, "date_added": get_date_added(offer_content), "images": get_img_url(offer_content), "private_business": data_dict.get("private_business"), } flat_data = parse_flat_data(offer_content, data_dict) if flat_data and any(flat_data.values()): result.update(flat_data) return result
def get_offers_for_page(page, main_category=None, sub_category=None, detail_category=None, region=None, search_query=None, url=None, **filters): """ Parses offers for one specific page of given category with filters. :param page: Page number :param url: User defined url for OLX page with offers. It overrides category parameters and applies search filters. :param main_category: Main category :param sub_category: Sub category :param detail_category: Detail category :param region: Region of search :param filters: See :meth category.get_category for reference :type page: int :type url: str, None :type main_category: str, None :type sub_category: str, None :type detail_category: str, None :type region: str, None :type search_query: str, None :type filters: dict :return: List of all offers for given page and parameters :rtype: list """ city = city_name(region) if region else None if url is None: url = get_url(main_category, sub_category, detail_category, city, search_query, page=page, **filters) else: url = get_url(page=page, user_url=url, **filters) response = get_content_for_url(url) log.info("Loaded page {0} of offers".format(page)) offers = parse_available_offers(response.content) log.info("Loaded {0} offers".format(str(len(offers)))) return offers
def get_page_count_for_filters(main_category=None, sub_category=None, detail_category=None, region=None, search_query=None, url=None, **filters): """ Reads total page number for given search filters :param url: User defined url for OLX page with offers. It overrides category parameters and applies search filters. :param main_category: Main category :param sub_category: Sub category :param detail_category: Detail category :param region: Region of search :param search_query: Additional search query :param filters: See :meth category.get_category for reference :type url: str, None :type main_category: str, None :type sub_category: str, None :type detail_category: str, None :type region: str, None :type search_query: str, None :return: Total page number :rtype: int """ city = city_name(region) if region else None if url is None: url = get_url(main_category, sub_category, detail_category, city, search_query, **filters) response = get_content_for_url(url) html_parser = BeautifulSoup(response.content, "html.parser") script = html_parser.head.script.next_sibling.next_sibling.next_sibling.text.split( ",") for element in script: if "page_count" in element: current = element.split(":") out = "" for char in current[len(current) - 1]: if char.isdigit(): out += char return int(out) log.warning( "Error no page number found. Please check if it's valid olx page.") return 1
def get_category(main_category=None, sub_category=None, detail_category=None, region=None, search_query=None, url=None, **filters): """ Parses available offer urls from given category from every page :param url: User defined url for OLX page with offers. It overrides category parameters and applies search filters. :param main_category: Main category :param sub_category: Sub category :param detail_category: Detail category :param region: Region of search :param search_query: Additional search query :param filters: Dictionary with additional filters. Following example dictionary contains every possible filter with examples of it's values. :Example: input_dict = { "[filter_float_price:from]": 2000, # minimal price "[filter_float_price:to]": 3000, # maximal price "[filter_enum_floor_select][0]": 3, # desired floor, enum: from -1 to 11 (10 and more) and 17 (attic) "[filter_enum_furniture][0]": True, # furnished or unfurnished offer "[filter_enum_builttype][0]": "blok", # valid build types: # blok, kamienica, szeregowiec, apartamentowiec, wolnostojacy, loft "[filter_float_m:from]": 25, # minimal surface "[filter_float_m:to]": 50, # maximal surface "[filter_enum_rooms][0]": 2 # desired number of rooms, enum: from 1 to 4 (4 and more) } :type url: str, None :type main_category: str, None :type sub_category: str, None :type detail_category: str, None :type region: str, None :type search_query: str, None :type filters: dict :return: List of all offers for given parameters :rtype: list """ parsed_content, page, start_url = [], 0, None city = city_name(region) if region else None if url is None: url = get_url(main_category, sub_category, detail_category, city, search_query, **filters) else: start_url = url response = get_content_for_url(url) page_max = get_page_count(response.content) while page < page_max: if start_url is None: url = get_url(main_category, sub_category, detail_category, city, search_query, page, **filters) else: url = get_url(page=page, user_url=start_url, **filters) log.debug(url) response = get_content_for_url(url) log.info("Loaded page {0} of offers".format(page)) offers = parse_available_offers(response.content) if offers is None: break parsed_content.append(offers) page += 1 parsed_content = list(flatten(parsed_content)) log.info("Loaded {0} offers".format(str(len(parsed_content)))) return parsed_content