Ejemplo n.º 1
0
def get_content(num):
    part_1 = "https://www.kijiji.ca/b-for-sale/st-johns/"
    part_2 = "c30353001l1700113?ad=offering"

    if num == 1:
        url = part_1 + part_2
    else:
        url = part_1 + "page-" + str(num) + "/" + part_2
    webpage_content = simple_get(url)
    if webpage_content:
        soup = BeautifulSoup(webpage_content, "html.parser")
        rent_houses = soup.find_all("div", {"class": "search-item"})
        return rent_houses
    else:
        return []
Ejemplo n.º 2
0
def web_scraper(number, size=0):
    content = get_content(number)
    time.sleep(2)
    for ad in content:
        item_url = "https://www.kijiji.ca" + ad.a.get("href")
        item_content = simple_get(item_url)
        if not item_content:
            continue
        item_soup = BeautifulSoup(item_content, "html.parser")
        try:
            item_address = item_soup.find(
                "span",
                {"class", "address-3617944557"}).string.replace("\n", "")
        except Exception as e:
            print(f"Address is not found: {item_url}")
            continue

        try:
            latitude = float(
                item_soup.find("meta", {
                    "property": "og:latitude"
                }).get("content"))
            longitude = float(
                item_soup.find("meta", {
                    "property": "og:longitude"
                }).get("content"))
        except Exception as e:
            latitude = None
            longitude = None

        try:
            item_price = item_soup.find("span", {
                "class": "currentPrice-2842943473"
            }).string.replace("\n", "")
        except Exception as e:
            item_price = "Not available"
            print(f"Price is not found: {item_url}")

        try:
            item_title = item_soup.find(
                "h1", {"class", "title-2323565163"}).text.replace("\n", "")
        except Exception as e:
            item_title = "No title"
            print(f"Title is not found: {item_url}")

        try:
            labels = item_soup.find_all("dt",
                                        {"class": "attributeLabel-240934283"})
            values = item_soup.find_all("dd",
                                        {"class": "attributeValue-2574930263"})
            info_list = [
                label.string + ": " + value.string
                for label, value in zip(labels, values)
            ]
            item_info = " *** ".join(info_list)

            if item_info:
                pass
            else:
                item_info = " *** ".join(get_info(item_soup))
        except Exception as e:
            item_info = "Not available"
            print(f"Info is not found: {item_url}")

        try:
            des_list = [
                string for string in item_soup.find("h3", {
                    "class": "title-1536205785"
                }).next_sibling.strings
            ]
            des_list = [string.replace("\n", " ") for string in des_list]
            description = "".join(des_list)
        except Exception as e:
            description = "Not available"
            print(f"Description is not found: {item_url}")

        data = [
            item_title, item_url, item_address, latitude, longitude,
            item_price, item_info, description
        ]
        data_queue.put(data)
        size += 1
        print(f"Completed scraping from {item_url}")
        time.sleep(2)

    print(f"Thread #{number} scrapes {size} ads")