def run():
    cities = ["hz", "cd", "bj", "sh", "hui", "nb"]
    maxpage = [100, 100, 21, 100, 39, 35]
    for city, max_pageid in zip(cities, maxpage):
        house_data = pd.DataFrame(columns=("name", "price", "year", "x", "y",
                                           "house_type_num",
                                           "house_structure_area"))
        csv_path = "./data/newhouse_lianjia_new_" + city + ".csv"
        head_url = "https://" + city + ".fang.lianjia.com"
        for page_id in range(1, max_pageid + 1):
            url = head_url + "/loupan/nhs1pg" + str(page_id)
            try:
                html = get_html(url)
                hrefs = get_hrefs(html, url_head="https://hz.fang.lianjia.com")
            except Exception:
                print("-------------------big error-------------------")
                print(url)
                time.sleep(60)
                continue
            for href in hrefs:
                try:
                    house_html = get_html(href)
                    name = get_name(house_html)
                    price = get_price(house_html)
                    xy = get_xy(house_html)
                    structure = get_structure(
                        house_html, url_head="https://hz.fang.lianjia.com")
                    year = get_year(house_html)
                    data_row = {
                        "name": name,
                        "price": price,
                        "year": year,
                        "x": xy[0],
                        "y": xy[1],
                        "house_type_num": len(structure),
                        "house_structure_area": structure
                    }
                except Exception:
                    print("----------------error----------------")
                    print(href)
                    time.sleep(10)
                    continue
                print(data_row)
                house_data = house_data.append(data_row, ignore_index=True)
            house_data.to_csv(csv_path)
            print(
                city + " " + str(page_id) +
                ":------------------------------saved--------------------------------------"
            )
        print(
            city +
            ":-----------------------------finished-------------------------------"
        )
    print(
        "-----------------------------finished-------------------------------")
Example #2
0
def run():
    cities = ["hui", "nb"]
    maxpage = [100, 100]
    # cities = ["hz", "cd", "bj", "sh", "hui", "nb"]
    # maxpage = [100, 100, 100, 100, 100, 100]
    for city, maxpagei in zip(cities, maxpage):
        house_data = pd.DataFrame(columns=("name", "month_price",
                                           "house_structure", "area", "x",
                                           "y"))
        csv_path = "./data/rent_lianjia_" + city + ".csv"
        for page_id in range(1, maxpagei):
            url = "https://" + city + ".lianjia.com/zufang/pg" + str(page_id)
            try:
                html = get_html(url)
                hrefs = get_hrefs(html)
            except Exception:
                print("-------------------big error-------------------")
                print(url)
                time.sleep(60)
                continue
            for href in hrefs:
                try:
                    house_html = get_html(href)
                    name = get_name(house_html)
                    price = get_price(house_html)
                    xy = get_xy(house_html)
                    structure, area = get_info(house_html)
                    data_row = {
                        "name": name,
                        "month_price": price,
                        "house_structure": structure,
                        "area": area,
                        "x": xy[0],
                        "y": xy[1]
                    }
                except Exception:
                    print("----------------error----------------")
                    print(href)
                    time.sleep(10)
                    continue
                house_data = house_data.append(data_row, ignore_index=True)
                print(data_row)
            house_data.to_csv(csv_path)
            print(
                city + " " + str(page_id) +
                ":------------------------------saved--------------------------------------"
            )
        print(
            city +
            ":-----------------------------finished-------------------------------"
        )
    print(
        "-----------------------------finished-------------------------------")
def get_structure(house_html, url_head):
    structures = []
    soup = BeautifulSoup(house_html)
    try:
        href = url_head + soup.find(attrs={
            "class": "h2-flow"
        }).find("a").attrs["href"]
        structure_html = get_html(href)
        structure_soup = BeautifulSoup(structure_html)
        list_soup = structure_soup.find(attrs={
            "class": "main-wrap huxingtu"
        }).find_all(attrs={"class": "huxing-item"})
        for li in list_soup:
            info = li.find(attrs={"class": "info clear"})
            lis = info.find("ul").find_all("li")
            structure = lis[0].text.split(":")[-1].strip()
            area = re.search("(([0-9]|\.)+)", lis[1].text).group()
            total_price = li.find(attrs={"class": "price"}).find("i").text
            data_row = [{
                "structure": structure,
                "area": int(area),
                "total_price": int(total_price)
            }]
            structures += data_row
        return structures
    except Exception:
        return structures
def get_info_from_house_html(href):
    try:
        try:
            house_html = get_redirect_html(href)
            soup = BeautifulSoup(house_html)
            name = get_name(soup)
            price = get_price(soup)
            structure = get_structure(soup)
            area = get_area(soup)
            xy = get_xy(house_html)
        except Exception:
            house_html = get_html(href)
            soup = BeautifulSoup(house_html)
            name = get_name(soup)
            price = get_price(soup)
            structure = get_structure(soup)
            area = get_area(soup)
            xy = get_xy(house_html)
        data_row = {
            "name": name,
            "month_price": price,
            "house_structure": structure,
            "area": area,
            "x": xy[0],
            "y": xy[1]
        }
        return data_row
    except Exception:
        return None
Example #5
0
def get_xy(html_house):
    soup_house_html = BeautifulSoup(html_house)
    href_map = 'https:' + soup_house_html.find(id="iframeBaiduMap").attrs['data-src']
    html_map = get_html(href_map)
    pattern = '_vars.cityx = \"([0-9]|\.)+\";_vars.cityy = \"([0-9]|\.)+\"'
    result = re.search(pattern, html_map).group()
    pattern_num = '(([0-9]|\.)+)'
    xy = re.findall(pattern_num, result)
    return [float(xy[3][0]), float(xy[1][0])]
def run():
    cities = ["hz", "cd", "bj", "sh", "huizhou", "nb"]
    page_nums = [100, 100, 100, 100, 100, 100]
    for ind, city in enumerate(cities):
        house_data = pd.DataFrame(columns=("name", "month_price",
                                           "house_structure", "area", "x",
                                           "y"))
        csv_path = "./data1207/rent_fang_" + city + ".csv"
        if os.path.exists(csv_path):
            house_data = pd.read_csv(csv_path, index_col=0)
        url_head = "https://" + city + ".zu.fang.com"
        if city == "bj":
            url_head = "https://zu.fang.com/"
        for page_id in range(1, page_nums[ind] + 1):
            url = url_head + "/house/i3" + str(page_id) + "/"
            try:
                html = get_redirect_html(url)
                hrefs = get_hrefs(html, url_head)
            except Exception:
                try:
                    html = get_html(url)
                    hrefs = get_hrefs(html, url_head)
                except Exception:
                    print(
                        "--------------------------------big error---------------------------"
                    )
                    print(url)
                    time.sleep(60)
                    continue
            thread_list = []
            for href in hrefs:
                thread_list += [
                    YhcThread(get_info_from_house_html, args=(href, ))
                ]
            for t in thread_list:
                t.start()
            for t in thread_list:
                t.join(10)
                result = t.get_result()
                if result is not None:
                    print(result)
                    house_data = house_data.append(result, ignore_index=True)
                else:
                    print("error....")
                    print(t.get_args())
            house_data.to_csv(csv_path)
            print(city + " " + str(page_id) + ":----saved to " + csv_path +
                  "--------------")
        print(
            city +
            "----------------------------finished---------------------------------"
        )
    print(
        "---------------------------------all finished-----------------------------------------"
    )
Example #7
0
def get_data_row(href):
    try:
        if href.find("?") == -1:
            href = get_redirect_url(href)
        house_html = get_html(href)
        name = get_name(house_html)
        price = get_price(house_html)
        structure, area, ave_price = get_info(house_html)
        xy = get_xy(house_html)
        type_str, build_year = get_type_buildyear(house_html)
        data_row = {"name": name, "type": type_str, "build_year": build_year, "year": None,
                    "total_price": price, "average_price": ave_price, "house_structure": structure,
                    "area": area, "x": xy[0], "y": xy[1]}
        return data_row
    except Exception:
        return None
Example #8
0
def run():
    cities = ["hz", "cd", "bj", "sh", "huizhou", "nb"]
    for city in cities:
        house_data = pd.DataFrame(columns=(
            "name", "type", "build_year", "year", "total_price", "average_price", "house_structure", "area", "x", "y"))
        csv_path = "./data1207/second_hard_house_fang_" + city + ".csv"
        if os.path.exists(csv_path):
            house_data = pd.read_csv(csv_path, index_col=0)
        url_head = "https://" + city + ".esf.fang.com"
        for page_id in range(1, 101):
            url = "https://" + city + ".esf.fang.com/house/i3" + str(page_id) + "/"
            try:
                url = get_redirect_url(url)
                html = get_html(url)
                hrefs = get_hrefs(html, url_head)
            except Exception:
                print("-------------------big error-------------------")
                print(url)
                time.sleep(60)
                continue
            thread_list = []
            for href in hrefs:
                thread_list += [YhcThread(get_data_row, args=(href,))]
            for t in thread_list:
                t.start()
            for t in thread_list:
                t.join(10)
                result = t.get_result()
                if result is not None:
                    print(result)
                    house_data = house_data.append(result, ignore_index=True)
                else:
                    print("error....")
                    print(t.get_args())
            house_data.to_csv(csv_path)
            print(city + " " + str(page_id) + ":---------saved to " + csv_path + "---------------------")
        print(city + ":--------------------finished----------------------")
    print("---------------------all finished-------------------------")
Example #9
0
def get_redirect_url(url):
    html = get_html(url)
    pattern = "t3='.*?'"
    part_url_str = re.search(pattern, html).group()
    part_url = part_url_str.replace("t3='", "").replace("'", "")
    return url + "?" + part_url
def get_redirect_html(href):
    href = get_redirect_url(href)
    house_html = get_html(href)
    return house_html