def get_communityinfo_by_url(url):
    source_code = crawlcore.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    communityinfos = soup.findAll("div", {"class": "xiaoquInfoItem"})
    res = {}
    for info in communityinfos:
        key_type = {
            u"建筑年代": "year",
            u"建筑类型": "housetype",
            u"物业费用": "cost",
            u"物业公司": "service",
            u"开发商": "company",
            u"楼栋总数": "building_num",
            u"房屋总数": "house_num",
        }
        try:
            key = info.find("span", {"xiaoquInfoLabel"})
            value = info.find("span", {"xiaoquInfoContent"})
            key_info = key_type[key.get_text().strip()]
            value_info = value.get_text().strip()
            res.update({key_info: value_info})
        except:
            continue

    return res
def GetHouseByRegionlist():
    redis_server.set(REDIS_ERSHOUFANG_DONE_COUNT_KEY, 0)
    starttime = datetime.datetime.now()
    url = BASE_URL + "ershoufang"
    source_code = crawlcore.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    region_list = soup.find('div', {'data-role': 'ershoufang'}).findAll('a')
    if region_list is None:
        raise RuntimeError('url:【%s】未找到region_list' % url)
    for regionlink in region_list:
        logging.info("Get Onsale House Infomation in %s" %
                     regionlink.get_text())
        try:
            # get_house_perregion(regionlink.get('href'))
            for i in range(REGION_THREAD_PAGE_NUMBER):
                _thread.start_new_thread(
                    get_house_perregion,
                    ('thread-%s-%d' %
                     (regionlink.get_text(), i), i, regionlink.get('href')))
        except Exception as e:
            logging.error(e)
            pass
    while int(redis_server.get(REDIS_ERSHOUFANG_DONE_COUNT_KEY)
              ) != len(region_list) * REGION_THREAD_PAGE_NUMBER:
        pass
    endtime = datetime.datetime.now()
    logging.info("ershoufang Run time: " + str(endtime - starttime))
def GetRentByRegionlist():
    starttime = datetime.datetime.now()
    redis_server.set(REDIS_RENT_DONE_COUNT_KEY, 0)
    url = BASE_URL + "zufang"
    source_code = crawlcore.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    region_list = soup.find('dd', {'data-index': '0'}).findAll('a')

    for regionlink in region_list:
        if regionlink.get_text() != '不限':
            logging.info("Get Rent House Infomation in %s" %
                         regionlink.get_text())
            try:
                for i in range(REGION_THREAD_PAGE_NUMBER):
                    # get_rent_perregion(regionlink.get('href'))
                    _thread.start_new_thread(get_rent_perregion,
                                             ('thread-%s-%d' %
                                              (regionlink.get_text(), i), i,
                                              regionlink.get('href')))
            except Exception as e:
                logging.error(e)
            pass
    while int(redis_server.get(REDIS_RENT_DONE_COUNT_KEY)) != (
            len(region_list) - 1) * REGION_THREAD_PAGE_NUMBER:
        pass
    endtime = datetime.datetime.now()
    logging.info("Run time: " + str(endtime - starttime))
def GetHouseByRegionlist():
    starttime = datetime.datetime.now()
    url = BASE_URL + "ershoufang"
    source_code = crawlcore.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    region_list = soup.find('div', {'data-role': 'ershoufang'}).findAll('a')
    if region_list is None:
        raise RuntimeError('url:【%s】未找到region_list' % url)
    for regionlink in region_list:
        logging.info("Get Onsale House Infomation in %s" %
                     regionlink.get_text())
        try:
            get_house_perregion(regionlink.get('href'))
        except Exception as e:
            logging.error(e)
            pass
    endtime = datetime.datetime.now()
    logging.info("Run time: " + str(endtime - starttime))
Exemple #5
0
def GetRentByRegionlist():
    starttime = datetime.datetime.now()

    url = BASE_URL + "zufang"
    source_code = crawlcore.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    region_list = soup.find('dd', {'data-index': '0'}).findAll('a')

    for regionlink in region_list:
        if regionlink.get_text() != '不限':
            logging.info("Get Rent House Infomation in %s" %
                         regionlink.get_text())
            try:
                get_rent_perregion(regionlink.get('href'))
            except Exception as e:
                logging.error(e)
            pass
    endtime = datetime.datetime.now()
    logging.info("Run time: " + str(endtime - starttime))
def get_house_perregion(link):
    url = BASE_URL + link
    source_code = crawlcore.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    total_pages = crawlcore.get_total_pages(soup)
    if total_pages is None:
        raise RuntimeError('url:【%s】未找到total_pages' % url)

    for page in range(1, total_pages + 1):
        if page > 1:
            url_page = url + u"/pg%d/" % page
            source_code = crawlcore.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        logging.info("GetHouseByRegionlist district:%s page:%s totalpage:%s" %
                     (link, page, total_pages))
        data_source = []
        hisprice_data_source = []
        for ultag in soup.findAll("ul", {"class": "sellListContent"}):
            for name in ultag.find_all('li'):
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update({'title': housetitle.get_text().strip()})
                    info_dict.update({'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('data-housecode')
                    if houseID is None:
                        houseID = housetitle.a.get(
                            'data-lj_action_housedel_id')
                    info_dict.update({'houseID': houseID})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    info = houseinfo.get_text().split('|')

                    # logging.info('houseID: %s houseinfo: %s' % (houseID,info))

                    info_dict.update({'community': info[0]})
                    info_dict.update({'housetype': info[1]})
                    info_dict.update({'square': info[2]})
                    info_dict.update({'direction': info[3]})
                    info_dict.update({'decoration': info[4]})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    info_dict.update({'years': housefloor.get_text().strip()})
                    info_dict.update({'floor': housefloor.get_text().strip()})

                    followInfo = name.find("div", {"class": "followInfo"})
                    info_dict.update(
                        {'followInfo': followInfo.get_text().strip()})

                    taxfree = name.find("span", {"class": "taxfree"})
                    if taxfree == None:
                        info_dict.update({"taxtype": ""})
                    else:
                        info_dict.update(
                            {"taxtype": taxfree.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    info_dict.update(
                        {'totalPrice': totalPrice.span.get_text()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    info_dict.update(
                        {'unitPrice': unitPrice.get("data-price")})

                    info_dict.update({'version': setting.DB_VERSION})
                except:
                    continue
                # Houseinfo insert into mysql
                data_source.append(info_dict)
                hisprice_data_source.append({
                    "houseID":
                    info_dict["houseID"],
                    "totalPrice":
                    info_dict["totalPrice"],
                    'version':
                    setting.DB_VERSION
                })

        with model.database.atomic():
            model.Houseinfo.insert_many(data_source).execute()
            model.Hisprice.insert_many(hisprice_data_source).execute()
        time.sleep(1)
def get_community_from_default_all(thread_name, part_no):
    starttime = datetime.datetime.now()
    logging.info('[%s]get_community_from_default_all 获取来自所有列表的小区信息' %
                 thread_name)
    url = BASE_URL + "xiaoqu/?from=rec"
    source_code = crawlcore.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    total_pages = crawlcore.get_total_pages(soup)
    i = 0
    if total_pages is None:
        raise RuntimeError('url:【%s】未找到total_pages' % url)

    start_no = 0
    end_no = 0
    start_no = int(total_pages / ALL_THREAD_NUMBER) * part_no + 1
    if part_no == ALL_THREAD_NUMBER - 1:
        end_no = total_pages + 1
    else:
        end_no = int(total_pages / ALL_THREAD_NUMBER) * (part_no + 1) + 1

    for page in range(start_no, end_no):
        if page > 1:
            sub_url = BASE_URL + "xiaoqu/pg%s" % page
            source_code = crawlcore.get_source_code(sub_url)
            soup = BeautifulSoup(source_code, 'lxml')

        name_list = soup.findAll('li', {'class': 'clear xiaoquListItem'})
        logging.info(
            '[%s] get_community_from_default_all page:%s total_page:%s' %
            (thread_name, page, total_pages))
        for name in name_list:
            info_dict = {}
            try:
                communitytitle = name.find("div", {"class": "title"})
                title = communitytitle.get_text().strip('\n')

                if redis_server.sismember(REDIS_COMMUNITY_SET_NAME, title):
                    continue
                i = i + 1
                redis_server.sadd(REDIS_COMMUNITY_SET_NAME, title)
                link = communitytitle.a.get('href')
                info_dict.update({'title': title})
                info_dict.update({'link': link})

                district = name.find("a", {"class": "district"})
                info_dict.update({'district': district.get_text()})

                bizcircle = name.find("a", {"class": "bizcircle"})
                info_dict.update({'bizcircle': bizcircle.get_text()})

                tagList = name.find("div", {"class": "tagList"})
                info_dict.update({'tagList': tagList.get_text().strip('\n')})

                onsale = name.find("a", {"class": "totalSellCount"})
                info_dict.update(
                    {'onsale': onsale.span.get_text().strip('\n')})

                onrent = name.find("a", {"title": title + u"租房"})
                info_dict.update(
                    {'onrent': onrent.get_text().strip('\n').split(u'套')[0]})

                info_dict.update({'communityid': name.get('data-housecode')})

                price = name.find("div", {"class": "totalPrice"})
                info_dict.update({'price': price.span.get_text().strip('\n')})

                info_dict.update({'version': setting.DB_VERSION})

                communityinfo = get_communityinfo_by_url(link)
                for key, value in communityinfo.items():
                    info_dict.update({key: value})
                with model.database.atomic():
                    model.Community.insert(info_dict).execute()
                time.sleep(1)
            except:
                continue
    endtime = datetime.datetime.now()
    redis_server.incr(REDIS_COMMUNITY_DONE_COUNT_KEY, 1)
    logging.info("[%s] " % thread_name +
                 "get_community_from_default_all Run time: " +
                 str(endtime - starttime) + " Total:" + str(i))
def get_house_perregion(thread_name, part_no, link):
    url = BASE_URL + link
    source_code = crawlcore.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    total_pages = crawlcore.get_total_pages(soup)
    if total_pages is None:
        raise RuntimeError('url:【%s】未找到total_pages' % url)

    start_no = 0
    end_no = 0
    start_no = int(total_pages / REGION_THREAD_PAGE_NUMBER) * part_no + 1
    if part_no == REGION_THREAD_PAGE_NUMBER - 1:
        end_no = total_pages + 1
    else:
        end_no = int(
            total_pages / REGION_THREAD_PAGE_NUMBER) * (part_no + 1) + 1

    for page in range(start_no, end_no):
        if page > 1:
            url_page = url + u"/pg%d/" % page
            source_code = crawlcore.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        logging.info(
            "[%s]GetHouseByRegionlist district:%s page:%s totalpage:%s" %
            (thread_name, link, page, total_pages))
        data_source = []
        hisprice_data_source = []
        for ultag in soup.findAll("ul", {"class": "sellListContent"}):
            for name in ultag.find_all('li'):
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update({'title': housetitle.get_text().strip()})
                    info_dict.update({'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('data-housecode')
                    if houseID is None:
                        houseID = housetitle.a.get(
                            'data-lj_action_housedel_id')
                    info_dict.update({'houseID': houseID})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    info = houseinfo.get_text().split('|')

                    # logging.info('houseID: %s houseinfo: %s' % (houseID,info))

                    info_dict.update({'community': info[0]})
                    info_dict.update({'housetype': info[1]})
                    info_dict.update({'square': info[2]})
                    info_dict.update({'direction': info[3]})
                    info_dict.update({'decoration': info[4]})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    # info_dict.update({'years':housefloor.get_text().strip()})
                    # info_dict.update({'floor':housefloor.get_text().strip()})
                    ObjectForNone(housefloor, info_dict, 'years',
                                  'get_text().strip()', '')
                    ObjectForNone(housefloor, info_dict, 'floor',
                                  'get_text().strip()', '')

                    followInfo = name.find("div", {"class": "followInfo"})
                    # info_dict.update({'followInfo':followInfo.get_text().strip()})
                    ObjectForNone(followInfo, info_dict, 'followInfo',
                                  'get_text().strip()', '')

                    taxfree = name.find("span", {"class": "taxfree"})
                    # if taxfree == None:
                    #     info_dict.update({"taxtype":""})
                    # else:
                    #     info_dict.update({"taxtype":taxfree.get_text().strip()})
                    ObjectForNone(taxfree, info_dict, 'taxtype',
                                  'get_text().strip()', '')

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    # info_dict.update({'totalPrice':totalPrice.span.get_text()})
                    ObjectForNone(totalPrice, info_dict, 'totalPrice',
                                  'span.get_text()', '0')

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    # info_dict.update({'unitPrice':unitPrice.get("data-price")})
                    ObjectForNone(unitPrice, info_dict, 'unitPrice',
                                  'get("data-price")', '0')

                    info_dict.update({'version': setting.DB_VERSION})
                except:
                    continue
                # Houseinfo insert into mysql
                data_source.append(info_dict)
                hisprice_data_source.append({
                    "houseID":
                    info_dict["houseID"],
                    "totalPrice":
                    info_dict["totalPrice"],
                    'version':
                    setting.DB_VERSION
                })

        with model.database.atomic():
            model.Houseinfo.insert_many(data_source).execute()
            model.Hisprice.insert_many(hisprice_data_source).execute()
        time.sleep(1)

    # 线程结束,更新线程计数器
    redis_server.incr(REDIS_ERSHOUFANG_DONE_COUNT_KEY, 1)
def get_community_from_default_all():
    starttime = datetime.datetime.now()
    logging.info('get_community_from_default_all 获取来自所有列表的小区信息')
    url = BASE_URL + "xiaoqu/?from=rec"
    source_code = crawlcore.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    total_pages = crawlcore.get_total_pages(soup)

    if total_pages is None:
        raise RuntimeError('url:【%s】未找到total_pages' % url)

    for page in range(1, total_pages + 1):
        if page > 1:
            sub_url = BASE_URL + "xiaoqu/pg%s" % page
            source_code = crawlcore.get_source_code(sub_url)
            soup = BeautifulSoup(source_code, 'lxml')

        name_list = soup.findAll('li', {'class': 'clear xiaoquListItem'})
        logging.info('get_community_from_default_all page:%s total_page:%s' %
                     (page, total_pages))
        for name in name_list:
            info_dict = {}
            try:
                communitytitle = name.find("div", {"class": "title"})
                title = communitytitle.get_text().strip('\n')

                if title in Community_Key_Pool:
                    continue
                Community_Key_Pool.append(title)
                link = communitytitle.a.get('href')
                info_dict.update({'title': title})
                info_dict.update({'link': link})

                district = name.find("a", {"class": "district"})
                info_dict.update({'district': district.get_text()})

                bizcircle = name.find("a", {"class": "bizcircle"})
                info_dict.update({'bizcircle': bizcircle.get_text()})

                tagList = name.find("div", {"class": "tagList"})
                info_dict.update({'tagList': tagList.get_text().strip('\n')})

                onsale = name.find("a", {"class": "totalSellCount"})
                info_dict.update(
                    {'onsale': onsale.span.get_text().strip('\n')})

                onrent = name.find("a", {"title": title + u"租房"})
                info_dict.update(
                    {'onrent': onrent.get_text().strip('\n').split(u'套')[0]})

                info_dict.update({'communityid': name.get('data-housecode')})

                price = name.find("div", {"class": "totalPrice"})
                info_dict.update({'price': price.span.get_text().strip('\n')})

                info_dict.update({'version': setting.DB_VERSION})

                communityinfo = get_communityinfo_by_url(link)
                for key, value in communityinfo.items():
                    info_dict.update({key: value})
                with model.database.atomic():
                    model.Community.insert(info_dict).execute()
                time.sleep(1)
            except:
                continue
    endtime = datetime.datetime.now()
    logging.info("get_community_from_default_all Run time: " +
                 str(endtime - starttime) + " Total:" +
                 len(Community_Key_Pool))
Exemple #10
0
def get_rent_perregion(link):
    url = BASE_URL + link
    source_code = crawlcore.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    total_pages = crawlcore.get_total_pages(soup)
    if total_pages is None:
        raise RuntimeError('url:【%s】未找到total_pages' % url)

    for page in range(1, total_pages + 1):
        if page > 1:
            url_page = url + u"/pg%d/" % page
            source_code = crawlcore.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        logging.info("GetRentByRegionlist district:%s page:%s totalpage:%s" %
                     (link, page, total_pages))
        data_source = []

        for ultag in soup.findAll("ul", {"class": "house-lst"}):
            for name in ultag.find_all('li'):

                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "info-panel"})
                    info_dict.update(
                        {'title': housetitle.h2.a.get_text().strip()})

                    info_dict.update({'link': housetitle.a.get("href")})

                    houseID = name.get("data-housecode")
                    info_dict.update({'houseID': houseID})

                    region = name.find("span", {"class": "region"})
                    info_dict.update({'region': region.get_text().strip()})

                    zone = name.find("span", {"class": "zone"})
                    info_dict.update({'zone': zone.get_text().strip()})

                    meters = name.find("span", {"class": "meters"})
                    info_dict.update({'meters': meters.get_text().strip()})

                    other = name.find("div", {"class": "con"})
                    info_dict.update({'other': other.get_text().strip()})

                    subway = name.find("span", {"class": "fang-subway-ex"})
                    if subway is None:
                        info_dict.update({'subway': ""})
                    else:
                        info_dict.update(
                            {'subway': subway.span.get_text().strip()})

                    decoration = name.find("span", {"class": "decoration-ex"})
                    if decoration is None:
                        info_dict.update({'decoration': ""})
                    else:
                        info_dict.update(
                            {'decoration': decoration.span.get_text().strip()})

                    heating = name.find("span", {"class": "heating-ex"})
                    if heating is None:
                        info_dict.update({'heating': ""})
                    else:
                        info_dict.update(
                            {'heating': heating.span.get_text().strip()})

                    price = name.find("div", {"class": "price"})
                    info_dict.update(
                        {'price': int(float(price.span.get_text().strip()))})

                    pricepre = name.find("div", {"class": "price-pre"})
                    info_dict.update({'pricepre': pricepre.get_text().strip()})

                    info_dict.update({'version': setting.DB_VERSION})

                except:
                    continue
                # Rentinfo insert into mysql
                data_source.append(info_dict)

        with model.database.atomic():
            model.Rentinfo.insert_many(data_source).execute()
        time.sleep(1)
def get_rent_perregion(thread_name, part_no, link):
    url = BASE_URL + link
    source_code = crawlcore.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    total_pages = crawlcore.get_total_pages(soup)
    if total_pages is None:
        raise RuntimeError('url:【%s】未找到total_pages' % url)

    start_no = 0
    end_no = 0
    start_no = int(total_pages / REGION_THREAD_PAGE_NUMBER) * part_no + 1
    if part_no == REGION_THREAD_PAGE_NUMBER - 1:
        end_no = total_pages + 1
    else:
        end_no = int(
            total_pages / REGION_THREAD_PAGE_NUMBER) * (part_no + 1) + 1

    for page in range(start_no, end_no):
        if page > 1:
            url_page = url + u"/pg%d/" % page
            source_code = crawlcore.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        logging.info(
            "[%s]GetRentByRegionlist district:%s page:%s totalpage:%s" %
            (thread_name, link, page, total_pages))
        data_source = []

        for ultag in soup.findAll("ul", {"class": "house-lst"}):
            for name in ultag.find_all('li'):

                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "info-panel"})
                    info_dict.update(
                        {'title': housetitle.h2.a.get_text().strip()})

                    info_dict.update({'link': housetitle.a.get("href")})

                    houseID = name.get("data-housecode")
                    info_dict.update({'houseID': houseID})

                    region = name.find("span", {"class": "region"})
                    # info_dict.update({'region':region.get_text().strip()})
                    ObjectForNone(region, info_dict, 'region',
                                  'get_text().strip()', '')

                    zone = name.find("span", {"class": "zone"})
                    # info_dict.update({'zone':zone.get_text().strip()})
                    ObjectForNone(zone, info_dict, 'zone',
                                  'get_text().strip()', '')

                    meters = name.find("span", {"class": "meters"})
                    # info_dict.update({'meters':meters.get_text().strip()})
                    ObjectForNone(meters, info_dict, 'meters',
                                  'get_text().strip()', '')

                    other = name.find("div", {"class": "con"})
                    # info_dict.update({'other':other.get_text().strip()})
                    ObjectForNone(other, info_dict, 'other',
                                  'get_text().strip()', '')

                    subway = name.find("span", {"class": "fang-subway-ex"})
                    if subway is None:
                        info_dict.update({'subway': ""})
                    else:
                        info_dict.update(
                            {'subway': subway.span.get_text().strip()})

                    decoration = name.find("span", {"class": "decoration-ex"})
                    if decoration is None:
                        info_dict.update({'decoration': ""})
                    else:
                        info_dict.update(
                            {'decoration': decoration.span.get_text().strip()})

                    heating = name.find("span", {"class": "heating-ex"})
                    if heating is None:
                        info_dict.update({'heating': ""})
                    else:
                        info_dict.update(
                            {'heating': heating.span.get_text().strip()})

                    price = name.find("div", {"class": "price"})
                    info_dict.update(
                        {'price': int(float(price.span.get_text().strip()))})

                    pricepre = name.find("div", {"class": "price-pre"})
                    #info_dict.update({'pricepre':pricepre.get_text().strip()})
                    ObjectForNone(pricepre, info_dict, 'pricepre',
                                  'get_text().strip()', '0')

                    info_dict.update({'version': setting.DB_VERSION})

                except:
                    continue
                # Rentinfo insert into mysql
                data_source.append(info_dict)

        with model.database.atomic():
            model.Rentinfo.insert_many(data_source).execute()
        time.sleep(1)

    # 线程结束,更新线程计数器
    redis_server.incr(REDIS_RENT_DONE_COUNT_KEY, 1)