Exemple #1
0
def get_rent_perregion(district):
    url = BASE_URL + u"zufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_sh_total_pages(url)
    if total_pages == None:
        row = model.Rentinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"zufang/%s/d%d/" % (district, page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetRentByRegionlist", district, page + 1, total_pages)
        data_source = []
        nameList = soup.findAll("div", {"class": "info-panel"})
        for name in nameList:
            i = i + 1
            info_dict = {}
            try:
                info = name.find("a", {"name": "selectDetail"})
                info_dict.update({u'title': info.get('title')})
                info_dict.update({u'link': info.get('href')})
                info_dict.update({u'houseID': info.get('key')})

                where = name.find("div", {"class": "where"})
                wheres = where.find_all("span")
                info_dict.update({u'region': wheres[0].get_text().strip()})
                info_dict.update({u'zone': wheres[1].get_text().strip()})
                info_dict.update({u'meters': wheres[2].get_text().strip()})

                other = name.find("div", {"class": "con"})
                info_dict.update({u'other': "".join(other.get_text().split())})

                info_dict.update({u'subway': ""})
                info_dict.update({u'decoration': ""})
                info_dict.update({u'heating': ""})

                price = name.find("div", {"class": "price"})
                info_dict.update(
                    {u'price': int(price.span.get_text().strip())})

                pricepre = name.find("div", {"class": "price-pre"})
                info_dict.update(
                    {u'pricepre': "".join(pricepre.get_text().split())})

            except:
                continue
            # Rentinfo insert into mysql
            data_source.append(info_dict)
            # model.Rentinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            model.Rentinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
Exemple #2
0
def get_rent_perregion(city, district):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"zufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    print(total_pages)
    if total_pages == None:
        row = model.Rentinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    #todo
    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + u"zufang/%s/pg%d/" % (district, page)
            print(url_page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetRentByRegionlist", district, page + 1, total_pages)
        data_source = []

        for ultag in soup.findAll("div", {"class": "content__list"}):
            for name in ultag.find_all('div'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find(
                        "p", {"class": "content__list--item--title"})

                    regionZone = name.find(
                        "p", {"class": "content__list--item--des"})
                    region = regionZone.a.get_text().strip()
                    zone = regionZone.a.next_sibling.next_sibling.get_text(
                    ).strip()

                    info_dict = get_detail_info(city, housetitle.a.get("href"))
                    info_dict.update({u'region': region})
                    info_dict.update({u'zone': zone})
                except:
                    print 'traceback.format_exc():\n%s' % traceback.format_exc(
                    )
                    continue
                # Rentinfo insert into mysql
                data_source.append(info_dict)
                # model.Rentinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            if data_source:
                model.Rentinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
Exemple #3
0
def get_community_perregion(regionname=u'xicheng'):
    url = BASE_URL + u"xiaoqu/" + regionname + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Community.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"xiaoqu/" + regionname + "/pg%d/" % (page +
                                                                        1, )
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        nameList = soup.findAll("li", {"class": "clear"})
        i = 0

        for name in nameList:  # Per house loop
            i = i + 1
            info_dict = {}
            try:
                communitytitle = name.find("div", {"class": "title"})
                info_dict.update(
                    {u'title': communitytitle.get_text().strip('\n')})
                info_dict.update({u'link': communitytitle.a.get('href')})

                district = name.find("a", {"class": "district"})
                info_dict.update({u'district': district.get_text()})

                bizcircle = name.find("a", {"class": "bizcircle"})
                info_dict.update({u'bizcircle': bizcircle.get_text()})

                tagList = name.find("div", {"class": "tagList"})
                info_dict.update({u'tagList': tagList.get_text().strip('\n')})

                onsale = name.find("a", {"class": "totalSellCount"})
                info_dict.update(
                    {u'onsale': onsale.span.get_text().strip('\n')})

                info_dict.update({u'id': name.get('data-housecode')})

            except:
                continue
            # communityinfo insert into mysql
            model.Community.insert(**info_dict).upsert().execute()

            time.sleep(1)
Exemple #4
0
def get_house_by_url(url):
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return

    intro = soup.find(id="introduction")
    transaction_labels = intro.find_all(has_span_with_class_label)
    name_dict = {
        u'配备电梯': 'elevator',
        u'产权所属': 'propertytype',
        u'建筑结构': 'buildingstructure',
        u'建筑类型': 'buildingtype',
        u'梯户比例': 'elevatorratio',
        u'交易权属': 'transactionownership'
    }
    res = {}
    for label in transaction_labels:
        spans = label.find_all("span")
        if len(spans) > 1:
            key = spans[0].string.strip()
            val = spans[1].string.strip()
            if key in name_dict:
                res[name_dict[key]] = val
        elif len(spans) > 0:
            key = spans[0].string.strip()
            val = label.get_text()
            if key in name_dict:
                res[name_dict[key]] = val
    return res
Exemple #5
0
def getSeltInfoLastMonth(url):
    data_source = []
    info_dict = {}

    html = misc.get_source_code(url)
    soup = BeautifulSoup(html, 'lxml')

    totalPrice = soup.find('span', class_='record_price').get_text()[:-1]
    unitPrice = soup.find(
        'p', class_='record_detail').get_text().split(',')[0][2:-3]
    dealdate = soup.find('p', class_='record_detail').get_text().split(',')[2]

    info_dict.update({'totalPrice': totalPrice})
    info_dict.update({'unitPrice': unitPrice})
    info_dict.update({'dealdate': dealdate})
    '''
    msg = soup.find('div',class_='msg').get_text()
    msgSplit = re.split(r'(\W+)',msg)
    guapaiPrince = msgSplit[0][:-4]
    chengjiaoZhouqi = msgSplit[4][:-4]
    adjustPriceCount = msgSplit[8][:-2]
    daikanCount = msgSplit[12][:-2]
    guanzhuCount = msgSplit[16][:-2]
    lookCount = msgSplit[20][:-2]

    '''
    msgSplit = soup.select(
        'body > section.wrapper > div.overview > div.info.fr > div.msg > span > label'
    )
    #guapaiPrince = msgSplit[0].get_text()
    chengjiaoZhouqi = msgSplit[1].get_text()
    #adjustPriceCount = msgSplit[2].get_text()
    daikanCount = msgSplit[3].get_text()
    guanzhuCount = msgSplit[4].get_text()
    lookCount = msgSplit[5].get_text()

    #info_dict.update({'guapaiPrice':guapaiPrince})
    info_dict.update({'transactionCycle': chengjiaoZhouqi})
    #info_dict.update({'adjustPriceCount':adjustPriceCount})
    info_dict.update({'numberOfVisits': daikanCount})
    info_dict.update({'followers': guanzhuCount})
    info_dict.update({'pageView': lookCount})
    '''
    baseInfo = soup.find('div', class_='content').get_text().split()
    buildYears = baseInfo[8][4:]
    warmStyle = baseInfo[11][4:]
    propertyRight = baseInfo[13][4:]
    '''
    baseInfo = soup.find('div', class_='content').findAll('li')
    buildYears = baseInfo[7].get_text()[4:].strip()
    warmStyle = baseInfo[10].get_text()[4:].strip()
    propertyRight = baseInfo[12].get_text()[4:].strip()

    info_dict.update({'buildyears': buildYears})
    info_dict.update({'warmStyle': warmStyle})
    info_dict.update({'propertyRight': propertyRight})

    #data_source.append(info_dict)

    return info_dict
Exemple #6
0
def get_communityinfo_by_url(url):
    try:
        source_code = misc.get_source_code(url)
        soup = BeautifulSoup(source_code, 'lxml')

        res = {}
        if check_block(soup):
            return res

        communityinfos = soup.findAll("div", {"class": "xiaoquInfoItem"})
        for info in communityinfos:
            key_type = {
                u"建筑年代": u'year',
                u"建筑类型": u'housetype',
                u"物业费用": u'cost',
                u"物业公司": u'service',
                u"开发商": u'company',
                u"楼栋总数": u'building_num',
                u"房屋总数": u'house_num',
            }
            try:
                key = info.find("span", {"xiaoquInfoLabel"})
                value = info.find("span", {"xiaoquInfoContent"})
                key_info = key_type[key.get_text().strip()]
                value_info = value.get_text().strip()
                res.update({key_info: value_info})

            except Exception as e:
                # logging.error(e) # 这里不需要打印该log,多余的字段不需要解析
                # u"附近门店": u'store_near',
                continue
        return res
    except Exception as e:
        print(e, traceback.print_exc())
Exemple #7
0
def get_communityinfo_by_url(url):
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return

    communityinfos = soup.findAll("div", {"class": "xiaoquInfoItem"})
    res = {}
    for info in communityinfos:
        key_type = {
            u"建筑年代": u'year',
            u"建筑类型": u'housetype',
            u"物业费用": u'cost',
            u"物业公司": u'service',
            u"开发商": u'company',
            u"楼栋总数": u'building_num',
            u"房屋总数": u'house_num',
        }
        try:
            key = info.find("span", {"xiaoquInfoLabel"})
            value = info.find("span", {"xiaoquInfoContent"})
            key_info = key_type[key.get_text().strip()]
            value_info = value.get_text().strip()
            res.update({key_info: value_info})

        except:
            continue
    return res
Exemple #8
0
def get_communityinfo_by_url(url):
    source_code = misc.get_source_code(BASE_URL + url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return

    communityinfos = soup.findAll("div", {"class": "col-2 clearfix"})
    res = {}
    for info in communityinfos:
        try:
            infos = info.findAll("li")
            housetype = infos[0].find("span", {"class": "other"})
            year = infos[1].find("span", {"class": "other"})
            cost = infos[2].find("span", {"class": "other"})
            service = infos[3].span.find(text=True, recursive=False)
            company = infos[4].span.find(text=True, recursive=False)
            res.update({'housetype': housetype.get_text().strip()})
            res.update({'year': year.get_text().strip()})
            res.update({'cost': cost.get_text().strip()})
            res.update({'service': service.strip()})
            res.update({'company': company.strip()})

        except:
            continue
    return res
Exemple #9
0
def GetSellByCommunitylist():
	with open('community_id.txt') as f:
	    for line in f.readlines():
	    	data_source = []
	    	code = line.split(' ')[0]
	    	communityinfo = line.split(' ')[1]
	    	pages = get_totalpage("https://bj.5i5j.com/sold/%s" % code)
	    	for page in range(1,pages+1):
		    	source_code = misc.get_source_code("https://bj.5i5j.com/sold/%s/n%d/" % (code,page))
		    	soup = BeautifulSoup(source_code, 'lxml')
		    	content = soup.find('ul', class_="pList zu")
		    	try:
		    		lists = content.find_all('li')
		    	except:
		    		continue

		    	for each in lists:
		    		info_dict = {}
		    		sTit = each.find("p", {"class":"sTit"})
		    		title = sTit.strong.get_text().strip()
		    		community = title.split(' ')[0]
		    		listCon = each.find("div", {"class":"listCon"})
		    		plist = listCon.find_all("p")
		    		housetype = plist[1].get_text().strip().split(u'·')[0]
		    		square = plist[1].get_text().strip().split(u'·')[1]
		    		direction = plist[1].get_text().strip().split(u'·')[2]
		    		dealdate = plist[2].get_text().strip().split(u':')[1]
		    		jiage = each.find("div", {"class":"jiage"})
		    		totalPrice = jiage.strong.get_text().strip()
		    		unitPrice = find_between_r(jiage.p.get_text().strip(),u'价',u'元')
		    		source=u"我爱我家"
		    		status=u"暂无信息"
		    		floor=u"暂无信息"
		    		years=u"暂无信息"
		    		link = "https://bj.5i5j.com%s" % each.a.get("href") 
		    		houseID = "5i5j%s" % find_between_r(each.a.get("href"),'/','.')
		    		info_dict.update({u'title':title})
		    		info_dict.update({u'houseID':houseID})
		    		info_dict.update({u'link':link})
		    		info_dict.update({u'community':community})
		    		info_dict.update({u'years':years})
		    		info_dict.update({u'housetype':housetype})
		    		info_dict.update({u'square':square})
		    		info_dict.update({u'direction':direction})
		    		info_dict.update({u'floor':floor})
		    		info_dict.update({u'status':status})
		    		info_dict.update({u'source':source})
		    		info_dict.update({u'totalPrice':totalPrice })
		    		info_dict.update({u'unitPrice':unitPrice })
		    		info_dict.update({u'dealdate':dealdate })
		    		data_source.append(info_dict)

	        with model.database.atomic():
	        	try:
	        		model.Sellinfo.insert_many(data_source).upsert().execute()
	        	except:
	        		pass
	        logging.info("%s finish" % communityinfo)
	        time.sleep(1)
Exemple #10
0
def get_totalpage(url):
	source_code = misc.get_source_code(url)
	soup = BeautifulSoup(source_code, 'lxml')
	info = soup.find("div", {"class": "pageSty rf"})
	if info == None:
		return 1
	alist = info.find_all("a")
	page = int(alist[1].get_text().strip())
	return page
Exemple #11
0
def get_district_of_city(city):
    ret = []
    logging.info("Get District Infomation")
    url = "https://%s.lianjia.com/chengjiao/" % (city)
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    intro_wrapper = soup.find_all("div", {"data-role": "ershoufang"})[0]
    for x in intro_wrapper.find_all("a"):
        ret.append(x.get("href").split('/')[2])
    print(ret)
    return ret
Exemple #12
0
def getRentInfoFromCommunity(search_community, rid, hid):
    if rid in search_community:
        return search_community

    url = 'https://bj.lianjia.com/zufang/housestat?hid=%s&rid=%s' % (hid, rid)

    try:
        source_code = misc.get_source_code(url)
        json_obj = json.loads(source_code)

        rentData = json_obj['data']['resblockSold']
        data_source = []

        logging.info("Progress: %s: %s" % ('getRentInfoFromCommunity', url))

        for rentInfo in rentData:
            info_dict = {}

            info_dict.update({
                u'title':
                rentInfo['resblockName'] + u' ' + rentInfo['title']
            })
            info_dict.update({u'link': rentInfo['house_url']})
            info_dict.update({u'houseID': rentInfo['houseId']})
            info_dict.update({u'regionid': rid})
            info_dict.update({u'region': rentInfo['resblockName']})
            info_dict.update({u'zone': rentInfo['title']})
            info_dict.update({u'meters': rentInfo['area']})
            info_dict.update({
                u'other':
                rentInfo['floor'] + '/' + rentInfo['totalFloor'] + u'层 ' +
                rentInfo['orientation'] + ' ' + rentInfo['decoration']
            })
            info_dict.update({u'subway': ''})
            info_dict.update({u'decoration': u'同小区成交记录'})
            info_dict.update({u'heating': ''})
            info_dict.update({u'price': rentInfo['price']})
            info_dict.update({u'pricepre': rentInfo['transDate'] + u' 成交'})

            data_source.append(info_dict)

        with model.database.atomic():
            model.Rentinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)

    except Exception as e:
        logging.error(e)

    search_community.append(rid)
    return search_community
Exemple #13
0
def get_house_detail(url_page):

    ret = {}
    source_code = misc.get_source_code(url_page)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    intro_wrapper = soup.find(id='introduction')
    for ultag in intro_wrapper.findAll('ul'):
        for litag in ultag.find_all('li'):
            key = litag.find_all('span')[0]
            ret_key = translate_1[key.text]
            # print(ret_key, ret_key in translate_1)
            ret[ret_key] = litag.text[len(key.text):].strip().encode("utf-8")
    # print(ret)
    return ret
Exemple #14
0
def get_subregion_of_city(city):

    districts = get_district_of_city(city)
    ret = []
    for dis in districts:
        logging.info("Get Sub-District Infomation %s" % (dis))
        url = "https://%s.lianjia.com/xiaoqu/%s/" % (city, dis)
        source_code = misc.get_source_code(url)
        soup = BeautifulSoup(source_code, 'lxml')
        if check_block(soup):
            return
        intro_wrapper = soup.find_all("div", {"data-role": "ershoufang"})[0]
        for x in intro_wrapper.find_all("a"):
            name = x.get("href").split('/')[2]
            if name not in districts:
                ret.append(name)
        print(ret)
    return sorted(ret)
Exemple #15
0
def get_img():
    for data in model.Houseinfo.select(model.Houseinfo.houseID,
                                       model.Houseinfo.link):
        print data.link
        source_code = misc.get_source_code(data.link)
        soup = BeautifulSoup(source_code, 'html.parser')
        li = soup.select("#thumbnail2 li")
        if len(li) > 0:
            img = li[0].get('data-src')
            print img
            try:
                model.House_img.insert({
                    model.House_img.house_id: data.houseID,
                    model.House_img.img: img
                }).execute()
            except Exception as e:
                print(e)
                continue
Exemple #16
0
def getRegionByArea(areaList):
    regionList = []

    for area in areaList:
        try:
            url = BASE_URL + u"ershoufang/%s/" % area
            source_code = misc.get_source_code(url)
            soup = BeautifulSoup(source_code, 'lxml')
            if check_block(soup):
                return
            for regionInfo in soup.find('div', {
                    "class": "sub_sub_nav"
            }).find_all('a'):
                region = regionInfo.get('href').strip().split(
                    '/ershoufang/')[1].rstrip('/')
                regionList.append(region)
        except Exception as e:
            logging.error(e)
            pass
    return regionList
Exemple #17
0
def get_sell_perhouseID(houseID):
    url_page = BASE_URL + u"chengjiao/" + houseID+ ".html"
    source_code = misc.get_source_code(url_page)
    soup = BeautifulSoup(source_code, 'lxml')
    log_progress("GetSellByHouseID", houseID,1,1)
    info_dict = {}
    for name in soup.findAll("ul", {"class":"record_list"}):

        try:
            totalPrice = name.find("span", {"class":"record_price"})
            if totalPrice.span is None:
                totalPrice = totalPrice.get_text().strip().split(u'万')
            else:
                totalPrice = totalPrice.span.get_text().strip().split(u'万')

            info_dict.update({u'totalPrice':totalPrice[0]})

            detail = name.find("p", {"class":"record_detail"}).get_text().split(',')
            info_dict.update({u'unitPrice':detail[0].replace(u'单价','').replace(u'元/平','')})
            info_dict.update({u'dealdate':detail[1].replace('.','-')})
        except Exception as e:
            logging.error(e)
            logging.info("name:" + name + "Fail")
            continue
    try:
        with model.database.atomic():
            '''update_house = model.Monthsellinfo.select().where(Monthsellinfo.houseID == houseID).get()
            update_house.totalPrice = info_dict[u'totalPrice']
            update_house.unitPrice = info_dict[u'unitPrice']
            update_house.dealdate = info_dict[u'dealdate']
            update_house.save()'''
            model.Monthsellinfo.update(totalPrice = info_dict[u'totalPrice'], unitPrice = info_dict[u'unitPrice'],dealdate=info_dict[u'dealdate']).where(model.Monthsellinfo.houseID == houseID).execute()
        time.sleep(1)
    except Exception as e:
        logging.error(e)
        logging.info(houseID + "house info Fail")     
Exemple #18
0
def get_communityinfo_by_url(url):
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return

    communityinfos = soup.findAll("div", {"class": "xiaoquInfoItem"})
    res = {}
    key_type = {
        u"建筑年代": u'year',
        u"建筑类型": u'housetype',
        u"物业费用": u'cost',
        u"物业公司": u'service',
        u"开发商": u'company',
        u"楼栋总数": u'building_num',
        u"房屋总数": u'house_num',
    }
    for info in communityinfos:

        try:
            key = info.find("span", {"xiaoquInfoLabel"})
            value = info.find("span", {"xiaoquInfoContent"})
            key_chinese = key.get_text().strip()
            if key_chinese not in key_type.keys():
                continue
            key_info = key_type[key_chinese]
            value_info = value.get_text().strip()
            res.update({key_info: value_info})
        except:
            logging.exception(info)
            continue
    for chinese, english in key_type.items():
        if english not in res.keys():
            res[english] = None
    return res
Exemple #19
0
def get_sellInfo_by_url(url):
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return

    sellInfoMsg = soup.find("div", {"class": "msg"})
    res = {}
    num = 0
    if sellInfoMsg == []:
        res.update({cycle: ''})
        res.update({listing_price: ''})
        res.update({adjust_num: ''})
        res.update({view_num: ''})
        res.update({attention_num: ''})
        res.update({browse_num: ''})
    else:
        sellInfoMsg = sellInfoMsg.findAll("label")
        key_type = {
            1: u'listing_price',
            2: u'cycle',
            3: u'adjust_num',
            4: u'view_num',
            5: u'attention_num',
            6: u'browse_num',
        }
        for info in sellInfoMsg:
            try:
                num += 1
                key_info = key_type[num]
                value_info = info.get_text().strip()
                res.update({key_info: value_info})
            except:
                continue
    return res
Exemple #20
0
def get_house_perregion(district):
    url = BASE_URL + u"ershoufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages == None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"ershoufang/%s/pg%d/" % (district, page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetHouseByRegionlist", district, page + 1, total_pages)
        data_source = []
        hisprice_data_source = []
        for ultag in soup.findAll("ul", {"class": "sellListContent"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update(
                        {u'title': housetitle.a.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('data_analysis-housecode')
                    info_dict.update({u'houseID': houseID})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    if CITY == 'bj':
                        info = houseinfo.get_text().split('/')
                    else:
                        info = houseinfo.get_text().split('|')
                    info_dict.update({u'community': info[0]})
                    info_dict.update({u'housetype': info[1]})
                    info_dict.update({u'square': info[2]})
                    info_dict.update({u'direction': info[3]})
                    info_dict.update({u'decoration': info[4]})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    info_dict.update({u'years': housefloor.get_text().strip()})
                    info_dict.update({u'floor': housefloor.get_text().strip()})

                    followInfo = name.find("div", {"class": "followInfo"})
                    info_dict.update(
                        {u'followInfo': followInfo.get_text().strip()})

                    taxfree = name.find("span", {"class": "taxfree"})
                    if taxfree == None:
                        info_dict.update({u"taxtype": ""})
                    else:
                        info_dict.update(
                            {u"taxtype": taxfree.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    info_dict.update(
                        {u'totalPrice': totalPrice.span.get_text()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    info_dict.update(
                        {u'unitPrice': unitPrice.get("data_analysis-price")})
                except:
                    continue

                # Houseinfo insert into mysql
                data_source.append(info_dict)
                hisprice_data_source.append({
                    "houseID":
                    info_dict["houseID"],
                    "totalPrice":
                    info_dict["totalPrice"]
                })
                # model.Houseinfo.insert(**info_dict).upsert().execute()
                # model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute()

        with model.database.atomic():
            model.Houseinfo.insert_many(data_source).upsert().execute()
            model.Hisprice.insert_many(hisprice_data_source).upsert().execute()
        time.sleep(1)
Exemple #21
0
def get_rent_perregion(district):
    url = BASE_URL + u"zufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages == None:
        row = model.Rentinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"zufang/%s/pg%d/" % (district, page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetRentByRegionlist", district, page + 1, total_pages)
        data_source = []
        for ultag in soup.findAll("ul", {"class": "house-lst"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "info-panel"})
                    info_dict.update(
                        {u'title': housetitle.h2.a.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get("href")})
                    houseID = name.get("data_analysis-housecode")
                    info_dict.update({u'houseID': houseID})

                    region = name.find("span", {"class": "region"})
                    info_dict.update({u'region': region.get_text().strip()})

                    zone = name.find("span", {"class": "zone"})
                    info_dict.update({u'zone': zone.get_text().strip()})

                    meters = name.find("span", {"class": "meters"})
                    info_dict.update({u'meters': meters.get_text().strip()})

                    other = name.find("div", {"class": "con"})
                    info_dict.update({u'other': other.get_text().strip()})

                    subway = name.find("span", {"class": "fang-subway-ex"})
                    if subway == None:
                        info_dict.update({u'subway': ""})
                    else:
                        info_dict.update(
                            {u'subway': subway.span.get_text().strip()})

                    decoration = name.find("span", {"class": "decoration-ex"})
                    if decoration == None:
                        info_dict.update({u'decoration': ""})
                    else:
                        info_dict.update({
                            u'decoration':
                            decoration.span.get_text().strip()
                        })

                    heating = name.find("span", {"class": "heating-ex"})
                    if decoration == None:
                        info_dict.update({u'heating': ""})
                    else:
                        info_dict.update(
                            {u'heating': heating.span.get_text().strip()})

                    price = name.find("div", {"class": "price"})
                    info_dict.update(
                        {u'price': int(price.span.get_text().strip())})

                    pricepre = name.find("div", {"class": "price-pre"})
                    info_dict.update(
                        {u'pricepre': pricepre.get_text().strip()})

                except:
                    continue
                # Rentinfo insert into mysql
                data_source.append(info_dict)
                # model.Rentinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            model.Rentinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
Exemple #22
0
def get_sell_percommunity(communityname):
    url = BASE_URL + u"chengjiao/rs" + urllib2.quote(
        communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Sellinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"chengjiao/pg%drs%s/" % (
                page, urllib2.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetSellByCommunitylist", communityname, page + 1,
                     total_pages)
        data_source = []
        for ultag in soup.findAll("ul", {"class": "listContent"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update({u'title': housetitle.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('href').split("/")[-1].split(
                        ".")[0]
                    info_dict.update({u'houseID': houseID.strip()})

                    house = housetitle.get_text().strip().split(' ')
                    info_dict.update({u'community': house[0].strip()})
                    info_dict.update({u'housetype': house[1].strip()})
                    info_dict.update({u'square': house[2].strip()})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    info = houseinfo.get_text().split('|')
                    info_dict.update({u'direction': info[0].strip()})
                    info_dict.update({u'status': info[1].strip()})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    floor_all = housefloor.get_text().strip().split(' ')
                    info_dict.update({u'floor': floor_all[0].strip()})
                    info_dict.update({u'years': floor_all[-1].strip()})

                    followInfo = name.find("div", {"class": "source"})
                    info_dict.update(
                        {u'source': followInfo.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    if totalPrice.span is None:
                        info_dict.update(
                            {u'totalPrice': totalPrice.get_text().strip()})
                    else:
                        info_dict.update({
                            u'totalPrice':
                            totalPrice.span.get_text().strip()
                        })

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    if unitPrice.span is None:
                        info_dict.update(
                            {u'unitPrice': unitPrice.get_text().strip()})
                    else:
                        info_dict.update(
                            {u'unitPrice': unitPrice.span.get_text().strip()})

                    dealDate = name.find("div", {"class": "dealDate"})
                    info_dict.update({
                        u'dealdate':
                        dealDate.get_text().strip().replace('.', '-')
                    })

                except:
                    continue
                # Sellinfo insert into mysql
                data_source.append(info_dict)
                # model.Sellinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            model.Sellinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
Exemple #23
0
def get_community_perregion(regionname):
    url = BASE_URL + u"xiaoqu/" + regionname + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Community.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"xiaoqu/" + regionname + "/pg%d/" % page
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        nameList = soup.findAll("li", {"class": "clear"})
        i = 0
        log_progress("GetCommunityByRegionlist", regionname, page + 1,
                     total_pages)
        data_source = []
        for name in nameList:  # Per house loop
            i = i + 1
            info_dict = {}
            try:
                communitytitle = name.find("div", {"class": "title"})
                title = communitytitle.get_text().strip('\n')
                link = communitytitle.a.get('href')
                info_dict.update({u'title': title})
                info_dict.update({u'link': link})

                district = name.find("a", {"class": "district"})
                info_dict.update({u'district': district.get_text()})

                bizcircle = name.find("a", {"class": "bizcircle"})
                info_dict.update({u'bizcircle': bizcircle.get_text()})

                tagList = name.find("div", {"class": "tagList"})
                info_dict.update({u'tagList': tagList.get_text().strip('\n')})

                onsale = name.find("a", {"class": "totalSellCount"})
                info_dict.update(
                    {u'onsale': onsale.span.get_text().strip('\n')})

                onrent = name.find("a", {"title": title + u"租房"})
                info_dict.update(
                    {u'onrent': onrent.get_text().strip('\n').split(u'套')[0]})

                info_dict.update({u'id': name.get('data_analysis-housecode')})

                price = name.find("div", {"class": "totalPrice"})
                info_dict.update({u'price': price.span.get_text().strip('\n')})

                communityinfo = get_communityinfo_by_url(link)
                for key, value in communityinfo.iteritems():
                    info_dict.update({key: value})

            except:
                continue
            # communityinfo insert into mysql
            data_source.append(info_dict)
            # model.Community.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            model.Community.insert_many(data_source).upsert().execute()
        time.sleep(1)
Exemple #24
0
def get_rent_percommunity(city, communityname):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"zufang/rs" + \
        urllib.parse.quote(communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Rentinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + \
                u"rent/pg%drs%s/" % (page,
                                     urllib.parse.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetRentByCommunitylist", communityname, page + 1,
                     total_pages)
        data_source = []
        for ultag in soup.findAll("ul", {"class": "house-lst"}):
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "info-panel"})
                    info_dict.update({u'title': housetitle.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('href').split("/")[-1].split(
                        ".")[0]
                    info_dict.update({u'houseID': houseID})

                    region = name.find("span", {"class": "region"})
                    info_dict.update({u'region': region.get_text().strip()})

                    zone = name.find("span", {"class": "zone"})
                    info_dict.update({u'zone': zone.get_text().strip()})

                    meters = name.find("span", {"class": "meters"})
                    info_dict.update({u'meters': meters.get_text().strip()})

                    other = name.find("div", {"class": "con"})
                    info_dict.update({u'other': other.get_text().strip()})

                    subway = name.find("span", {"class": "fang-subway-ex"})
                    if subway is None:
                        info_dict.update({u'subway': ""})
                    else:
                        info_dict.update(
                            {u'subway': subway.span.get_text().strip()})

                    decoration = name.find("span", {"class": "decoration-ex"})
                    if decoration is None:
                        info_dict.update({u'decoration': ""})
                    else:
                        info_dict.update({
                            u'decoration':
                            decoration.span.get_text().strip()
                        })

                    heating = name.find("span", {"class": "heating-ex"})
                    info_dict.update(
                        {u'heating': heating.span.get_text().strip()})

                    price = name.find("div", {"class": "price"})
                    info_dict.update(
                        {u'price': int(price.span.get_text().strip())})

                    pricepre = name.find("div", {"class": "price-pre"})
                    info_dict.update(
                        {u'pricepre': pricepre.get_text().strip()})

                except:
                    continue
                # Rentinfo insert into mysql
                data_source.append(info_dict)
                # model.Rentinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            if data_source:
                model.Rentinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
Exemple #25
0
def get_house_perregion(city, district):
    baseUrl = u"http://%s.lianjia.com/" % (city)
    url = baseUrl + u"ershoufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages is None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    #total_pages = 10
    house_ids = set()
    for page in range(total_pages):
        if page > 0:
            url_page = baseUrl + u"ershoufang/%s/pg%d/" % (district, page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetHouseByRegionlist", district, page + 1, total_pages)
        data_source = []
        hisprice_data_source = []
        ultags = soup.findAll("ul", {"class": "sellListContent"})
        for ultag in ultags:
            for name in ultag.find_all('li'):
                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update(
                        {u'title': housetitle.a.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('data-housecode')
                    # 对houseID进行去重
                    if houseID in house_ids:
                        continue
                    info_dict.update({u'houseID': houseID})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    info = houseinfo.get_text().split('|')
                    #info_dict.update({u'community': info[0]})
                    info_dict.update({u'housetype': info[0]})
                    info_dict.update({u'square': info[1]})
                    info_dict.update({u'direction': info[2]})
                    info_dict.update({u'decoration': info[3]})
                    info_dict.update({u'floor': info[4]})
                    info_dict.update({u'years': info[5]})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    communityInfo = housefloor.get_text().split('-')
                    info_dict.update({u'community': communityInfo[0]})
                    #info_dict.update({u'years': housefloor.get_text().strip()})
                    #info_dict.update({u'floor': housefloor.get_text().strip()})

                    followInfo = name.find("div", {"class": "followInfo"})
                    info_dict.update(
                        {u'followInfo': followInfo.get_text().strip()})

                    taxfree = name.find("span", {"class": "taxfree"})
                    if taxfree is None:
                        info_dict.update({u"taxtype": ""})
                    else:
                        info_dict.update(
                            {u"taxtype": taxfree.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    info_dict.update(
                        {u'totalPrice': totalPrice.span.get_text()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    info_dict.update(
                        {u'unitPrice': unitPrice.get("data-price")})
                    info_dict.update({"validdate": datetime.datetime.now()})
                except:
                    continue

                # Houseinfo insert into mysql
                data_source.append(info_dict)
                hisprice_data_source.append({
                    "houseID":
                    info_dict["houseID"],
                    "totalPrice":
                    info_dict["totalPrice"]
                })
                house_ids.add(info_dict["houseID"])

        with model.database.atomic():
            try:
                for data in data_source:
                    model.Houseinfo.insert(data).on_conflict(
                        conflict_target=(model.Houseinfo.houseID, ),
                        update=data,
                        #preserve=(model.Houseinfo.houseID, ),
                    ).execute()
                model.Hisprice.insert_many(hisprice_data_source).execute()
            except Exception as e:
                print("error: %s" % e)
        log_progress("GetHouseByRegionlist inserted", district, page + 1,
                     total_pages)
        time.sleep(0.5)
Exemple #26
0
def get_house_perregion(district):
    url = BASE_URL + u"ershoufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    if check_block(soup):
        return
    total_pages = misc.get_sh_total_pages(url)
    if total_pages == None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"ershoufang/%s/d%d/" % (district, page)
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetHouseByRegionlist", district, page + 1, total_pages)
        data_source = []
        hisprice_data_source = []
        nameList = soup.findAll("div", {"class": "info"})
        for name in nameList:  # per house loop
            i = i + 1
            info_dict = {}
            try:
                housetitle = name.find("div", {"class": "prop-title"})
                info_dict.update({u'title': housetitle.a.get('title')})
                info_dict.update({u'link': housetitle.a.get('href')})
                info_dict.update({u'houseID': housetitle.a.get('key')})

                houseaddr = name.find("span", {"class": "info-col row1-text"})
                info = houseaddr.get_text().split('|')
                info_dict.update({u'housetype': info[0].strip()})
                info_dict.update({u'square': info[1].strip()})
                info_dict.update({u'floor': info[2].strip()})
                try:
                    info_dict.update({u'direction': info[3].strip()})
                except:
                    info_dict.update({u'direction': ''})
                info_dict.update({u'decoration': ''})

                housefloor = name.find("span", {"class": "info-col row2-text"})
                detail = housefloor.get_text().split('|')
                info_dict.update({u'years': detail[-1].strip()})

                community = name.find("a", {"class": "laisuzhou"})
                info_dict.update(
                    {u'community': community.span.get_text().strip()})
                info_dict.update({u'followInfo': ''})

                tax = name.find("div", {"class": "property-tag-container"})
                info_dict.update({u'taxtype': "".join(tax.get_text().split())})

                totalPrice = name.find("span",
                                       {"class": "total-price strong-num"})
                info_dict.update(
                    {u'totalPrice': totalPrice.get_text().strip()})

                unitPrice = name.find("span",
                                      {"class": "info-col price-item minor"})
                info_dict.update({u'unitPrice': unitPrice.get_text().strip()})
            except:
                continue
            # houseinfo insert into mysql
            data_source.append(info_dict)
            hisprice_data_source.append({
                "houseID": info_dict["houseID"],
                "totalPrice": info_dict["totalPrice"]
            })
            # model.Houseinfo.insert(**info_dict).upsert().execute()
            #model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute()

            with model.database.atomic():
                model.Houseinfo.insert_many(data_source).upsert().execute()
                model.Hisprice.insert_many(
                    hisprice_data_source).upsert().execute()
            time.sleep(1)
Exemple #27
0
def get_community_perregion(regionname=u'pudong'):
    url = BASE_URL + u"xiaoqu/" + regionname + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_sh_total_pages(url)

    if total_pages == None:
        row = model.Community.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"xiaoqu/" + regionname + "/d%d/" % page
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        nameList = soup.findAll("div", {"class": "info-panel"})
        i = 0
        log_progress("GetCommunityByRegionlist", regionname, page + 1,
                     total_pages)
        data_source = []
        for name in nameList:  # Per house loop
            i = i + 1
            info_dict = {}
            try:
                communitytitle = name.find("a", {"name": "selectDetail"})
                title = communitytitle.get_text().strip('\n')
                link = communitytitle.get('href')
                id = communitytitle.get('key')
                info_dict.update({u'title': title})
                info_dict.update({u'link': link})
                info_dict.update({u'id': id})

                district = name.find("a", {"class": "ad"})
                info_dict.update({u'district': district.get_text()})

                cons = name.find("div", {"class": "con"})
                bizcircle = cons.findAll("a")
                info_dict.update(
                    {u'bizcircle': bizcircle[1].get_text().strip()})

                try:
                    tagList = name.find("span", {"class": "fang-subway-ex"})
                    info_dict.update({u'tagList': tagList.get_text().strip()})
                except:
                    info_dict.update({u'tagList': ''})

                onsale = name.find("span", {"class": "num"})
                info_dict.update({u'onsale': onsale.get_text().strip()})

                price = name.find("div", {"class": "price"})
                info_dict.update({u'price': price.span.get_text().strip()})

                communityinfo = get_communityinfo_by_url(link)
                for key, value in communityinfo.iteritems():
                    info_dict.update({key: value})

            except:
                continue
            # communityinfo insert into mysql
            data_source.append(info_dict)
            # model.Community.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            model.Community.insert_many(data_source).upsert().execute()
        time.sleep(1)
Exemple #28
0
def get_sell_percommunity(communityname):
    url = BASE_URL + u"chengjiao/rs" + \
        urllib2.quote(communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_sh_total_pages(url)

    if total_pages == None:
        row = model.Sellinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + \
                u"chengjiao/d%drs%s/" % (page,
                                         urllib2.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')
        i = 0
        log_progress("GetSellByCommunitylist", communityname, page + 1,
                     total_pages)
        data_source = []
        for name in soup.findAll("div", {"class": "info"}):
            i = i + 1
            info_dict = {}
            try:
                housetitle = name.findAll("div", {"class": "info-row"})[0]
                info_dict.update({u'title': housetitle.a.get('title')})
                info_dict.update({u'link': housetitle.a.get('href')})
                info_dict.update({u'houseID': housetitle.a.get('key')})
                houseinfo = housetitle.get_text().strip().split(' ')
                info_dict.update({u'housetype': houseinfo[1].strip()})
                info_dict.update(
                    {u'square': houseinfo[2].strip('').split('\n')[0]})

                houseaddr = name.find("div", {"class": "row1-text"})
                info = houseaddr.get_text().split('|')
                info_dict.update({u'floor': info[0].strip()})
                try:
                    info_dict.update({u'direction': info[1].strip()})
                except:
                    info_dict.update({u'direction': ''})
                info_dict.update({u'status': info[2].strip()})

                years = name.find("span", {"class": "c-prop-tag2"})
                info_dict.update({u'years': years.get_text().strip()})

                community = name.find("span", {"class": "cj-text"})
                info_dict.update({u'community': community.get_text().strip()})

                totalPrice = name.find("span", {"class": "strong-num"})
                info_dict.update(
                    {u'totalPrice': totalPrice.get_text().strip()})

                unitPrice = name.find("div",
                                      {"class": "info-col price-item minor"})
                info_dict.update({u'unitPrice': unitPrice.get_text().strip()})

                source = name.find("div",
                                   {"class": "info-col deal-item minor"})
                info_dict.update({u'source': source.get_text().strip()})

                dealdate = name.find(
                    "div", {"class": "info-col deal-item main strong-num"})
                info_dict.update({
                    u'dealdate':
                    dealdate.get_text().strip().replace('.', '-')
                })

            except:
                continue
            # Sellinfo insert into mysql
            data_source.append(info_dict)
            # model.Sellinfo.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            model.Sellinfo.insert_many(data_source).upsert().execute()
        time.sleep(1)
Exemple #29
0
def get_house_percommunity(communityname):
    url = BASE_URL + u"ershoufang/rs" + urllib2.quote(
        communityname.encode('utf8')) + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)

    if total_pages == None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"ershoufang/pg%drs%s/" % (
                page, urllib2.quote(communityname.encode('utf8')))
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        nameList = soup.findAll("li", {"class": "clear"})
        i = 0
        log_progress("GetHouseByCommunitylist", communityname, page + 1,
                     total_pages)
        data_source = []
        hisprice_data_source = []
        for name in nameList:  # per house loop
            i = i + 1
            info_dict = {}
            try:
                housetitle = name.find("div", {"class": "title"})
                info_dict.update({u'title': housetitle.a.get_text().strip()})
                info_dict.update({u'link': housetitle.a.get('href')})

                houseaddr = name.find("div", {"class": "address"})
                if CITY == 'bj':
                    info = houseaddr.div.get_text().split('/')
                else:
                    info = houseaddr.div.get_text().split('|')
                info_dict.update({u'community': info[0].strip()})
                info_dict.update({u'housetype': info[1].strip()})
                info_dict.update({u'square': info[2].strip()})
                info_dict.update({u'direction': info[3].strip()})
                info_dict.update({u'decoration': info[4].strip()})

                housefloor = name.find("div", {"class": "flood"})
                floor_all = housefloor.div.get_text().split(
                    '-')[0].strip().split(' ')
                info_dict.update({u'floor': floor_all[0].strip()})
                info_dict.update({u'years': floor_all[-1].strip()})

                followInfo = name.find("div", {"class": "followInfo"})
                info_dict.update({u'followInfo': followInfo.get_text()})

                tax = name.find("div", {"class": "tag"})
                info_dict.update({u'taxtype': tax.get_text().strip()})

                totalPrice = name.find("div", {"class": "totalPrice"})
                info_dict.update({u'totalPrice': totalPrice.span.get_text()})

                unitPrice = name.find("div", {"class": "unitPrice"})
                info_dict.update(
                    {u'unitPrice': unitPrice.get('data_analysis-price')})
                info_dict.update(
                    {u'houseID': unitPrice.get('data_analysis-hid')})
            except:
                continue
            # houseinfo insert into mysql
            data_source.append(info_dict)
            hisprice_data_source.append({
                "houseID": info_dict["houseID"],
                "totalPrice": info_dict["totalPrice"]
            })
            # model.Houseinfo.insert(**info_dict).upsert().execute()
            # model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute()

        with model.database.atomic():
            model.Houseinfo.insert_many(data_source).upsert().execute()
            model.Hisprice.insert_many(hisprice_data_source).upsert().execute()
        time.sleep(1)
Exemple #30
0
def get_house_perregion(district):
    url = BASE_URL + u"ershoufang/%s/" % district
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'html5lib')
    if check_block(soup):
        return
    total_pages = misc.get_total_pages(url)
    if total_pages == None:
        row = model.Houseinfo.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        url_page = BASE_URL + u"ershoufang/%s/pg%d/" % (district, page + 1)
        source_code = misc.get_source_code(url_page)
        soup = BeautifulSoup(source_code, 'html5lib')

        i = 0
        log_progress("GetHouseByRegionlist", district, page + 1, total_pages)
        data_source = []
        hisprice_data_source = []
        for ultag in soup.findAll("ul", {"class": "sellListContent"}):
            namearr = ultag.find_all('li', {"class": "clear"})

            for name in namearr:

                i = i + 1
                info_dict = {}
                try:
                    housetitle = name.find("div", {"class": "title"})
                    info_dict.update({u'title': housetitle.get_text().strip()})
                    info_dict.update({u'link': housetitle.a.get('href')})
                    houseID = housetitle.a.get('data-housecode')
                    info_dict.update({u'houseID': houseID})

                    houseinfo = name.find("div", {"class": "houseInfo"})
                    info = houseinfo.get_text().split('/')
                    info_communityid = houseinfo.a.get('href').split('xiaoqu/')
                    communityid = info_communityid[1].strip().rstrip('/')
                    square_info = info[2].encode("utf-8").split('平米')

                    info_dict.update({u'community': info[0]})
                    info_dict.update({u'communityid': communityid})
                    info_dict.update({u'housetype': info[1]})
                    info_dict.update({u'square': square_info[0]})
                    info_dict.update({u'direction': info[3]})
                    info_dict.update({u'decoration': info[4]})

                    housefloor = name.find("div", {"class": "positionInfo"})
                    info_housefloor = housefloor.get_text().split('/')

                    info_years = info_housefloor[1].strip(
                    ).encode("utf-8").split(
                        '年建'
                    )  #unicode作为python中间编码,先转化成utf8(decode:...->unicode,encode:unicode->...)
                    info_floor = info_housefloor[0].split('(')
                    info_buildheight = info_floor[1].encode("utf-8").rstrip(
                        '层)').lstrip('共')

                    info_dict.update({u'years': info_years[0].strip()})
                    info_dict.update({u'buildingtype': info_years[1].strip()})
                    info_dict.update({u'floor': info_floor[0].strip()})
                    info_dict.update(
                        {u'buildheight': info_buildheight.strip()})

                    followInfo = name.find("div", {"class": "followInfo"})
                    info_dict.update(
                        {u'followInfo': followInfo.get_text().strip()})

                    taxfree = name.find("span", {"class": "taxfree"})
                    if taxfree == None:
                        five = name.find("span", {"class": "five"})
                        if five == None:
                            info_dict.update({u"taxtype": ""})
                        else:
                            info_dict.update(
                                {u"taxtype": five.get_text().strip()})
                    else:
                        info_dict.update(
                            {u"taxtype": taxfree.get_text().strip()})

                    totalPrice = name.find("div", {"class": "totalPrice"})
                    info_dict.update(
                        {u'totalPrice': totalPrice.span.get_text()})

                    unitPrice = name.find("div", {"class": "unitPrice"})
                    info_dict.update(
                        {u'unitPrice': unitPrice.get("data-price")})

                except:
                    continue

                # Houseinfo insert into mysql
                data_source.append(info_dict)
                hisprice_data_source.append({
                    "houseID":
                    info_dict["houseID"],
                    "totalPrice":
                    info_dict["totalPrice"]
                })
                #model.Houseinfo.insert(**info_dict).upsert().execute()
                #model.Hisprice.insert(houseID=info_dict['houseID'], totalPrice=info_dict['totalPrice']).upsert().execute()

        with model.database.atomic():
            model.Houseinfo.insert_many(data_source).upsert().execute()
            model.Hisprice.insert_many(hisprice_data_source).upsert().execute()
        time.sleep(1)