コード例 #1
0
def get_totalpage(url):
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')
    info = soup.find("div", {"class": "pageSty rf"})
    if info == None:
        return 1
    alist = info.find_all("a")
    try:
        page = int(alist[1].get_text().strip())
    except:
        page = 1
    return page
コード例 #2
0
def get_community_perregion(regionname=u'pudong'):
    url = BASE_URL + u"xiaoqu/" + regionname + "/"
    source_code = misc.get_source_code(url)
    soup = BeautifulSoup(source_code, 'lxml')

    if check_block(soup):
        return
    total_pages = misc.get_sh_total_pages(url)

    if total_pages == None:
        row = model.Community.select().count()
        raise RuntimeError("Finish at %s because total_pages is None" % row)

    for page in range(total_pages):
        if page > 0:
            url_page = BASE_URL + u"xiaoqu/" + regionname + "/d%d/" % page
            source_code = misc.get_source_code(url_page)
            soup = BeautifulSoup(source_code, 'lxml')

        nameList = soup.findAll("div", {"class": "info-panel"})
        i = 0
        log_progress("GetCommunityByRegionlist",
                     regionname, page + 1, total_pages)
        data_source = []
        for name in nameList:  # Per house loop
            i = i + 1
            info_dict = {}
            try:
                communitytitle = name.find("a", {"name": "selectDetail"})
                title = communitytitle.get_text().strip('\n')
                link = communitytitle.get('href')
                id = communitytitle.get('key')
                info_dict.update({u'title': title})
                info_dict.update({u'link': link})
                info_dict.update({u'id': id})

                district = name.find("a", {"class": "ad"})
                info_dict.update({u'district': district.get_text()})

                cons = name.find("div", {"class": "con"})
                bizcircle = cons.findAll("a")
                info_dict.update(
                    {u'bizcircle': bizcircle[1].get_text().strip()})

                try:
                    tagList = name.find("span", {"class": "fang-subway-ex"})
                    info_dict.update({u'tagList': tagList.get_text().strip()})
                except:
                    info_dict.update({u'tagList': ''})

                onsale = name.find("span", {"class": "num"})
                info_dict.update({u'onsale': onsale.get_text().strip()})

                price = name.find("div", {"class": "price"})
                info_dict.update({u'price': price.span.get_text().strip()})

                communityinfo = get_communityinfo_by_url(link)
                for key, value in communityinfo.iteritems():
                    info_dict.update({key: value})

            except:
                continue
            # communityinfo insert into mysql
            data_source.append(info_dict)
            # model.Community.insert(**info_dict).upsert().execute()

        with model.database.atomic():
            model.Community.insert_many(data_source).upsert().execute()
        time.sleep(1)
コード例 #3
0
def GetSellByCommunitylist():
    with open('community_id.txt') as f:
        for line in f.readlines():
            data_source = []
            code = line.split(' ')[0]
            communityinfo = line.split(' ')[1]
            pages = get_totalpage("https://bj.5i5j.com/sold/%s" % code)
            for page in range(1, pages + 1):
                source_code = misc.get_source_code(
                    "https://bj.5i5j.com/sold/%s/n%d/" % (code, page))
                soup = BeautifulSoup(source_code, 'lxml')
                content = soup.find('ul', class_="pList zu")
                try:
                    lists = content.find_all('li')
                except:
                    continue

                for each in lists:
                    info_dict = {}
                    sTit = each.find("p", {"class": "sTit"})
                    title = sTit.strong.get_text().strip()
                    community = title.split(' ')[0]
                    listCon = each.find("div", {"class": "listCon"})
                    plist = listCon.find_all("p")
                    housetype = plist[1].get_text().strip().split(u'·')[0]
                    square = plist[1].get_text().strip().split(u'·')[1]
                    direction = plist[1].get_text().strip().split(u'·')[2]
                    dealdate = plist[2].get_text().strip().split(u':')[1]
                    jiage = each.find("div", {"class": "jiage"})
                    totalPrice = jiage.strong.get_text().strip()
                    unitPrice = find_between_r(
                        jiage.p.get_text().strip(), u'价', u'元')
                    source = u"我爱我家"
                    status = u"暂无信息"
                    floor = u"暂无信息"
                    years = u"暂无信息"
                    link = "https://bj.5i5j.com%s" % each.a.get("href")
                    houseID = "5i5j%s" % find_between_r(
                        each.a.get("href"), '/', '.')
                    info_dict.update({u'title': title})
                    info_dict.update({u'houseID': houseID})
                    info_dict.update({u'link': link})
                    info_dict.update({u'community': community})
                    info_dict.update({u'years': years})
                    info_dict.update({u'housetype': housetype})
                    info_dict.update({u'square': square})
                    info_dict.update({u'direction': direction})
                    info_dict.update({u'floor': floor})
                    info_dict.update({u'status': status})
                    info_dict.update({u'source': source})
                    info_dict.update({u'totalPrice': totalPrice})
                    info_dict.update({u'unitPrice': unitPrice})
                    info_dict.update({u'dealdate': dealdate})
                    data_source.append(info_dict)

            with model.database.atomic():
                try:
                    model.Sellinfo.insert_many(data_source).upsert().execute()
                except:
                    pass
            logging.info("%s finish" % communityinfo)
            time.sleep(1)