Example #1
0
def crawGanji(url):
    myLogger.logger.info("\n\n===========来自赶集网的房源==============")
    r = requests.get(url)
    contents = BeautifulSoup(r.content, "html.parser")  # 获取到该网页的地址
    all = contents.find_all("div", class_="ershoufang-list")
    global second
    second = all[0].find("dd", class_="title").find("a").text
    infoList = []

    for e in all:
        ele = e.find("dd", class_="title").find("a")
        # 房子赶集描述
        description = deleteallspecialcharacters(ele.text)
        # 赶集网改版后url后面增加了很多字符串,这里取有效字符串
        url = getvalidhref(ele["href"])
        price = e.find("dd", class_="info").find("div", class_="price").find(
            "span", class_="js-price").text
        location = deleteallspecialcharacters(
            e.find("dd", class_="address").find("span",
                                                class_="area").text).replace(
                                                    "徐东二手房出售", "")
        myLogger.logger.info(description + " ,url: " + url + "价格: " + price +
                             " 位置:" + location)
        info = Info(description=description,
                    url=url,
                    price=price,
                    website_id=2,
                    time=currentTime(),
                    location=location)
        infoList.append(info)
    batchInsertInfo(infoList)
Example #2
0
def crawl58(url):
    """
    :param url: 58同城爬取的url
    :return: NULL
    该文件需要进一步解决的问题:
            一、优化数据是否刷新算法(针对搜房网与赶集网刷新问题已经解决)
            二、重新整理MySQL数据库存储格式(已解决)
            三、对数据库中缺少的字段重新添加(已解决)
            四、及时处理网站更新问题,在这方面如何做一个动态配置
            五、优化匹配算法,现在这种全局匹配方式太落后,准确度几乎为0(已优化)
    """
    myLogger.logger.info("\n\n===========来自58的房源==============")
    r = requests.get(url)
    contents = BeautifulSoup(r.content, "html.parser")  # 获取到该网页的地址

    tds = contents.find_all("td", class_="t")  # 找到所有class="t"的td元素
    global first
    first = tds[0].find("a").text
    infoList = []

    for e in tds:
        a = e.find("a")
        desc = deleteallspecialcharacters(a.text)
        left = e.find("div", class_="qj-listleft")
        position = left.a.text
        myLogger.logger.info("URL:" + a["href"] + "描述: " + desc)
        right = e.find("div", class_="qj-listright")
        price = right.b.text
        dealingtime = currentTime()
        # 房源发布时间这里要删除所有非数字字符
        sourcetime = re.sub(
            "[^0-9]", "",
            deleteallspecialcharacters(
                left.find("span", class_="qj-listjjr").text))
        # 有些人的房价写的是一个区间,针对这种信息现不错处理,全部归为0
        try:
            price = int(price)
        except:
            price = 0
        myLogger.logger.info("价格:" + right.b.text)
        info = Info(description=desc,
                    url=deletedomain(a["href"]),
                    price=price,
                    website_id=1,
                    time=dealingtime,
                    sourcetime=sourcetime,
                    location=position)
        # 为什么数据库只是执行一条数据的插入???这里暂且先放到一个list集合中
        infoList.append(info)
    batchInsertInfo(infoList)
Example #3
0
def get_all_proxies():
    """使用selenium+beautifulsoup解析爬取的网页"""
    driver = webdriver.Chrome()
    driver.get(url="http://www.xicidaili.com/nn/")
    _source = driver.page_source
    pretty_page = BeautifulSoup(_source, "html.parser")
    main_body = pretty_page.find("tbody")
    all_server = main_body.find_all(name="tr")[1:]
    beans = list()
    for ele in all_server:
        all_td = ele.find_all("td")
        ip = all_td[1].text
        port = all_td[2].text
        loc = deleteallspecialcharacters(all_td[3].get_text())
        type = all_td[5].text
        live_long = all_td[-2].text
        verify_time = all_td[-1].text
        bean = ProxyServerBean(ip=ip,
                               port=port,
                               loc=loc,
                               type=type,
                               live_long=live_long,
                               verify_time=verify_time)
        beans.append(bean)
    return beans
Example #4
0
def getFourth(url):
    r = requests.get(url)
    contents = BeautifulSoup(r.content, "html.parser")
    allhouses = contents.find_all("div", class_="ohcon-list")
    currentfirst = deleteallspecialcharacters(allhouses[0].find("div", class_="ohclist-ctx").find("a").text)
    if currentfirst[:3] == first[:3]:
        myLogger.logger.info("亿房网没有刷新")
        return True
    else: return False
Example #5
0
def getThird(url):
    contents = commonCrawling(url)
    # 它的孩子元素的xpath://div[contains(@class, "backColor")],然后它的兄弟元素为后面所有非推广房源的dl元素:
    parent = contents.find("div", class_="build_list")
    # 推广房源
    commercial = parent.find("div", class_="backColor")

    currentThirdOne = deleteallspecialcharacters(
        commercial.findNext("dd", class_="margin_l").find(
            "p", class_="build_name").find("a").text)
    # 查找div的下一个dl元素,针对bs4
    currentThirdTwo = deleteallspecialcharacters(
        commercial.find_next_sibling('dl').find("dd", class_="margin_l").find(
            "p", class_="build_name").find("a").text)
    if currentThirdOne.strip()[:3] != third[0].strip(
    )[:3] or currentThirdTwo.strip()[:3] != third[1].strip()[:3]:
        myLogger.logger.warning("搜房网刷新了")
        return False
    else:
        myLogger.logger.info("搜房网没有刷新")
        return True
Example #6
0
def crawyifang(url):
    r = requests.get(url)
    contents = BeautifulSoup(r.content, "html.parser")
    allhouses = contents.find_all("div", class_="ohcon-list")
    global first
    first = deleteallspecialcharacters(allhouses[0].find("div", class_="ohclist-ctx").find("a").text)
    yifanginfolist = list()
    for h in allhouses:
        basic = h.find("div", class_="ohclist-ctx").find("a")
        description = deleteallspecialcharacters(basic.text)
        url = basic["href"]
        location = h.find("div", class_="ohclist-ctx").find("p", class_="addr").text
        sourcetime = deleteallspecialcharacters(h.find("div", class_="ohclist-ctx").find("span", class_="time").text)
        myLogger.logger.info("description: "+ description + "\turl: "+ url + "\tlocation: " + location +"\ttime: " + sourcetime)
        price = h.find("div", class_="ohclist-price").find("span", class_="pnum").text.replace("万", "")
        # 当前时间点
        currenttime = datetime.now()
        info = Info(description=description, url=deletedomain(url), price=price, website_id=4, time=currenttime, sourcetime=sourcetime, location=location)
        yifanginfolist.append(info)
    # 批量倒入info实体
    batchInsertInfo(yifanginfolist)
Example #7
0
def crawSouFang(url):
    myLogger.logger.info("\n\n===========来自搜房网的房源==============")
    r = requests.get(url)
    contents = BeautifulSoup(r.content, "html.parser")  # 获取到该网页的地址
    parent = contents.find("div", class_="build_list")
    # 推广房源
    commercial = parent.find("div", class_="backColor")
    # 所有房源具体信息
    allCommercialHouses = commercial.find_all("dl")
    # 查找div下直接dl子元素
    allNonCommercialHouses = parent.find_all("dl", recursive=False)
    global third
    infoList = []

    # 将third定义为一个tuple类型,判断它的推广房源以及非推广房源
    third_one = deleteallspecialcharacters(allCommercialHouses[0].find(
        "dd", class_="margin_l").find("p", class_="build_name").find("a").text)
    third_two = deleteallspecialcharacters(allNonCommercialHouses[0].find(
        "dd", class_="margin_l").find("p", class_="build_name").find("a").text)
    third = [third_one, third_two]
    myLogger.logger.info("推广房源数量:" + str(len(allCommercialHouses)) +
                         "个人非推广房源数目:" + str(len(allNonCommercialHouses)))
    myLogger.logger.info("\t\t=========下面是所有搜房网推广个人房源=============")
    # 下面时所有的推广的房源
    for a in allCommercialHouses:
        basic = a.find("dd",
                       class_="margin_l").find("p",
                                               class_="build_name").find("a")
        description = deleteallspecialcharacters(basic.text)
        # 这里getDomain还未完善
        url = basic["href"]
        price = a.find("dd", class_="right price_r").find(
            "p", class_="build_price").find("span").text
        position = deleteallspecialcharacters(
            a.find("dd", class_="margin_l").find("p",
                                                 class_="finish_data").text)
        myLogger.logger.info(description + " , url :" + url + " ,price :" +
                             price + " ,位置 :" + position)
        info = Info(description=description,
                    url=url,
                    price=price,
                    website_id=3,
                    time=currentTime(),
                    location=position)
        infoList.append(info)

    myLogger.logger.info("\t\t=========下面是所有非推广个人房源=============")
    # 下面时所有非推广的个人房源
    for a in allNonCommercialHouses:
        basic = a.find("dd",
                       class_="margin_l").find("p",
                                               class_="build_name").find("a")
        description = deleteallspecialcharacters(basic.text)
        position = deleteallspecialcharacters(
            a.find("dd", class_="margin_l").find("p",
                                                 class_="finish_data").text)
        # 这里getDomain还未完善
        url = basic["href"]
        price = a.find("dd", class_="right price_r").find(
            "p", class_="build_price").find("span").text
        myLogger.logger.info(description + " , url :" + url + " ,price :" +
                             price + " ,location :" + position)
        info = Info(description=description,
                    url=url,
                    price=price,
                    website_id=3,
                    time=currentTime(),
                    location=position)
        infoList.append(info)
    batchInsertInfo(infoList)