Python get_xpath_content Exemples, lib.utils.get_xpath_content Python Exemples

Exemple #1

0

Afficher le fichier

def crawl(middleman_type):
    origin_url = "http://www.anjuke.com/sy-city.html"
    city_xpath = ur"//div[@class='city_list']/a/@href"
    # 获取城市url列表
    page_obj = get(origin_url, use_proxy=False)
    if not page_obj:
        logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url))
        return
    city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath)
    if not city_url_list:
        logging.warning('%s: No city url!' % (middleman_type))
        return None
    # city_url_list = ["http://beijing.anjuke.com/tycoon/"]

    for city_url in city_url_list:
        logging.warning("%s: City page url, url: %s" % (middleman_type, city_url))
        city_url = city_url.rstrip("/")
        # 经纪人的url
        page_url = city_url + "/tycoon/"
        while page_url:
            logging.warning("%s: Get list page url, url: %s" % (middleman_type, page_url))
            page_obj = get(page_url, use_proxy=False)
            if not page_obj:
                logging.warning('%s: Cannot get page. url: %s' % (middleman_type, page_url))
                page_url = None
                continue
            page_res_list, next_page_url = parse_page(city_url, page_obj)
            if next_page_url:
                page_url = next_page_url[0]
            else:
                page_url = None
            res = record_res(page_res_list, middleman_type)
            if not res:
                logging.error("%s: Cannot record res, url: %s" % (middleman_type, page_url))

Exemple #2

0

Afficher le fichier

def crawl(middleman_type):
    origin_url = "http://house.focus.cn/"
    city_xpath = ur"//div[@id='cityArea']/div[@class='bot']//div[@class='cityAreaBoxCen']//a/@href"
    # 获取城市url列表
    page_obj = get(origin_url, use_proxy=False)
    if not page_obj:
        logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url))
        return
    city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath)
    if not city_url_list:
        logging.warning('%s: No city url!' % (middleman_type))
        return None
    # city_url_list = ["http://beijing.anjuke.com/tycoon/"]

    for city_url in city_url_list:
        logging.warning("%s: City page url, url: %s" % (middleman_type, city_url))
        # 经纪人的url
        url_list = city_url.split('.')
        start_page_url = url_list[0] + ".esf.focus.cn/agent"
        page_url = url_list[0] + ".esf.focus.cn/agent"

        while page_url:
            logging.warning("%s: Get list page url, url: %s" % (middleman_type, page_url))
            page_obj = get(page_url, use_proxy=False)
            if not page_obj:
                logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url))
                page_url = None
                continue
            page_res_list, next_page_url = parse_page(start_page_url, page_obj)
            # print 'next', next_page_url
            if next_page_url:
                page_url = next_page_url[0]
            else:
                page_url = None
            res = record_res(page_res_list, middleman_type)
            if not res:
                logging.error("%s: Cannot record res, url: %s" % (middleman_type, page_url))

Exemple #3

0

Afficher le fichier

def crawl(middleman_type):

    origin_url = "http://bj.5i5j.com/"
    city_xpath = "//div[@class='new_city_more']//a/@href"
    # 获取城市url列表
    time.sleep(2)
    origin_page_obj = get(origin_url, use_proxy=False)
    if not origin_page_obj:
        logging.warning('%s: Cannot get page. url: %s' %
                        (middleman_type, origin_url))
        return
    city_url_list = get_xpath_content(origin_url, origin_page_obj.text,
                                      city_xpath)
    if not city_url_list:
        logging.warning('%s: No city url.' % (middleman_type))
        return None

    # city_url_list = ["http://bj.5i5j.com/"]

    for city_url in city_url_list:

        logging.warning("%s: City page url, url: %s" %
                        (middleman_type, city_url))
        city_url = city_url.rstrip("/")
        city_broker_url = city_url + "/broker"

        logging.warning("%s: Get city page url, url: %s" %
                        (middleman_type, city_broker_url))
        time.sleep(2)
        city_broker_page_obj = get(city_broker_url, use_proxy=False)
        if not city_broker_page_obj:
            logging.warning('%s: Cannot get page. url: %s' %
                            (middleman_type, city_broker_url))
            continue

        if "tj.5i5j" in city_url:
            area_xpath = ur"//ul[@class='search-quyu']/li[1]/a[position()>1]/@href"
            detail_xpath = ur"//li[@class='addressli']/div[@class='shquan quanm']/span/a/@href"
        else:
            area_xpath = ur"//li[@class='quyu_gao']//a[position()>1]/@href"
            detail_xpath = ur"//div[@class='keywords01']/a/@href"

        area_url_list = get_xpath_content(city_url, city_broker_page_obj.text,
                                          area_xpath)
        if not area_url_list:
            logging.warning('%s: No area broker url, info: %s' %
                            (middleman_type, city_broker_url))
            continue

        # 获取具体地点的url列表
        # area_url_list = ["http://bj.5i5j.com/broker/haidian/"]
        for area_url in area_url_list:
            logging.warning("%s: Get area page url, url: %s" %
                            (middleman_type, area_url))
            time.sleep(2)
            area_page_obj = get(area_url, use_proxy=False)
            if not area_page_obj:
                logging.warning('%s: Cannot get page. url: %s' %
                                (middleman_type, area_url))
                continue
            detail_address_broker_list = get_xpath_content(
                city_url, area_page_obj.text, detail_xpath)
            if not detail_address_broker_list:
                logging.warning('%s: No detail address broker url, info: %s' %
                                (middleman_type, area_url))
                continue

            # # 记录
            for detail_address_url in detail_address_broker_list:
                #print 'detail_url', detail_address_url
                while detail_address_url:
                    logging.warning("%s: Get list page url, url: %s" %
                                    (middleman_type, detail_address_url))
                    time.sleep(2)
                    detail_page_obj = get(detail_address_url, use_proxy=False)
                    if not detail_page_obj:
                        logging.warning('%s: Cannot get page. url: %s' %
                                        (middleman_type, detail_address_url))
                        detail_address_url = None
                        continue
                    page_res_list, next_page_url = parse_page(
                        city_url, detail_page_obj)
                    if next_page_url:
                        detail_address_url = next_page_url[0]
                    else:
                        detail_address_url = None
                    #print 'next', detail_address_url
                    res = record_res(page_res_list, middleman_type)
                    if not res:
                        logging.error("%s: Cannot record res, url: %s" %
                                      (middleman_type, detail_address_url))

Exemple #4

0

Afficher le fichier

def crawl(middleman_type):
    origin_url = "http://shijiazhuang.tuitui99.com/"
    city_xpath = ur"//div[@class='city_more']//a/@href"
    # 获取城市url列表
    page_obj = get(origin_url, use_proxy=False)
    if not page_obj:
        logging.warning('%s: Cannot get page. url: %s' %
                        (middleman_type, origin_url))
        return
    city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath)
    if not city_url_list:
        logging.warning('%s: No city url!' % (middleman_type))
        return None
    # city_url_list = ["http://beijing.anjuke.com/tycoon/"]

    for city_url in city_url_list:
        logging.warning("%s: City page url, url: %s" %
                        (middleman_type, city_url))
        city_url = city_url.rstrip("/")
        city_broker_url = city_url + "/broker"

        logging.warning("%s: Get city page url, url: %s" %
                        (middleman_type, city_broker_url))
        time.sleep(2)
        city_broker_page_obj = get(city_broker_url, use_proxy=False)
        if not city_broker_page_obj:
            logging.warning('%s: Cannot get page. url: %s' %
                            (middleman_type, city_broker_url))
            continue
        area_xpath = ur"//dl[@class='clearfix']/dd/a[position()>1]/@href"
        detail_xpath = ur"//dd[@class='sub_area']/a[position()>1]/@href"

        area_url_list = get_xpath_content(city_url, city_broker_page_obj.text,
                                          area_xpath)
        if not area_url_list:
            logging.warning('%s: No area broker url, info: %s' %
                            (middleman_type, city_broker_url))
            continue

        # 获取具体地点的url列表
        # area_url_list = ["http://bj.5i5j.com/broker/haidian/"]
        for area_url in area_url_list:
            logging.warning("%s: Get area page url, url: %s" %
                            (middleman_type, area_url))
            time.sleep(2)
            area_page_obj = get(area_url, use_proxy=False)
            if not area_page_obj:
                logging.warning('%s: Cannot get page. url: %s' %
                                (middleman_type, area_url))
                continue
            detail_address_broker_list = get_xpath_content(
                city_url, area_page_obj.text, detail_xpath)
            if not detail_address_broker_list:
                logging.warning('%s: No detail address broker url, info: %s' %
                                (middleman_type, area_url))
                continue

            # # 记录
            for detail_address_url in detail_address_broker_list:
                # print 'detail_url', detail_address_url
                first_detail_address_url = detail_address_url
                while detail_address_url:
                    logging.warning("%s: Get list page url, url: %s" %
                                    (middleman_type, detail_address_url))
                    time.sleep(2)
                    detail_page_obj = get(detail_address_url, use_proxy=False)
                    if not detail_page_obj:
                        logging.warning('%s: Cannot get page. url: %s' %
                                        (middleman_type, detail_address_url))
                        detail_address_url = None
                        continue
                    page_res_list, next_page_url = parse_page(
                        city_url, detail_page_obj, first_detail_address_url)
                    if next_page_url:
                        detail_address_url = next_page_url[0]
                    else:
                        detail_address_url = None
                    # print 'next', detail_address_url
                    res = record_res(page_res_list, middleman_type)
                    if not res:
                        logging.error("%s: Cannot record res, url: %s" %
                                      (middleman_type, detail_address_url))

Exemple #5

0

Afficher le fichier

def crawl(middleman_type):
    origin_url = "http://fang.com/SoufunFamily.htm"
    city_xpath = "//div[@class='letterSelt']/div[@id='c01']//a/@href"
    # 获取城市url列表
    time.sleep(2)
    origin_page_obj = get(origin_url, use_proxy=False)
    if not origin_page_obj:
        logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url))
        return
    city_url_list = get_xpath_content(origin_url, origin_page_obj.text, city_xpath)
    if not city_url_list:
        logging.warning('%s: No city url.' % (middleman_type))
        return None

    #city_url_list = ["http://bj.fang.com/"]
    area_xpath = ur"//div[@class='qxName']/a[position()>1]/@href"
    detail_xpath = ur"//p[@id='shangQuancontain']/a[position()>1]/@href"

    for city_url in city_url_list:
        # print 'city',city_url
        logging.warning("%s: City page url, url: %s" % (middleman_type, city_url))
        if city_url == "http://bj.fang.com/":
            city_broker_url = "http://esf.fang.com"
        else:
            re_pattern = ur"^http://(\w+)\.fang\.com/$"
            m = re.search(re_pattern, city_url)
            if m:
                city_abbr = m.group(1)
                city_broker_url = "http://esf." + city_abbr + ".fang.com"

            else:
                continue
        city_broker_url_first = city_broker_url + '/agenthome/'
        logging.warning("%s: Get city page url, url: %s" % (middleman_type, city_broker_url_first))
        time.sleep(2)
        city_broker_page_obj = get(city_broker_url_first, use_proxy=False)
        if not city_broker_page_obj:
            logging.warning('%s: Cannot get page. url: %s' % (middleman_type, city_broker_url_first))
            continue
        area_url_list = get_xpath_content(city_broker_url, city_broker_page_obj.text, area_xpath)
        if not area_url_list:
            logging.warning('%s: No area broker url, info: %s' % (middleman_type, city_broker_url_first))
            continue

        # 获取具体地点的url列表
        # area_url_list = ["http://esf.fang.com/agenthome-a03/-i31-j310/"]
        for area_url in area_url_list:
            # print 'area_url', area_url
            logging.warning("%s: Get area page url, url: %s" % (middleman_type, area_url))
            time.sleep(2)
            area_page_obj = get(area_url, use_proxy=False)
            if not area_page_obj:
                logging.warning('%s: Cannot get page. url: %s' % (middleman_type, area_url))
                continue
            detail_address_broker_list = get_xpath_content(city_broker_url, area_page_obj.text, detail_xpath)
            if not detail_address_broker_list:
                logging.warning('%s: No detail address broker url, info: %s' % (middleman_type, area_url))
                continue

            # # 记录
            # detail_address_broker_list = ['http://esf.fang.com/agenthome-a03-b012384/-i31-j310/']
            for detail_address_url in detail_address_broker_list:
                # print 'detail_url', detail_address_url
                while detail_address_url:
                    logging.warning("%s: Get list page url, url: %s" % (middleman_type, detail_address_url))
                    time.sleep(2)
                    detail_page_obj = get(detail_address_url, use_proxy=False)
                    if not detail_page_obj:
                        logging.warning('%s: Cannot get page. url: %s' % (middleman_type, detail_address_url))
                        detail_address_url = None
                        continue
                    page_res_list, next_page_url = parse_page(city_broker_url, detail_page_obj)
                    if next_page_url:
                        detail_address_url = next_page_url[0]
                    else:
                        detail_address_url = None
                    # print 'next', detail_address_url
                    res = record_res(page_res_list, middleman_type)
                    if not res:
                        logging.error("%s: Cannot record res, url: %s" % (middleman_type, detail_address_url))