def crawl(middleman_type): origin_url = "http://www.anjuke.com/sy-city.html" city_xpath = ur"//div[@class='city_list']/a/@href" # 获取城市url列表 page_obj = get(origin_url, use_proxy=False) if not page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) return city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath) if not city_url_list: logging.warning('%s: No city url!' % (middleman_type)) return None # city_url_list = ["http://beijing.anjuke.com/tycoon/"] for city_url in city_url_list: logging.warning("%s: City page url, url: %s" % (middleman_type, city_url)) city_url = city_url.rstrip("/") # 经纪人的url page_url = city_url + "/tycoon/" while page_url: logging.warning("%s: Get list page url, url: %s" % (middleman_type, page_url)) page_obj = get(page_url, use_proxy=False) if not page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, page_url)) page_url = None continue page_res_list, next_page_url = parse_page(city_url, page_obj) if next_page_url: page_url = next_page_url[0] else: page_url = None res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, page_url))
def crawl(middleman_type): origin_url = "http://house.focus.cn/" city_xpath = ur"//div[@id='cityArea']/div[@class='bot']//div[@class='cityAreaBoxCen']//a/@href" # 获取城市url列表 page_obj = get(origin_url, use_proxy=False) if not page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) return city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath) if not city_url_list: logging.warning('%s: No city url!' % (middleman_type)) return None # city_url_list = ["http://beijing.anjuke.com/tycoon/"] for city_url in city_url_list: logging.warning("%s: City page url, url: %s" % (middleman_type, city_url)) # 经纪人的url url_list = city_url.split('.') start_page_url = url_list[0] + ".esf.focus.cn/agent" page_url = url_list[0] + ".esf.focus.cn/agent" while page_url: logging.warning("%s: Get list page url, url: %s" % (middleman_type, page_url)) page_obj = get(page_url, use_proxy=False) if not page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) page_url = None continue page_res_list, next_page_url = parse_page(start_page_url, page_obj) # print 'next', next_page_url if next_page_url: page_url = next_page_url[0] else: page_url = None res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, page_url))
def crawl(middleman_type): origin_url = "http://bj.5i5j.com/" city_xpath = "//div[@class='new_city_more']//a/@href" # 获取城市url列表 time.sleep(2) origin_page_obj = get(origin_url, use_proxy=False) if not origin_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) return city_url_list = get_xpath_content(origin_url, origin_page_obj.text, city_xpath) if not city_url_list: logging.warning('%s: No city url.' % (middleman_type)) return None # city_url_list = ["http://bj.5i5j.com/"] for city_url in city_url_list: logging.warning("%s: City page url, url: %s" % (middleman_type, city_url)) city_url = city_url.rstrip("/") city_broker_url = city_url + "/broker" logging.warning("%s: Get city page url, url: %s" % (middleman_type, city_broker_url)) time.sleep(2) city_broker_page_obj = get(city_broker_url, use_proxy=False) if not city_broker_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, city_broker_url)) continue if "tj.5i5j" in city_url: area_xpath = ur"//ul[@class='search-quyu']/li[1]/a[position()>1]/@href" detail_xpath = ur"//li[@class='addressli']/div[@class='shquan quanm']/span/a/@href" else: area_xpath = ur"//li[@class='quyu_gao']//a[position()>1]/@href" detail_xpath = ur"//div[@class='keywords01']/a/@href" area_url_list = get_xpath_content(city_url, city_broker_page_obj.text, area_xpath) if not area_url_list: logging.warning('%s: No area broker url, info: %s' % (middleman_type, city_broker_url)) continue # 获取具体地点的url列表 # area_url_list = ["http://bj.5i5j.com/broker/haidian/"] for area_url in area_url_list: logging.warning("%s: Get area page url, url: %s" % (middleman_type, area_url)) time.sleep(2) area_page_obj = get(area_url, use_proxy=False) if not area_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, area_url)) continue detail_address_broker_list = get_xpath_content( city_url, area_page_obj.text, detail_xpath) if not detail_address_broker_list: logging.warning('%s: No detail address broker url, info: %s' % (middleman_type, area_url)) continue # # 记录 for detail_address_url in detail_address_broker_list: #print 'detail_url', detail_address_url while detail_address_url: logging.warning("%s: Get list page url, url: %s" % (middleman_type, detail_address_url)) time.sleep(2) detail_page_obj = get(detail_address_url, use_proxy=False) if not detail_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, detail_address_url)) detail_address_url = None continue page_res_list, next_page_url = parse_page( city_url, detail_page_obj) if next_page_url: detail_address_url = next_page_url[0] else: detail_address_url = None #print 'next', detail_address_url res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, detail_address_url))
def crawl(middleman_type): origin_url = "http://shijiazhuang.tuitui99.com/" city_xpath = ur"//div[@class='city_more']//a/@href" # 获取城市url列表 page_obj = get(origin_url, use_proxy=False) if not page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) return city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath) if not city_url_list: logging.warning('%s: No city url!' % (middleman_type)) return None # city_url_list = ["http://beijing.anjuke.com/tycoon/"] for city_url in city_url_list: logging.warning("%s: City page url, url: %s" % (middleman_type, city_url)) city_url = city_url.rstrip("/") city_broker_url = city_url + "/broker" logging.warning("%s: Get city page url, url: %s" % (middleman_type, city_broker_url)) time.sleep(2) city_broker_page_obj = get(city_broker_url, use_proxy=False) if not city_broker_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, city_broker_url)) continue area_xpath = ur"//dl[@class='clearfix']/dd/a[position()>1]/@href" detail_xpath = ur"//dd[@class='sub_area']/a[position()>1]/@href" area_url_list = get_xpath_content(city_url, city_broker_page_obj.text, area_xpath) if not area_url_list: logging.warning('%s: No area broker url, info: %s' % (middleman_type, city_broker_url)) continue # 获取具体地点的url列表 # area_url_list = ["http://bj.5i5j.com/broker/haidian/"] for area_url in area_url_list: logging.warning("%s: Get area page url, url: %s" % (middleman_type, area_url)) time.sleep(2) area_page_obj = get(area_url, use_proxy=False) if not area_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, area_url)) continue detail_address_broker_list = get_xpath_content( city_url, area_page_obj.text, detail_xpath) if not detail_address_broker_list: logging.warning('%s: No detail address broker url, info: %s' % (middleman_type, area_url)) continue # # 记录 for detail_address_url in detail_address_broker_list: # print 'detail_url', detail_address_url first_detail_address_url = detail_address_url while detail_address_url: logging.warning("%s: Get list page url, url: %s" % (middleman_type, detail_address_url)) time.sleep(2) detail_page_obj = get(detail_address_url, use_proxy=False) if not detail_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, detail_address_url)) detail_address_url = None continue page_res_list, next_page_url = parse_page( city_url, detail_page_obj, first_detail_address_url) if next_page_url: detail_address_url = next_page_url[0] else: detail_address_url = None # print 'next', detail_address_url res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, detail_address_url))
def crawl(middleman_type): origin_url = "http://fang.com/SoufunFamily.htm" city_xpath = "//div[@class='letterSelt']/div[@id='c01']//a/@href" # 获取城市url列表 time.sleep(2) origin_page_obj = get(origin_url, use_proxy=False) if not origin_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) return city_url_list = get_xpath_content(origin_url, origin_page_obj.text, city_xpath) if not city_url_list: logging.warning('%s: No city url.' % (middleman_type)) return None #city_url_list = ["http://bj.fang.com/"] area_xpath = ur"//div[@class='qxName']/a[position()>1]/@href" detail_xpath = ur"//p[@id='shangQuancontain']/a[position()>1]/@href" for city_url in city_url_list: # print 'city',city_url logging.warning("%s: City page url, url: %s" % (middleman_type, city_url)) if city_url == "http://bj.fang.com/": city_broker_url = "http://esf.fang.com" else: re_pattern = ur"^http://(\w+)\.fang\.com/$" m = re.search(re_pattern, city_url) if m: city_abbr = m.group(1) city_broker_url = "http://esf." + city_abbr + ".fang.com" else: continue city_broker_url_first = city_broker_url + '/agenthome/' logging.warning("%s: Get city page url, url: %s" % (middleman_type, city_broker_url_first)) time.sleep(2) city_broker_page_obj = get(city_broker_url_first, use_proxy=False) if not city_broker_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, city_broker_url_first)) continue area_url_list = get_xpath_content(city_broker_url, city_broker_page_obj.text, area_xpath) if not area_url_list: logging.warning('%s: No area broker url, info: %s' % (middleman_type, city_broker_url_first)) continue # 获取具体地点的url列表 # area_url_list = ["http://esf.fang.com/agenthome-a03/-i31-j310/"] for area_url in area_url_list: # print 'area_url', area_url logging.warning("%s: Get area page url, url: %s" % (middleman_type, area_url)) time.sleep(2) area_page_obj = get(area_url, use_proxy=False) if not area_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, area_url)) continue detail_address_broker_list = get_xpath_content(city_broker_url, area_page_obj.text, detail_xpath) if not detail_address_broker_list: logging.warning('%s: No detail address broker url, info: %s' % (middleman_type, area_url)) continue # # 记录 # detail_address_broker_list = ['http://esf.fang.com/agenthome-a03-b012384/-i31-j310/'] for detail_address_url in detail_address_broker_list: # print 'detail_url', detail_address_url while detail_address_url: logging.warning("%s: Get list page url, url: %s" % (middleman_type, detail_address_url)) time.sleep(2) detail_page_obj = get(detail_address_url, use_proxy=False) if not detail_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, detail_address_url)) detail_address_url = None continue page_res_list, next_page_url = parse_page(city_broker_url, detail_page_obj) if next_page_url: detail_address_url = next_page_url[0] else: detail_address_url = None # print 'next', detail_address_url res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, detail_address_url))