def crawl(middleman_type): origin_url = "http://www.anjuke.com/sy-city.html" city_xpath = ur"//div[@class='city_list']/a/@href" # 获取城市url列表 page_obj = get(origin_url, use_proxy=False) if not page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) return city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath) if not city_url_list: logging.warning('%s: No city url!' % (middleman_type)) return None # city_url_list = ["http://beijing.anjuke.com/tycoon/"] for city_url in city_url_list: logging.warning("%s: City page url, url: %s" % (middleman_type, city_url)) city_url = city_url.rstrip("/") # 经纪人的url page_url = city_url + "/tycoon/" while page_url: logging.warning("%s: Get list page url, url: %s" % (middleman_type, page_url)) page_obj = get(page_url, use_proxy=False) if not page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, page_url)) page_url = None continue page_res_list, next_page_url = parse_page(city_url, page_obj) if next_page_url: page_url = next_page_url[0] else: page_url = None res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, page_url))
def init_site(site_info): city_code = json.loads(open('init_module/baidu_city_code.txt').read()) init_urls = [] for city, query_word in parse_query_file(): req_url = ("http://waimai.baidu.com/waimai?qt=poisug&wd=%s&" "cb=suggestion_1442286608299&cid=%s&b=&type=0&" "newmap=1&ie=utf-8&callback=jsonp11" % (query_word, city_code[city])) resp = getpage.get(req_url, use_proxy=0) if not resp or not resp.text: continue start_pos = resp.text.find('{') end_pos = resp.text.rfind('}') + 1 if end_pos < 0: return init_urls text = resp.text[start_pos: end_pos] resp_obj = json.loads(text) for data in resp_obj.get('s', []): data_obj = data.split('$') if not data_obj: continue try: address = urllib.quote(data_obj[3].encode('utf-8')) lat = data_obj[5].split(',')[0] lng = data_obj[5].split(',')[1] init_urls.append("http://waimai.baidu.com/mobile/waimai?qt=shoplist&address=%s&" "lat=%s&lng=%s&page=1&count=20&display=json" % (address, lat, lng)) except Exception, e: print e continue
def get_pagenum(site_info): try: page = getpage.get(site_info['url'], use_proxy=0) page_doc = lxml.html.document_fromstring(page.text) pagenum = page_doc.xpath("//input[@name='pageInfo.pageTotal']/@value")[0] except Exception, e: return None
def crawl(middleman_type): city_url_list = [ "http://bj.maitian.cn/bkesf", "http://fz.maitian.cn/bkesf", "http://xm.maitian.cn/bkesf" ] # city_url_list = ["http://beijing.anjuke.com/tycoon/"] for city_url in city_url_list: logging.warning("%s: City page url, url: %s" % (middleman_type, city_url)) page_url = city_url while page_url: logging.warning("%s: Get list page url, url: %s" % (middleman_type, page_url)) page_obj = get(page_url, use_proxy=False) if not page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, page_url)) page_url = None continue page_res_list, next_page_url = parse_page(city_url, page_obj) if next_page_url: page_url = next_page_url[0] else: page_url = None res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, page_url))
def get_provID(): province_url = "http://www.zto.cn/Scripts/proselect/Places.js" try: pageobj = getpage.get(url=province_url, use_proxy=False) page = json.loads(pageobj.text[:-1].split('=')[1]) except Exception,e: logging.error("download province info failed, msg: %s" %e) return False
def get_provID(): province_url = "http://www.zjs.com.cn/WS_Business/GetPCAData.ashx?province=0&city=0&county=0&companyname=0" try: pageobj = getpage.get(url=province_url, use_proxy=False) page = json.loads(pageobj.text) except Exception, e: logging.error("download province info failed, msg: %s" % e) return False
def test_get_batch(self): print 'Batch testing get() w/ proxy...' for url in self.urls: print '====================================' print 'url:\t', url p = getpage.get(url) self.printPage(p) self.assertTrue(type(p) is Page or p is None)
def get_provID(): province_url = "http://www.yto.net.cn/cn/service/map.htm" try: pageobj = getpage.get(url=province_url, use_proxy=False) page = pageobj.text except Exception, e: logging.error("download province info failed, msg: %s" % e) return False
def get_provID(): province_url = "http://www.ztky.com/data/getProvinceData.aspx" try: pageobj = getpage.get(url=province_url, use_proxy=False) page = json.loads(pageobj.text) except Exception,e: logging.error("download province info failed, msg: %s" %e) return False
def get_provid(): prov_url = "http://www.oppo.com/index.php?q=service/oppostore/p/%E5%8C%97%E4%BA%AC/c//g/1" try: page_obj = getpage.get(url=prov_url, use_proxy=False, render_js=True) page_doc = lxml.html.document_fromstring(page_obj.text) prov_ids = [] for val in page_doc.xpath("//ul[@id='province']//a/text()"): prov_ids.append(val) except Exception, e: logging.error("get prov id failed, msg: %s" % e) return []
def get_serviceinfo(): prov_url = "http://support-cn.samsung.com/support/ServiceLocations.asp" try: page_obj = getpage.get(url=prov_url, use_proxy=False) page_doc = lxml.html.document_fromstring(page_obj.text) prov_ids = page_doc.xpath("//ul[@id='ulpro']//a/text()") categorys = page_doc.xpath( "//ul[@class='product-categories']//label/@value") except Exception, e: logging.error("get prov id failed, msg: %s" % e) return ([], [])
def get_phone(phone_id): if not phone_id: return [] url = 'http://www.yellowpages.com.eg/en/get-phones/%s/' % phone_id page = getpage.get(url, use_proxy=0) page_obj = lxml.html.document_fromstring(page.text) phone_doc = page_obj.xpath("//span[@class='detail']/text()") phones = [] for each in phone_doc: phones += [v.strip() for v in each.split(',')] return phones
def get_provid(): url = "http://www.byf.com/b2b/list.aspx?p=017000" try: page_obj = getpage.get(url=url, use_proxy=False, render_js=True) page_doc = lxml.html.document_fromstring(page_obj.text) prov_ids = [] for val in page_doc.xpath("//select[@id='ddlProvince']/option/@value"): if val: prov_ids.append(val) except Exception, e: logging.error("get prov id failed, msg: %s" % e) return []
def init_site(site_info): init_urls = [] try: page = getpage.get( 'http://service.sony.com.cn/Maintenance_Station/2518.htm', use_proxy=0) page_doc = lxml.html.document_fromstring(page.text) for v in page_doc.xpath( "//select[@class='default']/option[position()>1]/@value"): init_urls.append('http://service.sony.com.cn' + v) except Exception, e: return []
def get_cityid(): city_url = "http://www.wo116114.com/" try: page_obj = getpage.get(url=city_url, use_proxy=False, render_js=True) page_doc = lxml.html.document_fromstring(page_obj.text) city_ids = [] for val in page_doc.xpath( "//div[@class='pro_nr02']//ul/li/a/@onclick"): id = val[val.find('\'') + 1:val.find(',') - 1] city_ids.append(val[val.find('\'') + 1:val.find(',') - 1]) except Exception, e: logging.error("get city id failed, msg: %s" % e) return []
def get_cityid(prov_id): city_url = "http://support-cn.samsung.com/support/ServiceLocations-ajax.asp?v=%s&act=3" % ( urllib2.quote(prov_id.encode('utf-8'))) try: page = getpage.get(url=city_url, use_proxy=False).text city_ids = [] for val in page.split(","): start_pos = val.find('"') end_pos = val.find('"', start_pos + 1) city_ids.append(val[start_pos + 1:end_pos]) except Exception, e: logging.error("get city id failed, msg: %s" % e) return []
def crawl(middleman_type): origin_url = "http://house.focus.cn/" city_xpath = ur"//div[@id='cityArea']/div[@class='bot']//div[@class='cityAreaBoxCen']//a/@href" # 获取城市url列表 page_obj = get(origin_url, use_proxy=False) if not page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) return city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath) if not city_url_list: logging.warning('%s: No city url!' % (middleman_type)) return None # city_url_list = ["http://beijing.anjuke.com/tycoon/"] for city_url in city_url_list: logging.warning("%s: City page url, url: %s" % (middleman_type, city_url)) # 经纪人的url url_list = city_url.split('.') start_page_url = url_list[0] + ".esf.focus.cn/agent" page_url = url_list[0] + ".esf.focus.cn/agent" while page_url: logging.warning("%s: Get list page url, url: %s" % (middleman_type, page_url)) page_obj = get(page_url, use_proxy=False) if not page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) page_url = None continue page_res_list, next_page_url = parse_page(start_page_url, page_obj) # print 'next', next_page_url if next_page_url: page_url = next_page_url[0] else: page_url = None res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, page_url))
def test_quick(self): url = 'http://www.cnbeta.com/articles/372623.htm' print 'Quick test' print '====================================' print 'PX: N, FP: N, JS: N' p = getpage.get(url, use_proxy=False, fpfirst=False, render_js=False) self.printPage(p) print '====================================' print 'PX: N, FP: Y, JS: N' p = getpage.get(url, use_proxy=False, fpfirst=True, render_js=False) self.printPage(p) print '====================================' print 'PX: N, FP: Y, JS: Y' p = getpage.get(url, use_proxy=False, fpfirst=True, render_js=True) self.printPage(p) print '====================================' print 'PX: Y, FP: N, JS: N' p = getpage.get(url, use_proxy=True, fpfirst=True, render_js=False) self.printPage(p) print '====================================' print 'PX: Y, FP: Y, JS: Y' p = getpage.get(url, use_proxy=True, fpfirst=True, render_js=True) self.printPage(p)
def test_get(self): print '====================================' print '====================================' print '====================================' print 'Testing get()...' print '====================================' url = 'http://www.cnbeta.com/articles/372623.htm' print '====================================' p = getpage.get(url, retry=1, use_proxy=False, fpfirst=False, render_js=False) self.printPage(p) print '====================================' p = getpage.get(url, retry=1, use_proxy=False, fpfirst=True, render_js=False) self.printPage(p) print '====================================' p = getpage.get(url, retry=1, use_proxy=False, fpfirst=True, render_js=True) self.printPage(p) print '====================================' p = getpage.get(url, retry=2, use_proxy=True, fpfirst=False, render_js=False) self.printPage(p) print '====================================' p = getpage.get(url, retry=2, use_proxy=True, fpfirst=True, render_js=False) self.printPage(p) print '====================================' p = getpage.get(url, retry=2, use_proxy=True, fpfirst=True, render_js=True) self.printPage(p) print '===================================='
def crawl(middleman_type): origin_url = "http://bj.5i5j.com/" city_xpath = "//div[@class='new_city_more']//a/@href" # 获取城市url列表 time.sleep(2) origin_page_obj = get(origin_url, use_proxy=False) if not origin_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) return city_url_list = get_xpath_content(origin_url, origin_page_obj.text, city_xpath) if not city_url_list: logging.warning('%s: No city url.' % (middleman_type)) return None # city_url_list = ["http://bj.5i5j.com/"] for city_url in city_url_list: logging.warning("%s: City page url, url: %s" % (middleman_type, city_url)) city_url = city_url.rstrip("/") city_broker_url = city_url + "/broker" logging.warning("%s: Get city page url, url: %s" % (middleman_type, city_broker_url)) time.sleep(2) city_broker_page_obj = get(city_broker_url, use_proxy=False) if not city_broker_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, city_broker_url)) continue if "tj.5i5j" in city_url: area_xpath = ur"//ul[@class='search-quyu']/li[1]/a[position()>1]/@href" detail_xpath = ur"//li[@class='addressli']/div[@class='shquan quanm']/span/a/@href" else: area_xpath = ur"//li[@class='quyu_gao']//a[position()>1]/@href" detail_xpath = ur"//div[@class='keywords01']/a/@href" area_url_list = get_xpath_content(city_url, city_broker_page_obj.text, area_xpath) if not area_url_list: logging.warning('%s: No area broker url, info: %s' % (middleman_type, city_broker_url)) continue # 获取具体地点的url列表 # area_url_list = ["http://bj.5i5j.com/broker/haidian/"] for area_url in area_url_list: logging.warning("%s: Get area page url, url: %s" % (middleman_type, area_url)) time.sleep(2) area_page_obj = get(area_url, use_proxy=False) if not area_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, area_url)) continue detail_address_broker_list = get_xpath_content( city_url, area_page_obj.text, detail_xpath) if not detail_address_broker_list: logging.warning('%s: No detail address broker url, info: %s' % (middleman_type, area_url)) continue # # 记录 for detail_address_url in detail_address_broker_list: #print 'detail_url', detail_address_url while detail_address_url: logging.warning("%s: Get list page url, url: %s" % (middleman_type, detail_address_url)) time.sleep(2) detail_page_obj = get(detail_address_url, use_proxy=False) if not detail_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, detail_address_url)) detail_address_url = None continue page_res_list, next_page_url = parse_page( city_url, detail_page_obj) if next_page_url: detail_address_url = next_page_url[0] else: detail_address_url = None #print 'next', detail_address_url res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, detail_address_url))
def crawl(middleman_type): origin_url = "http://shijiazhuang.tuitui99.com/" city_xpath = ur"//div[@class='city_more']//a/@href" # 获取城市url列表 page_obj = get(origin_url, use_proxy=False) if not page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) return city_url_list = get_xpath_content(origin_url, page_obj.text, city_xpath) if not city_url_list: logging.warning('%s: No city url!' % (middleman_type)) return None # city_url_list = ["http://beijing.anjuke.com/tycoon/"] for city_url in city_url_list: logging.warning("%s: City page url, url: %s" % (middleman_type, city_url)) city_url = city_url.rstrip("/") city_broker_url = city_url + "/broker" logging.warning("%s: Get city page url, url: %s" % (middleman_type, city_broker_url)) time.sleep(2) city_broker_page_obj = get(city_broker_url, use_proxy=False) if not city_broker_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, city_broker_url)) continue area_xpath = ur"//dl[@class='clearfix']/dd/a[position()>1]/@href" detail_xpath = ur"//dd[@class='sub_area']/a[position()>1]/@href" area_url_list = get_xpath_content(city_url, city_broker_page_obj.text, area_xpath) if not area_url_list: logging.warning('%s: No area broker url, info: %s' % (middleman_type, city_broker_url)) continue # 获取具体地点的url列表 # area_url_list = ["http://bj.5i5j.com/broker/haidian/"] for area_url in area_url_list: logging.warning("%s: Get area page url, url: %s" % (middleman_type, area_url)) time.sleep(2) area_page_obj = get(area_url, use_proxy=False) if not area_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, area_url)) continue detail_address_broker_list = get_xpath_content( city_url, area_page_obj.text, detail_xpath) if not detail_address_broker_list: logging.warning('%s: No detail address broker url, info: %s' % (middleman_type, area_url)) continue # # 记录 for detail_address_url in detail_address_broker_list: # print 'detail_url', detail_address_url first_detail_address_url = detail_address_url while detail_address_url: logging.warning("%s: Get list page url, url: %s" % (middleman_type, detail_address_url)) time.sleep(2) detail_page_obj = get(detail_address_url, use_proxy=False) if not detail_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, detail_address_url)) detail_address_url = None continue page_res_list, next_page_url = parse_page( city_url, detail_page_obj, first_detail_address_url) if next_page_url: detail_address_url = next_page_url[0] else: detail_address_url = None # print 'next', detail_address_url res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, detail_address_url))
if len(sys.argv) < 3: print "Usage: python %s url task_type" % sys.argv[0] exit() url = sys.argv[1] task_type = sys.argv[2] task_obj = task_mongo.Task() task_conf = task_obj.get_task_conf(task_type)['task_conf'] if not task_conf: exit() print url page_obj = getpage.get(url, use_proxy=False) if not page_obj: print "get page failed" exit() tasks, items = conf_parser.parse(page_obj.text, url, task_conf, page_obj.encoding) print "task num: %d" % len(tasks) for task_type, url in tasks: print task_type, url print "------------------------------------------------------------------------" print "item num: %d" % len(items) for item in items: item_str = json.dumps(item, ensure_ascii=False).encode("utf-8") print item_str
def crawl(middleman_type): origin_url = "http://fang.com/SoufunFamily.htm" city_xpath = "//div[@class='letterSelt']/div[@id='c01']//a/@href" # 获取城市url列表 time.sleep(2) origin_page_obj = get(origin_url, use_proxy=False) if not origin_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, origin_url)) return city_url_list = get_xpath_content(origin_url, origin_page_obj.text, city_xpath) if not city_url_list: logging.warning('%s: No city url.' % (middleman_type)) return None #city_url_list = ["http://bj.fang.com/"] area_xpath = ur"//div[@class='qxName']/a[position()>1]/@href" detail_xpath = ur"//p[@id='shangQuancontain']/a[position()>1]/@href" for city_url in city_url_list: # print 'city',city_url logging.warning("%s: City page url, url: %s" % (middleman_type, city_url)) if city_url == "http://bj.fang.com/": city_broker_url = "http://esf.fang.com" else: re_pattern = ur"^http://(\w+)\.fang\.com/$" m = re.search(re_pattern, city_url) if m: city_abbr = m.group(1) city_broker_url = "http://esf." + city_abbr + ".fang.com" else: continue city_broker_url_first = city_broker_url + '/agenthome/' logging.warning("%s: Get city page url, url: %s" % (middleman_type, city_broker_url_first)) time.sleep(2) city_broker_page_obj = get(city_broker_url_first, use_proxy=False) if not city_broker_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, city_broker_url_first)) continue area_url_list = get_xpath_content(city_broker_url, city_broker_page_obj.text, area_xpath) if not area_url_list: logging.warning('%s: No area broker url, info: %s' % (middleman_type, city_broker_url_first)) continue # 获取具体地点的url列表 # area_url_list = ["http://esf.fang.com/agenthome-a03/-i31-j310/"] for area_url in area_url_list: # print 'area_url', area_url logging.warning("%s: Get area page url, url: %s" % (middleman_type, area_url)) time.sleep(2) area_page_obj = get(area_url, use_proxy=False) if not area_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, area_url)) continue detail_address_broker_list = get_xpath_content(city_broker_url, area_page_obj.text, detail_xpath) if not detail_address_broker_list: logging.warning('%s: No detail address broker url, info: %s' % (middleman_type, area_url)) continue # # 记录 # detail_address_broker_list = ['http://esf.fang.com/agenthome-a03-b012384/-i31-j310/'] for detail_address_url in detail_address_broker_list: # print 'detail_url', detail_address_url while detail_address_url: logging.warning("%s: Get list page url, url: %s" % (middleman_type, detail_address_url)) time.sleep(2) detail_page_obj = get(detail_address_url, use_proxy=False) if not detail_page_obj: logging.warning('%s: Cannot get page. url: %s' % (middleman_type, detail_address_url)) detail_address_url = None continue page_res_list, next_page_url = parse_page(city_broker_url, detail_page_obj) if next_page_url: detail_address_url = next_page_url[0] else: detail_address_url = None # print 'next', detail_address_url res = record_res(page_res_list, middleman_type) if not res: logging.error("%s: Cannot record res, url: %s" % (middleman_type, detail_address_url))