def start_download(site_url): print "Start download: " + site_url.encode('utf-8') global img_idx global img_cnt img_idx = 0 img_cnt = 0 root_path = u"/var/storage/图片收集/maldiveschina/" # root_path = "/Users/nangua/Desktop/test/" doc = pqr(url=site_url) save_path = os.path.join(root_path, doc(".post-title").text()) print "Create folder: " + save_path.encode('utf-8') if not os.path.exists(save_path): os.makedirs(save_path) image_list = doc(".gallery-icon a") name_list = doc(".gallery-caption") work_list = [] img_cnt = len(image_list) for index, img in enumerate(image_list): img_url = pqr(img).attr("href") if index <= len(name_list)-1: caption = pqr(name_list[index]).text() else: caption = "" worker = Thread(target=download_img, args=(img_url, caption, save_path)) work_list.append(worker) worker.start()
def start_download(site_url): print "Start download: " + site_url.encode('utf-8') global img_idx global img_cnt img_idx = 0 img_cnt = 0 root_path = u"/var/storage/图片收集/agoda/" # root_path = "/Users/nangua/Desktop/test" doc = pqr(url=site_url) save_path = os.path.join(root_path, doc("#ctl00_ctl00_MainContent_ContentMain_HotelHeaderHD_lblEHotelName").text()[1:-1]) print "Create folder: " + save_path.encode('utf-8') if not os.path.exists(save_path): os.makedirs(save_path) # image_list = doc("#ctl00_ctl00_MainContent_ContentMain_MainHotelPhotoHDAB2659_ThumbPhotosHDAB2659_dtlPhotoAB2659 img") image_list = doc("#ctl00_ctl00_MainContent_ContentMain_MainHotelPhotoHD_ThumbPhotosHD_dtlPhoto img") work_list = [] img_cnt = len(image_list) for img in image_list: img_url = pqr(img).attr("src") worker = Thread(target=download_img, args=(img_url.replace("s=116x88", "s=800x600"), save_path)) work_list.append(worker) worker.start()
def start_download(site_url): print "Start download: " + site_url.encode('utf-8') global img_idx global img_cnt img_idx = 0 img_cnt = 0 # root_path = u"/var/storage/图片收集/jetwing/" root_path = "/Users/nangua/Desktop/test/" doc = pqr(url=site_url) save_path = os.path.join(root_path, doc("#header h1").text()) print "Create folder: " + save_path.encode('utf-8') if not os.path.exists(save_path): os.makedirs(save_path) image_list = doc("#pikame img") work_list = [] img_cnt = len(image_list) for img in image_list: img_url = pqr(img).attr("src") worker = Thread(target=download_img, args=(img_url, save_path)) work_list.append(worker) worker.start()
def scan_page(site_url, folder_name): print "Searching pictures at Page " + str(page_no) + " ..." folder_name = folder_name + "/" + str(page_no) if not os.path.exists(folder_name): os.mkdir(folder_name) doc = pqr(url=site_url) pic_div_list = doc("#px div.container div.d4") i = 0 work_list = [] for pic in pic_div_list: href = pqr(pqr(pic).find("div.photo a")).attr("href") pic_id = href.split("/")[-1] pic_info = {} pic_info["info"] = pqr(pqr(pic).find("div.info a")).text().encode('utf-8') pic_info["title"] = pqr(pqr(pic).find("div.title a")).text().encode('utf-8') pic_info["rating"] = pqr(pqr(pic).find("div.rating")).text().encode('utf-8') i += 1 worker = Thread(target=download_pic , args=(pic_id, pic_info, i, page_no, folder_name)) work_list.append(worker) worker.start() for work in work_list: work.join()
def download_pic(pic_id, pic_info, pic_idx, page_idx, folder_name): global pic_no pic_doc = pqr(url="http://500px.com/photo/" + pic_id) pic_url = pqr(pic_doc("div.photo.segment img")).attr("src") urllib.urlretrieve(pic_url, folder_name + "/" + pic_id + "." + pic_url.split(".")[-1]) fileHandle = open(folder_name + "/" + pic_id + ".txt", "w") fileHandle.write("info: " + pic_info["info"] + "\n") fileHandle.write("title: " + pic_info["title"] + "\n") fileHandle.write("rating: " + pic_info["rating"] + "\n") fileHandle.close() pic_no += 1 print "Download complete: " + str(pic_idx) + " at Page " + str(page_idx) + ", total: " + str(pic_no)
def get_path(site_url): doc = pqr(url=site_url) return "booking/" + doc("#hp_hotel_name").text()
def get_path(site_url): doc = pqr(url=site_url) return "maldiveschina/" + doc("#hp_hotel_name").text()
def get_path(site_url): doc = pqr(url=site_url) return "booking/" + doc("#header h1").text()