def downloadImgList(pageID, imgList, homePath): mkpath = homePath + "/%d/" % pageID try: os.makedirs(mkpath) except: pass for imgURL in imgList: fn = ".".join(re.findall('[^/\.]+', imgURL)[-2:]) write_raw("%s%s" % (mkpath, fn), urlread2(imgURL))
def get_novel_list_count(block_url_head): """ 通过父级URL获取小说列表的页面数量 """ data = urlread2(block_url_head) soup = BeautifulSoup(data, XML_decoder) end_page_url = soup.find("div", attrs={"class": "pageNav px19"}).find_all("a")[-1].get("href") end_index = string.atoi(re.findall("\d*?.html", end_page_url)[0][:-5]) return end_index
def get_novel(novel_url): """ 通过小说文章的URL获取小说的标题和内容 """ data = urlread2(novel_url) soup = BeautifulSoup(data, XML_decoder) novel_text = soup.find("tbody").tr.td.getText() novel_title = soup.find_all("div", attrs={"class": "layout mt10"})[2].find("font", attrs={"color": "#000"}).getText() return novel_title, novel_text
def get_novel_list(block_url_head, index): """ 通过父级URL和Index获取列表中的小说URL(一般是25个/页) """ host = re.findall("http://.*?/", block_url_head)[0][:-1] if index != 0: block_url_append = "index_%d.html" % (index + 1) block_url_head += block_url_append data = urlread2(block_url_head) soup = BeautifulSoup(data, XML_decoder) url_soup_list = soup.find("ul", attrs={"class": "textList"}).find_all("a") novel_url_list = [] for url_soup in url_soup_list: novel_url_list.append(host + url_soup.get("href")) return novel_url_list
def getSoup(url): page_data = urlread2(url) return page_data, BeautifulSoup(page_data, XML_DECODER)
def get_soup(url): return BeautifulSoup(urlread2(url), XML_DECODER)
def get_data(url): return urlread2(url)
def get_mail_data(id): """ :param id: Index of mail (up to 22456) :return: email text """ return urlread2("https://wikileaks.org/dnc-emails//get/%d" % id)
def get_book_info(book_id): """ :param book_id: Index of book :return: json string """ return urlread2("https://api.douban.com/v2/book/%s" % book_id)
def get_movie_info(movie_id): """ :param movie_id: Index of Movie :return: json string """ return urlread2("https://api.douban.com/v2/movie/subject/%s" % movie_id)