def write_to_text(params): temp = 0 start_page = params['start_page'] end_page = params['end_page'] select_str = params['select_str'] down_url = params['down_url'] # done_down_text = cur_dir + params['done_down_text'] # print(os.getcwd()) done_down_text = common.get_file_name_list(os.getcwd(), "text")[0] print(done_down_text) with open(done_down_text, 'a+') as fileObj: readLines = fileObj.read().splitlines() for i in range(start_page, end_page): soup = common.get_beauty_soup(down_url % i) itemUrl = soup.select(select_str) for j in range(0, len(itemUrl)): fileUrl = itemUrl[j].get('href') print('fileUrl:' + fileUrl) if fileUrl is not None and fileUrl.find( '.html') >= 0 and fileUrl not in readLines: os.chdir(cur_dir) # 保存已下载连接 with open(done_down_text, 'a+') as ff: ff.write(fileUrl + '\n') fileUrl = pre_url + fileUrl temp += 1 print("fileUrl:" + fileUrl) # 保存下载连接 with open( common.get_datetime('%Y-%m-%d') + '_' + str(temp // 500) + '.txt', 'a+') as f: f.write(fileUrl + '\n')
def get_child_img_url(url): soup = common.get_beauty_soup(url) img_url_list = soup.select( "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] table tr td img[file]" ) img_url_list_2 = soup.select( "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tr td img[file]" ) img_url_list_3 = soup.select( "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div div[class='postattachlist'] dl dd p img[file]" ) img_url_list_1 = soup.select( "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tbody tr td a[href]" ) img_url_list.extend(img_url_list_2) img_url_list.extend(img_url_list_3) img_url_list.extend(img_url_list_1) new_list = common.list_distinct(img_url_list) return new_list
def down_noval(params): # 文件下载路径 down_path = params['down_file_path'] + cur_month select_str = params['select_str'] require_pre_url = params['require_pre_url'] if require_pre_url: pre_url_ = params['pre_url'] if not (os.path.exists(down_path)): os.makedirs(down_path) file_name_list = common.get_file_name_list(cur_dir, 'txt') for index, file_name in enumerate(file_name_list, 1): print('下载第 %i 个文件: %s' % (index, file_name)) # 打开文件 with open(file_name) as file_obj: for num, value in enumerate(file_obj, 1): line = value.strip('\n') if require_pre_url: line = pre_url_ + line print('第 %i 行: %s' % (num, line)) soup = common.get_beauty_soup(line) title = common.replace_special_char(soup.title.string) newTitle = title.split('www')[-1] print(str(newTitle.strip())) textContent = soup.select(select_str) print('text数量:' + str(len(textContent))) if len(textContent) == 0: os.chdir(os.getcwd()) with open( common.get_datetime('%Y-%m-%d') + '_未下载.txt', 'a+') as f: f.write('第' + str(num) + '行:' + line + ',' + newTitle + '\n') else: os.chdir(down_path) with open(newTitle.strip() + '.txt', 'a+', encoding='utf8') as f: for i in range(0, len(textContent)): text = textContent[i].text os.chdir(down_path) f.write(text + '\n') print("-----down over----------------")
def get_jh_img_url_list(line): soup = common.get_beauty_soup(line) new_title = common.replace_sub(soup.title.string) img_url_list = soup.select( "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] table tr td img[file]" ) img_url_list_2 = soup.select( "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tr td img[file]" ) img_url_list_3 = soup.select( "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div div[class='postattachlist'] dl dd p img[file]" ) img_url_list_1 = soup.select( "body div[id='wrap'] div[id='postlist'] div[id] table tr td[class='postcontent'] div[class='defaultpost'] div div table tbody tr td a[href]" ) img_url_list.extend(img_url_list_2) img_url_list.extend(img_url_list_3) img_url_list.extend(img_url_list_1) new_list = common.list_distinct(img_url_list) print('去重后图片数量:' + str(len(new_list))) return [new_list, new_title]
import os import common temp = 0 for i in range(1, 47): print('第 %i 页' % i) url = "https://www.4455sk.com/xiaoshuo/list-情感小说-%i.html" print(url) soup = common.get_beauty_soup(url) itemUrl = soup.select( "body div[class='maomi-content'] main[id='main-container'] div[class='text-list-html'] div ul li a" ) for j in range(0, len(itemUrl)): fileUrl = itemUrl[j].get('href') # print('fileUrl:'+fileUrl) if fileUrl is not None and fileUrl.find('.html') >= 0: fileUrl = "https://www.4455sk.com/" + fileUrl temp += 1 print("fileUrl:" + fileUrl) os.chdir(os.getcwd()) with open( 'qinggan_%s_%i.txt' % (common.get_datetime('%Y-%m-%d'), temp // 500), 'a+') as f: f.write(fileUrl + '\n') print("打印完成")
import common # 获取总行数 path = 'D:/down/qinggan/%s/' % common.get_datetime('%Y-%m-%d') if not (os.path.exists(path)): os.makedirs(path) file_list = common.get_file_name_list(os.getcwd(), 'txt') for index, file_name in enumerate(file_list, 1): print('下载第 %i 个文件: %s' % (index, file_name)) # 打开文件 with open(file_name) as file_obj: for num, value in enumerate(file_obj, 1): line = value.strip('\n') print('第 %i 行: %s' % (num, line)) itemSoup = common.get_beauty_soup(line) title = common.replace_special_char(itemSoup.title.string) newTitle = title.split('www')[-1] print(str(newTitle.strip())) textContent = itemSoup.select( "body div[class='maomi-content'] main[id='main-container'] div[class='content']" ) if len(textContent) == 0: with open( 'qinggan_%s_未下载.txt' % common.get_datetime('%Y-%m-%d'), 'a+') as f: f.write('第 %i 行:%s ; %s \n' % (num, line, newTitle) + str(num)) else: with open(newTitle.strip() + '.txt', 'a+',