def BulidLinkB(): import time, MyDef, os start_time = time.time() # 初始时间戳 refer_dict = ReadMD5SamplingCSV() MD5_refer_dict, MD5_list = ReadMD5CSV() yun_link_dict = ReadMobile() output_list = [] # ========================主目录======================== for key in refer_dict: if key in yun_link_dict: folder_name = MyDef.HexShift(key) yun_link = yun_link_dict[key] sample_list = refer_dict[key].split("|") print(sample_list) # ========================次级目录======================== for sample in sample_list: # 文件MD5 file_path = MD5_refer_dict[sample] file_name = os.path.split(file_path)[1] file_folder_name = MyDef.HexShift(sample) file_link = yun_link + '#path=%252F' + folder_name + '%252F' + folder_name + file_folder_name print(file_name) print(file_link) path_list = [file_path, file_link] output_list.append(path_list) path = '/Users/alicewish/Dropbox/漫画图源度盘地址表.csv' MyDef.StoreCSV(output_list, path) print(len(output_list)) print(MyDef.RunTime(start_time))
def ChangeBack(): import os, MyDef dropbox_path = '/Users/alicewish/Dropbox' refer_file_name = '漫画图源MD5表.csv' refer_file_path = os.path.join(dropbox_path, refer_file_name) text_readline = MyDef.ReadCSV(refer_file_path, type=False) for i in range(len(text_readline)): if '@k_k@' in text_readline[i][1]: print(text_readline[i][1]) text_readline[i][1] = text_readline[i][1].replace('@k_k@', '') MyDef.StoreCSV(text_readline, refer_file_path)
def MD5Table(file_dir): import time, os, MyDef start_time = time.time() # 初始时间戳 dropbox_path = '/Users/alicewish/Dropbox' refer_file_name = '漫画图源MD5表.csv' refer_file_path = os.path.join(dropbox_path, refer_file_name) # 词典文件的地址 MD5_refer_dict = MyDef.ReadDictB(refer_file_path, True) # print(MD5_refer_dict) file_path_check_set = set() major_key_list = [] output_list = [] # ================读取文件夹内容================ file_list = os.listdir(file_dir) # 获得目录中的内容 # print(file_list) for file_MD5 in MD5_refer_dict: file_path = MD5_refer_dict[file_MD5] file_path_check_set.add(file_path) for file_name in file_list: file_path = os.path.join(file_dir, file_name) if file_path in file_path_check_set: pass else: # file_MD5 = MyDef.HashMD5File(file_path) file_MD5 = MyDef.md5sum(file_path) print(file_MD5, file_name) MD5_refer_dict[file_MD5] = file_path for file_MD5 in MD5_refer_dict: file_path = MD5_refer_dict[file_MD5] file_name = os.path.split(file_path)[1] major_key = file_name + file_MD5 major_key_list.append(major_key) major_key_list.sort() for major_key in major_key_list: file_MD5 = major_key[-32:] info_list = [file_MD5, MD5_refer_dict[file_MD5]] output_list.append(info_list) head_info = ['MD5', '路径'] MyDef.StoreCSV(output_list, refer_file_path, head_info) print(MyDef.RunTime(start_time))
def ComixIssue(search_comic_name="Transformers"): from lxml import html import requests, time, re # ========================输入区开始======================== save_comic_name = search_comic_name.replace(":", "").replace( "/", "").replace("&", "").replace(" ", " ") key_title = search_comic_name.replace(" ", "-") print(key_title) url_prefix = 'https://www.comixology.com/search?search=' comic_url = url_prefix + search_comic_name # 完整的查询网址 # ========================执行区开始======================== page = requests.get(comic_url) # 获取网页信息 tree = html.fromstring(page.text) # 构筑查询用树 # ====================找到系列==================== all_url = tree.xpath('//a[@class="content-details"]/@href') print(len(all_url)) issues_url = [] # 每期网址 check_set = set() # 重复检查 info_dict = {} alter_info_dict = {} major_key_list = [] for i in range(len(all_url)): entry_start_time = time.time() print(i) print(all_url[i]) if re.match(r'.*/digital-comic/[^?]*', all_url[i]) and key_title in all_url[i]: matches = re.match(r'.*/digital-comic/[^?]*', all_url[i]) short_link = matches.group(0) if short_link not in check_set: # 尚未读取过 check_set.add(short_link) print("获取中……") issues_url.append(short_link) # ========================执行区开始======================== page = requests.get(short_link) # 获取网页信息 tree = html.fromstring(page.text) # 构筑查询用树 # ====================关键词列表==================== key_word_list = [ "Written by", "Art by", "Pencils", "Inks", "Colored by", "Cover by", "Genres", "Digital Release Date", "Print Release Date", "Page Count", "Age Rating", "Sold by", "About Book" ] # ====================标题==================== title = tree.xpath('//h1[@class="title"]/text()')[0] # ====================简介==================== raw_description = tree.xpath( '//section[@class="item-description"]/text()') # 列表 description = "".join(raw_description) formatted_description = description.strip("\n\t").replace( "\r\n", "|") formatted_description = formatted_description.replace( "\r", "|").replace("\n", "|") # ====================创作信息==================== credit_list = [] raw_credits = tree.xpath( '//div[@class="credits"]//*/text()') # 列表 for i in range(len(raw_credits)): credit_line = raw_credits[i].strip("\t\n") if credit_line != "" and credit_line != "HIDE...": credit_list.append(credit_line) credit = "\n".join(credit_list) # ====================评价数==================== rating_count = "" try: review_count = tree.xpath( '//div[@itemprop="reviewCount"]/text()')[0] rating_count = review_count.replace( "Average Rating (", "").replace("):", "") except: pass # ====================价格==================== price = "" try: price = tree.xpath('//h5[@class="item-price"]/text()')[0] except: pass # ====================封面==================== cover_image_url = "" try: cover_image_url = tree.xpath( '//img[@class="cover"]/@src')[0] except: pass # ====================编剧==================== writer = "" item = "Written by" if item in credit_list: item_index = credit_list.index(item) temp_store = credit_list[item_index + 1] while credit_list[item_index + 2] not in key_word_list: item_index += 1 temp_store = temp_store + "|" + credit_list[item_index + 1] writer = temp_store # ====================画师==================== artist = "" item = "Art by" if item in credit_list: item_index = credit_list.index(item) temp_store = credit_list[item_index + 1] while credit_list[item_index + 2] not in key_word_list: item_index += 1 temp_store = temp_store + "|" + credit_list[item_index + 1] artist = temp_store # ====================铅稿==================== penciller = "" item = "Pencils" if item in credit_list: item_index = credit_list.index(item) temp_store = credit_list[item_index + 1] while credit_list[item_index + 2] not in key_word_list: item_index += 1 temp_store = temp_store + "|" + credit_list[item_index + 1] penciller = temp_store # ====================墨线==================== inker = "" item = "Inks" if item in credit_list: item_index = credit_list.index(item) temp_store = credit_list[item_index + 1] while credit_list[item_index + 2] not in key_word_list: item_index += 1 temp_store = temp_store + "|" + credit_list[item_index + 1] inker = temp_store # ====================上色==================== colorist = "" item = "Colored by" if item in credit_list: item_index = credit_list.index(item) temp_store = credit_list[item_index + 1] while credit_list[item_index + 2] not in key_word_list: item_index += 1 temp_store = temp_store + "|" + credit_list[item_index + 1] colorist = temp_store # ====================填字==================== letterer = "" item = "Lettered by" if item in credit_list: item_index = credit_list.index(item) temp_store = credit_list[item_index + 1] while credit_list[item_index + 2] not in key_word_list: item_index += 1 temp_store = temp_store + "|" + credit_list[item_index + 1] letterer = temp_store # ====================封面==================== cover_artist = "" item = "Cover by" if item in credit_list: item_index = credit_list.index(item) temp_store = credit_list[item_index + 1] while credit_list[item_index + 2] not in key_word_list: item_index += 1 temp_store = temp_store + "|" + credit_list[item_index + 1] cover_artist = temp_store # ====================类型==================== genres = "" item = "Genres" if item in credit_list: item_index = credit_list.index(item) temp_store = credit_list[item_index + 1] while credit_list[item_index + 2] not in key_word_list: item_index += 1 temp_store = temp_store + "|" + credit_list[item_index + 1] genres = temp_store # ====================故事线==================== story_arc = "" item = "Story Arc" if item in credit_list: item_index = credit_list.index(item) temp_store = credit_list[item_index + 1] while credit_list[item_index + 2] not in key_word_list: item_index += 1 temp_store = temp_store + "|" + credit_list[item_index + 1] story_arc = temp_store # ====================数字出版日期==================== digital_release_date = "" item = "Digital Release Date" if item in credit_list: time_string = credit_list[credit_list.index(item) + 1] time_convert = time.strptime(time_string, "%B %d %Y") digital_release_date = time.strftime( "%Y-%m-%d", time_convert) # ====================实体出版日期==================== print_release_date = "" item = "Print Release Date" if item in credit_list: time_string = credit_list[credit_list.index(item) + 1] time_convert = time.strptime(time_string, "%B %d %Y") print_release_date = time.strftime("%Y-%m-%d", time_convert) # ====================页数==================== page_count = "" item = "Page Count" if item in credit_list: page_count = (credit_list[credit_list.index(item) + 1]).replace(" Pages", "") # ====================年龄评级==================== age_rating = "" item = "Age Rating" if item in credit_list: age_rating = (credit_list[credit_list.index(item) + 1]).replace(" Only", "") # ====================出版公司==================== publisher = "" item = "Sold by" if item in credit_list: publisher = credit_list[credit_list.index(item) + 1] # ====================输出区开始==================== line_info = [ title, digital_release_date, print_release_date, price, page_count, age_rating, rating_count, publisher, genres, story_arc, writer, artist, penciller, inker, colorist, letterer, cover_artist, short_link, cover_image_url, formatted_description ] this_line = "\t".join(line_info) # 行信息合并 print(this_line) major_key = digital_release_date + title # "日期+标题"作为主键 major_key_list.append(major_key) info_dict[major_key] = line_info major_key_list.sort() # 主键表排序 text_list = [] for key in major_key_list: text_list.append(info_dict[key]) # ================写入TXT================ txt_file_path = '/Users/alicewish/Dropbox/Comixology刊物' + save_comic_name + '.csv' # TXT文件名 head_info = [ "标题", "数字出版日期", "实体出版日期", "价格", "页数", "分级", "评价数", "出版商", "类型", "故事线", "编剧", "画师", "铅笔稿", "墨线", "上色师", "填字员", "封面画师", "短链", "封面图地址", "简介" ] MyDef.StoreCSV(text_list, txt_file_path, head_info) # ====================次级输出区开始==================== alter_line_info = ["### " + title, formatted_description] alter_line = "\r\n".join(alter_line_info) # 行信息合并 print(alter_line) alter_info_dict[major_key] = alter_line major_key_list.sort() # 主键表排序 alter_text_list = [] for key in major_key_list: alter_text_list.append(alter_info_dict[key]) alter_text = "\r\n".join(alter_text_list) # ================写入TXT================ alter_txt_file_path = '/Users/alicewish/我的坚果云/Comixology简介' + save_comic_name + '.txt' # TXT文件名 f = open(alter_txt_file_path, 'w') try: f.write(alter_text) finally: f.close() entry_run_time = time.time() - entry_start_time print("耗时:{:.2f}秒".format(entry_run_time)) # ========================输出区开始======================== print("总共" + str(len(issues_url)) + "期")