def BulidLinkB(): import time, MyDef, os start_time = time.time() # 初始时间戳 refer_dict = ReadMD5SamplingCSV() MD5_refer_dict, MD5_list = ReadMD5CSV() yun_link_dict = ReadMobile() output_list = [] # ========================主目录======================== for key in refer_dict: if key in yun_link_dict: folder_name = MyDef.HexShift(key) yun_link = yun_link_dict[key] sample_list = refer_dict[key].split("|") print(sample_list) # ========================次级目录======================== for sample in sample_list: # 文件MD5 file_path = MD5_refer_dict[sample] file_name = os.path.split(file_path)[1] file_folder_name = MyDef.HexShift(sample) file_link = yun_link + '#path=%252F' + folder_name + '%252F' + folder_name + file_folder_name print(file_name) print(file_link) path_list = [file_path, file_link] output_list.append(path_list) path = '/Users/alicewish/Dropbox/漫画图源度盘地址表.csv' MyDef.StoreCSV(output_list, path) print(len(output_list)) print(MyDef.RunTime(start_time))
def UnDistribute(): import time, MyDef, os, shutil start_time = time.time() # 初始时间戳 refer_dict = ReadMD5SamplingCSV() MD5_refer_dict, MD5_list = ReadMD5CSV() new_file_dir = '/Volumes/Mack/Distribute' for key in refer_dict: new_folder_path = os.path.join(new_file_dir, key) sample_list = refer_dict[key].split("|") print(sample_list) for sample in sample_list: file_path = MD5_refer_dict[sample] file_name = os.path.split(file_path)[1] print("旧", file_path) new_file_path = os.path.join(new_folder_path, sample, file_name) print("新", new_file_path) try: shutil.move(new_file_path, file_path) # 移动文件或目录都是使用这条命令 except: pass print(MyDef.RunTime(start_time))
def Distribute(): import time, MyDef, os, shutil start_time = time.time() # 初始时间戳 refer_dict = ReadMD5SamplingCSV() MD5_refer_dict, MD5_list = ReadMD5CSV() # ========================最外层目录======================== new_file_dir = '/Volumes/Mack/Distribute' if not os.path.exists(new_file_dir): # 判断目标是否存在 try: os.mkdir(new_file_dir) # 创建最外层目录 except: pass # ========================主目录======================== for key in refer_dict: folder_name = MyDef.HexShift(key) new_folder_path = os.path.join(new_file_dir, folder_name) if not os.path.exists(new_folder_path): # 判断目标是否存在 try: os.mkdir(new_folder_path) # 创建目录 except: pass sample_list = refer_dict[key].split("|") print(sample_list) # ========================次级目录======================== for sample in sample_list: # 文件MD5 file_path = MD5_refer_dict[sample] file_name = os.path.split(file_path)[1] print("旧", file_path) file_folder_name = MyDef.HexShift(sample) new_file_folder_path = os.path.join(new_file_dir, folder_name, folder_name + file_folder_name) try: os.mkdir(new_file_folder_path) # 创建目录 except: pass new_file_path = os.path.join(new_file_folder_path, file_name) print("新", new_file_path) try: shutil.move(file_path, new_file_path) # 移动文件或目录都是使用这条命令 except: pass print(MyDef.RunTime(start_time))
def MD5Table(file_dir): import time, os, MyDef start_time = time.time() # 初始时间戳 dropbox_path = '/Users/alicewish/Dropbox' refer_file_name = '漫画图源MD5表.csv' refer_file_path = os.path.join(dropbox_path, refer_file_name) # 词典文件的地址 MD5_refer_dict = MyDef.ReadDictB(refer_file_path, True) # print(MD5_refer_dict) file_path_check_set = set() major_key_list = [] output_list = [] # ================读取文件夹内容================ file_list = os.listdir(file_dir) # 获得目录中的内容 # print(file_list) for file_MD5 in MD5_refer_dict: file_path = MD5_refer_dict[file_MD5] file_path_check_set.add(file_path) for file_name in file_list: file_path = os.path.join(file_dir, file_name) if file_path in file_path_check_set: pass else: # file_MD5 = MyDef.HashMD5File(file_path) file_MD5 = MyDef.md5sum(file_path) print(file_MD5, file_name) MD5_refer_dict[file_MD5] = file_path for file_MD5 in MD5_refer_dict: file_path = MD5_refer_dict[file_MD5] file_name = os.path.split(file_path)[1] major_key = file_name + file_MD5 major_key_list.append(major_key) major_key_list.sort() for major_key in major_key_list: file_MD5 = major_key[-32:] info_list = [file_MD5, MD5_refer_dict[file_MD5]] output_list.append(info_list) head_info = ['MD5', '路径'] MyDef.StoreCSV(output_list, refer_file_path, head_info) print(MyDef.RunTime(start_time))
def ReadMobile(): import time, requests, MyDef start_time = time.time() # 初始时间戳 refer_dict = {} header = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 (FoxPlus) Firefox/2.0.0.14' } url = 'https://pan.baidu.com/wap/share/home?uk=2007334207&third=0' page = requests.get(url=url, headers=header) # print(page.encoding) # print(page.headers) # print(page.cookies) # print(page.text) html = page.content.decode("utf", "ignore") # print(html) shareid_list = MyDef.ReFind(html, r'"shareid":"[0-9]{1,20}') print(shareid_list) print(len(shareid_list)) title_list = MyDef.ReFind(html, r'"title":"[a-z]{64}') print(title_list) print(len(title_list)) for i in range(len(shareid_list)): shareid = shareid_list[i].replace('"shareid":"', '') title = title_list[i].replace('"title":"', '') real_name = MyDef.HexShiftBack(title[:32]) refer_dict[ real_name] = 'https://pan.baidu.com/share/link?uk=2007334207&shareid=' + shareid refer_dict = AddDict(refer_dict) # ================运行时间计时================ print(MyDef.RunTime(start_time)) print(refer_dict) print(len(refer_dict)) return refer_dict
def FileWithLink(): import time, os, MyDef start_time = time.time() # 初始时间戳 # ========================输入区开始======================== output_readline = [] refer_file_path = '/Users/alicewish/Dropbox/漫画图源MD5表.csv' MD5_dict = MyDef.ReadDictC(refer_file_path, True) yun_link_file_path = '/Users/alicewish/Dropbox/漫画图源度盘地址表.csv' yun_link_dict = MyDef.ReadDictB(yun_link_file_path) file_path_list = [] for key in MD5_dict: file_path = MD5_dict[key] file_size = os.path.getsize(file_path) readable_file_size = sizeof_fmt(file_size) file_name = os.path.split(file_path)[1] file_path_list.append(file_path) if file_path in yun_link_dict: yun_link = yun_link_dict[file_path] output_line = "[" + file_name + "](" + yun_link + ") | " + readable_file_size else: output_line = "[" + file_name + "]() | " + readable_file_size print(file_name) # print(output_line) output_readline.append(output_line) output_readline.sort() output_readline.insert(0, '--- | ---') output_readline.insert(0, '文件名 | 大小') for i in range(len(output_readline)): line = output_readline[i] if "]()" in line: line = line[1:].replace("]()", "") output_readline[i] = line # ================写入剪贴板================ output_text = '\r\n'.join(output_readline) MyDef.WriteClip(output_text) print(MyDef.RunTime(start_time))
def Sampling(number=30): import time, MyDef, os start_time = time.time() # 初始时间戳 MD5_refer_dict, MD5_list = ReadMD5CSV() remain_sample = MD5_list refer_dict = {} while len(remain_sample) > 0: random_sample, remain_sample = RandomSample(remain_sample, number) random_sample_string = '|'.join(random_sample) print(random_sample_string) random_sample_string_MD5 = MyDef.HashMD5String(random_sample_string) print('方案MD5', random_sample_string_MD5) refer_dict[random_sample_string_MD5] = random_sample_string dropbox_path = '/Users/alicewish/Dropbox' refer_file_name = '漫画图源MD5随机分样表.csv' refer_file_path = os.path.join(dropbox_path, refer_file_name) MyDef.WriteDictB(refer_dict, refer_file_path) print(MyDef.RunTime(start_time))
magnet_list = tree.xpath('//a[@title="Download this torrent using magnet"]/@href') count = len(title_list) for i in range(count): output_line_in_list = [title_list[i], magnet_list[i]] output_line = ",".join(output_line_in_list) output_readline.append(output_line) this_week_readline.append(magnet_list[0]) # ================写入文本================ text = '\r\n'.join(output_readline) print(text) f = open(output_file_path, 'w') try: f.write(text) finally: f.close() info = "\r\n".join(this_week_readline) print(info) print(len(this_week_readline)) # ================写入剪贴板================ import pyperclip pyperclip.copy(info) spam = pyperclip.paste() print("耗时:",MyDef.RunTime(start_time))
def ReadChrome(input_file_path): import time, re, MyDef from lxml import html start_time = time.time() # 初始时间戳 # ==============读取文本============== input_file_path = '/Users/alicewish/Dropbox/百度云 网盘-我的分享.htm' read_text = open(input_file_path, 'r').read() # 读取文本 tree = html.fromstring(read_text) # ==============读取文件名============== names = tree.xpath('//span[@node-type="name-text"]/@title') # 列表存储 all_name = '\r\n'.join(names) print(len(names)) print(all_name) # ==============读取下载地址============== share_links = [] links = tree.xpath('//a[@target="_blank"]/@href') # 列表存储 for link in links: # print(link) if re.match(r'https://pan.baidu.com/s/[^<]*', link): # 判断是否度盘外链 share_links.append(link) all_link = '\n'.join(share_links) print(len(share_links)) print(all_link) # ==============读取分享时间和浏览、保存、下载次数============== raw_share_time = tree.xpath( '//div[@style="width: 20%"]/text()') # 列表存储分享时间 all_number = tree.xpath('//div[@style="width: 9%"]/text()') # 列表存储各类次数 share_time = [] view_number = [] save_number = [] download_number = [] for i in range(len(names)): share_time.append(raw_share_time[i + 1].strip(" \n\t\r")) view_number.append(all_number[3 * i + 3].strip(" \n\t\r").strip("次")) # 浏览次数 save_number.append(all_number[3 * i + 4].strip(" \n\t\r").strip("次")) # 保存次数 download_number.append( all_number[3 * i + 5].strip(" \n\t\r").strip("次")) # 下载次数 # ==============合并信息============== info_list = [] refer_dict = {} if len(names) == len(share_links): for i in range(len(names)): info_line_in_list = [ names[i], share_links[i], share_time[i], view_number[i], save_number[i], download_number[i] ] info_line = "\t".join(info_line_in_list) info_list.append(info_line) if len(names[i]) == 64: real_name = MyDef.HexShiftBack(names[i][:32]) # 重要 refer_dict[real_name] = share_links[i] else: print("错误", len(names), len(share_links)) all_info = '\n'.join(info_list) print(all_info) print(MyDef.RunTime(start_time)) return refer_dict
def Processing(): import time, jieba, re, MyDef start_time = time.time() # 初始时间戳 scenario_list_full = Training() # ======================================处理区开始====================================== dict_file_path = '/Users/alicewish/我的坚果云/userdict.txt' # 自定义词典路径 cut_right_count = 0 cut_wrong_count = 0 # ========================输入区开始======================== input_file_path = "/Users/alicewish/Downloads/my.md" # ================按行读取输入文本================ read_text = open(input_file_path, 'r').read() # 读取文本 text_readline = read_text.replace("\nclass", "class").replace("...", "…").splitlines() # print(text_readline) # ================按行读取文本:with open(更好)================ status_readline = [] # 状态列表 output_readline = [] # 输出列表 jieba.load_userdict(dict_file_path) line_formmat_list_all = [] for a in range(len(text_readline)): text_readline[a] = re.sub(r'<span.*</span>', '', text_readline[a]) # 去除span text_readline[a] = text_readline[a].replace('……', '…') markdown_line = text_readline[a].replace("\*", "の").replace( "\[", "[").replace("\]", "]") print(markdown_line) line_cut_list = markdown_line.split("*") print(line_cut_list) plain_line = markdown_line.replace("*", "").replace("の", "*") # 调整* print(plain_line) line_formmat_list = [] for j in range(len(plain_line)): line_formmat_list.append(0) # print(line_formmat_list) line_mark_count_list = [] for k in range(len(plain_line) + 1): line_mark_count_list.append(0) point = 0 for char in markdown_line: if char == '*': line_mark_count_list[point] = line_mark_count_list[point] + 1 else: point = point + 1 print(line_mark_count_list) pin = 0 before = 0 for seg in line_cut_list: if seg == '': pass else: last_pin = pin pin += len(seg) # print(line_mark_count_list[last_pin]) # print(line_mark_count_list[pin]) if last_pin > 0: before = line_formmat_list[last_pin - 1] for l in range(last_pin, pin): if before == 0: line_formmat_list[ l] = before + line_mark_count_list[last_pin] elif before == 1 and line_mark_count_list[last_pin] == 1: line_formmat_list[ l] = before - line_mark_count_list[last_pin] elif before == 1 and line_mark_count_list[last_pin] == 2: line_formmat_list[ l] = before + line_mark_count_list[last_pin] elif before == 2 and line_mark_count_list[last_pin] == 1: line_formmat_list[ l] = before + line_mark_count_list[last_pin] elif before == 2 and line_mark_count_list[last_pin] == 2: line_formmat_list[ l] = before - line_mark_count_list[last_pin] elif before == 3: line_formmat_list[ l] = before - line_mark_count_list[last_pin] # print(line_formmat_list[last_pin - 1]) print(line_formmat_list) line_formmat_list_all.append(line_formmat_list) print(plain_line) print(len(plain_line)) need_cut = True # 需要切吗? if len(plain_line) == 0: pass elif len(plain_line) == 2 and re.match(r'[0-9][0-9]', plain_line): # 页码 need_cut = False elif a < 8: # 首部 need_cut = False elif len(plain_line) > 2: if plain_line[0] == '*' or plain_line[0] == '[': # 注释 need_cut = False if plain_line == "": status = 0 # 空行 elif not need_cut: status = -1 # 不需要切 else: status = 1 # 待分词 status_readline.append(status) if status == 1: # ================结巴分词================ string_list = [] seg_list = jieba.cut(plain_line) # 默认是精确模式 for word in seg_list: string_list.append(word) print(string_list) start_status = False for i in range(len(scenario_list_full)): scenario_line_full = scenario_list_full[i] if scenario_line_full[0:2] == str(len(plain_line)).zfill(2): if start_status: end_i = i else: start_i = i start_status = True end_i = i # ================进行切分================ current_i = start_i cut_right = False while current_i <= end_i and not cut_right: current_cut = scenario_list_full[current_i] current_cut_list = current_cut[7:].split("-") # 列表存储的切分方案 # ================进行分词判断================ line_can_cut_list = [] for i in range(len(plain_line)): line_can_cut_list.append(0) j = 0 for string in string_list: j = j + len(string) # print(j) line_can_cut_list[j - 1] = 1 # ================对标点和语气词进行纠正================ for i in range(len(plain_line)): if plain_line[i] in ',.?!,。…?!”·-》>:】【]、': # 这些之前不可切 line_can_cut_list[i - 1] = 0 elif plain_line[i] in '“《<【[': # 这些之后不可切 line_can_cut_list[i] = 0 elif plain_line[ i] in '上中下内出完的地得了吗吧着个就前世里嘛图们来呗' and line_can_cut_list[ i - 1] == 1 and line_can_cut_list[i] == 1: # 这些之前不可切 line_can_cut_list[i - 1] = 0 elif plain_line[i] in '太每帮跟另' and line_can_cut_list[ i - 1] == 1 and line_can_cut_list[i] == 1: # 这些之后不可切 line_can_cut_list[i] = 0 print(line_can_cut_list) print(current_cut_list) # ================判断方案正确与否================ sum = 0 cut_right = True for i in range(len(current_cut_list)): # 切分 last_sum = sum sum = sum + int(current_cut_list[i]) print(line_can_cut_list[sum - 1]) print(plain_line[last_sum:sum]) if line_can_cut_list[sum - 1] == 0: cut_right = False print(cut_right) if not cut_right: current_i += 1 if cut_right: # 切对了 cut_right_count += 1 sum = 0 for i in range(len(current_cut_list)): last_sum = sum sum = sum + int(current_cut_list[i]) output_line = plain_line[last_sum:sum] output_line_format_list = line_formmat_list_all[a][ last_sum:sum] output_line_mark_count_list = [] format_list_for_use = [0] + output_line_format_list + [ 0 ] # 11 for b in range(len(format_list_for_use) - 1): output_line_mark_count = abs(format_list_for_use[b + 1] - format_list_for_use[b]) output_line_mark_count_list.append( output_line_mark_count) print(output_line_mark_count_list) # 10 output_markdown_line = '' for c in range(len(output_line_mark_count_list) - 1): for d in range(output_line_mark_count_list[c]): output_markdown_line += '*' output_markdown_line += output_line[c] for d in range(output_line_mark_count_list[-1]): output_markdown_line += '*' output_readline.append(output_markdown_line) print("格式", output_line_format_list) else: # 切错了 output_readline.append(text_readline[a]) cut_wrong_count += 1 elif status == 0: # 不需要切 output_readline.append('\n|\n') else: # 不需要切 output_readline.append(text_readline[a].replace("\[", "[").replace( "\]", "]")) print('切对', cut_right_count) print('待切', cut_wrong_count) # ================写入剪贴板================ text = '\r\n'.join(output_readline) MyDef.WriteClip(text) print(MyDef.RunTime(start_time))