def __train_transition(cls, data_lines): """ Train emission parameter Args: data_lines(list): content list of training data """ print('Begin training transition parameter...') length = len(data_lines) progress = LineProgress(title='Training transition') count = 0 prev_next_prob = dict() for line in data_lines: count += 1 progress.update(count * 100 / length) line = line.strip() if len(line) <= 1: continue if not is_chinese(line): continue for prev_char, next_char in zip(line[:-1], line[1:]): next_prob = prev_next_prob.get(prev_char, dict()) next_prob[next_char] = next_prob.get(next_char, 0) + 1 prev_next_prob[prev_char] = next_prob print('\nInserting into database...') for prev_char, next_prob in prev_next_prob.items(): for next_char, prob in next_prob.items(): # noinspection PyTypeChecker cls.insert(Transition, prev_char=prev_char, next_char=next_char, prob=float(np.log(prob / sum(next_prob.values())))) print('Done training transition parameter')
def __train_init(cls, data_lines): """ Train init parameter Args: data_lines(list): content list of training data """ print('Begin training init parameter...') length = len(data_lines) progress = LineProgress(title='Training init') count = 0 char_prob = dict() for line in data_lines: count += 1 progress.update(count * 100 / length) line = line.strip() if len(line) == 0: continue if not is_chinese(line[0]): continue char_prob[line[0]] = char_prob.get(line[0], 0) + 1 print('\nInserting into database...') for character, prob in char_prob.items(): # noinspection PyTypeChecker cls.insert(Init, character=character, prob=float(np.log(prob / length))) print('Done training init parameter')
def test_accuracy(path='./Data/dict.txt', size=1000, loop=5): """ Function for testing the model's accuracy Args: path(str): The path of test file size(int): The size of test data loop(int): The times of test """ with open(path, 'r', encoding='utf-8') as file: lines = file.readlines() if size > len(lines): print("The size is too large") raise BaseException accurate_nums = list() for i in range(loop): progress = LineProgress(title='Loop ' + str(i + 1)) test_pos_list = np.random.randint(0, len(lines), size) test_words = [lines[pos].split()[0] for pos in test_pos_list] accurate_num = 0 for index, test_word in enumerate(test_words): progress.update(index / size * 100) pin_yin = ' '.join([ py_list[0] for py_list in pinyin(test_word, style=NORMAL) ]) prediction = Model.predict(pin_yin) count = 0 for j in range(len(prediction)): if prediction[j] == test_word[j]: count += 1 accurate_num += count / len(test_word) accurate_nums.append(accurate_num) print("The test accuracy is: {:.2f}%".format( sum(accurate_nums) / (size * loop) * 100))
def accuracy(cls, path='./Data/test.txt', size=1000, rounds=5): """ Test the model accuracy Args: path(str): path of test data size(int): size of test data rounds(int): rounds of test """ with open(path, 'r', encoding='utf-8') as file: lines = file.readlines() if size > len(lines): print("The size is too large") raise BaseException accuracy_count = list() start = time.time() for i in range(rounds): progress = LineProgress(title='Round ' + str(i + 1)) positives = np.random.randint(0, len(lines), size) words = [lines[pos].split()[0] for pos in positives] accurate_num = 0 for index, word in enumerate(words): progress.update(index / size * 100) pin_yin = ' '.join([py_list[0] for py_list in pinyin(word, style=NORMAL)]) prediction = HMModel.translate(pin_yin) count = 0 for j in range(len(prediction)): if prediction[j] == word[j]: count += 1 accurate_num += count / len(word) accuracy_count.append(accurate_num) print("Avg. time consumption per round: {} s".format((time.time() - start) / rounds)) print("The test accuracy is: {:.2f}%".format(sum(accuracy_count) / (size * rounds) * 100))
def downloadTxtNovel(url, curChaptersNum=0): response = requests.get(url) response.encoding = "utf-8" html = etree.HTML(response.text) coverUrl = html.xpath("//*[@id='fmimg']/img/@src")[0] title = html.xpath("//*[@id='info']/h1/text()")[0] chaptersHtml = html.xpath("//*[@id='list']/dl/dd") data = """\n""" progress = LineProgress(total=100, title=title) total = len(chaptersHtml) - curChaptersNum for index in range(curChaptersNum, len(chaptersHtml)): chapter = chaptersHtml[index] title = chapter.xpath("./a/text()")[0].split("【")[0].split("(")[0] data += ("\n\n" + title + "\n\n") response = requests.get("http://www.xbiquge.la/" + chapter.xpath("./a/@href")[0]) response.encoding = "utf-8" chapterHtml = etree.HTML(response.text) data += "".join( chapterHtml.xpath("//*[@id='content']/text()")).replace( "\xa0", " ").replace("\r", "\n") progress.update(int((index - curChaptersNum + 1) / total * 100)) return coverUrl, len(chaptersHtml), data
def __train_emission(cls, data_lines): """ Train emission parameter Args: data_lines(list): content list of training data """ print('Begin training emission parameter...') length = len(data_lines) progress = LineProgress(title='Training emission') count = 0 char_pinyin_prob = dict() for line in data_lines: count += 1 progress.update(count * 100 / length) line = line.strip() if len(line) == 0: continue if not is_chinese(line): continue pinyin_list = pinyin(line, style=NORMAL, heteronym=True) for character, pinyin_s in zip(line, pinyin_list): pinyin_prob = char_pinyin_prob.get(character, dict()) for pin_yin in pinyin_s: pinyin_prob[pin_yin] = pinyin_prob.get(pin_yin, 0) + 1 char_pinyin_prob[character] = pinyin_prob print('\nInserting into database...') for character, pinyin_prob in char_pinyin_prob.items(): for pin_yin, prob in pinyin_prob.items(): # noinspection PyTypeChecker cls.insert(Emission, character=character, pin_yin=pin_yin, prob=float(np.log(prob / sum(pinyin_prob.values())))) print('Done training emission parameter')
class Progress(): def __init__(self, title, total): self._counter = 0 self.total = total self.progress = LineProgress(title=title) def show_progress(self): self._counter += 1 self.progress.update(int(self._counter / self.total * 100))
def __train_emission(cls, char_lines): """ Train emission parameter Args: char_lines(list): content list of training data """ print('Begin training emission parameter...') length = len(char_lines) progress = LineProgress(title='Training emission') count = 0 char_pinyin_prob = dict() for line in char_lines: count += 1 progress.update(count * 100 / length) line = line.strip() line, data_type = line.split()[0], line.split()[1] if data_type == 'S': line = list(jieba.cut(line)) if len(line) == 0: continue if not is_chinese(line): continue if data_type == 'S': pinyin_list = [pinyin(word, style=NORMAL) for word in line] else: pinyin_list = pinyin(line, style=NORMAL) for character, pinyin_s in zip(line, pinyin_list): pinyin_prob = char_pinyin_prob.get(character, dict()) pin_yin = ' '.join([py[0] for py in pinyin_s]) pinyin_prob[pin_yin] = pinyin_prob.get(pin_yin, 0) + 1 if data_type == 'W': for pin_yin in pinyin_s: pinyin_prob[pin_yin] = pinyin_prob.get(pin_yin, 0) + 1 char_pinyin_prob[character] = pinyin_prob print('\nInserting into database...') for character, pinyin_prob in char_pinyin_prob.items(): for pin_yin, prob in pinyin_prob.items(): # noinspection PyTypeChecker cls.insert(Emission, character=character, pin_yin=pin_yin, prob=float(np.log(prob / sum(pinyin_prob.values())))) print('Done training emission parameter')
def CreateProgress(title, max_l): line_progress = LineProgress(title='qweqweqew', total=100, symbol='#') for i in range(101): line_progress.update(i * 2) time.sleep(0.05)
def data_analysis(fund_with_achievement, choice_return_this, choice_time_this): """ 按传入的训责策略,筛选出符合要求的基金 :param fund_with_achievement: 全部的基金信息文件名 :param choice_return_this: 要求的基金收益率 :param choice_time_this: 要求的任职时间 """ # 文件以a方式写入,先进行可能的文件清理 try: os.remove(fund_choice_filename) except FileNotFoundError: pass try: with open(fund_choice_filename, 'w') as f: if fund_with_achievement == all_index_fund_with_msg_filename: f.write(header_index_fund) else: f.write(header_guaranteed_fund) print('筛选基金。。。') with open(fund_with_achievement, 'r') as f: count = 0 all_lines = f.readlines()[1:] len_of_lines = len(all_lines) line_progress = LineProgress(title='爬取进度') for i in all_lines: # 逐条检查 count += 1 sign = 1 # 取基金信息,并按收益率和任职时间分类 _, _, one_month, three_month, six_month, one_year, three_year, from_st, _, this_tenure_time, \ this_return, all_tenure_time, _ = i.split(',') return_all = [ one_month, three_month, six_month, one_year, three_year, from_st, this_return ] time_all = [this_tenure_time, all_tenure_time] # 信息未知或一月数据不存在(成立时间过短)的淘汰 if one_month == '??' or one_month == '--': continue # 收益率部分的筛选 for j, k in zip(choice_return_this.values(), return_all): if k == '--': continue if float(k[:-1]) < j: sign = 0 break # 任职时间部分的筛选 if sign == 1: for j, k in zip(choice_time_this.values(), time_all): for l, m in zip(j, get_time_from_str(k)): if m > l: break elif m == l: continue else: sign = 0 break # 符合要求的保存进文件 if sign == 1: with open(fund_choice_filename, 'a') as f2: f2.write(i) line_progress.update(count * 100 // len_of_lines) except Exception as e: print(e)
def get_past_performance(source_file): """ 在简单基金目录的基础上,爬取所有基金的信息 :param source_file:要爬取的基金目录 :return 爬取失败的(基金代码,基金名称)list """ # 测试文件是否被占用,并写入列索引 global thread_pool try: if source_file == all_fund_filename: with open(all_index_fund_with_msg_filename, 'w') as f: f.write(header_index_fund) f.write('\n') with open(all_guaranteed_fund_with_msg_filename, 'w') as f: f.write(header_guaranteed_fund) f.write('\n') except IOError: print('文件' + all_fund_filename + '无法打开') return if type(source_file) == str: with open(source_file, 'r') as f: # 逐个爬取所有基金的信息 fund_list = f.readlines() os.remove(source_file) else: fund_list = source_file # 进度条 line_progress = LineProgress(title='爬取进度') # 线程集合 thread = list() # 接受线程爬取的信息 queue_index_fund = Queue() queue_guaranteed_fund = Queue() queue_other_fund = Queue() queue_give_up = Queue() lock_thread_pool = threading.Lock() last_queue_num = 0 fund_list_length = len(fund_list) ture_done_num = 0 def save_file(): # 写入文件 with open(all_index_fund_with_msg_filename, 'a') as f: while not queue_index_fund.empty(): i = queue_index_fund.get() for j in i: f.write(j + ',') f.write('\n') with open(all_guaranteed_fund_with_msg_filename, 'a') as f: while not queue_guaranteed_fund.empty(): i = queue_guaranteed_fund.get() for j in i: f.write(j + ',') f.write('\n') for i in fund_list: # 已完成的基金数目 done_num = (queue_index_fund.qsize() + queue_guaranteed_fund.qsize() + queue_other_fund.qsize() + queue_give_up.qsize()) try: code, name = i.split(',') name = name[:-1] except ValueError: continue # 多线程爬取 t = threading.Thread(target=thread_get_past_performance, args=(code, name, queue_index_fund, queue_guaranteed_fund, queue_other_fund, queue_give_up, lock_thread_pool)) thread.append(t) t.setName(code + ',' + name) t.start() for t in thread: if not t.is_alive(): thread.remove(t) # 判断线程集合是否过大 if len(thread) > thread_pool: thread_pool += done_num - last_queue_num last_queue_num = done_num while len(thread) > thread_pool: time.sleep(random.random()) for t in thread: if not t.is_alive(): thread.remove(t) line_progress.update( (ture_done_num + done_num) * 100 // fund_list_length) # 爬取一定数目之后保存一次文件,发现爬取过程中速度会变慢,可通过实验调节num_save_file和休眠的值,达到间歇爬取,提高速度的目的 if done_num >= num_save_file: time.sleep(5) thread_pool += done_num - last_queue_num last_queue_num = 0 ture_done_num += queue_index_fund.qsize( ) + queue_guaranteed_fund.qsize() save_file() # 等待所有线程执行完毕 while len(thread) > 0: for t in thread: if not t.is_alive(): thread.remove(t) process = (queue_index_fund.qsize() + queue_guaranteed_fund.qsize() + queue_other_fund.qsize() + queue_give_up.qsize() + ture_done_num) * 100 // fund_list_length line_progress.update(process) time.sleep(random.random()) save_file() print('\n基金信息爬取完成,其中处于封闭期或已终止的基金有' + str(queue_other_fund.qsize()) + '个,爬取失败的有' + str(queue_give_up.qsize()) + '个') return list(queue_give_up.get() for i in range(queue_give_up.qsize()))
def get_past_performance(all_fund_generator_or_list, first_crawling=True): """ 在简单基金目录的基础上,爬取所有基金的信息 :param all_fund_generator_or_list: 要爬取的基金目录(generator) 也可以直接是列表('基金代码,基金名称')(list) :param first_crawling: 是否是第一次爬取,这决定了是否会重新写保存文件(清空并写入列索引) :return 爬取失败的('基金代码,基金名称')(list) """ maximum_of_thread = 1 # 测试文件是否被占用,并写入列索引 try: if first_crawling: with open(all_index_fund_with_msg_filename, 'w') as f: f.write(header_index_fund) with open(all_guaranteed_fund_with_msg_filename, 'w') as f: f.write(header_guaranteed_fund) except IOError: print('文件' + all_fund_filename + '无法打开') return # 对于输入为list的情况,构造成迭代器 if type(all_fund_generator_or_list) == list: all_fund_generator_or_list = (i for i in all_fund_generator_or_list) elif str(type(all_fund_generator_or_list)) != "<class 'generator'>": raise AttributeError # 进度条 line_progress = LineProgress(title='爬取进度') # 线程集合 thread = list() # 接受线程爬取的信息 queue_index_fund = Queue() queue_guaranteed_fund = Queue() queue_other_fund = Queue() queue_give_up = Queue() num_of_previous_completed = 0 num_of_last_addition_of_completed_fund_this_time = 0 num_of_last_addition_give_up_fund = 0 num_of_last_addition_other_fund = 0 need_to_save_file_event = threading.Event() def save_file(): nonlocal maximum_of_thread, num_of_last_addition_of_completed_fund_this_time, num_of_previous_completed, \ num_of_last_addition_give_up_fund, num_of_last_addition_other_fund # 写入文件和最大线程数减半 while True: need_to_save_file_event.wait() maximum_of_thread = (maximum_of_thread // 2) + 1 num_of_last_addition_of_completed_fund_this_time = 0 num_of_previous_completed += (queue_index_fund.qsize() + queue_guaranteed_fund.qsize() + queue_other_fund.qsize() + queue_give_up.qsize() - num_of_last_addition_give_up_fund - num_of_last_addition_other_fund) num_of_last_addition_give_up_fund = queue_give_up.qsize() with open(all_index_fund_with_msg_filename, 'a') as f: while not queue_index_fund.empty(): i = queue_index_fund.get() for j in i: f.write(j + ',') f.write('\n') with open(all_guaranteed_fund_with_msg_filename, 'a') as f: while not queue_guaranteed_fund.empty(): i = queue_guaranteed_fund.get() for j in i: f.write(j + ',') f.write('\n') need_to_save_file_event.clear() t = threading.Thread(target=save_file) t.setDaemon(True) t.start() try: while True: i = next(all_fund_generator_or_list) try: code, name = i.split(',') name = name[:-1] except ValueError: continue num_of_completed_this_time = (queue_index_fund.qsize() + queue_guaranteed_fund.qsize() + queue_other_fund.qsize() + queue_give_up.qsize() - num_of_last_addition_give_up_fund - num_of_last_addition_other_fund) # 多线程爬取 t = threading.Thread(target=thread_get_past_performance, args=(code, name, queue_index_fund, queue_guaranteed_fund, queue_other_fund, queue_give_up, need_to_save_file_event)) thread.append(t) t.setName(code + ',' + name) t.start() for t in thread: if not t.is_alive(): thread.remove(t) if len(thread) > maximum_of_thread: time.sleep(random.random()) if need_to_save_file_event.is_set(): while need_to_save_file_event.is_set(): pass else: maximum_of_thread += num_of_completed_this_time - num_of_last_addition_of_completed_fund_this_time num_of_last_addition_of_completed_fund_this_time = num_of_completed_this_time while len(thread) > maximum_of_thread // 2: for t in thread: if not t.is_alive(): thread.remove(t) line_progress.update( (num_of_previous_completed + num_of_completed_this_time) * 100 // sum_of_fund) except StopIteration: pass # 等待所有线程执行完毕 while len(thread) > 0: line_progress.update((sum_of_fund - len(thread)) * 100 // sum_of_fund) time.sleep(random.random()) for t in thread: if not t.is_alive(): thread.remove(t) line_progress.update(99) need_to_save_file_event.set() line_progress.update(100) print('\n基金信息爬取完成,其中处于封闭期或已终止的基金有' + str(queue_other_fund.qsize()) + '个,爬取失败的有' + str(queue_give_up.qsize()) + '个') return list(queue_give_up.get() for i in range(queue_give_up.qsize()))
thread_pool = [] if which == 1: # circle circle_progress = CircleProgress(title='circle loading') # circle_thread = threading.Thread(target=mock_single_progress, args=(circle_progress, 0.1)) # thread_pool.append(circle_thread) # circle_thread.start() for i in range(1, 101): circle_progress.update(i) time.sleep(0.1) elif which == 2: # line line_progress = LineProgress(title='line progress') for i in range(1, 101): line_progress.update(i) time.sleep(0.05) # line_thread = threading.Thread(target=mock_single_progress, args=(line_progress, 0.05)) # thread_pool.append(line_thread) # line_thread.start() elif which == 3: # multi line progress_manager = MultiProgressManager() thread1 = threading.Thread(target=mock_multi_progress, args=(progress_manager, 0.05), name=str(1001)) thread2 = threading.Thread(target=mock_multi_progress, args=(progress_manager, 0.2), name=str(1002)) thread3 = threading.Thread(target=mock_multi_progress,
def main(LIMT=20, ): circle_progress = CircleProgress(title='Initialize loading') for i in range(1, 10): circle_progress.update(i) time.sleep(0.1) line_progress = LineProgress(title='Scan System Information') for i in range(1, 101): line_progress.update(i) time.sleep(0.02) line_progress = LineProgress(title='Scan Patch') for i in range(1, 101): line_progress.update(i) time.sleep(0.03) line_progress = LineProgress(title='Scan Model') for i in range(1, 101): line_progress.update(i) time.sleep(0.03) cpuuser = 0 MODLENUM = 0 FORMATS = 0 STACKS = 0 REGSS = 0 RWS = 0 TOTALNUM = 0 FORMATN = 0 STACKN = 0 REGN = 0 RWN = 0 start = timeit.default_timer() set_win() display_info(" BJTU ZXY MAKARA ", 0, 0, 1) display_info(" Version: Beta 0.1 ", 17, 0, 2) display_info(" Gitee: https://gitee.com/zeroaone/makara \n", 36, 0, 3) str1 = (" __ __ _ _ __ _ ____ _\n" "| \/ | / \ | |/ / / \ | _ \ / \\\n" "| |\/| | / _ \ | ' / / _ \ | |_) | / _ \\\n" "| | | |/ ___ \| . \ / ___ \| _ < / ___ \\\n" "|_| |_/_/ \_\_|\_\/_/ \_\_| \_\/_/ \_\\\n" "\n") display_info_flash(str1, 0, 1) display_info("[CPU: %]", 0, 7, 4) display_info("[CPU-USER-TIME: ]", 0, 8, 4) display_info("[MEM: %]", 0, 9, 4) display_info("[ PROCESS TIMING ] \n", 0, 10, 1) display_info("TIMING LIMIT :" + str(LIMT) + " MIN", 0, 11, 1) display_info("TIMING RUNING:", 0, 12, 1) display_info("[ SCAN BUG STEP ] \n", 0, 13, 2) display_info("MODEL NUM:", 0, 14, 4) display_info("FORMAT STEPS:", 0, 15, 4) display_info("STACK STEPS:", 0, 16, 4) display_info("REGS STEPS:", 0, 17, 4) display_info("RW STEPS:", 0, 18, 4) display_info("[ SCAN BUG NUM ] \n", 0, 19, 2) display_info("TOTAL NUMS:", 0, 20, 4) display_info("FORMAT NUMS:", 0, 21, 4) display_info("STACK NUMS:", 0, 22, 4) display_info("REGS NUMS:", 0, 23, 4) display_info("RW NUMS:", 0, 24, 4) display_info("[ SYSTEM INFO ] \n", 0, 25, 2) display_info("CPU NAME:", 0, 26, 4) display_info("CPU ARCH:", 0, 27, 4) display_info("IP ADDRES:", 0, 28, 4) display_info("OS INFO:", 0, 29, 4) display_info("[LOG INFO LATEST 10] \n", 0, 30, 2) try: while 1: FORMATS = FORMATS + random.randint(0, 10) STACKS = STACKS + random.randint(0, 20) REGSS = REGSS + random.randint(0, 20) RWS = RWS + random.randint(0, 20) MODLENUM = FORMATS + STACKS + REGSS + RWS #PROCESS TIMING FORMATN = FORMATN + random.randint(0, 2) STACKN = STACKN + random.randint(0, 2) REGN = REGN + random.randint(0, 2) RWN = RWN + random.randint(0, 2) TOTALNUM = FORMATN + STACKN + REGN + RWN end = timeit.default_timer() time.sleep(1.5) mem = random.randint(0, 100) cpu = random.randint(0, 100) cpuuser = cpuuser + 1 #SCAN BUG STEP display_info(str(cpu), 6, 7, 5) display_info(str(cpuuser), 16, 8, 5) display_info(str(mem), 6, 9, 5) display_info(str(end - start), 14, 12, 5) #SYSTEM INFO display_info(str(MODLENUM), 15, 14, 5) display_info(str(FORMATS), 15, 15, 5) display_info(str(STACKS), 15, 16, 5) display_info(str(REGSS), 15, 17, 5) display_info(str(RWS), 15, 18, 5) display_info(str(TOTALNUM), 15, 20, 5) display_info(str(FORMATN), 15, 21, 5) display_info(str(STACKN), 15, 22, 5) display_info(str(REGN), 15, 23, 5) display_info(str(RWN), 15, 24, 5) #SYSTEM INFO display_info('Qualcomm Technologies, Inc SDM660', 15, 26, 5) display_info('ARM aarch64', 15, 27, 5) display_info('192.168.31.106', 15, 28, 5) display_info('Android 9', 15, 29, 5) #logINFO str1 = [] str1.append("adasdasdasdasdasdasdasd") str1.append("sfasgea") str1.append("asasdasdasd") str1.append("ftrythfgh") str1.append("sadf34q") str1.append("vbvnvcbnbvnvb") str1.append("wrwersdf") str1.append("3454356ydh") str1.append("hdfhdfg") str1.append("eqwrsdfasdf") str1.append("cvbnvcb") str1.append("adhxcncvnvbxdfgsdgrsdrg") str1.append("fdsadfzxdvzxvd") str1.append("vzxcvzxcvzxcvzxvzxcvzxc") index = random.randint(0, 12) display_info(str1[index], 0, 31, 5) index = random.randint(0, 12) display_info(str1[index], 0, 32, 5) index = random.randint(0, 12) display_info(str1[index], 0, 33, 5) index = random.randint(0, 12) display_info(str1[index], 0, 34, 5) index = random.randint(0, 12) display_info(str1[index], 0, 35, 5) index = random.randint(0, 12) display_info(str1[index], 0, 36, 5) index = random.randint(0, 12) display_info(str1[index], 0, 37, 5) index = random.randint(0, 12) display_info(str1[index], 0, 38, 5) index = random.randint(0, 12) display_info(str1[index], 0, 39, 5) index = random.randint(0, 12) display_info(str1[index], 0, 40, 5) index = random.randint(0, 12) display_info(str1[index], 0, 41, 5) except KeyboardInterrupt: unset_win()
def main(LIMT=20, ): circle_progress = CircleProgress(title='Initialize loading') for i in range(1, 10): circle_progress.update(i) time.sleep(0.1) line_progress = LineProgress(title='Scan System Information') for i in range(1, 101): line_progress.update(i) time.sleep(0.02) line_progress = LineProgress(title='Scan Patch') for i in range(1, 101): line_progress.update(i) time.sleep(0.03) line_progress = LineProgress(title='Scan Model') for i in range(1, 101): line_progress.update(i) time.sleep(0.03) MODLENUM = 0 FORMATS = 0 STACKS = 0 REGSS = 0 RWS = 0 TOTALNUM = 0 FORMATN = 0 STACKN = 0 REGN = 0 RWN = 0 start = timeit.default_timer() set_win() display_info(" BJTU ZXY MAKARA ", 0, 0, 1) display_info(" Version: Beta 0.1 ", 17, 0, 2) display_info(" Gitee: https://gitee.com/zeroaone/makara \n", 36, 0, 3) str1 = ( " __ __ _ _ __ _ ____ _\n" "| \/ | / \ | |/ / / \ | _ \ / \\\n" "| |\/| | / _ \ | ' / / _ \ | |_) | / _ \\\n" "| | | |/ ___ \| . \ / ___ \| _ < / ___ \\\n" "|_| |_/_/ \_\_|\_\/_/ \_\_| \_\/_/ \_\\\n" "\n" ) display_info_flash(str1, 0, 1) display_info("[CPU: %]", 0, 7, 4) display_info("[CPU-USER-TIME: ]", 0, 8, 4) display_info("[MEM: %]", 0, 9, 4) display_info("[ PROCESS TIMING ] \n", 0, 10, 1) display_info("TIMING LIMIT :" + str(LIMT) + " MIN", 0, 11, 1) display_info("TIMING RUNING:", 0, 12, 1) display_info("[ SCAN BUG STEP ] \n", 0, 13, 2) display_info("MODEL NUM:", 0, 14, 4) display_info("FORMAT STEPS:", 0, 15, 4) display_info("STACK STEPS:", 0, 16, 4) display_info("REGS STEPS:", 0, 17, 4) display_info("RW STEPS:", 0, 18, 4) display_info("[ SCAN BUG NUM ] \n", 0, 19, 2) display_info("TOTAL NUMS:", 0, 20, 4) display_info("FORMAT NUMS:", 0, 21, 4) display_info("STACK NUMS:", 0, 22, 4) display_info("REGS NUMS:", 0, 23, 4) display_info("RW NUMS:", 0, 24, 4) try: while 1: MODLENUM = MODLENUM + random.randint(30,50) FORMATS = FORMATS + random.randint(0, 20) STACKS = STACKS + random.randint(0, 20) REGSS = REGSS + random.randint(0, 20) RWS = RWS + random.randint(0, 20) FORMATN = ReadNum("fmt.json") STACKN = ReadNum("stack.json") REGN = ReadNum("regs.json") RWN = ReadNum("ArbRW.json") TOTALNUM = FORMATN + STACKN + REGN + RWN end = timeit.default_timer() time.sleep(0.5) mem = psutil.virtual_memory().percent cpu = psutil.cpu_percent() cpuuser = psutil.cpu_times().user display_info(str(cpu), 6, 7, 5) display_info(str(cpuuser), 16, 8, 5) display_info(str(mem), 6, 9, 5) display_info(str(end - start), 14, 12, 5) display_info(str(MODLENUM), 15, 14, 5) display_info(str(FORMATS), 15, 15, 5) display_info(str(STACKS), 15, 16, 5) display_info(str(REGSS), 15, 17, 5) display_info(str(RWS), 15, 18, 5) display_info(str(TOTALNUM), 15, 20, 5) display_info(str(FORMATN), 15, 21, 5) display_info(str(STACKN), 15, 22, 5) display_info(str(REGN), 15, 23, 5) display_info(str(RWN), 15, 24, 5) except KeyboardInterrupt: unset_win()
def crawling_fund(fund_list_class: GetFundList, first_crawling=True): """ 在简单基金目录的基础上,爬取所有基金的信息 :param fund_list_class: 提供要爬取的基金目录的类 :param first_crawling: 是否是第一次爬取,这决定了是否会重新写保存文件(清空并写入列索引) :return 爬取失败的('基金代码,基金名称')(list) """ # 进度条 基金总数 爬取进度 line_progress = LineProgress(title='爬取进度') cur_process = 0 # 爬取输入、输出队列,输入结束事件,网络状态事件,爬取核心 input_queue = Queue() result_queue = Queue() finish_sign = Event() network_health = Event() crawling_core = GetPageByWebWithAnotherProcessAndMultiThreading( input_queue, result_queue, finish_sign, network_health, TIMEOUT) crawling_core.start() fund_list = fund_list_class.get_fund_list() num_of_fund = fund_list_class.sum_of_fund having_fund_need_to_crawl = True # 未来有计划将解析部分独立 fund_web_page_parse = parse_fund_info() manager_web_page_parse = parse_manager_info() write_file = write_to_file(first_crawling) next(fund_web_page_parse) next(manager_web_page_parse) next(write_file) if_first_show_network_problem = True while True: if network_health.is_set(): if if_first_show_network_problem: print('如果此条提示持续出现,请检查当前的网络状态') if_first_show_network_problem = False elif not if_first_show_network_problem: if_first_show_network_problem = True # 根据短路原则,首先是是否还有要爬取的基金,然后是判断需要解析的数据量(控制内存),最后才是查看输入队列的情况 while having_fund_need_to_crawl and result_queue.qsize( ) < 100 and input_queue.qsize() < 10: try: code, name = next(fund_list).split(',') except StopIteration: having_fund_need_to_crawl = False break tem_fund_info = FundInfo() tem_fund_info.set_fund_info('基金名称', name) tem_fund_info.set_fund_info('基金代码', code) input_queue.put( ('http://fund.eastmoney.com/' + code + '.html', tem_fund_info)) # 优先补充输入队列,保证爬取的速度,再处理需要解析的数据 while (input_queue.qsize() > 5 or not having_fund_need_to_crawl) and result_queue.qsize(): a_result = result_queue.get() # 若上次的爬取失败了,则重试,未对一直失败的进行排除 if a_result[0] == 'error': input_queue.put(a_result[1:]) else: if a_result[2].next_step == 'parsing_fund': new_fund_info: FundInfo = fund_web_page_parse.send( a_result[1:]) if new_fund_info.next_step == 'parsing_manager': input_queue.put( (new_fund_info.manager_need_process_list[-1][0], new_fund_info)) else: result_queue.put((None, None, new_fund_info)) elif a_result[2].next_step == 'parsing_manager': new_fund_info: FundInfo = manager_web_page_parse.send( a_result[1:]) if new_fund_info.next_step == 'parsing_manager': input_queue.put( (new_fund_info.manager_need_process_list[-1][0], new_fund_info)) else: result_queue.put((None, None, new_fund_info)) elif a_result[2].next_step == 'writing_file': write_file.send(a_result[2]) cur_process += 1 line_progress.update(100 * cur_process / num_of_fund) else: print( f'请检查FundInfo的next_step(此处为{a_result[2].next_step})设置,出现了未知的参数' ) # 完成所有任务判断 if not having_fund_need_to_crawl and input_queue.qsize( ) == 0 and result_queue.qsize() == 0: time.sleep(TIMEOUT) if not having_fund_need_to_crawl and input_queue.qsize( ) == 0 and result_queue.qsize() == 0: break finish_sign.set()
import urllib.request as rq import os import json from eprogress import LineProgress url = "https://www.bilibili.com/index/index-icon.json" save_dir = "bilibili_images/" res = rq.urlopen(url) json_str = json.loads(res.read()) if not os.path.exists(save_dir): os.mkdir(save_dir) images = json_str["fix"] total = len(images) progress = LineProgress(title='total ' + str(total) + ' images, downloading progress') for index, im in enumerate(images): title, icon_url = im["title"], im["icon"] ss = rq.urlretrieve("http:" + icon_url, filename=os.path.join(save_dir, title + "." + icon_url.split('.')[-1])) progress.update(int((index + 1) / total * 100)) print("\ndone.")