def _wrapper(self, logpath, *args, **kwargs): """Timer wrapper Mainly for the function download_alltarget() to achieve timing expansion :param logpath: log save path :param *args: pythonic variable argument :param **kwargs: pythonic variable argument :return: none """ log_content = "launch timer decorator, start download threads timer" self.wca_logprowork(logpath, log_content) starttime = time.time() origin_func(self, logpath, *args, **kwargs) # packaged original function endtime = time.time() elapesd_time = endtime - starttime average_download_speed = float(WkvCwApi._datastream_pool / elapesd_time) log_content = (dl.BY_CB( "all of threads reclaim, total download data-stream size: %0.2fMB, " "average download speed: %0.2fkB/s" % (float(WkvCwApi._datastream_pool / 1024), average_download_speed))) self.wca_logprowork(logpath, log_content) WkvCwApi._datastream_pool = 0 # clear global data stream list
def rtn_gather_rankingdata(self): """Crawl dailyRank list :return: status code """ response = self.wkv_cw_api.wca_url_request_handler(target_url=self.rtn_req_url, post_data=self.wkv_cw_api.getway_data, timeout=30, target_page_word='rankpage', logpath=self.logpath) # size info in webpage source web_src = response.read().decode("UTF-8", "ignore") imgitem_pattern = re.compile(dl.RANKING_SECTION_REGEX, re.S) info_pattern = re.compile(dl.RANKING_INFO_REGEX, re.S) sizer_result = self.wkv_cw_api.wca_commit_spansizer(imgitem_pattern, info_pattern, web_src) if sizer_result == dl.PUB_E_FAIL: return dl.PUB_E_FAIL url_lst = sizer_result['url lst'] img_info_lst = sizer_result['info lst'] # cut need image count to be target list valid_url_cnt = len(url_lst) if self.ir_mode == dl.MODE_INTERACTIVE: img_nbr = self.rtn_gather_essential_info(self.page_opt, valid_url_cnt) if img_nbr == dl.PUB_E_PARAM_FAIL: return dl.PUB_E_FAIL elif self.ir_mode == dl.MODE_SERVER: img_nbr = valid_url_cnt # server mode directly get all of alive targets dl.LT_PRINT(dl.BY_CB('server mode auto crawl all of alive targets')) self.rtn_target_urls = url_lst[:img_nbr] log_content = dl.BY_CB('crawl ranking top ' + str(img_nbr) + ', target table:') self.wkv_cw_api.wca_logprowork(self.logpath, log_content) image_info_table = PrettyTable( ["ImageNumber", "ImageID", "ImageTitle", "ImageID+PageNumber", "AuthorID", "AuthorName"]) for k, i in enumerate(img_info_lst[:img_nbr]): self.rtn_basepages.append(dl.BASEPAGE_URL(i[3])) # url request header use image_info_table.add_row([(k + 1), i[3], i[1], dl.FROM_URL_GET_IMG_NAME(self.rtn_target_urls[k]), i[4], i[2]]) # damn emoji, maybe dump failed try: self.wkv_cw_api.wca_logprowork(self.logpath, str(image_info_table), False) except Exception as e: dl.LT_PRINT(dl.BR_CB('error: %s, dump prettytable interrupt' % str(e))) return dl.PUB_E_OK
def wca_save_test_html(self, filename, workdir, content): """Save request web source page in a html file, test use @@API that allows external calls :param filename: save html file name :param workdir: work directory :param content: save content(web source code) :return: none """ htmlfile = open(workdir + '/' + filename + '.html', "w", encoding='utf-8') htmlfile.write(content) htmlfile.close() dl.LT_PRINT(dl.BY_CB('save test request html page ok'))
def wca_camouflage_login(self): """Camouflage browser to login If login failed, program will exit here @@API that allows external calls :return: status code """ if WkvCwApi._login_once_flag: return dl.PUB_E_OK else: WkvCwApi._login_once_flag = True if self._gatherpostkey() != dl.PUB_E_OK: exit(dl.PUB_E_RESPONSE_FAIL) cookie_jar = self._get_chrome_cookie(dl.local_cache_cookie_path, dl.HTTPS_HOST_URL) self.cookieHandler = urllib.request.HTTPCookieProcessor(cookie_jar) self.opener = urllib.request.build_opener(self.cookieHandler) urllib.request.install_opener(self.opener) response = self.wca_url_request_handler( target_url=dl.LOGIN_REQUEST_API_URL, post_data=self.postway_data, timeout=30, target_page_word='login', logpath=None) if response == dl.PUB_E_RESPONSE_FAIL: dl.LT_PRINT( dl.BR_CB('login response return a boolean FALSE, exit')) exit(dl.PUB_E_RESPONSE_FAIL) web_src = response.read().decode("UTF-8", "ignore") dl.LT_PRINT( dl.BY_CB('response source: %s' % web_src.encode("UTF-8").decode("unicode_escape"))) login_info_pattern = re.compile(dl.LOGIN_INFO_REGEX, re.S) response_info = re.findall(login_info_pattern, web_src) if response_info: if response_info[0] != 'false': # error false means no error dl.LT_PRINT(dl.BR_CB('login confirm raise a error, exit')) exit(dl.PUB_E_RESPONSE_FAIL) else: dl.LT_PRINT('login check response right') else: dl.LT_PRINT('login confirm response no error status') exit(dl.PUB_E_RESPONSE_FAIL)
def wca_download_alltarget(self, logpath, urls, basepages, workdir): """Multi-process download all image @@API that allows external calls :param urls: all original images urls :param basepages: all referer basic pages :param workdir: work directory :param logpath: log save path :return: none """ thread_block_flag = False # thread blocking flag alive_thread_cnt = queueLength = len(urls) log_content = dl.BY_CB('hit %d target(s), start download task(s)' % queueLength) self.wca_logprowork(logpath, log_content) # capture timeout and the user interrupt fault and exit the failed thread try: for i, one_url in enumerate(urls): self._MultiThreading.lock_t.acquire() if len(self._MultiThreading.queue_t) > dl.SYSTEM_MAX_THREADS: thread_block_flag = True self._MultiThreading.lock_t.release() # if the number of created threads reach max limit # program will stop here, wait all of threads have been created over # when one thread executed over, create next one self._MultiThreading.event_t.wait() else: self._MultiThreading.lock_t.release() # build overwrite threading.Thread object sub_thread = self._MultiThreading(i, one_url, basepages, workdir, logpath) # set every download sub-process daemon property # set false, then if you exit one thread, others threads will not end # set true, quit one is quit all sub_thread.setDaemon(True) # if create this sub-thread failed from function if sub_thread.create() == dl.PUB_E_FAIL: log_content = dl.BR_CB('create a new sub-thread failed') print(log_content) return dl.PUB_E_FAIL if thread_block_flag == False: log_content = dl.BY_CB( 'created {:d} download target object(s)') else: log_content = dl.BY_CB( 'created {:d} download target object(s), thread creation is blocked, please wait' ) dl.LT_FLUSH(log_content, i + 1) print(dl.BY_CB(', all threads have been loaded OK')) thread_block_flag = False # parent thread wait all sub-thread end # the count of all threads is 1 parent thread and n sub-thread(s) # when all pictures have been downloaded over, thread count is 1 while alive_thread_cnt > 1: # global variable update self.alivethread_counter = threading.active_count() # when alive thread count change, print its value if alive_thread_cnt != self.alivethread_counter: alive_thread_cnt = self.alivethread_counter # update alive thread count # display alive sub-thread count # its number wouldn't more than thread max count log_content = dl.BY_CB( 'currently remaining sub-thread(s):({:4d}/{:4d}), completed:({:4.1%})|({:5.2f}MB)' ) dl.LT_FLUSH(log_content, alive_thread_cnt - 1, queueLength, ((queueLength - (alive_thread_cnt - 1)) / queueLength), (float(WkvCwApi._datastream_pool / 1024))) print(dl.BY_CB(', sub-threads execute finished')) except KeyboardInterrupt: print(dl.BY_CB(', user interrupt a thread, exit all threads'))
def _login_preload(self, aes_file_path): """Get user input login info and storage into aes file If project directory has no file, you need hand-input login info, then program will create new file to storage AES encrypt info to it This method use pycrypto, need import external call :param aes_file_path: .aes_crypto_login.ini file path :return: none """ if os.path.exists(aes_file_path): # stable read rows get username and password read_aes_file = open(aes_file_path, 'rb+') readline_cache = read_aes_file.readlines() # all line list read_aes_file.close() # get aes file storage info and split tail '\n' aes_info = { 'iv_param': readline_cache[0][:-1], 'user_mail': readline_cache[1][:-1], 'passwd': readline_cache[2][:-1] } # analysis hash value to string username_aes_decrypt_cipher = AES.new(dl.AES_SECRET_KEY, AES.MODE_CFB, aes_info['iv_param']) username = str( username_aes_decrypt_cipher.decrypt( aes_info['user_mail'][AES.block_size:]), 'UTF-8') password_aes_decrypt_cipher = AES.new(dl.AES_SECRET_KEY, AES.MODE_CFB, aes_info['iv_param']) passwd = str( password_aes_decrypt_cipher.decrypt( aes_info['passwd'][AES.block_size:]), 'UTF-8') if self.ir_mode == dl.MODE_INTERACTIVE: check = dl.LT_INPUT( dl.HL_CY("get user account info ok, check: \n" "[*username] %s\n[*password] %s\n" "Is that correct? (Y/N): " % (username, passwd))) # if user judge info are error, delete old AES file and record new info if check == 'N' or check == 'n': os.remove(aes_file_path) # delete old AES file # temporarily enter login information dl.LT_PRINT( dl.BY_CB( "Well, you need hand-input your login data: ")) username = dl.LT_INPUT( dl.HL_CY( 'enter your pixiv id(mailbox), must be a R18: ')) passwd = getpass.getpass( dl.realtime_logword(dl.base_time) + dl.HL_CY('enter your account password: '******'enter your pixiv id(mailbox), must be a R18: ')) passwd = getpass.getpass( dl.realtime_logword(dl.base_time) + dl.HL_CY('enter your account password: '******'user', username), ('pass', passwd)] getway_data = urllib.parse.urlencode(getway_register).encode( encoding='UTF8') self.username = username self.passwd = passwd self.getway_data = getway_data
def rtn_target_confirm(self): """Input option and confirm target :return: status code """ req_url = None # request target ranking url rank_word = None # ranking word dwm_opt = None # daily/weekly/monthly if self.ir_mode == dl.MODE_INTERACTIVE: page_opt = dl.LT_INPUT(dl.HL_CY('select ranking type, ordinary(1) | r18(2) | r18g(3): ')) sex_opt = dl.LT_INPUT(dl.HL_CY('select sex favor, normal(0) | male(1) | female(2): ')) elif self.ir_mode == dl.MODE_SERVER: page_opt = self.rtn_r18_arg sex_opt = self.rtn_sex_opt else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL if page_opt == dl.PAGE_ORDINARY: if self.ir_mode == dl.MODE_INTERACTIVE: dwm_opt = dl.LT_INPUT(dl.HL_CY('select daily(1) | weekly(2) | monthly(3) ordinary ranking type: ')) elif self.ir_mode == dl.MODE_SERVER: dwm_opt = self.rtn_rank_type else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL if dwm_opt == dl.RANK_DAILY: if sex_opt == dl.SEX_NORMAL: req_url = dl.RANK_DAILY_URL rank_word = dl.DAILY_WORD elif sex_opt == dl.SEX_MALE: req_url = dl.RANK_DAILY_MALE_URL rank_word = dl.MALE_WORD elif sex_opt == dl.SEX_FEMALE: req_url = dl.RANK_DAILY_FEMALE_URL rank_word = dl.FEMALE_WORD else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL elif dwm_opt == dl.RANK_WEEKLY: req_url = dl.RANK_WEEKLY_URL rank_word = dl.WEEKLY_WORD elif dwm_opt == dl.RANK_MONTHLY: req_url = dl.RANK_MONTHLY_URL rank_word = dl.MONTHLY_WORD else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL elif page_opt == dl.PAGE_R18: if self.ir_mode == dl.MODE_INTERACTIVE: dwm_opt = dl.LT_INPUT(dl.HL_CY('select daily(1)/weekly(2) R18 ranking type: ')) elif self.ir_mode == dl.MODE_SERVER: dwm_opt = self.rtn_rank_type else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL if dwm_opt == dl.RANK_DAILY: if sex_opt == dl.SEX_NORMAL: req_url = dl.RANK_DAILY_R18_URL rank_word = dl.DAILY_WORD elif sex_opt == dl.SEX_MALE: req_url = dl.RANK_DAILY_MALE_R18_URL rank_word = dl.MALE_WORD elif sex_opt == dl.SEX_FEMALE: req_url = dl.RANK_DAILY_FEMALE_R18_URL rank_word = dl.FEMALE_WORD else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL elif dwm_opt == dl.RANK_WEEKLY: req_url = dl.RANK_WEEKLY_R18_URL rank_word = dl.WEEKLY_WORD else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL elif page_opt == dl.PAGE_R18G: req_url = dl.RANK_R18G_URL rank_word = dl.R18G_WORD dl.LT_PRINT(dl.BR_CB('warning: you choose the r18g rank, hope you know what it means')) else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL log_content = dl.BY_CB('base select option, set rank target url: [%s]' % req_url) self.wkv_cw_api.wca_logprowork(self.logpath, log_content) self.rtn_req_url = req_url self.page_opt = page_opt return dl.PUB_E_OK
def ira_crawl_allpage_target(self): """Package all gather urls :return: status code """ require_page_cnt = 0 if self.ira_max_cnt <= dl.ONE_PAGE_COMMIT: require_page_cnt = 1 else: require_page_cnt = int(self.ira_max_cnt / dl.ONE_PAGE_COMMIT) # remainder decision if self.ira_max_cnt % dl.ONE_PAGE_COMMIT != 0: require_page_cnt += 1 # build the json data url iid_string_tail = '' subpage_url_list = [] for ix in range(require_page_cnt): # one subpage only include 6*8 valid image, others are invalid tmp_tail_nbr = dl.ONE_PAGE_COMMIT * (ix + 1) tmp_tail_nbr = self.ira_max_cnt if tmp_tail_nbr > self.ira_max_cnt else tmp_tail_nbr for index in self.ira_pure_idlist[(dl.ONE_PAGE_COMMIT * ix):tmp_tail_nbr]: iid_string_tail += dl.IDS_UNIT(index) subpage_url_list.append(dl.ALLREPOINFO_URL(self.user_input_id, iid_string_tail, 1 if ix == 0 else 0)) iid_string_tail = '' # clear last cache # get all data from response xhr page into a temp list tmp_receive_list = [] tmp_ret = [] for i in range(require_page_cnt): tmp_ret = self.ira_crawl_subpage_data(i + 1, subpage_url_list[i]) if not isinstance(tmp_ret, list): return dl.PUB_E_FAIL tmp_receive_list += tmp_ret repo_target_all_list = [] for i in range(len(tmp_receive_list)): tmp_receive_list[i][1] = dl.UNICODE_ESCAPE(tmp_receive_list[i][1]) tmp_receive_list[i][1] = dl.EMOJI_REPLACE(tmp_receive_list[i][1]) # build original url without image format tmp = tmp_receive_list[i][2] tmp = tmp.replace('\\', '') tmp_receive_list[i][2] = dl.ORIGINAL_IMAGE_HEAD + tmp[-39:-7] + '.png' # first original url repo_target_all_list.append(tmp_receive_list[i]) # add other original image url by pageCount tmp_page_count_str = tmp_receive_list[i][3] if tmp_page_count_str.isdigit(): index_page_count = int(tmp_page_count_str) if index_page_count != 1: for px in range(index_page_count): insert_item = [tmp_receive_list[i][0], tmp_receive_list[i][1], tmp_receive_list[i][2][:-5] + str(px) + '.png', tmp_receive_list[i][3]] repo_target_all_list.append(insert_item) else: log_content = dl.BR_CB('page count process error') self.wkv_cw_api.wca_logprowork(self.logpath, log_content) return dl.PUB_E_FAIL del tmp_receive_list alive_target_cnt = len(repo_target_all_list) require_img_nbr = 0 if self.ir_mode == dl.MODE_INTERACTIVE: require_img_str = dl.LT_INPUT(dl.HL_CY('crawl all repo %d, whole target(s): %d, enter you want count: ' % (self.ira_max_cnt, alive_target_cnt))) # if user input isn't number while not require_img_str.isdigit(): dl.LT_PRINT(dl.BR_CB('input error, your input content was not a decimal number')) require_img_str = dl.LT_INPUT(dl.HL_CY('enter again(max is %d): ' % alive_target_cnt)) require_img_nbr = int(require_img_str) if require_img_nbr <= 0: dl.LT_PRINT(dl.BR_CB('what the f**k is wrong with you?')) return dl.PUB_E_PARAM_FAIL require_img_nbr = alive_target_cnt if require_img_nbr > alive_target_cnt else require_img_nbr elif self.ir_mode == dl.MODE_SERVER: require_img_nbr = alive_target_cnt dl.LT_PRINT(dl.BY_CB('server mode auto crawl all of alive targets')) else: pass for k, i in enumerate(repo_target_all_list[:require_img_nbr]): self.ira_target_capture.append(i[2]) self.ira_basepages.append(dl.BASEPAGE_URL(i[0])) log_content = 'illustrator [%s] id [%s], require image(s): %d, target table:' \ % (self.ira_author_name, self.user_input_id, require_img_nbr) self.wkv_cw_api.wca_logprowork(self.logpath, log_content) image_info_table = PrettyTable(["ImageNumber", "ImageID", "ImageTitle", "ImagePageName"]) for k, i in enumerate(repo_target_all_list[:require_img_nbr]): image_info_table.add_row([(k + 1), i[0], i[1], dl.FROM_URL_GET_IMG_NAME(i[2])]) # damn emoji, maybe dump failed try: self.wkv_cw_api.wca_logprowork(self.logpath, str(image_info_table), False) except Exception as e: dl.LT_PRINT(dl.BR_CB('error: %s, dump prettytable interrupt' % str(e))) del repo_target_all_list return dl.PUB_E_OK
def main(): """main logic Get user input arguments and launch mode function :return: none """ select_option = dl.SELECT_RTN rtn_page_opt = dl.PAGE_ORDINARY rtn_rank_opt = dl.RANK_DAILY rtn_sex_opt = dl.SEX_NORMAL ira_illust_id_list = [] print(dl.HL_CR(WkvCwApi.__doc__)) mode_interactive_server = dl.MODE_INTERACTIVE if len( sys.argv) == 1 else dl.MODE_SERVER api_instance = WkvCwApi(mode_interactive_server) api_instance.wca_camouflage_login() while True: if mode_interactive_server == dl.MODE_INTERACTIVE: select_option = dl.LT_INPUT( dl.HL_CY('login completed, select mode: ')) else: opts, args = getopt.getopt( sys.argv[1:], "hm:r:l:s:i:", ["help", "mode", "R18", "list", "sex", "id"]) for opt, value in opts: if opt in ("-m", "--mode"): select_option = value elif opt in ("-r", "--R18"): rtn_page_opt = value elif opt in ("-l", "--list"): rtn_rank_opt = value elif opt in ("-s", "--sex"): rtn_sex_opt = value elif opt in ("-i", "--id"): ira_illust_id_list = value.split( ',' ) # server mode support multi-input id and split with ',' elif opt in ("-h", "--help"): print(dl.HL_CR(WkvCwApi.__doc__)) exit(dl.PUB_E_OK) if select_option == dl.SELECT_RTN: dl.LT_PRINT(dl.BY_CB('mode: [Ranking Top N]')) rtn_instance = rtn(dl.RANK_DIR, dl.LOG_PATH, dl.HTML_PATH, api_instance, mode_interactive_server, rtn_page_opt, rtn_rank_opt, rtn_sex_opt) rtn_instance.start() elif select_option == dl.SELECT_IRA: dl.LT_PRINT(dl.BY_CB('mode: [Illustrator Repository All]')) if mode_interactive_server == dl.MODE_SERVER: for ira_illust_id in ira_illust_id_list: ira_instance = ira(dl.g_dl_work_dir, dl.LOG_NAME, dl.HTML_NAME, api_instance, mode_interactive_server, ira_illust_id) ira_instance.start() else: ira_instance = ira(dl.g_dl_work_dir, dl.LOG_NAME, dl.HTML_NAME, api_instance, mode_interactive_server, '') ira_instance.start() elif select_option == dl.SELECT_HELP: print(dl.HL_CR(WkvCwApi.__doc__)) elif select_option == dl.SELECT_EXIT: dl.LT_PRINT(dl.BY_CB("user exit program")) dl.crawler_logo() # exit print logo exit(dl.PUB_E_OK) else: dl.nolog_raise_arguerr() if mode_interactive_server == dl.MODE_SERVER: exit(dl.PUB_E_OK)