def _get_chrome_cookie(cache_path, url): '''Get chrome cookies with selenium @@API that allows external calls Due to the recaptcha mechanism set by the website, it is impossible to obtain the token using the normal method, and it is forced to adopt the webdriver of selenium. :param cache_path: local cookie cache text path :param url: selenium webdriver request url :return: cookie jar ''' cookie_jar = RequestsCookieJar() # first judge local cookie text file exist # if exists, just read it and return if os.path.exists(cache_path): dl.LT_PRINT('check local cookie file') with open(cache_path, "r") as fp: cookies = json.load(fp) # package to jar type for cookie in cookies: cookie_jar.set(cookie['name'], cookie['value']) return cookie_jar dl.LT_PRINT('start selenium webdriver') dl.LT_PRINT('target page: %s' % url) chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') # hide window chrome_options.add_argument( '--disable-extensions') # disable chrome externsions chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--incognito') # seamless mode chrome_options.add_argument( '--blink-settings=imagesEnabled=false') # do not load image ## chrome_options.add_argument('--start-maximized') chrome_options.add_argument('user-data-dir=' + os.path.abspath(dl.chrome_user_data_dir)) driver = webdriver.Chrome(chrome_options=chrome_options) # request website and get cookie driver.get(url) cookies = driver.get_cookies() driver.close() dl.LT_PRINT('stop selenium webdriver') # save cookies to file with open(cache_path, "w") as fp: json.dump(cookies, fp, sort_keys=True, indent=4) # package to jar type for cookie in cookies: cookie_jar.set(cookie['name'], cookie['value']) return cookie_jar
def wca_camouflage_login(self): """Camouflage browser to login If login failed, program will exit here @@API that allows external calls :return: status code """ if WkvCwApi._login_once_flag: return dl.PUB_E_OK else: WkvCwApi._login_once_flag = True if self._gatherpostkey() != dl.PUB_E_OK: exit(dl.PUB_E_RESPONSE_FAIL) cookie_jar = self._get_chrome_cookie(dl.local_cache_cookie_path, dl.HTTPS_HOST_URL) self.cookieHandler = urllib.request.HTTPCookieProcessor(cookie_jar) self.opener = urllib.request.build_opener(self.cookieHandler) urllib.request.install_opener(self.opener) response = self.wca_url_request_handler( target_url=dl.LOGIN_REQUEST_API_URL, post_data=self.postway_data, timeout=30, target_page_word='login', logpath=None) if response == dl.PUB_E_RESPONSE_FAIL: dl.LT_PRINT( dl.BR_CB('login response return a boolean FALSE, exit')) exit(dl.PUB_E_RESPONSE_FAIL) web_src = response.read().decode("UTF-8", "ignore") dl.LT_PRINT( dl.BY_CB('response source: %s' % web_src.encode("UTF-8").decode("unicode_escape"))) login_info_pattern = re.compile(dl.LOGIN_INFO_REGEX, re.S) response_info = re.findall(login_info_pattern, web_src) if response_info: if response_info[0] != 'false': # error false means no error dl.LT_PRINT(dl.BR_CB('login confirm raise a error, exit')) exit(dl.PUB_E_RESPONSE_FAIL) else: dl.LT_PRINT('login check response right') else: dl.LT_PRINT('login confirm response no error status') exit(dl.PUB_E_RESPONSE_FAIL)
def rtn_gather_essential_info(page_opt, whole_nbr): """Get input image count If user input number more than whole number, set target count is whole number Only intercative mode call this function :param page_opt: select ranktop ordinary or r18 mode :param whole_nbr: whole ranking crawl count :return: crawl images count """ img_cnt = 0 if page_opt == dl.PAGE_ORDINARY: label = 'ordinary' elif page_opt == dl.PAGE_R18: label = 'r18' elif page_opt == dl.PAGE_R18G: label = 'r18g' else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL img_str = dl.LT_INPUT(dl.HL_CY('crawl %s valid target %d, enter you want: ' % (label, whole_nbr))) while not img_str.isdigit(): img_str = dl.LT_INPUT(dl.HL_CY('input error, enter again(max is %d): ' % whole_nbr)) img_cnt = int(img_str) if img_cnt <= 0: dl.LT_PRINT(dl.BR_CB('what the f**k is wrong with you?')) return dl.PUB_E_PARAM_FAIL if img_cnt > whole_nbr: img_cnt = whole_nbr return img_cnt
def rtn_gather_rankingdata(self): """Crawl dailyRank list :return: status code """ response = self.wkv_cw_api.wca_url_request_handler(target_url=self.rtn_req_url, post_data=self.wkv_cw_api.getway_data, timeout=30, target_page_word='rankpage', logpath=self.logpath) # size info in webpage source web_src = response.read().decode("UTF-8", "ignore") imgitem_pattern = re.compile(dl.RANKING_SECTION_REGEX, re.S) info_pattern = re.compile(dl.RANKING_INFO_REGEX, re.S) sizer_result = self.wkv_cw_api.wca_commit_spansizer(imgitem_pattern, info_pattern, web_src) if sizer_result == dl.PUB_E_FAIL: return dl.PUB_E_FAIL url_lst = sizer_result['url lst'] img_info_lst = sizer_result['info lst'] # cut need image count to be target list valid_url_cnt = len(url_lst) if self.ir_mode == dl.MODE_INTERACTIVE: img_nbr = self.rtn_gather_essential_info(self.page_opt, valid_url_cnt) if img_nbr == dl.PUB_E_PARAM_FAIL: return dl.PUB_E_FAIL elif self.ir_mode == dl.MODE_SERVER: img_nbr = valid_url_cnt # server mode directly get all of alive targets dl.LT_PRINT(dl.BY_CB('server mode auto crawl all of alive targets')) self.rtn_target_urls = url_lst[:img_nbr] log_content = dl.BY_CB('crawl ranking top ' + str(img_nbr) + ', target table:') self.wkv_cw_api.wca_logprowork(self.logpath, log_content) image_info_table = PrettyTable( ["ImageNumber", "ImageID", "ImageTitle", "ImageID+PageNumber", "AuthorID", "AuthorName"]) for k, i in enumerate(img_info_lst[:img_nbr]): self.rtn_basepages.append(dl.BASEPAGE_URL(i[3])) # url request header use image_info_table.add_row([(k + 1), i[3], i[1], dl.FROM_URL_GET_IMG_NAME(self.rtn_target_urls[k]), i[4], i[2]]) # damn emoji, maybe dump failed try: self.wkv_cw_api.wca_logprowork(self.logpath, str(image_info_table), False) except Exception as e: dl.LT_PRINT(dl.BR_CB('error: %s, dump prettytable interrupt' % str(e))) return dl.PUB_E_OK
def wca_commit_spansizer(whole_pattern, info_pattern, web_src): """A sizer for all of images in once commit item @@API that allows external calls After Pixiv 20181002 update, this method only support mode rtn :param whole_pattern: whole info data regex compile pattern :param info_pattern: image info regex compile pattern :param web_src: webpage source :return: original target url list & image info list dict """ img_info_lst = [] tgt_url_lst = [] datasrc_pattern = re.compile(dl.DATASRC_REGEX, re.S) span_pattern = re.compile(dl.SPAN_REGEX, re.S) img_whole_info = re.findall(whole_pattern, web_src) # image have 3 format: jpg/png/gif # this crawler will give gif format up and crawl png or jpg # pixiv one repertory maybe have multi-images for item in img_whole_info: tmp_thumbnail = re.findall(datasrc_pattern, item) if not tmp_thumbnail: dl.LT_PRINT(dl.BR_CB('span sizer regex cannot get valid info')) return dl.PUB_E_FAIL thumbnail = tmp_thumbnail[0] judge_word = thumbnail[-18:] # check jpg/png or gif if judge_word == dl.JUDGE_NOGIF_WORD: span_word = re.findall(span_pattern, item) vaild_word = thumbnail[44:-18] # try to check multi-span images if len(span_word) != 0: for _px in range(int(span_word[0])): info = re.findall(info_pattern, item)[0] img_info_lst.append(info) # more pages point, range 0~span-1 target_url = dl.ORIGINAL_IMAGE_HEAD + vaild_word + dl.ORIGINAL_IMAGE_TAIL( _px) tgt_url_lst.append(target_url) # just only one picture in a commit else: info = re.findall(info_pattern, item)[0] img_info_lst.append(info) # only _p0 page target_url = dl.ORIGINAL_IMAGE_HEAD + vaild_word + dl.ORIGINAL_IMAGE_TAIL( 0) tgt_url_lst.append(target_url) # give up gif format, or list is empty else: pass return {'url lst': tgt_url_lst, 'info lst': img_info_lst}
def wca_save_test_html(self, filename, workdir, content): """Save request web source page in a html file, test use @@API that allows external calls :param filename: save html file name :param workdir: work directory :param content: save content(web source code) :return: none """ htmlfile = open(workdir + '/' + filename + '.html', "w", encoding='utf-8') htmlfile.write(content) htmlfile.close() dl.LT_PRINT(dl.BY_CB('save test request html page ok'))
def wca_logprowork(self, logpath, log_content, withtime=True): """Universal work log save @@API that allows external calls Notice: If here print series fucntion raise UnicodeEncodeError, it must web page include emoji symbol encode title when use prettytable to package title info :param logpath: log save path :param log_content: log save content :param withtime: default parameter, print and save with real time or not :return: none """ # if log path is none, just print message and return, no log action if logpath == None: dl.LT_PRINT(log_content) return # add context to the file use option 'a+' # write content may have some not utf8 code, example Japanese log_fd = open(logpath, 'a+', encoding='utf-8') if withtime == True: dl.LT_PRINT(log_content) log_content = self.wca_remove_color_chars( log_content) # remove log color chars # remove timestamp log color chars timestamp = dl.realtime_logword(dl.base_time) timestamp = self.wca_remove_color_chars(timestamp) timestamp = timestamp[:-1] + ' ' # timestamp has a space in tail log_fd.write(timestamp + log_content + '\n') else: print(log_content) log_content = self.wca_remove_color_chars(log_content) log_fd.write(log_content + '\n') log_fd.close()
def _gatherpostkey(self): """POST way login need post-key Pixiv website POST login address: (see dl.LOGIN_POSTKEY_URL) This operation will get cookie and post-key :return: status code """ self._login_preload(dl.LOGIN_AES_INI_PATH) response = self.wca_url_request_handler( target_url=dl.LOGIN_POSTKEY_URL, post_data=None, # cannot set data when get post key timeout=30, target_page_word='post-key', logpath=None) web_src = response.read().decode("UTF-8", "ignore") # debug recaptcha v3 token use ## self.wca_save_test_html('post-key', 'E:\\OperationCache', web_src) post_pattern = re.compile(dl.POSTKEY_REGEX, re.S) postkey = re.findall(post_pattern, web_src) if not postkey: dl.LT_PRINT('regex parse post key failed') return dl.PUB_E_REGEX_FAIL # build post-way data with order dictory structure post_orderdict = OrderedDict() post_orderdict['captcha'] = "" post_orderdict['g_recaptcha_response'] = "" post_orderdict['password'] = self.passwd post_orderdict['pixiv_id'] = self.username post_orderdict['post_key'] = postkey[0] post_orderdict['source'] = "accounts" post_orderdict['ref'] = "" post_orderdict['return_to'] = dl.HTTPS_HOST_URL post_orderdict['recaptcha_v3_token'] = "" # google recaptcha v3 token self.postway_data = urllib.parse.urlencode(post_orderdict).encode( "UTF-8") return dl.PUB_E_OK
def wca_mkworkdir(self, logpath, folder): """Create a crawler work directory @@API that allows external calls :param logpath: log save path :param folder: folder create path :return: folder create path """ # create a folder to save picture dl.LT_PRINT('crawler work directory setting: ' + folder) is_folder_existed = os.path.exists(folder) if not is_folder_existed: os.makedirs(folder) log_content = 'create a new work folder' else: log_content = 'target folder has already existed' # log file first line here if os.path.exists(logpath): os.remove(logpath) self.wca_logprowork(logpath, log_content)
def _login_preload(self, aes_file_path): """Get user input login info and storage into aes file If project directory has no file, you need hand-input login info, then program will create new file to storage AES encrypt info to it This method use pycrypto, need import external call :param aes_file_path: .aes_crypto_login.ini file path :return: none """ if os.path.exists(aes_file_path): # stable read rows get username and password read_aes_file = open(aes_file_path, 'rb+') readline_cache = read_aes_file.readlines() # all line list read_aes_file.close() # get aes file storage info and split tail '\n' aes_info = { 'iv_param': readline_cache[0][:-1], 'user_mail': readline_cache[1][:-1], 'passwd': readline_cache[2][:-1] } # analysis hash value to string username_aes_decrypt_cipher = AES.new(dl.AES_SECRET_KEY, AES.MODE_CFB, aes_info['iv_param']) username = str( username_aes_decrypt_cipher.decrypt( aes_info['user_mail'][AES.block_size:]), 'UTF-8') password_aes_decrypt_cipher = AES.new(dl.AES_SECRET_KEY, AES.MODE_CFB, aes_info['iv_param']) passwd = str( password_aes_decrypt_cipher.decrypt( aes_info['passwd'][AES.block_size:]), 'UTF-8') if self.ir_mode == dl.MODE_INTERACTIVE: check = dl.LT_INPUT( dl.HL_CY("get user account info ok, check: \n" "[*username] %s\n[*password] %s\n" "Is that correct? (Y/N): " % (username, passwd))) # if user judge info are error, delete old AES file and record new info if check == 'N' or check == 'n': os.remove(aes_file_path) # delete old AES file # temporarily enter login information dl.LT_PRINT( dl.BY_CB( "Well, you need hand-input your login data: ")) username = dl.LT_INPUT( dl.HL_CY( 'enter your pixiv id(mailbox), must be a R18: ')) passwd = getpass.getpass( dl.realtime_logword(dl.base_time) + dl.HL_CY('enter your account password: '******'enter your pixiv id(mailbox), must be a R18: ')) passwd = getpass.getpass( dl.realtime_logword(dl.base_time) + dl.HL_CY('enter your account password: '******'user', username), ('pass', passwd)] getway_data = urllib.parse.urlencode(getway_register).encode( encoding='UTF8') self.username = username self.passwd = passwd self.getway_data = getway_data
def rtn_target_confirm(self): """Input option and confirm target :return: status code """ req_url = None # request target ranking url rank_word = None # ranking word dwm_opt = None # daily/weekly/monthly if self.ir_mode == dl.MODE_INTERACTIVE: page_opt = dl.LT_INPUT(dl.HL_CY('select ranking type, ordinary(1) | r18(2) | r18g(3): ')) sex_opt = dl.LT_INPUT(dl.HL_CY('select sex favor, normal(0) | male(1) | female(2): ')) elif self.ir_mode == dl.MODE_SERVER: page_opt = self.rtn_r18_arg sex_opt = self.rtn_sex_opt else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL if page_opt == dl.PAGE_ORDINARY: if self.ir_mode == dl.MODE_INTERACTIVE: dwm_opt = dl.LT_INPUT(dl.HL_CY('select daily(1) | weekly(2) | monthly(3) ordinary ranking type: ')) elif self.ir_mode == dl.MODE_SERVER: dwm_opt = self.rtn_rank_type else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL if dwm_opt == dl.RANK_DAILY: if sex_opt == dl.SEX_NORMAL: req_url = dl.RANK_DAILY_URL rank_word = dl.DAILY_WORD elif sex_opt == dl.SEX_MALE: req_url = dl.RANK_DAILY_MALE_URL rank_word = dl.MALE_WORD elif sex_opt == dl.SEX_FEMALE: req_url = dl.RANK_DAILY_FEMALE_URL rank_word = dl.FEMALE_WORD else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL elif dwm_opt == dl.RANK_WEEKLY: req_url = dl.RANK_WEEKLY_URL rank_word = dl.WEEKLY_WORD elif dwm_opt == dl.RANK_MONTHLY: req_url = dl.RANK_MONTHLY_URL rank_word = dl.MONTHLY_WORD else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL elif page_opt == dl.PAGE_R18: if self.ir_mode == dl.MODE_INTERACTIVE: dwm_opt = dl.LT_INPUT(dl.HL_CY('select daily(1)/weekly(2) R18 ranking type: ')) elif self.ir_mode == dl.MODE_SERVER: dwm_opt = self.rtn_rank_type else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL if dwm_opt == dl.RANK_DAILY: if sex_opt == dl.SEX_NORMAL: req_url = dl.RANK_DAILY_R18_URL rank_word = dl.DAILY_WORD elif sex_opt == dl.SEX_MALE: req_url = dl.RANK_DAILY_MALE_R18_URL rank_word = dl.MALE_WORD elif sex_opt == dl.SEX_FEMALE: req_url = dl.RANK_DAILY_FEMALE_R18_URL rank_word = dl.FEMALE_WORD else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL elif dwm_opt == dl.RANK_WEEKLY: req_url = dl.RANK_WEEKLY_R18_URL rank_word = dl.WEEKLY_WORD else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL elif page_opt == dl.PAGE_R18G: req_url = dl.RANK_R18G_URL rank_word = dl.R18G_WORD dl.LT_PRINT(dl.BR_CB('warning: you choose the r18g rank, hope you know what it means')) else: dl.nolog_raise_arguerr() return dl.PUB_E_PARAM_FAIL log_content = dl.BY_CB('base select option, set rank target url: [%s]' % req_url) self.wkv_cw_api.wca_logprowork(self.logpath, log_content) self.rtn_req_url = req_url self.page_opt = page_opt return dl.PUB_E_OK
def ira_crawl_allpage_target(self): """Package all gather urls :return: status code """ require_page_cnt = 0 if self.ira_max_cnt <= dl.ONE_PAGE_COMMIT: require_page_cnt = 1 else: require_page_cnt = int(self.ira_max_cnt / dl.ONE_PAGE_COMMIT) # remainder decision if self.ira_max_cnt % dl.ONE_PAGE_COMMIT != 0: require_page_cnt += 1 # build the json data url iid_string_tail = '' subpage_url_list = [] for ix in range(require_page_cnt): # one subpage only include 6*8 valid image, others are invalid tmp_tail_nbr = dl.ONE_PAGE_COMMIT * (ix + 1) tmp_tail_nbr = self.ira_max_cnt if tmp_tail_nbr > self.ira_max_cnt else tmp_tail_nbr for index in self.ira_pure_idlist[(dl.ONE_PAGE_COMMIT * ix):tmp_tail_nbr]: iid_string_tail += dl.IDS_UNIT(index) subpage_url_list.append(dl.ALLREPOINFO_URL(self.user_input_id, iid_string_tail, 1 if ix == 0 else 0)) iid_string_tail = '' # clear last cache # get all data from response xhr page into a temp list tmp_receive_list = [] tmp_ret = [] for i in range(require_page_cnt): tmp_ret = self.ira_crawl_subpage_data(i + 1, subpage_url_list[i]) if not isinstance(tmp_ret, list): return dl.PUB_E_FAIL tmp_receive_list += tmp_ret repo_target_all_list = [] for i in range(len(tmp_receive_list)): tmp_receive_list[i][1] = dl.UNICODE_ESCAPE(tmp_receive_list[i][1]) tmp_receive_list[i][1] = dl.EMOJI_REPLACE(tmp_receive_list[i][1]) # build original url without image format tmp = tmp_receive_list[i][2] tmp = tmp.replace('\\', '') tmp_receive_list[i][2] = dl.ORIGINAL_IMAGE_HEAD + tmp[-39:-7] + '.png' # first original url repo_target_all_list.append(tmp_receive_list[i]) # add other original image url by pageCount tmp_page_count_str = tmp_receive_list[i][3] if tmp_page_count_str.isdigit(): index_page_count = int(tmp_page_count_str) if index_page_count != 1: for px in range(index_page_count): insert_item = [tmp_receive_list[i][0], tmp_receive_list[i][1], tmp_receive_list[i][2][:-5] + str(px) + '.png', tmp_receive_list[i][3]] repo_target_all_list.append(insert_item) else: log_content = dl.BR_CB('page count process error') self.wkv_cw_api.wca_logprowork(self.logpath, log_content) return dl.PUB_E_FAIL del tmp_receive_list alive_target_cnt = len(repo_target_all_list) require_img_nbr = 0 if self.ir_mode == dl.MODE_INTERACTIVE: require_img_str = dl.LT_INPUT(dl.HL_CY('crawl all repo %d, whole target(s): %d, enter you want count: ' % (self.ira_max_cnt, alive_target_cnt))) # if user input isn't number while not require_img_str.isdigit(): dl.LT_PRINT(dl.BR_CB('input error, your input content was not a decimal number')) require_img_str = dl.LT_INPUT(dl.HL_CY('enter again(max is %d): ' % alive_target_cnt)) require_img_nbr = int(require_img_str) if require_img_nbr <= 0: dl.LT_PRINT(dl.BR_CB('what the f**k is wrong with you?')) return dl.PUB_E_PARAM_FAIL require_img_nbr = alive_target_cnt if require_img_nbr > alive_target_cnt else require_img_nbr elif self.ir_mode == dl.MODE_SERVER: require_img_nbr = alive_target_cnt dl.LT_PRINT(dl.BY_CB('server mode auto crawl all of alive targets')) else: pass for k, i in enumerate(repo_target_all_list[:require_img_nbr]): self.ira_target_capture.append(i[2]) self.ira_basepages.append(dl.BASEPAGE_URL(i[0])) log_content = 'illustrator [%s] id [%s], require image(s): %d, target table:' \ % (self.ira_author_name, self.user_input_id, require_img_nbr) self.wkv_cw_api.wca_logprowork(self.logpath, log_content) image_info_table = PrettyTable(["ImageNumber", "ImageID", "ImageTitle", "ImagePageName"]) for k, i in enumerate(repo_target_all_list[:require_img_nbr]): image_info_table.add_row([(k + 1), i[0], i[1], dl.FROM_URL_GET_IMG_NAME(i[2])]) # damn emoji, maybe dump failed try: self.wkv_cw_api.wca_logprowork(self.logpath, str(image_info_table), False) except Exception as e: dl.LT_PRINT(dl.BR_CB('error: %s, dump prettytable interrupt' % str(e))) del repo_target_all_list return dl.PUB_E_OK
def ira_gather_preloadinfo(self): """Crawler need to know how many images do you want This function will get author name base on author id :return: status code """ # request all of one illustrator's artworks response = self.wkv_cw_api.wca_url_request_handler(target_url=dl.AJAX_ALL_URL(self.user_input_id), post_data=self.wkv_cw_api.getway_data, timeout=30, target_page_word='ajaxpage', logpath=self.logpath) # get artworks id list web_src = response.read().decode("UTF-8", "ignore") ajax_idlist_pattern = re.compile(dl.AJAX_ALL_IDLIST_REGEX, re.S) ajax_idlist = re.findall(ajax_idlist_pattern, web_src) if not ajax_idlist: log_content = dl.BR_CB('regex get ajax id list fail') self.wkv_cw_api.wca_logprowork(self.logpath, log_content) return dl.PUB_E_REGEX_FAIL number_pattern = re.compile(dl.NUMBER_REGEX, re.S) for index in ajax_idlist: if index.isdigit(): self.ira_pure_idlist.append(index) else: # id list result may include some garbages, use number regex get pure result one_pure_id = re.findall(number_pattern, index) if one_pure_id: self.ira_pure_idlist.append(one_pure_id[0]) else: pass # website server require the descending list of sort artwork id pure_idlist_nbr = [] for index in self.ira_pure_idlist: pure_idlist_nbr.append(int(index)) self.wkv_cw_api.wca_quick_sort(pure_idlist_nbr, 0, len(pure_idlist_nbr) - 1) self.ira_pure_idlist.clear() for index in reversed(pure_idlist_nbr): self.ira_pure_idlist.append(str(index)) del pure_idlist_nbr self.ira_max_cnt = len(self.ira_pure_idlist) # get author name from member-main-page illust_mainpage_url = dl.USERS_ARTWORKS_URL(self.user_input_id) log_content = dl.HL_CY('crawl illustrator url: [%s]' % illust_mainpage_url) self.wkv_cw_api.wca_logprowork(self.logpath, log_content) response = self.wkv_cw_api.wca_url_request_handler(target_url=illust_mainpage_url, post_data=self.wkv_cw_api.getway_data, timeout=30, target_page_word='mainpage', logpath=self.logpath) # match illustrator name web_src = response.read().decode("UTF-8", "ignore") illust_name_pattern = re.compile(dl.ILLUST_NAME_REGEX(self.user_input_id), re.S) author_info = re.findall(illust_name_pattern, web_src) if not author_info: # cannot catch illust name in mainpage if login failed dl.LT_PRINT(dl.BR_CB("Regex parsing result error, no author info")) return dl.PUB_E_REGEX_FAIL self.ira_author_name = author_info[0] log_content = dl.HL_CY('check illustrator: [%s]' % self.ira_author_name) self.wkv_cw_api.wca_logprowork(self.logpath, log_content) return dl.PUB_E_OK
def main(): """main logic Get user input arguments and launch mode function :return: none """ select_option = dl.SELECT_RTN rtn_page_opt = dl.PAGE_ORDINARY rtn_rank_opt = dl.RANK_DAILY rtn_sex_opt = dl.SEX_NORMAL ira_illust_id_list = [] print(dl.HL_CR(WkvCwApi.__doc__)) mode_interactive_server = dl.MODE_INTERACTIVE if len( sys.argv) == 1 else dl.MODE_SERVER api_instance = WkvCwApi(mode_interactive_server) api_instance.wca_camouflage_login() while True: if mode_interactive_server == dl.MODE_INTERACTIVE: select_option = dl.LT_INPUT( dl.HL_CY('login completed, select mode: ')) else: opts, args = getopt.getopt( sys.argv[1:], "hm:r:l:s:i:", ["help", "mode", "R18", "list", "sex", "id"]) for opt, value in opts: if opt in ("-m", "--mode"): select_option = value elif opt in ("-r", "--R18"): rtn_page_opt = value elif opt in ("-l", "--list"): rtn_rank_opt = value elif opt in ("-s", "--sex"): rtn_sex_opt = value elif opt in ("-i", "--id"): ira_illust_id_list = value.split( ',' ) # server mode support multi-input id and split with ',' elif opt in ("-h", "--help"): print(dl.HL_CR(WkvCwApi.__doc__)) exit(dl.PUB_E_OK) if select_option == dl.SELECT_RTN: dl.LT_PRINT(dl.BY_CB('mode: [Ranking Top N]')) rtn_instance = rtn(dl.RANK_DIR, dl.LOG_PATH, dl.HTML_PATH, api_instance, mode_interactive_server, rtn_page_opt, rtn_rank_opt, rtn_sex_opt) rtn_instance.start() elif select_option == dl.SELECT_IRA: dl.LT_PRINT(dl.BY_CB('mode: [Illustrator Repository All]')) if mode_interactive_server == dl.MODE_SERVER: for ira_illust_id in ira_illust_id_list: ira_instance = ira(dl.g_dl_work_dir, dl.LOG_NAME, dl.HTML_NAME, api_instance, mode_interactive_server, ira_illust_id) ira_instance.start() else: ira_instance = ira(dl.g_dl_work_dir, dl.LOG_NAME, dl.HTML_NAME, api_instance, mode_interactive_server, '') ira_instance.start() elif select_option == dl.SELECT_HELP: print(dl.HL_CR(WkvCwApi.__doc__)) elif select_option == dl.SELECT_EXIT: dl.LT_PRINT(dl.BY_CB("user exit program")) dl.crawler_logo() # exit print logo exit(dl.PUB_E_OK) else: dl.nolog_raise_arguerr() if mode_interactive_server == dl.MODE_SERVER: exit(dl.PUB_E_OK)