Esempio n. 1
0
    def _get_chrome_cookie(cache_path, url):
        '''Get chrome cookies with selenium

        @@API that allows external calls
        Due to the recaptcha mechanism set by the website, 
        it is impossible to obtain the token using the normal method, 
        and it is forced to adopt the webdriver of selenium.
        :param cache_path:  local cookie cache text path
        :param url:         selenium webdriver request url
        :return:            cookie jar
        '''
        cookie_jar = RequestsCookieJar()
        # first judge local cookie text file exist
        # if exists, just read it and return
        if os.path.exists(cache_path):
            dl.LT_PRINT('check local cookie file')
            with open(cache_path, "r") as fp:
                cookies = json.load(fp)
            # package to jar type
            for cookie in cookies:
                cookie_jar.set(cookie['name'], cookie['value'])

            return cookie_jar

        dl.LT_PRINT('start selenium webdriver')
        dl.LT_PRINT('target page: %s' % url)

        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')  # hide window
        chrome_options.add_argument(
            '--disable-extensions')  # disable chrome externsions
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--incognito')  # seamless mode
        chrome_options.add_argument(
            '--blink-settings=imagesEnabled=false')  # do not load image
        ## chrome_options.add_argument('--start-maximized')
        chrome_options.add_argument('user-data-dir=' +
                                    os.path.abspath(dl.chrome_user_data_dir))
        driver = webdriver.Chrome(chrome_options=chrome_options)

        # request website and get cookie
        driver.get(url)
        cookies = driver.get_cookies()
        driver.close()
        dl.LT_PRINT('stop selenium webdriver')

        # save cookies to file
        with open(cache_path, "w") as fp:
            json.dump(cookies, fp, sort_keys=True, indent=4)
        # package to jar type
        for cookie in cookies:
            cookie_jar.set(cookie['name'], cookie['value'])

        return cookie_jar
Esempio n. 2
0
    def wca_camouflage_login(self):
        """Camouflage browser to login

        If login failed, program will exit here
        @@API that allows external calls
        :return:        status code
        """
        if WkvCwApi._login_once_flag:
            return dl.PUB_E_OK
        else:
            WkvCwApi._login_once_flag = True

        if self._gatherpostkey() != dl.PUB_E_OK:
            exit(dl.PUB_E_RESPONSE_FAIL)

        cookie_jar = self._get_chrome_cookie(dl.local_cache_cookie_path,
                                             dl.HTTPS_HOST_URL)
        self.cookieHandler = urllib.request.HTTPCookieProcessor(cookie_jar)
        self.opener = urllib.request.build_opener(self.cookieHandler)
        urllib.request.install_opener(self.opener)

        response = self.wca_url_request_handler(
            target_url=dl.LOGIN_REQUEST_API_URL,
            post_data=self.postway_data,
            timeout=30,
            target_page_word='login',
            logpath=None)
        if response == dl.PUB_E_RESPONSE_FAIL:
            dl.LT_PRINT(
                dl.BR_CB('login response return a boolean FALSE, exit'))
            exit(dl.PUB_E_RESPONSE_FAIL)

        web_src = response.read().decode("UTF-8", "ignore")
        dl.LT_PRINT(
            dl.BY_CB('response source: %s' %
                     web_src.encode("UTF-8").decode("unicode_escape")))

        login_info_pattern = re.compile(dl.LOGIN_INFO_REGEX, re.S)
        response_info = re.findall(login_info_pattern, web_src)
        if response_info:
            if response_info[0] != 'false':
                # error false means no error
                dl.LT_PRINT(dl.BR_CB('login confirm raise a error, exit'))
                exit(dl.PUB_E_RESPONSE_FAIL)
            else:
                dl.LT_PRINT('login check response right')
        else:
            dl.LT_PRINT('login confirm response no error status')
            exit(dl.PUB_E_RESPONSE_FAIL)
Esempio n. 3
0
    def rtn_gather_essential_info(page_opt, whole_nbr):
        """Get input image count

        If user input number more than whole number, set target count is whole number
        Only intercative mode call this function
        :param page_opt:      select ranktop ordinary or r18 mode
        :param whole_nbr:   whole ranking crawl count
        :return:            crawl images count
        """
        img_cnt = 0

        if page_opt == dl.PAGE_ORDINARY:
            label = 'ordinary'
        elif page_opt == dl.PAGE_R18:
            label = 'r18'
        elif page_opt == dl.PAGE_R18G:
            label = 'r18g'
        else:
            dl.nolog_raise_arguerr()
            return dl.PUB_E_PARAM_FAIL

        img_str = dl.LT_INPUT(dl.HL_CY('crawl %s valid target %d, enter you want: ' % (label, whole_nbr)))

        while not img_str.isdigit():
            img_str = dl.LT_INPUT(dl.HL_CY('input error, enter again(max is %d): ' % whole_nbr))
        img_cnt = int(img_str)
        if img_cnt <= 0:
            dl.LT_PRINT(dl.BR_CB('what the f**k is wrong with you?'))
            return dl.PUB_E_PARAM_FAIL

        if img_cnt > whole_nbr:
            img_cnt = whole_nbr

        return img_cnt
Esempio n. 4
0
    def rtn_gather_rankingdata(self):
        """Crawl dailyRank list

        :return:        status code
        """
        response = self.wkv_cw_api.wca_url_request_handler(target_url=self.rtn_req_url,
                                                        post_data=self.wkv_cw_api.getway_data, 
                                                        timeout=30, 
                                                        target_page_word='rankpage',
                                                        logpath=self.logpath)

        # size info in webpage source
        web_src = response.read().decode("UTF-8", "ignore")
        imgitem_pattern = re.compile(dl.RANKING_SECTION_REGEX, re.S)
        info_pattern    = re.compile(dl.RANKING_INFO_REGEX, re.S)
        sizer_result    = self.wkv_cw_api.wca_commit_spansizer(imgitem_pattern, info_pattern, web_src)
        if sizer_result == dl.PUB_E_FAIL:
            return dl.PUB_E_FAIL
        url_lst         = sizer_result['url lst']
        img_info_lst    = sizer_result['info lst']

        # cut need image count to be target list
        valid_url_cnt = len(url_lst)
        if self.ir_mode == dl.MODE_INTERACTIVE:
            img_nbr = self.rtn_gather_essential_info(self.page_opt, valid_url_cnt)
            if img_nbr == dl.PUB_E_PARAM_FAIL:
                return dl.PUB_E_FAIL
        elif self.ir_mode == dl.MODE_SERVER:
            img_nbr = valid_url_cnt             # server mode directly get all of alive targets
            dl.LT_PRINT(dl.BY_CB('server mode auto crawl all of alive targets'))
        self.rtn_target_urls = url_lst[:img_nbr]

        log_content = dl.BY_CB('crawl ranking top ' + str(img_nbr) + ', target table:')
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
        image_info_table = PrettyTable(
            ["ImageNumber", "ImageID", "ImageTitle", "ImageID+PageNumber", "AuthorID", "AuthorName"])
        for k, i in enumerate(img_info_lst[:img_nbr]):
            self.rtn_basepages.append(dl.BASEPAGE_URL(i[3]))        # url request header use
            image_info_table.add_row([(k + 1), i[3], i[1], dl.FROM_URL_GET_IMG_NAME(self.rtn_target_urls[k]), i[4], i[2]])

        # damn emoji, maybe dump failed
        try:
            self.wkv_cw_api.wca_logprowork(self.logpath, str(image_info_table), False)
        except Exception as e:
            dl.LT_PRINT(dl.BR_CB('error: %s, dump prettytable interrupt' % str(e)))

        return dl.PUB_E_OK
Esempio n. 5
0
    def wca_commit_spansizer(whole_pattern, info_pattern, web_src):
        """A sizer for all of images in once commit item

        @@API that allows external calls
        After Pixiv 20181002 update, this method only support mode rtn
        :param whole_pattern:   whole info data regex compile pattern
        :param info_pattern:    image info regex compile pattern
        :param web_src:         webpage source
        :return:                original target url list & image info list dict
        """
        img_info_lst = []
        tgt_url_lst = []

        datasrc_pattern = re.compile(dl.DATASRC_REGEX, re.S)
        span_pattern = re.compile(dl.SPAN_REGEX, re.S)
        img_whole_info = re.findall(whole_pattern, web_src)

        # image have 3 format: jpg/png/gif
        # this crawler will give gif format up and crawl png or jpg
        # pixiv one repertory maybe have multi-images
        for item in img_whole_info:
            tmp_thumbnail = re.findall(datasrc_pattern, item)
            if not tmp_thumbnail:
                dl.LT_PRINT(dl.BR_CB('span sizer regex cannot get valid info'))
                return dl.PUB_E_FAIL

            thumbnail = tmp_thumbnail[0]
            judge_word = thumbnail[-18:]
            # check jpg/png or gif
            if judge_word == dl.JUDGE_NOGIF_WORD:
                span_word = re.findall(span_pattern, item)
                vaild_word = thumbnail[44:-18]

                # try to check multi-span images
                if len(span_word) != 0:
                    for _px in range(int(span_word[0])):
                        info = re.findall(info_pattern, item)[0]
                        img_info_lst.append(info)
                        # more pages point, range 0~span-1
                        target_url = dl.ORIGINAL_IMAGE_HEAD + vaild_word + dl.ORIGINAL_IMAGE_TAIL(
                            _px)
                        tgt_url_lst.append(target_url)
                # just only one picture in a commit
                else:
                    info = re.findall(info_pattern, item)[0]
                    img_info_lst.append(info)
                    # only _p0 page
                    target_url = dl.ORIGINAL_IMAGE_HEAD + vaild_word + dl.ORIGINAL_IMAGE_TAIL(
                        0)
                    tgt_url_lst.append(target_url)
            # give up gif format, or list is empty
            else:
                pass

        return {'url lst': tgt_url_lst, 'info lst': img_info_lst}
Esempio n. 6
0
    def wca_save_test_html(self, filename, workdir, content):
        """Save request web source page in a html file, test use

        @@API that allows external calls
        :param filename:    save html file name
        :param workdir:     work directory
        :param content:     save content(web source code)
        :return:            none
        """
        htmlfile = open(workdir + '/' + filename + '.html', "w", encoding='utf-8')
        htmlfile.write(content)
        htmlfile.close()
        dl.LT_PRINT(dl.BY_CB('save test request html page ok'))
Esempio n. 7
0
    def wca_logprowork(self, logpath, log_content, withtime=True):
        """Universal work log save

        @@API that allows external calls
        Notice: If here print series fucntion raise UnicodeEncodeError, it must web page 
        include emoji symbol encode title when use prettytable to package title info
        :param logpath:     log save path
        :param log_content: log save content
        :param withtime:    default parameter, print and save with real time or not
        :return:            none
        """
        # if log path is none, just print message and return, no log action
        if logpath == None:
            dl.LT_PRINT(log_content)
            return

        # add context to the file use option 'a+'
        # write content may have some not utf8 code, example Japanese
        log_fd = open(logpath, 'a+', encoding='utf-8')

        if withtime == True:
            dl.LT_PRINT(log_content)

            log_content = self.wca_remove_color_chars(
                log_content)  # remove log color chars
            # remove timestamp log color chars
            timestamp = dl.realtime_logword(dl.base_time)
            timestamp = self.wca_remove_color_chars(timestamp)
            timestamp = timestamp[:-1] + ' '  # timestamp has a space in tail

            log_fd.write(timestamp + log_content + '\n')

        else:
            print(log_content)
            log_content = self.wca_remove_color_chars(log_content)
            log_fd.write(log_content + '\n')
        log_fd.close()
Esempio n. 8
0
    def _gatherpostkey(self):
        """POST way login need post-key

        Pixiv website POST login address: (see dl.LOGIN_POSTKEY_URL)
        This operation will get cookie and post-key
        :return:            status code
        """

        self._login_preload(dl.LOGIN_AES_INI_PATH)

        response = self.wca_url_request_handler(
            target_url=dl.LOGIN_POSTKEY_URL,
            post_data=None,  # cannot set data when get post key
            timeout=30,
            target_page_word='post-key',
            logpath=None)

        web_src = response.read().decode("UTF-8", "ignore")
        # debug recaptcha v3 token use
        ## self.wca_save_test_html('post-key', 'E:\\OperationCache', web_src)

        post_pattern = re.compile(dl.POSTKEY_REGEX, re.S)
        postkey = re.findall(post_pattern, web_src)
        if not postkey:
            dl.LT_PRINT('regex parse post key failed')
            return dl.PUB_E_REGEX_FAIL

        # build post-way data with order dictory structure
        post_orderdict = OrderedDict()
        post_orderdict['captcha'] = ""
        post_orderdict['g_recaptcha_response'] = ""
        post_orderdict['password'] = self.passwd
        post_orderdict['pixiv_id'] = self.username
        post_orderdict['post_key'] = postkey[0]
        post_orderdict['source'] = "accounts"
        post_orderdict['ref'] = ""
        post_orderdict['return_to'] = dl.HTTPS_HOST_URL
        post_orderdict['recaptcha_v3_token'] = ""  # google recaptcha v3 token
        self.postway_data = urllib.parse.urlencode(post_orderdict).encode(
            "UTF-8")

        return dl.PUB_E_OK
Esempio n. 9
0
    def wca_mkworkdir(self, logpath, folder):
        """Create a crawler work directory

        @@API that allows external calls
        :param logpath:     log save path
        :param folder:      folder create path
        :return:            folder create path
        """
        # create a folder to save picture
        dl.LT_PRINT('crawler work directory setting: ' + folder)
        is_folder_existed = os.path.exists(folder)
        if not is_folder_existed:
            os.makedirs(folder)
            log_content = 'create a new work folder'
        else:
            log_content = 'target folder has already existed'

        # log file first line here
        if os.path.exists(logpath):
            os.remove(logpath)
        self.wca_logprowork(logpath, log_content)
Esempio n. 10
0
    def _login_preload(self, aes_file_path):
        """Get user input login info and storage into aes file

        If project directory has no file, you need hand-input login info,
        then program will create new file to storage AES encrypt info to it
        This method use pycrypto, need import external call
        :param aes_file_path:       .aes_crypto_login.ini file path
        :return:                    none
        """
        if os.path.exists(aes_file_path):
            # stable read rows get username and password
            read_aes_file = open(aes_file_path, 'rb+')
            readline_cache = read_aes_file.readlines()  # all line list
            read_aes_file.close()

            # get aes file storage info and split tail '\n'
            aes_info = {
                'iv_param': readline_cache[0][:-1],
                'user_mail': readline_cache[1][:-1],
                'passwd': readline_cache[2][:-1]
            }

            # analysis hash value to string
            username_aes_decrypt_cipher = AES.new(dl.AES_SECRET_KEY,
                                                  AES.MODE_CFB,
                                                  aes_info['iv_param'])
            username = str(
                username_aes_decrypt_cipher.decrypt(
                    aes_info['user_mail'][AES.block_size:]), 'UTF-8')
            password_aes_decrypt_cipher = AES.new(dl.AES_SECRET_KEY,
                                                  AES.MODE_CFB,
                                                  aes_info['iv_param'])
            passwd = str(
                password_aes_decrypt_cipher.decrypt(
                    aes_info['passwd'][AES.block_size:]), 'UTF-8')

            if self.ir_mode == dl.MODE_INTERACTIVE:
                check = dl.LT_INPUT(
                    dl.HL_CY("get user account info ok, check: \n"
                             "[*username] %s\n[*password] %s\n"
                             "Is that correct? (Y/N): " % (username, passwd)))

                # if user judge info are error, delete old AES file and record new info
                if check == 'N' or check == 'n':
                    os.remove(aes_file_path)  # delete old AES file
                    # temporarily enter login information
                    dl.LT_PRINT(
                        dl.BY_CB(
                            "Well, you need hand-input your login data: "))
                    username = dl.LT_INPUT(
                        dl.HL_CY(
                            'enter your pixiv id(mailbox), must be a R18: '))
                    passwd = getpass.getpass(
                        dl.realtime_logword(dl.base_time) +
                        dl.HL_CY('enter your account password: '******'enter your pixiv id(mailbox), must be a R18: '))
            passwd = getpass.getpass(
                dl.realtime_logword(dl.base_time) +
                dl.HL_CY('enter your account password: '******'user', username), ('pass', passwd)]
        getway_data = urllib.parse.urlencode(getway_register).encode(
            encoding='UTF8')

        self.username = username
        self.passwd = passwd
        self.getway_data = getway_data
Esempio n. 11
0
    def rtn_target_confirm(self):
        """Input option and confirm target

        :return:        status code
        """
        req_url     = None      # request target ranking url
        rank_word   = None      # ranking word
        dwm_opt     = None      # daily/weekly/monthly

        if self.ir_mode == dl.MODE_INTERACTIVE:
            page_opt    = dl.LT_INPUT(dl.HL_CY('select ranking type, ordinary(1) | r18(2) | r18g(3): '))
            sex_opt     = dl.LT_INPUT(dl.HL_CY('select sex favor, normal(0) | male(1) | female(2): '))
        elif self.ir_mode == dl.MODE_SERVER:
            page_opt    = self.rtn_r18_arg
            sex_opt     = self.rtn_sex_opt
        else:
            dl.nolog_raise_arguerr()
            return dl.PUB_E_PARAM_FAIL

        if page_opt == dl.PAGE_ORDINARY:
            if self.ir_mode == dl.MODE_INTERACTIVE:
                dwm_opt = dl.LT_INPUT(dl.HL_CY('select daily(1) | weekly(2) | monthly(3) ordinary ranking type: '))
            elif self.ir_mode == dl.MODE_SERVER:
                dwm_opt = self.rtn_rank_type
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL

            if dwm_opt == dl.RANK_DAILY:
                if sex_opt == dl.SEX_NORMAL:
                    req_url     = dl.RANK_DAILY_URL
                    rank_word   = dl.DAILY_WORD
                elif sex_opt == dl.SEX_MALE:
                    req_url     = dl.RANK_DAILY_MALE_URL
                    rank_word   = dl.MALE_WORD
                elif sex_opt == dl.SEX_FEMALE:
                    req_url     = dl.RANK_DAILY_FEMALE_URL
                    rank_word   = dl.FEMALE_WORD
                else:
                    dl.nolog_raise_arguerr()
                    return dl.PUB_E_PARAM_FAIL
            elif dwm_opt == dl.RANK_WEEKLY:
                req_url     = dl.RANK_WEEKLY_URL
                rank_word   = dl.WEEKLY_WORD
            elif dwm_opt == dl.RANK_MONTHLY:
                req_url     = dl.RANK_MONTHLY_URL
                rank_word   = dl.MONTHLY_WORD
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL

        elif page_opt == dl.PAGE_R18:
            if self.ir_mode == dl.MODE_INTERACTIVE:
                dwm_opt = dl.LT_INPUT(dl.HL_CY('select daily(1)/weekly(2) R18 ranking type: '))
            elif self.ir_mode == dl.MODE_SERVER:
                dwm_opt = self.rtn_rank_type
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL
            if dwm_opt == dl.RANK_DAILY:
                if sex_opt == dl.SEX_NORMAL:
                    req_url     = dl.RANK_DAILY_R18_URL
                    rank_word   = dl.DAILY_WORD
                elif sex_opt == dl.SEX_MALE:
                    req_url     = dl.RANK_DAILY_MALE_R18_URL
                    rank_word   = dl.MALE_WORD
                elif sex_opt == dl.SEX_FEMALE:
                    req_url     = dl.RANK_DAILY_FEMALE_R18_URL
                    rank_word   = dl.FEMALE_WORD
                else:
                    dl.nolog_raise_arguerr()
                    return dl.PUB_E_PARAM_FAIL
            elif dwm_opt == dl.RANK_WEEKLY:
                req_url     = dl.RANK_WEEKLY_R18_URL
                rank_word   = dl.WEEKLY_WORD
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL

        elif page_opt == dl.PAGE_R18G:
            req_url     = dl.RANK_R18G_URL
            rank_word   = dl.R18G_WORD
            dl.LT_PRINT(dl.BR_CB('warning: you choose the r18g rank, hope you know what it means'))

        else:
            dl.nolog_raise_arguerr()
            return dl.PUB_E_PARAM_FAIL

        log_content = dl.BY_CB('base select option, set rank target url: [%s]' % req_url)
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
        self.rtn_req_url    = req_url
        self.page_opt       = page_opt

        return dl.PUB_E_OK
Esempio n. 12
0
    def ira_crawl_allpage_target(self):
        """Package all gather urls

        :return:            status code
        """
        require_page_cnt = 0

        if self.ira_max_cnt <= dl.ONE_PAGE_COMMIT:
            require_page_cnt = 1
        else:
            require_page_cnt = int(self.ira_max_cnt / dl.ONE_PAGE_COMMIT)
            # remainder decision
            if self.ira_max_cnt % dl.ONE_PAGE_COMMIT != 0:
                require_page_cnt += 1

        # build the json data url
        iid_string_tail     = ''
        subpage_url_list    = []
        for ix in range(require_page_cnt):
            # one subpage only include 6*8 valid image, others are invalid
            tmp_tail_nbr = dl.ONE_PAGE_COMMIT * (ix + 1)
            tmp_tail_nbr = self.ira_max_cnt if tmp_tail_nbr > self.ira_max_cnt else tmp_tail_nbr

            for index in self.ira_pure_idlist[(dl.ONE_PAGE_COMMIT * ix):tmp_tail_nbr]:
                iid_string_tail += dl.IDS_UNIT(index)
            subpage_url_list.append(dl.ALLREPOINFO_URL(self.user_input_id, iid_string_tail, 1 if ix == 0 else 0))
            iid_string_tail = ''                            # clear last cache

        # get all data from response xhr page into a temp list
        tmp_receive_list    = []
        tmp_ret             = []
        for i in range(require_page_cnt):
            tmp_ret = self.ira_crawl_subpage_data(i + 1, subpage_url_list[i])
            if not isinstance(tmp_ret, list):
                return dl.PUB_E_FAIL
            tmp_receive_list += tmp_ret

        repo_target_all_list = []
        for i in range(len(tmp_receive_list)):
            tmp_receive_list[i][1] = dl.UNICODE_ESCAPE(tmp_receive_list[i][1])
            tmp_receive_list[i][1] = dl.EMOJI_REPLACE(tmp_receive_list[i][1])
            # build original url without image format
            tmp = tmp_receive_list[i][2]
            tmp = tmp.replace('\\', '')
            tmp_receive_list[i][2] = dl.ORIGINAL_IMAGE_HEAD + tmp[-39:-7] + '.png'  # first original url
            repo_target_all_list.append(tmp_receive_list[i])

            # add other original image url by pageCount
            tmp_page_count_str = tmp_receive_list[i][3]
            if tmp_page_count_str.isdigit():
                index_page_count = int(tmp_page_count_str)
                if index_page_count != 1:
                    for px in range(index_page_count):
                        insert_item = [tmp_receive_list[i][0], 
                                        tmp_receive_list[i][1], 
                                        tmp_receive_list[i][2][:-5] + str(px) + '.png', 
                                        tmp_receive_list[i][3]]
                        repo_target_all_list.append(insert_item)
            else:
                log_content = dl.BR_CB('page count process error')
                self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
                return dl.PUB_E_FAIL
        del tmp_receive_list

        alive_target_cnt    = len(repo_target_all_list)
        require_img_nbr     = 0

        if self.ir_mode == dl.MODE_INTERACTIVE:
            require_img_str = dl.LT_INPUT(dl.HL_CY('crawl all repo %d, whole target(s): %d, enter you want count: '
                % (self.ira_max_cnt, alive_target_cnt)))
            # if user input isn't number
            while not require_img_str.isdigit():
                dl.LT_PRINT(dl.BR_CB('input error, your input content was not a decimal number'))
                require_img_str = dl.LT_INPUT(dl.HL_CY('enter again(max is %d): ' % alive_target_cnt))
            require_img_nbr = int(require_img_str)
            if require_img_nbr <= 0:
                dl.LT_PRINT(dl.BR_CB('what the f**k is wrong with you?'))
                return dl.PUB_E_PARAM_FAIL
            require_img_nbr = alive_target_cnt if require_img_nbr > alive_target_cnt else require_img_nbr

        elif self.ir_mode == dl.MODE_SERVER:
            require_img_nbr = alive_target_cnt
            dl.LT_PRINT(dl.BY_CB('server mode auto crawl all of alive targets'))
        else:
            pass

        for k, i in enumerate(repo_target_all_list[:require_img_nbr]):
            self.ira_target_capture.append(i[2])
            self.ira_basepages.append(dl.BASEPAGE_URL(i[0]))

        log_content = 'illustrator [%s] id [%s], require image(s): %d, target table:' \
            % (self.ira_author_name, self.user_input_id, require_img_nbr)
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)

        image_info_table = PrettyTable(["ImageNumber", "ImageID", "ImageTitle", "ImagePageName"])
        for k, i in enumerate(repo_target_all_list[:require_img_nbr]):
            image_info_table.add_row([(k + 1), i[0], i[1], dl.FROM_URL_GET_IMG_NAME(i[2])])

        # damn emoji, maybe dump failed
        try:
            self.wkv_cw_api.wca_logprowork(self.logpath, str(image_info_table), False)
        except Exception as e:
            dl.LT_PRINT(dl.BR_CB('error: %s, dump prettytable interrupt' % str(e)))
        del repo_target_all_list

        return dl.PUB_E_OK
Esempio n. 13
0
    def ira_gather_preloadinfo(self):
        """Crawler need to know how many images do you want

        This function will get author name base on author id
        :return:            status code
        """
        # request all of one illustrator's artworks
        response = self.wkv_cw_api.wca_url_request_handler(target_url=dl.AJAX_ALL_URL(self.user_input_id),
                                                        post_data=self.wkv_cw_api.getway_data, 
                                                        timeout=30, 
                                                        target_page_word='ajaxpage',
                                                        logpath=self.logpath)

        # get artworks id list
        web_src = response.read().decode("UTF-8", "ignore")
        ajax_idlist_pattern = re.compile(dl.AJAX_ALL_IDLIST_REGEX, re.S)
        ajax_idlist         = re.findall(ajax_idlist_pattern, web_src)
        if not ajax_idlist:
            log_content = dl.BR_CB('regex get ajax id list fail')
            self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
            return dl.PUB_E_REGEX_FAIL

        number_pattern = re.compile(dl.NUMBER_REGEX, re.S)
        for index in ajax_idlist:
            if index.isdigit():
                self.ira_pure_idlist.append(index)
            else:
                # id list result may include some garbages, use number regex get pure result
                one_pure_id = re.findall(number_pattern, index)
                if one_pure_id:
                    self.ira_pure_idlist.append(one_pure_id[0])
                else:
                    pass

        # website server require the descending list of sort artwork id
        pure_idlist_nbr = []
        for index in self.ira_pure_idlist:
            pure_idlist_nbr.append(int(index))
        self.wkv_cw_api.wca_quick_sort(pure_idlist_nbr, 0, len(pure_idlist_nbr) - 1)

        self.ira_pure_idlist.clear()
        for index in reversed(pure_idlist_nbr):
            self.ira_pure_idlist.append(str(index))
        del pure_idlist_nbr
        self.ira_max_cnt = len(self.ira_pure_idlist)

        # get author name from member-main-page
        illust_mainpage_url = dl.USERS_ARTWORKS_URL(self.user_input_id)
        log_content = dl.HL_CY('crawl illustrator url: [%s]' % illust_mainpage_url)
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)

        response = self.wkv_cw_api.wca_url_request_handler(target_url=illust_mainpage_url,
                                                        post_data=self.wkv_cw_api.getway_data, 
                                                        timeout=30, 
                                                        target_page_word='mainpage',
                                                        logpath=self.logpath)

        # match illustrator name
        web_src = response.read().decode("UTF-8", "ignore")
        illust_name_pattern = re.compile(dl.ILLUST_NAME_REGEX(self.user_input_id), re.S)
        author_info         = re.findall(illust_name_pattern, web_src)
        if not author_info:
            # cannot catch illust name in mainpage if login failed
            dl.LT_PRINT(dl.BR_CB("Regex parsing result error, no author info"))
            return dl.PUB_E_REGEX_FAIL

        self.ira_author_name = author_info[0]
        log_content = dl.HL_CY('check illustrator: [%s]' % self.ira_author_name)
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)

        return dl.PUB_E_OK
Esempio n. 14
0
def main():
    """main logic

    Get user input arguments and launch mode function
    :return:    none
    """
    select_option = dl.SELECT_RTN
    rtn_page_opt = dl.PAGE_ORDINARY
    rtn_rank_opt = dl.RANK_DAILY
    rtn_sex_opt = dl.SEX_NORMAL
    ira_illust_id_list = []

    print(dl.HL_CR(WkvCwApi.__doc__))
    mode_interactive_server = dl.MODE_INTERACTIVE if len(
        sys.argv) == 1 else dl.MODE_SERVER
    api_instance = WkvCwApi(mode_interactive_server)
    api_instance.wca_camouflage_login()

    while True:
        if mode_interactive_server == dl.MODE_INTERACTIVE:
            select_option = dl.LT_INPUT(
                dl.HL_CY('login completed, select mode: '))
        else:
            opts, args = getopt.getopt(
                sys.argv[1:], "hm:r:l:s:i:",
                ["help", "mode", "R18", "list", "sex", "id"])
            for opt, value in opts:
                if opt in ("-m", "--mode"):
                    select_option = value
                elif opt in ("-r", "--R18"):
                    rtn_page_opt = value
                elif opt in ("-l", "--list"):
                    rtn_rank_opt = value
                elif opt in ("-s", "--sex"):
                    rtn_sex_opt = value
                elif opt in ("-i", "--id"):
                    ira_illust_id_list = value.split(
                        ','
                    )  # server mode support multi-input id and split with ','
                elif opt in ("-h", "--help"):
                    print(dl.HL_CR(WkvCwApi.__doc__))
                    exit(dl.PUB_E_OK)

        if select_option == dl.SELECT_RTN:
            dl.LT_PRINT(dl.BY_CB('mode: [Ranking Top N]'))
            rtn_instance = rtn(dl.RANK_DIR, dl.LOG_PATH, dl.HTML_PATH,
                               api_instance, mode_interactive_server,
                               rtn_page_opt, rtn_rank_opt, rtn_sex_opt)
            rtn_instance.start()

        elif select_option == dl.SELECT_IRA:
            dl.LT_PRINT(dl.BY_CB('mode: [Illustrator Repository All]'))
            if mode_interactive_server == dl.MODE_SERVER:
                for ira_illust_id in ira_illust_id_list:
                    ira_instance = ira(dl.g_dl_work_dir, dl.LOG_NAME,
                                       dl.HTML_NAME, api_instance,
                                       mode_interactive_server, ira_illust_id)
                    ira_instance.start()
            else:
                ira_instance = ira(dl.g_dl_work_dir, dl.LOG_NAME, dl.HTML_NAME,
                                   api_instance, mode_interactive_server, '')
                ira_instance.start()

        elif select_option == dl.SELECT_HELP:
            print(dl.HL_CR(WkvCwApi.__doc__))

        elif select_option == dl.SELECT_EXIT:
            dl.LT_PRINT(dl.BY_CB("user exit program"))
            dl.crawler_logo()  # exit print logo
            exit(dl.PUB_E_OK)

        else:
            dl.nolog_raise_arguerr()

        if mode_interactive_server == dl.MODE_SERVER:
            exit(dl.PUB_E_OK)