Esempio n. 1
0
        def _wrapper(self, logpath, *args, **kwargs):
            """Timer wrapper

            Mainly for the function download_alltarget() to achieve timing expansion
            :param logpath:     log save path
            :param *args:       pythonic variable argument
            :param **kwargs:    pythonic variable argument
            :return:            none
            """
            log_content = "launch timer decorator, start download threads timer"
            self.wca_logprowork(logpath, log_content)
            starttime = time.time()

            origin_func(self, logpath, *args,
                        **kwargs)  # packaged original function

            endtime = time.time()
            elapesd_time = endtime - starttime
            average_download_speed = float(WkvCwApi._datastream_pool /
                                           elapesd_time)
            log_content = (dl.BY_CB(
                "all of threads reclaim, total download data-stream size: %0.2fMB, "
                "average download speed: %0.2fkB/s" %
                (float(WkvCwApi._datastream_pool / 1024),
                 average_download_speed)))
            self.wca_logprowork(logpath, log_content)
            WkvCwApi._datastream_pool = 0  # clear global data stream list
Esempio n. 2
0
    def rtn_gather_rankingdata(self):
        """Crawl dailyRank list

        :return:        status code
        """
        response = self.wkv_cw_api.wca_url_request_handler(target_url=self.rtn_req_url,
                                                        post_data=self.wkv_cw_api.getway_data, 
                                                        timeout=30, 
                                                        target_page_word='rankpage',
                                                        logpath=self.logpath)

        # size info in webpage source
        web_src = response.read().decode("UTF-8", "ignore")
        imgitem_pattern = re.compile(dl.RANKING_SECTION_REGEX, re.S)
        info_pattern    = re.compile(dl.RANKING_INFO_REGEX, re.S)
        sizer_result    = self.wkv_cw_api.wca_commit_spansizer(imgitem_pattern, info_pattern, web_src)
        if sizer_result == dl.PUB_E_FAIL:
            return dl.PUB_E_FAIL
        url_lst         = sizer_result['url lst']
        img_info_lst    = sizer_result['info lst']

        # cut need image count to be target list
        valid_url_cnt = len(url_lst)
        if self.ir_mode == dl.MODE_INTERACTIVE:
            img_nbr = self.rtn_gather_essential_info(self.page_opt, valid_url_cnt)
            if img_nbr == dl.PUB_E_PARAM_FAIL:
                return dl.PUB_E_FAIL
        elif self.ir_mode == dl.MODE_SERVER:
            img_nbr = valid_url_cnt             # server mode directly get all of alive targets
            dl.LT_PRINT(dl.BY_CB('server mode auto crawl all of alive targets'))
        self.rtn_target_urls = url_lst[:img_nbr]

        log_content = dl.BY_CB('crawl ranking top ' + str(img_nbr) + ', target table:')
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
        image_info_table = PrettyTable(
            ["ImageNumber", "ImageID", "ImageTitle", "ImageID+PageNumber", "AuthorID", "AuthorName"])
        for k, i in enumerate(img_info_lst[:img_nbr]):
            self.rtn_basepages.append(dl.BASEPAGE_URL(i[3]))        # url request header use
            image_info_table.add_row([(k + 1), i[3], i[1], dl.FROM_URL_GET_IMG_NAME(self.rtn_target_urls[k]), i[4], i[2]])

        # damn emoji, maybe dump failed
        try:
            self.wkv_cw_api.wca_logprowork(self.logpath, str(image_info_table), False)
        except Exception as e:
            dl.LT_PRINT(dl.BR_CB('error: %s, dump prettytable interrupt' % str(e)))

        return dl.PUB_E_OK
Esempio n. 3
0
    def wca_save_test_html(self, filename, workdir, content):
        """Save request web source page in a html file, test use

        @@API that allows external calls
        :param filename:    save html file name
        :param workdir:     work directory
        :param content:     save content(web source code)
        :return:            none
        """
        htmlfile = open(workdir + '/' + filename + '.html', "w", encoding='utf-8')
        htmlfile.write(content)
        htmlfile.close()
        dl.LT_PRINT(dl.BY_CB('save test request html page ok'))
Esempio n. 4
0
    def wca_camouflage_login(self):
        """Camouflage browser to login

        If login failed, program will exit here
        @@API that allows external calls
        :return:        status code
        """
        if WkvCwApi._login_once_flag:
            return dl.PUB_E_OK
        else:
            WkvCwApi._login_once_flag = True

        if self._gatherpostkey() != dl.PUB_E_OK:
            exit(dl.PUB_E_RESPONSE_FAIL)

        cookie_jar = self._get_chrome_cookie(dl.local_cache_cookie_path,
                                             dl.HTTPS_HOST_URL)
        self.cookieHandler = urllib.request.HTTPCookieProcessor(cookie_jar)
        self.opener = urllib.request.build_opener(self.cookieHandler)
        urllib.request.install_opener(self.opener)

        response = self.wca_url_request_handler(
            target_url=dl.LOGIN_REQUEST_API_URL,
            post_data=self.postway_data,
            timeout=30,
            target_page_word='login',
            logpath=None)
        if response == dl.PUB_E_RESPONSE_FAIL:
            dl.LT_PRINT(
                dl.BR_CB('login response return a boolean FALSE, exit'))
            exit(dl.PUB_E_RESPONSE_FAIL)

        web_src = response.read().decode("UTF-8", "ignore")
        dl.LT_PRINT(
            dl.BY_CB('response source: %s' %
                     web_src.encode("UTF-8").decode("unicode_escape")))

        login_info_pattern = re.compile(dl.LOGIN_INFO_REGEX, re.S)
        response_info = re.findall(login_info_pattern, web_src)
        if response_info:
            if response_info[0] != 'false':
                # error false means no error
                dl.LT_PRINT(dl.BR_CB('login confirm raise a error, exit'))
                exit(dl.PUB_E_RESPONSE_FAIL)
            else:
                dl.LT_PRINT('login check response right')
        else:
            dl.LT_PRINT('login confirm response no error status')
            exit(dl.PUB_E_RESPONSE_FAIL)
Esempio n. 5
0
    def wca_download_alltarget(self, logpath, urls, basepages, workdir):
        """Multi-process download all image

        @@API that allows external calls
        :param urls:        all original images urls
        :param basepages:   all referer basic pages
        :param workdir:     work directory
        :param logpath:     log save path
        :return:            none
        """
        thread_block_flag = False  # thread blocking flag
        alive_thread_cnt = queueLength = len(urls)
        log_content = dl.BY_CB('hit %d target(s), start download task(s)' %
                               queueLength)
        self.wca_logprowork(logpath, log_content)

        # capture timeout and the user interrupt fault and exit the failed thread
        try:
            for i, one_url in enumerate(urls):
                self._MultiThreading.lock_t.acquire()
                if len(self._MultiThreading.queue_t) > dl.SYSTEM_MAX_THREADS:
                    thread_block_flag = True
                    self._MultiThreading.lock_t.release()
                    # if the number of created threads reach max limit
                    # program will stop here, wait all of threads have been created over
                    # when one thread executed over, create next one
                    self._MultiThreading.event_t.wait()
                else:
                    self._MultiThreading.lock_t.release()

                # build overwrite threading.Thread object
                sub_thread = self._MultiThreading(i, one_url, basepages,
                                                  workdir, logpath)
                # set every download sub-process daemon property
                # set false, then if you exit one thread, others threads will not end
                # set true, quit one is quit all
                sub_thread.setDaemon(True)
                # if create this sub-thread failed from function
                if sub_thread.create() == dl.PUB_E_FAIL:
                    log_content = dl.BR_CB('create a new sub-thread failed')
                    print(log_content)
                    return dl.PUB_E_FAIL

                if thread_block_flag == False:
                    log_content = dl.BY_CB(
                        'created {:d} download target object(s)')
                else:
                    log_content = dl.BY_CB(
                        'created {:d} download target object(s), thread creation is blocked, please wait'
                    )
                dl.LT_FLUSH(log_content, i + 1)
            print(dl.BY_CB(', all threads have been loaded OK'))
            thread_block_flag = False

            # parent thread wait all sub-thread end
            # the count of all threads is 1 parent thread and n sub-thread(s)
            # when all pictures have been downloaded over, thread count is 1
            while alive_thread_cnt > 1:
                # global variable update
                self.alivethread_counter = threading.active_count()
                # when alive thread count change, print its value
                if alive_thread_cnt != self.alivethread_counter:
                    alive_thread_cnt = self.alivethread_counter  # update alive thread count
                    # display alive sub-thread count
                    # its number wouldn't more than thread max count
                    log_content = dl.BY_CB(
                        'currently remaining sub-thread(s):({:4d}/{:4d}), completed:({:4.1%})|({:5.2f}MB)'
                    )
                    dl.LT_FLUSH(log_content, alive_thread_cnt - 1, queueLength,
                                ((queueLength -
                                  (alive_thread_cnt - 1)) / queueLength),
                                (float(WkvCwApi._datastream_pool / 1024)))
            print(dl.BY_CB(', sub-threads execute finished'))
        except KeyboardInterrupt:
            print(dl.BY_CB(', user interrupt a thread, exit all threads'))
Esempio n. 6
0
    def _login_preload(self, aes_file_path):
        """Get user input login info and storage into aes file

        If project directory has no file, you need hand-input login info,
        then program will create new file to storage AES encrypt info to it
        This method use pycrypto, need import external call
        :param aes_file_path:       .aes_crypto_login.ini file path
        :return:                    none
        """
        if os.path.exists(aes_file_path):
            # stable read rows get username and password
            read_aes_file = open(aes_file_path, 'rb+')
            readline_cache = read_aes_file.readlines()  # all line list
            read_aes_file.close()

            # get aes file storage info and split tail '\n'
            aes_info = {
                'iv_param': readline_cache[0][:-1],
                'user_mail': readline_cache[1][:-1],
                'passwd': readline_cache[2][:-1]
            }

            # analysis hash value to string
            username_aes_decrypt_cipher = AES.new(dl.AES_SECRET_KEY,
                                                  AES.MODE_CFB,
                                                  aes_info['iv_param'])
            username = str(
                username_aes_decrypt_cipher.decrypt(
                    aes_info['user_mail'][AES.block_size:]), 'UTF-8')
            password_aes_decrypt_cipher = AES.new(dl.AES_SECRET_KEY,
                                                  AES.MODE_CFB,
                                                  aes_info['iv_param'])
            passwd = str(
                password_aes_decrypt_cipher.decrypt(
                    aes_info['passwd'][AES.block_size:]), 'UTF-8')

            if self.ir_mode == dl.MODE_INTERACTIVE:
                check = dl.LT_INPUT(
                    dl.HL_CY("get user account info ok, check: \n"
                             "[*username] %s\n[*password] %s\n"
                             "Is that correct? (Y/N): " % (username, passwd)))

                # if user judge info are error, delete old AES file and record new info
                if check == 'N' or check == 'n':
                    os.remove(aes_file_path)  # delete old AES file
                    # temporarily enter login information
                    dl.LT_PRINT(
                        dl.BY_CB(
                            "Well, you need hand-input your login data: "))
                    username = dl.LT_INPUT(
                        dl.HL_CY(
                            'enter your pixiv id(mailbox), must be a R18: '))
                    passwd = getpass.getpass(
                        dl.realtime_logword(dl.base_time) +
                        dl.HL_CY('enter your account password: '******'enter your pixiv id(mailbox), must be a R18: '))
            passwd = getpass.getpass(
                dl.realtime_logword(dl.base_time) +
                dl.HL_CY('enter your account password: '******'user', username), ('pass', passwd)]
        getway_data = urllib.parse.urlencode(getway_register).encode(
            encoding='UTF8')

        self.username = username
        self.passwd = passwd
        self.getway_data = getway_data
Esempio n. 7
0
    def rtn_target_confirm(self):
        """Input option and confirm target

        :return:        status code
        """
        req_url     = None      # request target ranking url
        rank_word   = None      # ranking word
        dwm_opt     = None      # daily/weekly/monthly

        if self.ir_mode == dl.MODE_INTERACTIVE:
            page_opt    = dl.LT_INPUT(dl.HL_CY('select ranking type, ordinary(1) | r18(2) | r18g(3): '))
            sex_opt     = dl.LT_INPUT(dl.HL_CY('select sex favor, normal(0) | male(1) | female(2): '))
        elif self.ir_mode == dl.MODE_SERVER:
            page_opt    = self.rtn_r18_arg
            sex_opt     = self.rtn_sex_opt
        else:
            dl.nolog_raise_arguerr()
            return dl.PUB_E_PARAM_FAIL

        if page_opt == dl.PAGE_ORDINARY:
            if self.ir_mode == dl.MODE_INTERACTIVE:
                dwm_opt = dl.LT_INPUT(dl.HL_CY('select daily(1) | weekly(2) | monthly(3) ordinary ranking type: '))
            elif self.ir_mode == dl.MODE_SERVER:
                dwm_opt = self.rtn_rank_type
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL

            if dwm_opt == dl.RANK_DAILY:
                if sex_opt == dl.SEX_NORMAL:
                    req_url     = dl.RANK_DAILY_URL
                    rank_word   = dl.DAILY_WORD
                elif sex_opt == dl.SEX_MALE:
                    req_url     = dl.RANK_DAILY_MALE_URL
                    rank_word   = dl.MALE_WORD
                elif sex_opt == dl.SEX_FEMALE:
                    req_url     = dl.RANK_DAILY_FEMALE_URL
                    rank_word   = dl.FEMALE_WORD
                else:
                    dl.nolog_raise_arguerr()
                    return dl.PUB_E_PARAM_FAIL
            elif dwm_opt == dl.RANK_WEEKLY:
                req_url     = dl.RANK_WEEKLY_URL
                rank_word   = dl.WEEKLY_WORD
            elif dwm_opt == dl.RANK_MONTHLY:
                req_url     = dl.RANK_MONTHLY_URL
                rank_word   = dl.MONTHLY_WORD
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL

        elif page_opt == dl.PAGE_R18:
            if self.ir_mode == dl.MODE_INTERACTIVE:
                dwm_opt = dl.LT_INPUT(dl.HL_CY('select daily(1)/weekly(2) R18 ranking type: '))
            elif self.ir_mode == dl.MODE_SERVER:
                dwm_opt = self.rtn_rank_type
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL
            if dwm_opt == dl.RANK_DAILY:
                if sex_opt == dl.SEX_NORMAL:
                    req_url     = dl.RANK_DAILY_R18_URL
                    rank_word   = dl.DAILY_WORD
                elif sex_opt == dl.SEX_MALE:
                    req_url     = dl.RANK_DAILY_MALE_R18_URL
                    rank_word   = dl.MALE_WORD
                elif sex_opt == dl.SEX_FEMALE:
                    req_url     = dl.RANK_DAILY_FEMALE_R18_URL
                    rank_word   = dl.FEMALE_WORD
                else:
                    dl.nolog_raise_arguerr()
                    return dl.PUB_E_PARAM_FAIL
            elif dwm_opt == dl.RANK_WEEKLY:
                req_url     = dl.RANK_WEEKLY_R18_URL
                rank_word   = dl.WEEKLY_WORD
            else:
                dl.nolog_raise_arguerr()
                return dl.PUB_E_PARAM_FAIL

        elif page_opt == dl.PAGE_R18G:
            req_url     = dl.RANK_R18G_URL
            rank_word   = dl.R18G_WORD
            dl.LT_PRINT(dl.BR_CB('warning: you choose the r18g rank, hope you know what it means'))

        else:
            dl.nolog_raise_arguerr()
            return dl.PUB_E_PARAM_FAIL

        log_content = dl.BY_CB('base select option, set rank target url: [%s]' % req_url)
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
        self.rtn_req_url    = req_url
        self.page_opt       = page_opt

        return dl.PUB_E_OK
Esempio n. 8
0
    def ira_crawl_allpage_target(self):
        """Package all gather urls

        :return:            status code
        """
        require_page_cnt = 0

        if self.ira_max_cnt <= dl.ONE_PAGE_COMMIT:
            require_page_cnt = 1
        else:
            require_page_cnt = int(self.ira_max_cnt / dl.ONE_PAGE_COMMIT)
            # remainder decision
            if self.ira_max_cnt % dl.ONE_PAGE_COMMIT != 0:
                require_page_cnt += 1

        # build the json data url
        iid_string_tail     = ''
        subpage_url_list    = []
        for ix in range(require_page_cnt):
            # one subpage only include 6*8 valid image, others are invalid
            tmp_tail_nbr = dl.ONE_PAGE_COMMIT * (ix + 1)
            tmp_tail_nbr = self.ira_max_cnt if tmp_tail_nbr > self.ira_max_cnt else tmp_tail_nbr

            for index in self.ira_pure_idlist[(dl.ONE_PAGE_COMMIT * ix):tmp_tail_nbr]:
                iid_string_tail += dl.IDS_UNIT(index)
            subpage_url_list.append(dl.ALLREPOINFO_URL(self.user_input_id, iid_string_tail, 1 if ix == 0 else 0))
            iid_string_tail = ''                            # clear last cache

        # get all data from response xhr page into a temp list
        tmp_receive_list    = []
        tmp_ret             = []
        for i in range(require_page_cnt):
            tmp_ret = self.ira_crawl_subpage_data(i + 1, subpage_url_list[i])
            if not isinstance(tmp_ret, list):
                return dl.PUB_E_FAIL
            tmp_receive_list += tmp_ret

        repo_target_all_list = []
        for i in range(len(tmp_receive_list)):
            tmp_receive_list[i][1] = dl.UNICODE_ESCAPE(tmp_receive_list[i][1])
            tmp_receive_list[i][1] = dl.EMOJI_REPLACE(tmp_receive_list[i][1])
            # build original url without image format
            tmp = tmp_receive_list[i][2]
            tmp = tmp.replace('\\', '')
            tmp_receive_list[i][2] = dl.ORIGINAL_IMAGE_HEAD + tmp[-39:-7] + '.png'  # first original url
            repo_target_all_list.append(tmp_receive_list[i])

            # add other original image url by pageCount
            tmp_page_count_str = tmp_receive_list[i][3]
            if tmp_page_count_str.isdigit():
                index_page_count = int(tmp_page_count_str)
                if index_page_count != 1:
                    for px in range(index_page_count):
                        insert_item = [tmp_receive_list[i][0], 
                                        tmp_receive_list[i][1], 
                                        tmp_receive_list[i][2][:-5] + str(px) + '.png', 
                                        tmp_receive_list[i][3]]
                        repo_target_all_list.append(insert_item)
            else:
                log_content = dl.BR_CB('page count process error')
                self.wkv_cw_api.wca_logprowork(self.logpath, log_content)
                return dl.PUB_E_FAIL
        del tmp_receive_list

        alive_target_cnt    = len(repo_target_all_list)
        require_img_nbr     = 0

        if self.ir_mode == dl.MODE_INTERACTIVE:
            require_img_str = dl.LT_INPUT(dl.HL_CY('crawl all repo %d, whole target(s): %d, enter you want count: '
                % (self.ira_max_cnt, alive_target_cnt)))
            # if user input isn't number
            while not require_img_str.isdigit():
                dl.LT_PRINT(dl.BR_CB('input error, your input content was not a decimal number'))
                require_img_str = dl.LT_INPUT(dl.HL_CY('enter again(max is %d): ' % alive_target_cnt))
            require_img_nbr = int(require_img_str)
            if require_img_nbr <= 0:
                dl.LT_PRINT(dl.BR_CB('what the f**k is wrong with you?'))
                return dl.PUB_E_PARAM_FAIL
            require_img_nbr = alive_target_cnt if require_img_nbr > alive_target_cnt else require_img_nbr

        elif self.ir_mode == dl.MODE_SERVER:
            require_img_nbr = alive_target_cnt
            dl.LT_PRINT(dl.BY_CB('server mode auto crawl all of alive targets'))
        else:
            pass

        for k, i in enumerate(repo_target_all_list[:require_img_nbr]):
            self.ira_target_capture.append(i[2])
            self.ira_basepages.append(dl.BASEPAGE_URL(i[0]))

        log_content = 'illustrator [%s] id [%s], require image(s): %d, target table:' \
            % (self.ira_author_name, self.user_input_id, require_img_nbr)
        self.wkv_cw_api.wca_logprowork(self.logpath, log_content)

        image_info_table = PrettyTable(["ImageNumber", "ImageID", "ImageTitle", "ImagePageName"])
        for k, i in enumerate(repo_target_all_list[:require_img_nbr]):
            image_info_table.add_row([(k + 1), i[0], i[1], dl.FROM_URL_GET_IMG_NAME(i[2])])

        # damn emoji, maybe dump failed
        try:
            self.wkv_cw_api.wca_logprowork(self.logpath, str(image_info_table), False)
        except Exception as e:
            dl.LT_PRINT(dl.BR_CB('error: %s, dump prettytable interrupt' % str(e)))
        del repo_target_all_list

        return dl.PUB_E_OK
Esempio n. 9
0
def main():
    """main logic

    Get user input arguments and launch mode function
    :return:    none
    """
    select_option = dl.SELECT_RTN
    rtn_page_opt = dl.PAGE_ORDINARY
    rtn_rank_opt = dl.RANK_DAILY
    rtn_sex_opt = dl.SEX_NORMAL
    ira_illust_id_list = []

    print(dl.HL_CR(WkvCwApi.__doc__))
    mode_interactive_server = dl.MODE_INTERACTIVE if len(
        sys.argv) == 1 else dl.MODE_SERVER
    api_instance = WkvCwApi(mode_interactive_server)
    api_instance.wca_camouflage_login()

    while True:
        if mode_interactive_server == dl.MODE_INTERACTIVE:
            select_option = dl.LT_INPUT(
                dl.HL_CY('login completed, select mode: '))
        else:
            opts, args = getopt.getopt(
                sys.argv[1:], "hm:r:l:s:i:",
                ["help", "mode", "R18", "list", "sex", "id"])
            for opt, value in opts:
                if opt in ("-m", "--mode"):
                    select_option = value
                elif opt in ("-r", "--R18"):
                    rtn_page_opt = value
                elif opt in ("-l", "--list"):
                    rtn_rank_opt = value
                elif opt in ("-s", "--sex"):
                    rtn_sex_opt = value
                elif opt in ("-i", "--id"):
                    ira_illust_id_list = value.split(
                        ','
                    )  # server mode support multi-input id and split with ','
                elif opt in ("-h", "--help"):
                    print(dl.HL_CR(WkvCwApi.__doc__))
                    exit(dl.PUB_E_OK)

        if select_option == dl.SELECT_RTN:
            dl.LT_PRINT(dl.BY_CB('mode: [Ranking Top N]'))
            rtn_instance = rtn(dl.RANK_DIR, dl.LOG_PATH, dl.HTML_PATH,
                               api_instance, mode_interactive_server,
                               rtn_page_opt, rtn_rank_opt, rtn_sex_opt)
            rtn_instance.start()

        elif select_option == dl.SELECT_IRA:
            dl.LT_PRINT(dl.BY_CB('mode: [Illustrator Repository All]'))
            if mode_interactive_server == dl.MODE_SERVER:
                for ira_illust_id in ira_illust_id_list:
                    ira_instance = ira(dl.g_dl_work_dir, dl.LOG_NAME,
                                       dl.HTML_NAME, api_instance,
                                       mode_interactive_server, ira_illust_id)
                    ira_instance.start()
            else:
                ira_instance = ira(dl.g_dl_work_dir, dl.LOG_NAME, dl.HTML_NAME,
                                   api_instance, mode_interactive_server, '')
                ira_instance.start()

        elif select_option == dl.SELECT_HELP:
            print(dl.HL_CR(WkvCwApi.__doc__))

        elif select_option == dl.SELECT_EXIT:
            dl.LT_PRINT(dl.BY_CB("user exit program"))
            dl.crawler_logo()  # exit print logo
            exit(dl.PUB_E_OK)

        else:
            dl.nolog_raise_arguerr()

        if mode_interactive_server == dl.MODE_SERVER:
            exit(dl.PUB_E_OK)