def logprowork(log_path, log_content, withtime='y'):
        """Universal work log save

        @@API that allows external calls
        Notice: If here print series fucntion raise UnicodeEncodeError, it must web page 
        include emoji symbol encode title when use prettytable to package title info
        :param log_path:    log save path
        :param log_content: log save content
        :param withtime:    default parameter, print and save with real time or not
        :return:            none
        """
        # add context to the file use option 'a+'
        # write content may have some not utf8 code, example Japanese
        log_file_ptr = open(log_path, 'a+', encoding='utf-8')

        # select add real time word or not
        if withtime == 'y':
            dataload.logtime_print(log_content)
            # use variable-length argument write word to the log file
            log_file_ptr.write(
                dataload.realtime_logword(dataload.base_time) + log_content +
                '\n')
        else:
            print(log_content)
            log_file_ptr.write(log_content + '\n')
        log_file_ptr.close()
Esempio n. 2
0
    def gather_rankingdata(self, option):
        """Crawl dailyRank list

        :param option:      user choose option
        :return:            none
        """
        response = self.pvmx.url_request_handler(
            target_url=option[0],
            post_data=self.pvmx.login_bias[2],
            timeout=30,
            target_page_word='Rankpage',
            need_log=True,
            log_path=self.logpath)
        # size info in webpage source
        web_src = response.read().decode("UTF-8", "ignore")
        imgitem_pattern = re.compile(dataload.RANKING_SECTION_REGEX, re.S)
        info_pattern = re.compile(dataload.RANKING_INFO_REGEX, re.S)
        sizer_result = self.pvmx.commit_spansizer(imgitem_pattern,
                                                  info_pattern, web_src)
        # whole data cache pool
        whole_urls, img_infos = sizer_result[0], sizer_result[1]

        # cut need image count to be target list
        alive_targets = len(whole_urls)
        if self.ir_mode == 1:
            img_nbr = self.gather_essential_info(option[1], alive_targets)
        # server mode directly get all of alive targets
        elif self.ir_mode == 2:
            img_nbr = alive_targets
            dataload.logtime_print(
                'Server mode auto crawl all of alive targets')
        self.target_urls = whole_urls[:img_nbr]
        log_context = 'Gather rankingtop ' + str(img_nbr) + ', target table:'
        self.pvmx.logprowork(self.logpath, log_context)
        # use prettytable package info list
        image_info_table = PrettyTable([
            "ImageNumber", "ImageID", "ImageTitle", "ImageID+PageNumber",
            "AuthorID", "AuthorName"
        ])
        for k, i in enumerate(img_infos[:img_nbr]):
            # basepage will be a headers referer
            self.basepages.append(dataload.BASEPAGE_URL + i[3])
            image_info_table.add_row([(k + 1), i[3], i[1],
                                      self.target_urls[k][57:-4], i[4], i[2]])
        # save table without time header word
        self.pvmx.logprowork(self.logpath, str(image_info_table), 'N')
    def _gatherpostkey(self):
        """POST way login need post-key

        :return:            post way request data
        """

        # call gather login data function
        self.login_bias = self._login_preload(dataload.LOGIN_AES_INI_PATH)
        response = self.url_request_handler(
            target_url=dataload.LOGIN_POSTKEY_URL,
            post_data=None,  # cannot set data when get post key
            timeout=30,
            target_page_word='POST-key',
            need_log=False,
            log_path='')
        # cookie check
        for item in self.cookie:
            log_context = 'Cookie: [name:' + item.name + '-value:' + item.value + ']'
            dataload.logtime_print(log_context)

        # mate post key
        web_src = response.read().decode("UTF-8", "ignore")
        post_pattern = re.compile(dataload.POSTKEY_REGEX, re.S)
        postkey = re.findall(post_pattern, web_src)[0]
        log_context = 'Get post-key: ' + postkey
        dataload.logtime_print(log_context)

        # build post-way data with order dictory structure
        post_orderdict = OrderedDict()
        post_orderdict['pixiv_id'] = self.login_bias[0]
        post_orderdict['password'] = self.login_bias[1]
        post_orderdict['captcha'] = ""
        post_orderdict['g_recaptcha_response'] = ""
        post_orderdict['post_key'] = postkey
        post_orderdict['source'] = "pc"
        post_orderdict['ref'] = dataload.LOGIN_POSTDATA_REF
        post_orderdict['return_to'] = dataload.HTTPS_HOST_URL
        # transfer to json data format, the same way as GET way data
        postway_data = urllib.parse.urlencode(post_orderdict).encode("UTF-8")

        return postway_data
    def mkworkdir(self, log_path, folder):
        """Create a crawler work directory

        @@API that allows external calls
        :param log_path:    log save path
        :param folder:      folder create path
        :return:            folder create path
        """
        # create a folder to save picture
        dataload.logtime_print('Crawler work directory setting: ' + folder)
        is_folder_existed = os.path.exists(folder)
        if not is_folder_existed:
            os.makedirs(folder)
            log_context = 'Create a new work folder'
        else:
            log_context = 'Target folder has already existed'
        # remove old log file
        if os.path.exists(log_path):
            os.remove(log_path)
        # this step will create a new log file and write the first line
        self.logprowork(log_path, log_context)
Esempio n. 5
0
    def target_confirm(self):
        """Input option and confirm target

        :return:            request mainpage url, mode
        """

        rank_word, req_url = None, None

        if self.ir_mode == 1:
            log_context = 'Gather ranking list======>'
            self.pvmx.logprowork(self.logpath, log_context)

            ormode = dataload.logtime_input(
                'Select ranking type, ordinary(o|1) or r18(r|2): ')
        elif self.ir_mode == 2:
            ormode = self.rtn_r18_arg

        if ormode == 'o' or ormode == '1':
            if self.ir_mode == 1:
                dwm = dataload.logtime_input(
                    'Select daily(1) | weekly(2) | monthly(3) ordinary ranking type: ')
            elif self.ir_mode == 2:
                dwm = self.rtn_rank_type

            if dwm == '1':
                req_url = dataload.DAILY_RANKING_URL
                rank_word = dataload.DAILY_WORD
            elif dwm == '2':
                req_url = dataload.WEEKLY_RANKING_URL
                rank_word = dataload.WEEKLY_WORD
            elif dwm == '3':
                req_url = dataload.MONTHLY_RANKING_URL
                rank_word = dataload.MONTHLY_WORD
            else:
                dataload.logtime_print("Argument(s) error\n")
            log_context = 'Crawler set target to %s rank top' % rank_word
        elif ormode == 'r' or ormode == '2':
            if self.ir_mode == 1:
                dwm = dataload.logtime_input(
                    'Select daily(1)/weekly(2) R18 ranking type: ')
            elif self.ir_mode == 2:
                dwm = self.rtn_rank_type

            if dwm == '1':
                req_url = dataload.DAILY_RANKING_R18_URL
                rank_word = dataload.DAILY_WORD
            elif dwm == '2':
                req_url = dataload.WEEKLY_RANKING_R18_URL
                rank_word = dataload.WEEKLY_WORD
            else:
                dataload.logtime_print(
                    "Argument(s) error\n")
            log_context = 'Crawler set target to %s r18 rank top' % rank_word
        else:
            dataload.logtime_print("Argument(s) error\n")
            log_context = None
        self.pvmx.logprowork(self.logpath, log_context)

        return req_url, ormode
    def url_request_handler(self, target_url, post_data, timeout,
                            target_page_word, need_log, log_path):
        """Universal URL request format handler

        @@API that allows external calls
        :param target_url:          target request url
        :param post_data:           post way data
        :param timeout:             request timeout, suggest 30s
        :param target_page_word:    target page symbol word
        :param need_log:            need log? True is need, then log_path is must
        :param log_path:            log save path
        :return:                    request result response(raw)
        """
        response = None
        try:
            response = self.opener.open(fullurl=target_url,
                                        data=post_data,
                                        timeout=timeout)
        except Exception as e:
            log_context = "Error Type: " + str(e)
            if need_log == True:
                self.logprowork(log_path, log_context)
            else:
                dataload.logtime_print(log_context)
        except KeyboardInterrupt:
            log_context = 'User interrupt request, exit program'
            if need_log == True:
                self.logprowork(log_path, log_context)
            exit()

        # if response failed, crawler will exit with error code -1
        if response is not None:
            if response.getcode() == dataload.HTTP_OK_CODE_200:
                log_context = target_page_word + ' response successed'
            else:
                log_context = (
                    target_page_word +
                    ' response not ok, return code %d' % response.getcode())
            if need_log == True:
                self.logprowork(log_path, log_context)
            else:
                dataload.logtime_print(log_context)
        else:
            log_context = target_page_word + ' response failed'
            if need_log == True:
                self.logprowork(log_path, log_context)
            else:
                dataload.logtime_print(log_context)
            exit(-1)

        return response
Esempio n. 7
0
    def gather_essential_info(ormode, whole_nbr):
        """Get input image count

        If user input number more than whole number, set target count is whole number
        Only intercative mode call this function
        :param ormode:      select ranktop ordinary or r18 mode
        :param whole_nbr:   whole ranking crawl count
        :return:            crawl images count
        """
        # transfer ascii string to number
        img_cnt = 0
        # choose ordinary artwork images
        if ormode == 'o' or ormode == '1':
            # input a string for request image number
            img_str = dataload.logtime_input(
                'Gather whole ordinary valid target %d, enter you want: '
                % whole_nbr)
        # choose R18 artwork images
        elif ormode == 'r' or ormode == '2':
            # input a string for request image number
            img_str = dataload.logtime_input(
                'Gather whole R18 vaild target %d, enter you want: '
                % whole_nbr)
        # error input
        else:
            dataload.logtime_print("Argument(s) error\n")
            exit(-1)

        # if user input isn't number
        while not img_str.isdigit():
            dataload.logtime_print(
                'Input error, your input content was not a decimal number')
            img_str = dataload.logtime_input(
                'Enter again(max is %d): ' % whole_nbr)
        # check input content is a number
        # if user input number more than limit max, set it to max
        img_cnt = int(img_str)
        if img_cnt > whole_nbr:
            img_cnt = whole_nbr
        elif img_cnt <= 0:
            dataload.logtime_print('What the f**k is wrong with you?')
            exit(-1)

        return img_cnt
Esempio n. 8
0
    def crawl_allpage_target(self):
        """Package all gather urls

        :return:            none
        """
        # calcus nbr need request count
        # each page at most ONE_AUTHOR_MAINPAGE_IMGCOUNT(20181003:48) images
        require_page_cnt = 0
        if self.max_cnt <= dataload.ONE_PAGE_COMMIT:
            require_page_cnt = 1
        else:
            require_page_cnt = int(self.max_cnt / dataload.ONE_PAGE_COMMIT)
            # remainder decision
            if self.max_cnt % dataload.ONE_PAGE_COMMIT != 0:
                require_page_cnt += 1

        # build request url of one page 
        iid_string_tail = ''
        page_url_array = []
        for ix in range(require_page_cnt):
            # tail number limit
            tmp_tail_nbr = dataload.ONE_PAGE_COMMIT * (ix + 1)
            if tmp_tail_nbr > self.max_cnt:
                tmp_tail_nbr = self.max_cnt
            for index in self.pure_idlist[(dataload.ONE_PAGE_COMMIT * ix):tmp_tail_nbr]:
                iid_string_tail += dataload.IDS_UNIT(index)
            one_page_request_url = dataload.ALLREPOINFO_URL(self.user_input_id, iid_string_tail)
            iid_string_tail = ''                                # clear last cache
            page_url_array.append(one_page_request_url)
        
        # gather all data from response xhr page into a temp list
        tmp_receive_list = []
        for i in range(require_page_cnt):
            tmp_receive_list += self.crawl_onepage_data(i + 1, page_url_array[i])
        # handle url string
        repo_target_all_list = []
        for i in range(len(tmp_receive_list)):
            # tasnform title '\\uxxx' to unicode
            tmp_receive_list[i][1] = self.pvmx.unicode_escape(tmp_receive_list[i][1])
            # replace emoji string
            tmp_receive_list[i][1] = self.pvmx.replace_emoji(tmp_receive_list[i][1])
            # build original url without image format
            tmp = tmp_receive_list[i][2]
            tmp = tmp.replace('\\', '')                         # delete character '\' 
            tmp_receive_list[i][2] = dataload.ORIGINAL_IMAGE_HEAD + tmp[50:] + '.png'
            repo_target_all_list.append(tmp_receive_list[i])    # move original item to target list
            # use page count number build total url
            tmp_page_count_str = tmp_receive_list[i][3]
            if tmp_page_count_str.isdigit():
                index_page_count = int(tmp_page_count_str)
                if index_page_count != 1:
                    # add others items into list
                    for px in range(index_page_count - 1):
                        insert_item = [tmp_receive_list[i][0], 
                            tmp_receive_list[i][1], 
                            tmp_receive_list[i][2][:-5] + str(px + 1) + '.png', 
                            tmp_receive_list[i][3]]
                        repo_target_all_list.append(insert_item)
            else:
                log_context = 'Page count process error!'
                self.pvmx.logprowork(self.logpath, log_context)
                exit(-1)
        del tmp_receive_list                                    # clear cache

        # collection target count
        alive_targetcnt = len(repo_target_all_list)
        require_img_nbr = 0
        if self.ir_mode == 1:
            require_img_str = dataload.logtime_input(
                'Gather all repo %d, whole target(s): %d, enter you want count: '
                        % (self.max_cnt, alive_targetcnt))
            # if user input isn't number
            while not require_img_str.isdigit():
                dataload.logtime_print(
                    'Input error, your input content was not a decimal number')
                require_img_str = dataload.logtime_input(
                    'Enter again(max is %d): ' % alive_targetcnt)
            require_img_nbr = int(require_img_str)
            # if user input number more than limit max, set it to max
            if require_img_nbr > alive_targetcnt:
                require_img_nbr = alive_targetcnt
            elif require_img_nbr <= 0:
                dataload.logtime_print('What the f**k is wrong with you?')
                exit(-1)
        # server mode directly catch all of alive targets
        elif self.ir_mode == 2:
            require_img_nbr = alive_targetcnt
            dataload.logtime_print('Server mode auto crawl all of alive targets')
        
        # download image number limit
        for k, i in enumerate(repo_target_all_list[:require_img_nbr]):
            self.target_capture.append(i[2])    # put url into target capture list
            self.basepages.append(dataload.BASEPAGE_URL + i[0]) # build basepage url
            
        # display author info
        log_context = ('Illustrator: ' + self.author_name + ' id: '
            + self.user_input_id + ' require image(s): ' 
            + str(require_img_nbr) + ', target table:')
        self.pvmx.logprowork(self.logpath, log_context)
        # use prettytable build a table save and print info list
        image_info_table = PrettyTable(
            ["ImageNumber", "ImageID", "ImageTitle", "ImagePageName"])
        for k, i in enumerate(repo_target_all_list[:require_img_nbr]):
            image_info_table.add_row([(k + 1), i[0], i[1], i[2][57:-4]]) 
        # save with str format and no time word
        self.pvmx.logprowork(self.logpath, str(image_info_table), 'N')
        del repo_target_all_list            # clear cache 
Esempio n. 9
0
    def gather_preloadinfo(self):
        """Crawler need to know how many images do you want

        This function will get author name base on author id
        :return:            none
        """
        # request all of one illustrator's artworks
        response = self.pvmx.url_request_handler(
            target_url=dataload.AJAX_ALL_URL(self.user_input_id),
            post_data=self.pvmx.login_bias[2], 
            timeout=30, 
            target_page_word='Ajaxpage',
            need_log=True,
            log_path=self.logpath)
        # mate illustrator name
        web_src = response.read().decode("UTF-8", "ignore")
        ajax_idlist_pattern = re.compile(dataload.AJAX_ALL_IDLIST_REGEX, re.S)
        ajax_idlist = re.findall(ajax_idlist_pattern, web_src)
        # id list result may include some garbages, use number regex get pure result
        number_pattern = re.compile(dataload.NUMBER_REGEX, re.S)
        for index in ajax_idlist:
            one_pure_id = re.findall(number_pattern, index)
            if one_pure_id:
                self.pure_idlist.append(one_pure_id[0])
            else:
                # very rare error, only happening in this address:
                # https://www.pixiv.net/member_illust.php?id=15115322
                log_context = 'Get ajax page valid info failed, exit'
                self.pvmx.logprowork(self.logpath, log_context)
                exit(-1)

        # use quick-sort algorithm to handle id number
        # descending order sort
        pure_idlist_nbr = []
        for index in self.pure_idlist:
            pure_idlist_nbr.append(int(index))      # string to integer number
        self.pvmx.quick_sort(pure_idlist_nbr, 0, len(pure_idlist_nbr) - 1)
        pure_idlist_nbr.reverse()                   # reverse order
        self.pure_idlist.clear()                         # clear origin list
        for index in pure_idlist_nbr:
            self.pure_idlist.append(str(index))
        del pure_idlist_nbr                         # clear number cache
        self.max_cnt = len(self.pure_idlist)
        
        # get author name from member-main-page
        response = self.pvmx.url_request_handler(
            target_url=dataload.MEMBER_ILLUST_URL + self.user_input_id,
            post_data=self.pvmx.login_bias[2], 
            timeout=30, 
            target_page_word='Mainpage',
            need_log=True,
            log_path=self.logpath)
        # mate illustrator name
        web_src = response.read().decode("UTF-8", "ignore")
        illust_name_pattern = re.compile(dataload.ILLUST_NAME_REGEX, re.S)
        author_info = re.findall(illust_name_pattern, web_src)
        # if login failed, regex parsing result will be a empty list
        if len(author_info) == 0:
            dataload.logtime_print("Regex parsing result error, no author info, exit")
            exit()
        else:
            self.author_name = author_info[0]
    def _login_preload(aes_file_path):
        """Get user input login info and storage into aes file

        If project directory has no file, you need hand-input login info,
        then program will create new file to storage AES encrypt info to it
        This method use pycrypto, need import external call
        :param aes_file_path:       .aes_crypto_login.ini file path
        :return:                    username, password, get data
        """
        is_aes_file_existed = os.path.exists(aes_file_path)
        if is_aes_file_existed:
            # stable read rows get username and password
            # read bin file content to a list
            read_aes_file = open(aes_file_path, 'rb+')
            readline_cache = read_aes_file.readlines()  # all line list
            read_aes_file.close()

            read_aes_iv_param_raw = readline_cache[0]  # row 1 is AES IV PARAM
            read_user_mailbox_raw = readline_cache[1]  # row 2 is username
            read_user_passwd_raw = readline_cache[2]  # row 3 is password
            # cut last char (b'\n')
            read_aes_iv_param_raw = read_aes_iv_param_raw[:-1]
            read_user_mailbox_raw = read_user_mailbox_raw[:-1]
            read_user_passwd_raw = read_user_passwd_raw[:-1]

            # analysis hash value to string
            username_aes_decrypt_cipher = AES.new(dataload.AES_SECRET_KEY,
                                                  AES.MODE_CFB,
                                                  read_aes_iv_param_raw)
            username = str(
                username_aes_decrypt_cipher.decrypt(
                    read_user_mailbox_raw[AES.block_size:]), 'UTF-8')
            password_aes_decrypt_cipher = AES.new(dataload.AES_SECRET_KEY,
                                                  AES.MODE_CFB,
                                                  read_aes_iv_param_raw)
            passwd = str(
                password_aes_decrypt_cipher.decrypt(
                    read_user_passwd_raw[AES.block_size:]), 'UTF-8')

            # check username and password
            check = dataload.logtime_input(
                "Read user login information configuration ok, check this: \n"
                "[-> Username] %s\n[-> Password] %s\n"
                "Is that correct? (Y/N): " % (username, passwd))

            # if user judge info are error, delete old AES file and record new info
            if check == 'N' or check == 'n':
                os.remove(aes_file_path)  # delete old AES file
                # temporarily enter login information
                dataload.logtime_print(
                    "Well, you need hand-input your login data: ")
                username = dataload.logtime_input(
                    'Enter your pixiv id(mailbox), must be a R18: ').encode(
                        'utf-8')
                passwd = getpass.getpass(
                    dataload.realtime_logword(dataload.base_time) +
                    'Enter your account password: '******'utf-8')

                generate_aes_iv_param = Random.new().read(
                    AES.block_size)  # generate random aes iv param
                username_cipher = AES.new(dataload.AES_SECRET_KEY,
                                          AES.MODE_CFB, generate_aes_iv_param)
                username_encrypto = generate_aes_iv_param + username_cipher.encrypt(
                    username)
                passwd_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB,
                                        generate_aes_iv_param)
                passwd_encrypto = generate_aes_iv_param + passwd_cipher.encrypt(
                    passwd)

                # create new aes file rewrite it
                write_aes_file = open(aes_file_path, 'wb')
                # write bin value to file with b'\n' to wrap
                write_aes_file.write(generate_aes_iv_param +
                                     b'\n')  # row 1 is iv param
                write_aes_file.write(username_encrypto +
                                     b'\n')  # row 2 is username
                write_aes_file.write(passwd_encrypto +
                                     b'\n')  # row 3 is password
                write_aes_file.close()
            # read info correct, jump out here
            else:
                pass

        # if no AES file, then create new and write md5 value into it
        else:
            dataload.logtime_print(
                "Create new AES encrypt file to storage your username and password: "
            )
            username = dataload.logtime_input(
                'Enter your pixiv id(mailbox), must be a R18: ').encode(
                    'utf-8')
            passwd = getpass.getpass(
                dataload.realtime_logword(dataload.base_time) +
                'Enter your account password: '******'utf-8')

            generate_aes_iv_param = Random.new().read(
                AES.block_size)  # generate random aes iv param
            username_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB,
                                      generate_aes_iv_param)
            username_encrypto = generate_aes_iv_param + username_cipher.encrypt(
                username)
            passwd_cipher = AES.new(dataload.AES_SECRET_KEY, AES.MODE_CFB,
                                    generate_aes_iv_param)
            passwd_encrypto = generate_aes_iv_param + passwd_cipher.encrypt(
                passwd)

            # create new AES file, set write bin bytes mode
            write_aes_file = open(aes_file_path, 'wb')
            # write bin value to file with b'\n' to wrap
            write_aes_file.write(generate_aes_iv_param +
                                 b'\n')  # row 1 is iv param
            write_aes_file.write(username_encrypto +
                                 b'\n')  # row 2 is username
            write_aes_file.write(passwd_encrypto + b'\n')  # row 3 is password
            write_aes_file.close()

        # build data string
        getway_register = [('user', username), ('pass', passwd)]
        getway_data = urllib.parse.urlencode(getway_register).encode(
            encoding='UTF8')

        return username, passwd, getway_data  # return login use 3 elements
Esempio n. 11
0
def main():
    """main() function

    Get user input arguments and launch mode function
    :return:    none
    """
    print(PixivAPILib.__doc__)
    # program work continue ask
    ask_res = dataload.logtime_input('%s lanuch, continue? (Y/N): ' %
                                     dataload.PROJECT_NAME)
    if ask_res == 'N' or ask_res == 'No' or ask_res == 'n':
        dataload.logtime_print("User exit program\n")
        exit(0)
    # website id and password require
    ask_res = dataload.logtime_input(
        'Crawler will use your Pixiv-ID and password to login to the website, agree? (Y/N): '
    )
    if ask_res == 'N' or ask_res == 'No' or ask_res == 'n':
        dataload.logtime_print("No ID and password crawler cannot work, exit")
        exit(0)

    api_instance = PixivAPILib()  # instance class to a object
    api_instance.camouflage_login()  # crawler simulated login
    # multiple task cycles
    while True:
        mode = dataload.logtime_input('Login finished, select mode: ')
        # ranking top N mode
        if mode == 'rtn' or mode == '1':
            dataload.logtime_print('Mode: [Ranking Top N]')
            rtn_instance = rtn(dataload.RANK_DIR, dataload.LOG_PATH,
                               dataload.HTML_PATH, api_instance)
            rtn_instance.start()
        # illustrator repositories all mode
        elif mode == 'ira' or mode == '2':
            dataload.logtime_print('Mode: [Illustrator Repository All]')
            ira_instance = ira(dataload.REPO_DIR, dataload.LOG_NAME,
                               dataload.HTML_NAME, api_instance)
            ira_instance.start()
        # help page
        elif mode == 'help' or mode == '3':
            print(PixivAPILib.__doc__)
        # user normal exit program
        elif mode == 'exit' or mode == '4':
            dataload.logtime_print("User exit program")
            dataload.crawler_logo()  # exit print logo
            exit(0)
        # input parameter error, into next circle
        else:
            dataload.logtime_print("Argument(s) error")
Esempio n. 12
0
    def target_confirm(self):
        """Input option and confirm target

        :return:            request mainpage url, mode
        """

        rank_word, req_url = None, None

        if self.ir_mode == 1:
            log_context = 'Gather ranking list======>'
            self.pvmx.logprowork(self.logpath, log_context)

            # select rank R18 or not
            ormode = dataload.logtime_input(
                'Select ranking type, ordinary(o|1) or r18(r|2): ')
            mf_word = dataload.logtime_input(
                'Select sex favor, normal(n|0) or male(m|1) or female(f|2): ')
        elif self.ir_mode == 2:
            ormode = self.rtn_r18_arg
            mf_word = self.rtn_mf_word

        if ormode == 'o' or ormode == '1':
            if self.ir_mode == 1:
                dwm = dataload.logtime_input(
                    'Select daily(1) | weekly(2) | monthly(3) ordinary ranking type: '
                )
            elif self.ir_mode == 2:
                dwm = self.rtn_rank_type

            if dwm == '1':
                if mf_word == '0' or mf_word == 'n':
                    req_url = dataload.DAILY_RANKING_URL
                    rank_word = dataload.DAILY_WORD
                # choose the mail or female, rank type only can be set to daily
                elif mf_word == '1' or mf_word == 'm':
                    req_url = dataload.DAILY_MALE_RANKING_URL
                    rank_word = dataload.MALE_WORD
                elif mf_word == '2' or mf_word == 'f':
                    req_url = dataload.DAILY_FEMALE_RANKING_URL
                    rank_word = dataload.FEMALE_WORD
                else:
                    dataload.logtime_print("Argument(s) error\n")
            elif dwm == '2':
                req_url = dataload.WEEKLY_RANKING_URL
                rank_word = dataload.WEEKLY_WORD
            elif dwm == '3':
                req_url = dataload.MONTHLY_RANKING_URL
                rank_word = dataload.MONTHLY_WORD
            else:
                dataload.logtime_print("Argument(s) error\n")
            log_context = 'Crawler set target to %s rank top' % rank_word
        elif ormode == 'r' or ormode == '2':
            if self.ir_mode == 1:
                dwm = dataload.logtime_input(
                    'Select daily(1)/weekly(2) R18 ranking type: ')
            elif self.ir_mode == 2:
                dwm = self.rtn_rank_type

            if dwm == '1':
                if mf_word == '0' or mf_word == 'n':
                    req_url = dataload.DAILY_RANKING_R18_URL
                    rank_word = dataload.DAILY_WORD
                # choose the mail or female, rank type only can be set to daily
                elif mf_word == '1' or mf_word == 'm':
                    req_url = dataload.DAILY_MALE_RANKING_R18_URL
                    rank_word = dataload.MALE_WORD
                elif mf_word == '2' or mf_word == 'f':
                    req_url = dataload.DAILY_FEMALE_RANKING_R18_URL
                    rank_word = dataload.FEMALE_WORD
                else:
                    dataload.logtime_print("Argument(s) error\n")
            elif dwm == '2':
                req_url = dataload.WEEKLY_RANKING_R18_URL
                rank_word = dataload.WEEKLY_WORD
            else:
                dataload.logtime_print("Argument(s) error\n")
            log_context = 'Crawler set target to %s r18 rank top' % rank_word
        else:
            dataload.logtime_print("Argument(s) error\n")
            log_context = None
        self.pvmx.logprowork(self.logpath, log_context)

        return req_url, ormode
Esempio n. 13
0
def main():
    """main() function

    Get user input arguments and launch mode function
    :return:    none
    """

    print(PixivAPILib.__doc__)
    mode_interactive_server = 1                     # intercative mode or server mode, default interavtive mode(1)
    # judge the count of command line argument
    # if no external arguments, into interactive mode
    if len(sys.argv) == 1:
        mode_interactive_server = 1
        # program work continue ask
        ask_res = dataload.logtime_input('%s lanuch, continue? (Y/N): ' % dataload.PROJECT_NAME)
        if ask_res == 'N' or ask_res == 'No' or ask_res == 'n':
            dataload.logtime_print("User exit program\n")
            exit(0)
        # website id and password require
        ask_res = dataload.logtime_input(
            'Crawler will use your Pixiv-ID and password to login to the website, agree? (Y/N): ')
        if ask_res == 'N' or ask_res == 'No' or ask_res == 'n':
            dataload.logtime_print("No ID and password crawler cannot work, exit")
            exit(0)
        
        api_instance = PixivAPILib(mode_interactive_server) # instance class to a object
        api_instance.camouflage_login()                     # crawler simulated login
        # multiple task cycles
        while True:
            mode = dataload.logtime_input('Login finished, select mode: ')
            # ranking top N mode
            if mode == 'rtn' or mode == '1':
                dataload.logtime_print('Mode: [Ranking Top N]')
                rtn_instance = rtn(dataload.RANK_DIR, dataload.LOG_PATH, 
                    dataload.HTML_PATH, api_instance, mode_interactive_server)
                rtn_instance.start()
            # illustrator repositories all mode
            elif mode == 'ira' or mode == '2':
                dataload.logtime_print('Mode: [Illustrator Repository All]')
                ira_instance = ira(dataload.REPO_DIR, dataload.LOG_NAME, 
                    dataload.HTML_NAME, api_instance, mode_interactive_server)
                ira_instance.start()
            # help page
            elif mode == 'help' or mode == '3':
                print(PixivAPILib.__doc__)
            # user normal exit program
            elif mode == 'exit' or mode == '4':
                dataload.logtime_print("User exit program")
                dataload.crawler_logo()         # exit print logo
                exit(0)
            # input parameter error, into next circle
            else:
                dataload.logtime_print("Argument(s) error")
    else:
        mode_interactive_server = 2
        # argument pass to variable
        opts, args = getopt.getopt(sys.argv[1:], "hm:r:l:s:i:", ["help", "mode", "R18", "list", "sex", "id"])
        catch_mode = '1'
        rtn_r18_opt = '1'
        rtn_list_type = '1'
        rtn_mf_word = ''
        ira_illust_id = ''
        for opt, value in opts:
            if opt in ("-m", "--mode"):
                catch_mode = value
            elif opt in ("-r", "--R18"):
                rtn_r18_opt = value
            elif opt in ("-l", "--list"):
                rtn_list_type = value
            elif opt in ("-s", "--sex"):
                rtn_mf_word = value
            elif opt in ("-i", "--id"):
                ira_illust_id = value
            elif opt in ("-h", "--help"):
                print(PixivAPILib.__doc__)
                exit(0)
    
        api_instance = PixivAPILib(mode_interactive_server) # instance class to a object
        api_instance.camouflage_login()                     # crawler simulated login

        if catch_mode == '1':
            dataload.logtime_print('Mode: [Ranking Top N]')
            rtn_instance = rtn(dataload.RANK_DIR, dataload.LOG_PATH, 
                dataload.HTML_PATH, api_instance, mode_interactive_server, 
                rtn_r18_opt, rtn_list_type, rtn_mf_word)
            rtn_instance.start()
        # illustrator repositories all mode
        elif catch_mode == '2':
            dataload.logtime_print('Mode: [Illustrator Repository All]')
            ira_instance = ira(dataload.REPO_DIR, dataload.LOG_NAME, 
                dataload.HTML_NAME, api_instance, mode_interactive_server, ira_illust_id)
            ira_instance.start()
        # help page
        elif catch_mode == 'help' or catch_mode == '3':
            print(PixivAPILib.__doc__)