def gather_essential_info(ormode, whole_nbr):
        """Get input image count

        :param ormode:      select ranktop ordinary or r18 mode
        :param whole_nbr:   whole ranking crawl count
        :return:            crawl images count
        """
        # transfer ascii string to number
        img_cnt = 0
        if ormode == 'o' or ormode == '1':
            # input a string for request image number
            img_cnt = int(
                dataload.SBH_INPUT(
                    'gather whole ordinary vaild target %d, enter you want: ' %
                    whole_nbr))
            while img_cnt > whole_nbr:
                dataload.SBH_PRINT('input error, rank top at most %d' %
                                   whole_nbr)
                img_cnt = int(
                    dataload.SBH_INPUT('enter again(max is %d): ' % whole_nbr))
        elif ormode == 'r' or ormode == '2':
            # input a string for request image number
            img_cnt = int(
                dataload.SBH_INPUT(
                    'gather whole R18 vaild target %d, enter you want: ' %
                    whole_nbr))
            while img_cnt > whole_nbr:
                dataload.SBH_PRINT('input error, rank R18 top at most %d' %
                                   whole_nbr)
                img_cnt = int(
                    dataload.SBH_INPUT('enter again(max is %d): ' % whole_nbr))
        else:
            pass

        return img_cnt
    def target_confirm(log_path):
        """Input option and confirm target

        :param log_path:    log save path
        :return:            request mainpage url, mode
        """
        log_context = 'gather ranking list======>'
        _pvmx.logprowork(log_path, log_context)
        rank_word = None
        req_url = None
        ormode = dataload.SBH_INPUT(
            'select ranking type, ordinary(o|1) or r18(r|2): ')
        if ormode == 'o' or ormode == '1':
            dwm = dataload.SBH_INPUT(
                'select daily(1)|weekly(2)|monthly(3) ordinary ranking type: ')
            if dwm == '1':
                req_url = dataload.DAILY_RANKING_URL
                rank_word = dataload.DAILY_WORD
            elif dwm == '2':
                req_url = dataload.WEEKLY_RANKING_URL
                rank_word = dataload.WEEKLY_WORD
            elif dwm == '3':
                req_url = dataload.MONTHLY_RANKING_URL
                rank_word = dataload.MONTHLY_WORD
            else:
                dataload.SBH_PRINT("argv(s) error\n")
            log_context = 'crawler set target to %s rank top' % rank_word
        elif ormode == 'r' or ormode == '2':
            dwm = dataload.SBH_INPUT(
                'select daily(1)/weekly(2) R18 ranking type: ')
            if dwm == '1':
                req_url = dataload.DAILY_RANKING_R18_URL
                rank_word = dataload.DAILY_WORD
            elif dwm == '2':
                req_url = dataload.WEEKLY_RANKING_R18_URL
                rank_word = dataload.WEEKLY_WORD
            else:
                dataload.SBH_PRINT("argv(s) error\n")
            log_context = 'crawler set target to %s r18 rank top' % rank_word
        else:
            dataload.SBH_PRINT("argv(s) error\n")
            log_context = None
        _pvmx.logprowork(log_path, log_context)

        return req_url, ormode
 def __init__(self, workdir, log_name, html_name):
     """
     :param workdir:     work directory
     :param log_name:    log name
     :param html_name:   html name
     """
     target_id = dataload.SBH_INPUT('target crawl illustrator pixiv-id: ')
     self.user_input_id = target_id
     self.workdir = workdir + 'illustrepo_' + self.user_input_id
     self.logpath = self.workdir + log_name
     self.htmlpath = self.workdir + html_name
def main():
    """main() fuction

    :return:    none
    """
    print(Matrix.__doc__)

    mode = dataload.SBH_INPUT('select mode: ')
    if mode == 'rtn' or mode == '1':
        build_task = rtn(dataload.RANK_DIR, dataload.LOG_PATH,
                         dataload.HTML_PATH)
        build_task.start()
    elif mode == 'ira' or mode == '2':
        build_task = ira(dataload.REPO_DIR, dataload.LOG_NAME,
                         dataload.HTML_NAME)
        build_task.start()
    elif mode == 'help' or mode == '3':
        print(Matrix.__doc__)
    else:
        dataload.SBH_PRINT("argv(s) error\n")
    def crawl_allpage_target(self, illust_id, nbr, arthor_name, log_path):
        """Package all gather url

        :param self:        self class
        :param illust_id:   illustrator id
        :param nbr:         package images count
        :param arthor_name: arthor name
        :param log_path:    log save path
        :return:            build original images urls list
        """
        # calcus nbr need request count
        # each page at most 20 images
        if nbr <= 20:
            need_pagecnt = 1
        else:
            need_pagecnt = int(nbr / 20) + 1

        # gather all data
        all_targeturls = []
        all_artworknames = []
        for i in range(need_pagecnt):
            data_capture = self.crawl_onepage_data(illust_id, i + 1, log_path)
            all_targeturls += data_capture[0]
            all_artworknames += data_capture[1]
        # collection target count
        alive_targetcnt = len(all_targeturls)

        # count get process
        log_context = ("gather all repo %d, whole target(s): %d" %
                       (nbr, alive_targetcnt))
        _pvmx.logprowork(log_path, log_context)
        nbr_capture = int(dataload.SBH_INPUT('enter you want count: '))
        while (nbr_capture > alive_targetcnt) or (nbr_capture <= 0):
            nbr_capture = int(
                dataload.SBH_INPUT(
                    'error, input count must <= %d and not 0: ' %
                    alive_targetcnt))
        log_context = ("check crawl illustrator id:" + self.user_input_id +
                       " image(s):%d" % nbr_capture)
        _pvmx.logprowork(log_path, log_context)

        # cut need data
        artwork_ids = []
        target_capture = []
        basepages = []
        for i in all_targeturls[:nbr_capture]:
            target_capture.append(i)
            img_id = i[57:-7]
            artwork_ids.append(img_id)
            basepage = dataload.BASEPAGE_URL + img_id
            basepages.append(basepage)
        # log images info
        log_context = ('illustrator: ' + arthor_name + ' id: ' +
                       self.user_input_id + ' artworks info====>')
        _pvmx.logprowork(log_path, log_context)

        for k, i in enumerate(all_artworknames[:nbr_capture]):
            log_context = ('no.%d image: [%s | id: %s | url: %s]' %
                           ((k + 1), i, artwork_ids[k], target_capture[k]))
            _pvmx.logprowork(log_path, log_context)

        return target_capture, basepages
Exemple #6
0
    def _login_preload(logincr_path):
        """Get user input username and password

        login.cr file example:
        =================================
        [login]
        <mail>
        <passwd>
        =================================
        :param logincr_path:    login.cr file path
        :return:                username, password, get data
        """
        is_login_file_existed = os.path.exists(logincr_path)
        if is_login_file_existed:
            # read two row content
            user_mailbox = linecache.getline(logincr_path, 2)
            user_password = linecache.getline(logincr_path, 3)
            # empty file
            if user_mailbox == '' or user_password == '':
                dataload.SBH_PRINT(
                    "login.cr file invaild, please input your login info")
                user_mailbox = dataload.SBH_INPUT(
                    'enter your pixiv id(mailbox), must be a R18: ')
                # pycharm python console not support getpass input
                user_password = getpass.getpass(
                    dataload.SHELL_BASHHEAD + 'enter your account password: '******'yes' and check != 'Yes' and check != 'YES'
                        and check != 'y' and check != 'Y'):
                    dataload.SBH_PRINT("you can write new info")
                    user_mailbox = dataload.SBH_INPUT(
                        'enter your pixiv id(mailbox), must be a R18: ')
                    user_password = getpass.getpass(
                        dataload.SHELL_BASHHEAD +
                        'enter your account password: '******'enter your pixiv id(mailbox), must be a R18: ')
            user_password = getpass.getpass(dataload.SHELL_BASHHEAD +
                                            'enter your account password: '******'\n'
        username = user_mailbox.strip()
        passwd = user_password.strip()

        getway_reg_info = [('user', username), ('pass', passwd)]
        getway_data \
            = urllib.parse.urlencode(getway_reg_info).encode(encoding='UTF8')

        # return login need 3 elements
        return username, passwd, getway_data