def gather_essential_info(ormode, whole_nbr): """Get input image count :param ormode: select ranktop ordinary or r18 mode :param whole_nbr: whole ranking crawl count :return: crawl images count """ # transfer ascii string to number img_cnt = 0 if ormode == 'o' or ormode == '1': # input a string for request image number img_cnt = int( dataload.SBH_INPUT( 'gather whole ordinary vaild target %d, enter you want: ' % whole_nbr)) while img_cnt > whole_nbr: dataload.SBH_PRINT('input error, rank top at most %d' % whole_nbr) img_cnt = int( dataload.SBH_INPUT('enter again(max is %d): ' % whole_nbr)) elif ormode == 'r' or ormode == '2': # input a string for request image number img_cnt = int( dataload.SBH_INPUT( 'gather whole R18 vaild target %d, enter you want: ' % whole_nbr)) while img_cnt > whole_nbr: dataload.SBH_PRINT('input error, rank R18 top at most %d' % whole_nbr) img_cnt = int( dataload.SBH_INPUT('enter again(max is %d): ' % whole_nbr)) else: pass return img_cnt
def target_confirm(log_path): """Input option and confirm target :param log_path: log save path :return: request mainpage url, mode """ log_context = 'gather ranking list======>' _pvmx.logprowork(log_path, log_context) rank_word = None req_url = None ormode = dataload.SBH_INPUT( 'select ranking type, ordinary(o|1) or r18(r|2): ') if ormode == 'o' or ormode == '1': dwm = dataload.SBH_INPUT( 'select daily(1)|weekly(2)|monthly(3) ordinary ranking type: ') if dwm == '1': req_url = dataload.DAILY_RANKING_URL rank_word = dataload.DAILY_WORD elif dwm == '2': req_url = dataload.WEEKLY_RANKING_URL rank_word = dataload.WEEKLY_WORD elif dwm == '3': req_url = dataload.MONTHLY_RANKING_URL rank_word = dataload.MONTHLY_WORD else: dataload.SBH_PRINT("argv(s) error\n") log_context = 'crawler set target to %s rank top' % rank_word elif ormode == 'r' or ormode == '2': dwm = dataload.SBH_INPUT( 'select daily(1)/weekly(2) R18 ranking type: ') if dwm == '1': req_url = dataload.DAILY_RANKING_R18_URL rank_word = dataload.DAILY_WORD elif dwm == '2': req_url = dataload.WEEKLY_RANKING_R18_URL rank_word = dataload.WEEKLY_WORD else: dataload.SBH_PRINT("argv(s) error\n") log_context = 'crawler set target to %s r18 rank top' % rank_word else: dataload.SBH_PRINT("argv(s) error\n") log_context = None _pvmx.logprowork(log_path, log_context) return req_url, ormode
def __init__(self, workdir, log_name, html_name): """ :param workdir: work directory :param log_name: log name :param html_name: html name """ target_id = dataload.SBH_INPUT('target crawl illustrator pixiv-id: ') self.user_input_id = target_id self.workdir = workdir + 'illustrepo_' + self.user_input_id self.logpath = self.workdir + log_name self.htmlpath = self.workdir + html_name
def main(): """main() fuction :return: none """ print(Matrix.__doc__) mode = dataload.SBH_INPUT('select mode: ') if mode == 'rtn' or mode == '1': build_task = rtn(dataload.RANK_DIR, dataload.LOG_PATH, dataload.HTML_PATH) build_task.start() elif mode == 'ira' or mode == '2': build_task = ira(dataload.REPO_DIR, dataload.LOG_NAME, dataload.HTML_NAME) build_task.start() elif mode == 'help' or mode == '3': print(Matrix.__doc__) else: dataload.SBH_PRINT("argv(s) error\n")
def crawl_allpage_target(self, illust_id, nbr, arthor_name, log_path): """Package all gather url :param self: self class :param illust_id: illustrator id :param nbr: package images count :param arthor_name: arthor name :param log_path: log save path :return: build original images urls list """ # calcus nbr need request count # each page at most 20 images if nbr <= 20: need_pagecnt = 1 else: need_pagecnt = int(nbr / 20) + 1 # gather all data all_targeturls = [] all_artworknames = [] for i in range(need_pagecnt): data_capture = self.crawl_onepage_data(illust_id, i + 1, log_path) all_targeturls += data_capture[0] all_artworknames += data_capture[1] # collection target count alive_targetcnt = len(all_targeturls) # count get process log_context = ("gather all repo %d, whole target(s): %d" % (nbr, alive_targetcnt)) _pvmx.logprowork(log_path, log_context) nbr_capture = int(dataload.SBH_INPUT('enter you want count: ')) while (nbr_capture > alive_targetcnt) or (nbr_capture <= 0): nbr_capture = int( dataload.SBH_INPUT( 'error, input count must <= %d and not 0: ' % alive_targetcnt)) log_context = ("check crawl illustrator id:" + self.user_input_id + " image(s):%d" % nbr_capture) _pvmx.logprowork(log_path, log_context) # cut need data artwork_ids = [] target_capture = [] basepages = [] for i in all_targeturls[:nbr_capture]: target_capture.append(i) img_id = i[57:-7] artwork_ids.append(img_id) basepage = dataload.BASEPAGE_URL + img_id basepages.append(basepage) # log images info log_context = ('illustrator: ' + arthor_name + ' id: ' + self.user_input_id + ' artworks info====>') _pvmx.logprowork(log_path, log_context) for k, i in enumerate(all_artworknames[:nbr_capture]): log_context = ('no.%d image: [%s | id: %s | url: %s]' % ((k + 1), i, artwork_ids[k], target_capture[k])) _pvmx.logprowork(log_path, log_context) return target_capture, basepages
def _login_preload(logincr_path): """Get user input username and password login.cr file example: ================================= [login] <mail> <passwd> ================================= :param logincr_path: login.cr file path :return: username, password, get data """ is_login_file_existed = os.path.exists(logincr_path) if is_login_file_existed: # read two row content user_mailbox = linecache.getline(logincr_path, 2) user_password = linecache.getline(logincr_path, 3) # empty file if user_mailbox == '' or user_password == '': dataload.SBH_PRINT( "login.cr file invaild, please input your login info") user_mailbox = dataload.SBH_INPUT( 'enter your pixiv id(mailbox), must be a R18: ') # pycharm python console not support getpass input user_password = getpass.getpass( dataload.SHELL_BASHHEAD + 'enter your account password: '******'yes' and check != 'Yes' and check != 'YES' and check != 'y' and check != 'Y'): dataload.SBH_PRINT("you can write new info") user_mailbox = dataload.SBH_INPUT( 'enter your pixiv id(mailbox), must be a R18: ') user_password = getpass.getpass( dataload.SHELL_BASHHEAD + 'enter your account password: '******'enter your pixiv id(mailbox), must be a R18: ') user_password = getpass.getpass(dataload.SHELL_BASHHEAD + 'enter your account password: '******'\n' username = user_mailbox.strip() passwd = user_password.strip() getway_reg_info = [('user', username), ('pass', passwd)] getway_data \ = urllib.parse.urlencode(getway_reg_info).encode(encoding='UTF8') # return login need 3 elements return username, passwd, getway_data