def handleDebugTagSearchPage(self, response, url): if self._config.enableDump: if self._config.dumpTagSearchPage: dump_filename = "TagSearch Page for {0}.html".format(url) PixivHelper.dumpHtml(dump_filename, response) PixivHelper.print_and_log('info', 'Dumping html to: {0}'.format(dump_filename)) if self._config.debugHttp: PixivHelper.safePrint(u"reply: {0}".format(PixivHelper.toUnicode(response)))
def handleDebugMediumPage(self, response, imageId): if self._config.enableDump: if self._config.dumpMediumPage: dump_filename = "Medium Page for Image Id {0}.html".format(imageId) PixivHelper.dumpHtml(dump_filename, response) PixivHelper.print_and_log('info', 'Dumping html to: {0}'.format(dump_filename)) if self._config.debugHttp: PixivHelper.safePrint(u"reply: {0}".format(PixivHelper.toUnicode(response)))
def getSearchTagPage(self, tags, current_page, wild_card=True, title_caption=False, start_date=None, end_date=None, member_id=None, oldest_first=False, start_page=1): response = None result = None url = '' if member_id is not None: # from member id search by tags (artist, response) = self.getMemberPage(member_id, current_page, False, tags) # convert to PixivTags result = PixivModelWhiteCube.PixivTags() result.parseMemberTags(artist, member_id, tags) else: # search by tags url = PixivHelper.generateSearchTagUrl(tags, current_page, title_caption, wild_card, oldest_first, start_date, end_date, member_id, self._config.r18mode) PixivHelper.print_and_log('info', 'Looping... for ' + url) # response = self.open(url).read() response = self.getPixivPage(url, returnParsed=False).read() self.handleDebugTagSearchPage(response, url) parse_search_page = BeautifulSoup(response) result = PixivModel.PixivTags() if member_id is not None: result.parseMemberTags(parse_search_page, member_id, tags) else: try: result.parseTags(parse_search_page, tags) except BaseException: PixivHelper.dumpHtml("Dump for SearchTags " + tags + ".html", response) raise parse_search_page.decompose() del parse_search_page return (result, response)
def ParseInfo(self, page, fromImage=False, bookmark=False): avatarBox = page.find(attrs={'class': '_unit profile-unit'}) self.artistToken = self.ParseToken(page, fromImage) if avatarBox is not None: temp = str(avatarBox.find('a')['href']) self.artistId = int(re.search(r'id=(\d+)', temp).group(1)) self.artistAvatar = str(page.find('img', attrs={'class': 'user-image'})['src']) try: h1 = page.find('h1', attrs={'class': 'user'}) if h1 is not None: self.artistName = unicode(h1.string.extract()) else: avatar_m = page.findAll(attrs={"class": "avatar_m"}) if avatar_m is not None and len(avatar_m) > 0: self.artistName = unicode(avatar_m[0]["title"]) except: self.artistName = self.artistToken # use the token. return else: # Issue #236 avatarBox = page.find(attrs={'class': '_user-profile-card'}) if avatarBox is not None: temp = avatarBox.find('a') self.artistId = int(re.search(r'id=(\d+)', temp['href']).group(1)) self.artistName = unicode(temp['title']) self.artistAvatar = avatarBox.find('a')['style'].replace("background-image: url('", "").replace("');", "") return # check if self manage page submit_related = page.findAll("ul", attrs={'class': 'related'}) if len(submit_related) > 0 and str(submit_related[0]).find("upload.php") > 0: PixivHelper.printAndLog("info", "Manage Page.") self.artistAvatar = "your profile" self.artistName = "yourself" temp = page.find("h1", attrs={'class': 'column-title'}).find("a") self.artistId = int(re.search(r'id=(\d+)', temp['href']).group(1)) return # Issue #236 # cannot parse information self.artistAvatar = "no_profile" self.artistName = "self" title = page.find("title").text filename = u"Dump for {0} UnknownProfile for {1}.html".format(title, self.artistToken) PixivHelper.printAndLog("error", u"Cannot parse artist info, dumping to {0}".format(filename)) # PixivHelper.printAndLog("error", u"{0}".format(page)) PixivHelper.dumpHtml(filename, page)
def getSearchTagPage(self, tags, current_page, wild_card=True, title_caption=False, start_date=None, end_date=None, member_id=None, oldest_first=False, start_page=1): response = None result = None url = '' if member_id is not None: ## if member_id is None: ## # from search page: ## # https://www.pixiv.net/rpc/whitecube/index.php?order=date&adult_mode=include&q=vocaloid&p=0&type=&mode=whitecube_search&s_mode=s_tag&scd=&size=&ratio=&like=&tools=&tt=4e2cdee233f1156231ee99da1e51a83c ## url = "https://www.pixiv.net/rpc/whitecube/index.php?q={0}".format(tags) ## url = url + "&adult_mode={0}".format("include") ## url = url + "&mode={0}".format("whitecube_search") ## ## # date ordering ## order = "date_d" ## if oldest_first: ## order = "date" ## url = url + "&order={0}".format(order) ## ## # search mode ## s_mode = "s_tag_full" ## if wild_card: ## s_mode = "s_tag" ## elif title_caption: ## s_mode = "s_tc" ## url = url + "&s_mode={0}".format(s_mode) ## ## # start/end date ## if start_date is not None: ## url = url + "&scd={0}".format(start_date) ## if end_date is not None: ## url = url + "&ecd={0}".format(end_date) ## ## url = url + "&p={0}".format(i) ## url = url + "&start_page={0}".format(start_page) ## url = url + "&tt={0}".format(self._whitecubeToken) ## ## PixivHelper.print_and_log('info', 'Looping for {0} ...'.format(url)) ## response = self.open(url).read() ## self.handleDebugTagSearchPage(response, url) ## ## PixivHelper.GetLogger().debug(response) ## result = PixivModelWhiteCube.PixivTags() ## result.parseTags(response, tags) ## else: # from member id search by tags (artist, response) = self.getMemberPage(member_id, current_page, False, tags) # convert to PixivTags result = PixivModelWhiteCube.PixivTags() result.parseMemberTags(artist, member_id, tags) else: # search by tags url = PixivHelper.generateSearchTagUrl(tags, current_page, title_caption, wild_card, oldest_first, start_date, end_date, member_id, self._config.r18mode) PixivHelper.print_and_log('info', 'Looping... for ' + url) # response = self.open(url).read() response = self.getPixivPage(url, returnParsed=False).read() self.handleDebugTagSearchPage(response, url) parse_search_page = BeautifulSoup(response) result = PixivModel.PixivTags() if member_id is not None: result.parseMemberTags(parse_search_page, member_id, tags) else: try: result.parseTags(parse_search_page, tags) except BaseException: PixivHelper.dumpHtml( "Dump for SearchTags " + tags + ".html", response) raise parse_search_page.decompose() del parse_search_page return (result, response)
def getSearchTagPage(self, tags, current_page, wild_card=True, title_caption=False, start_date=None, end_date=None, member_id=None, oldest_first=False, start_page=1, include_bookmark_data=False): response = None result = None url = '' if member_id is not None: # from member id search by tags (artist, response) = self.getMemberPage(member_id, current_page, False, tags) # convert to PixivTags result = PixivTags() result.parseMemberTags(artist, member_id, tags) else: # search by tags url = PixivHelper.generateSearchTagUrl(tags, current_page, title_caption, wild_card, oldest_first, start_date, end_date, member_id, self._config.r18mode) PixivHelper.print_and_log('info', 'Looping... for ' + url) response = self.getPixivPage(url, returnParsed=False) self.handleDebugTagSearchPage(response, url) result = None if member_id is not None: result = PixivTags() parse_search_page = BeautifulSoup(response) result.parseMemberTags(parse_search_page, member_id, tags) parse_search_page.decompose() del parse_search_page else: try: result = PixivTags() result.parseTags(response, tags, current_page) # parse additional information if include_bookmark_data: idx = 0 print("Retrieving bookmark information...", end=' ') for image in result.itemList: idx = idx + 1 print("\r", end=' ') print("Retrieving bookmark information... [{0}] of [{1}]".format(idx, len(result.itemList)), end=' ') img_url = "https://www.pixiv.net/ajax/illust/{0}".format(image.imageId) response = self._get_from_cache(img_url) if response is None: try: response = self.open_with_retry(img_url).read() except urllib2.HTTPError as ex: if ex.code == 404: response = ex.read() self._put_to_cache(img_url, response) image_info_js = json.loads(response) image.bookmarkCount = int(image_info_js["body"]["bookmarkCount"]) image.imageResponse = int(image_info_js["body"]["responseCount"]) print("") except BaseException: PixivHelper.dumpHtml("Dump for SearchTags " + tags + ".html", response) raise return (result, response)
def getSearchTagPage(self, tags, i, wild_card=True, title_caption=False, start_date=None, end_date=None, member_id=None, oldest_first=False, start_page=1): response = None result = None if self._isWhitecube: if member_id is None: # from search page: # https://www.pixiv.net/rpc/whitecube/index.php?order=date&adult_mode=include&q=vocaloid&p=0&type=&mode=whitecube_search&s_mode=s_tag&scd=&size=&ratio=&like=&tools=&tt=4e2cdee233f1156231ee99da1e51a83c url = "https://www.pixiv.net/rpc/whitecube/index.php?q={0}".format( tags) url = url + "&adult_mode={0}".format("include") url = url + "&mode={0}".format("whitecube_search") # date ordering order = "date_d" if oldest_first: order = "date" url = url + "&order={0}".format(order) # search mode s_mode = "s_tag_full" if wild_card: s_mode = "s_tag" elif title_caption: s_mode = "s_tc" url = url + "&s_mode={0}".format(s_mode) # start/end date if start_date is not None: url = url + "&scd={0}".format(start_date) if end_date is not None: url = url + "&ecd={0}".format(end_date) url = url + "&p={0}".format(i) url = url + "&start_page={0}".format(start_page) url = url + "&tt={0}".format(self._whitecubeToken) PixivHelper.printAndLog('info', 'Looping for {0} ...'.format(url)) response = self.open(url).read() PixivHelper.GetLogger().debug(response) result = PixivModelWhiteCube.PixivTags() result.parseTags(response, tags) else: # from member id search by tags print "Not supported yet" else: url = PixivHelper.generateSearchTagUrl(tags, i, title_caption, wild_card, oldest_first, start_date, end_date, member_id, self._config.r18mode) PixivHelper.printAndLog('info', 'Looping... for ' + url) response = self.open(url).read() parse_search_page = BeautifulSoup(response) if self._config.dumpTagSearchPage and self._config.enableDump: dump_filename = PixivHelper.dumpHtml(url + ".html", parse_search_page) PixivHelper.printAndLog( 'info', "Dump tag search page to: " + dump_filename) result = PixivModel.PixivTags() if not member_id is None: result.parseMemberTags(parse_search_page, member_id, tags) else: try: result.parseTags(parse_search_page, tags) except: PixivHelper.dumpHtml( "Dump for SearchTags " + tags + ".html", search_page.get_data()) raise parse_search_page.decompose() del parse_search_page return (result, response)
def getSearchTagPage(self, tags, current_page, wild_card=True, title_caption=False, start_date=None, end_date=None, member_id=None, oldest_first=False, start_page=1): response = None result = None url = '' if member_id is not None: ## if member_id is None: ## # from search page: ## # https://www.pixiv.net/rpc/whitecube/index.php?order=date&adult_mode=include&q=vocaloid&p=0&type=&mode=whitecube_search&s_mode=s_tag&scd=&size=&ratio=&like=&tools=&tt=4e2cdee233f1156231ee99da1e51a83c ## url = "https://www.pixiv.net/rpc/whitecube/index.php?q={0}".format(tags) ## url = url + "&adult_mode={0}".format("include") ## url = url + "&mode={0}".format("whitecube_search") ## ## # date ordering ## order = "date_d" ## if oldest_first: ## order = "date" ## url = url + "&order={0}".format(order) ## ## # search mode ## s_mode = "s_tag_full" ## if wild_card: ## s_mode = "s_tag" ## elif title_caption: ## s_mode = "s_tc" ## url = url + "&s_mode={0}".format(s_mode) ## ## # start/end date ## if start_date is not None: ## url = url + "&scd={0}".format(start_date) ## if end_date is not None: ## url = url + "&ecd={0}".format(end_date) ## ## url = url + "&p={0}".format(i) ## url = url + "&start_page={0}".format(start_page) ## url = url + "&tt={0}".format(self._whitecubeToken) ## ## PixivHelper.print_and_log('info', 'Looping for {0} ...'.format(url)) ## response = self.open(url).read() ## self.handleDebugTagSearchPage(response, url) ## ## PixivHelper.GetLogger().debug(response) ## result = PixivModelWhiteCube.PixivTags() ## result.parseTags(response, tags) ## else: # from member id search by tags (artist, response) = self.getMemberPage(member_id, current_page, False, tags) # convert to PixivTags result = PixivModelWhiteCube.PixivTags() result.parseMemberTags(artist, member_id, tags) else: # search by tags url = PixivHelper.generateSearchTagUrl(tags, current_page, title_caption, wild_card, oldest_first, start_date, end_date, member_id, self._config.r18mode) PixivHelper.print_and_log('info', 'Looping... for ' + url) # response = self.open(url).read() response = self.getPixivPage(url, returnParsed=False).read() self.handleDebugTagSearchPage(response, url) parse_search_page = BeautifulSoup(response) result = PixivModel.PixivTags() if member_id is not None: result.parseMemberTags(parse_search_page, member_id, tags) else: try: result.parseTags(parse_search_page, tags) except BaseException: PixivHelper.dumpHtml("Dump for SearchTags " + tags + ".html", response) raise parse_search_page.decompose() del parse_search_page return (result, response)