def test_collector_download_destination_path_does_not_exist(): timestamp = datetime.datetime.now().strftime("%Y%m%d.%H") uta_label = "#" + "M.U1" + "." + timestamp + "#" filename = "ia.icama." + uta_label + ".zip" collector = Collector() collector.connect(host="192.168.179.130", username="******", password="******") collector.download(path="/home/fastfile/ewsd", filename=filename, destination_path="c:\\Zdenek\\_tmp\\nonexistant\\")
def test_collector_download_invalid_request(): timestamp = datetime.datetime.now().strftime("%Y%m%d.%H") uta_label = "#" + "M.U1" + "." + timestamp + "#" filename = "ia.icama." + uta_label + ".zip" collector = Collector() collector.connect(host="192.168.179.130", username="******", password="******") collector.download(path=None, filename=None, destination_path=None)
class RankingCrawler(): # flow capacity (MB) def __init__(self, cookie, capacity=1024): self.date = START_DATE self.domain = DOMAIN self.mode = PIXIV_MODES[PIXIV_MODE] # url sample: https://www.pixiv.net/ranking.php?mode=daily&date=20200801&p=1&format=json # ref url sample: https://www.pixiv.net/ranking.php?mode=daily&date=20200801 self.url = 'https://www.pixiv.net/ranking.php?mode=' + self.mode self.cookie = cookie self.headers = {'x-requested-with': 'XMLHttpRequest'} self.collector = Collector(cookie, capacity) def __nextday(self): self.date += datetime.timedelta(days=1) # collect illust_id from daily json def collect(self): # note that 50 artworks per p=x page_num = (ARTWORKS_PER - 1) // 50 + 1 #ceil print("---start collecting " + self.mode + " ranking---") print("start with " + self.date.strftime("%Y-%m-%d")) print("end with " + (self.date + datetime.timedelta( days=self.domain - 1)).strftime("%Y-%m-%d" + '\n')) # store all jsons' url in self.group self.group = set() for _i in range(DOMAIN): for j in range(page_num): self.group.add(self.url + '&date=' + self.date.strftime("%Y%m%d") + '&p=' + str(j + 1) + '&format=json') self.__nextday() pool = ParallelPool(len(self.group)) while len(self.group) or not pool.empty(): time.sleep(THREAD_DELAY) # send ranking_json to parallel pool while not pool.full() and len(self.group): url = self.group.pop() ref = re.search('(.*)&p', url).group(1) headers = self.headers.update({'Referer': ref}) pool.add( CollectorUnit(url, self.cookie, ranking_selector, headers)) # remove complete thread finished = pool.finished_item() while True: try: ranking_json = next(finished) self.collector.add(ranking_json.group) if MOST_OUTPUT: print("--send page " + ranking_json.url + " to collector--") except StopIteration: break print("\n---collect " + self.mode + " ranking complete---") def run(self): self.collect() self.collector.collect() return self.collector.download()
class UserCrawler(): def __init__(self, artist_id, cookie, capacity=1024): self.url = 'https://www.pixiv.net/ajax/user/' + artist_id + '/profile/all?lang=zh' self.ref = 'https://www.pixiv.net/users/' + artist_id + '/illustrations' self.headers = {'x-user-id': USER_ID} self.headers.update({'Referer': self.ref}) self.cookie = cookie self.collector = Collector(cookie, capacity) def collect(self): user = CollectorUnit(self.url, self.cookie, user_selector, self.headers) user.start() user.join() self.collector.add(user.group) print("--send user " + user.url + " to collector--") def run(self): self.collect() self.collector.collect() return self.collector.download()
def test_collector_download_file_does_not_exist(): collector = Collector() collector.connect(host="192.168.179.130", username="******", password="******") collector.download(path="/home/fastfile/ewsd", filename="nonexistant", destination_path="c:\\Zdenek\\_tmp\\")
class BookmarkCrawler(): # count-badge sample: # </h1><span class="count-badge">3069.</span> # artworks sample: # href="/artworks/83348083 # (out-dated) see def get_count & def collect for more # url sample: # https://www.pixiv.net/bookmark.php?rest=show&p=1 # rest=show for public/ rest=hide for private # note that 20 artworks per p # max number of downloads # flow capacity def __init__(self, cookie, maxnum=200, capacity=1024): self.num = maxnum self.cookie = cookie self.url = "https://www.pixiv.net/ajax/user/" + USER_ID + "/illusts" self.headers = BROWSER_HEADER self.collect_cnt = 0 self.collector = Collector(cookie, capacity) # get count-badge # https://www.pixiv.net/ajax/user/xxxx/illusts/bookmark/tags?lang=zh def get_count(self): count_url = self.url + "/bookmark/tags?lang=zh" print("---start collecting bookmark count---") for i in range(FAIL_TIMES): try: response = requests.get(count_url, headers=self.headers, proxies=PROXIES, cookies=self.cookie, timeout=4) if response.status_code == 200: # cnt = re.search('count-badge.*?(\d+)', # response.text).group(1) cnt = response.json()['body']['public'][0]['cnt'] self.num = min(self.num, int(cnt)) print("total count: " + cnt) print("download count: " + str(self.num)) print("---collect bookmark count complete---") return except Exception as e: print(e) print("check your proxy setting") # print("maybe it was banned.") print("This is " + str(i + 1) + " attempt to collect bookmark count") print("next attempt will start in " + str(FAIL_DELAY) + " sec\n") time.sleep(FAIL_DELAY) print("---fail to collect bookmark count---") sys.exit(0) # collect bookmark # https://www.pixiv.net/ajax/user/xxx/illusts/bookmarks?tag=&offset=0&limit=48&rest=show&lang=zh # [offset+1,offset+limit] # note that disable artwork'id is num not str... def collect(self): # default is 48, I just keep it. limit = 48 page_num = (self.num - 1) // limit + 1 pool = ParallelPool(page_num) print("---start collecting " + PIXIV_ID + "\'s bookmarks---") # store all pages' url in self.group self.group = set() for i in range(page_num): url = self.url + "/bookmarks?tag=" url = url + "&offset=" + str(i * limit) + "&limit=" + str(limit) url = url + "&rest=show&lang=zh" self.group.add(url) while len(self.group) or not pool.empty(): time.sleep(THREAD_DELAY) # send page to parallel pool while not pool.full() and len(self.group): pool.add( CollectorUnit(self.group.pop(), self.cookie, page_selector)) # remove complete thread finished = pool.finished_item() while True: try: page = next(finished) self.collector.add(page.group) if MOST_OUTPUT: print("--send page " + page.url + " to collector--") except StopIteration: break print("\n---collecting bookmark complete---") print("downloadable artworks: " + str(len(self.collector.group))) def run(self): self.get_count() self.collect() self.collector.collect() return self.collector.download()