class RankingCrawler(): # flow capacity (MB) def __init__(self, cookie, capacity=1024): self.date = START_DATE self.domain = DOMAIN self.mode = PIXIV_MODES[PIXIV_MODE] # url sample: https://www.pixiv.net/ranking.php?mode=daily&date=20200801&p=1&format=json # ref url sample: https://www.pixiv.net/ranking.php?mode=daily&date=20200801 self.url = 'https://www.pixiv.net/ranking.php?mode=' + self.mode self.cookie = cookie self.headers = {'x-requested-with': 'XMLHttpRequest'} self.collector = Collector(cookie, capacity) def __nextday(self): self.date += datetime.timedelta(days=1) # collect illust_id from daily json def collect(self): # note that 50 artworks per p=x page_num = (ARTWORKS_PER - 1) // 50 + 1 #ceil print("---start collecting " + self.mode + " ranking---") print("start with " + self.date.strftime("%Y-%m-%d")) print("end with " + (self.date + datetime.timedelta( days=self.domain - 1)).strftime("%Y-%m-%d" + '\n')) # store all jsons' url in self.group self.group = set() for _i in range(DOMAIN): for j in range(page_num): self.group.add(self.url + '&date=' + self.date.strftime("%Y%m%d") + '&p=' + str(j + 1) + '&format=json') self.__nextday() pool = ParallelPool(len(self.group)) while len(self.group) or not pool.empty(): time.sleep(THREAD_DELAY) # send ranking_json to parallel pool while not pool.full() and len(self.group): url = self.group.pop() ref = re.search('(.*)&p', url).group(1) headers = self.headers.update({'Referer': ref}) pool.add( CollectorUnit(url, self.cookie, ranking_selector, headers)) # remove complete thread finished = pool.finished_item() while True: try: ranking_json = next(finished) self.collector.add(ranking_json.group) if MOST_OUTPUT: print("--send page " + ranking_json.url + " to collector--") except StopIteration: break print("\n---collect " + self.mode + " ranking complete---") def run(self): self.collect() self.collector.collect() return self.collector.download()
def test1(self): coll1 = Collector() coll2 = Collector() n = 40 s1 = 0 s2 = 0 q1 = 0 q2 = 0 for i in range(n): x1 = uniform(5, 15) x2 = uniform(20, 30) coll1.add(x1) coll2.add(x2) s1 += x1 s2 += x2 q1 += x1 * x1 q2 += x2 * x2 self.assertAlmostEqual(q1, coll1.sum_squares(), 5) self.assertAlmostEqual(q2, coll2.sum_squares(), 5) self.assertAlmostEqual(s1, coll1.sum(), 5) self.assertEqual(i + 1, coll1.count()) self.assertAlmostEqual(s2, coll2.sum(), 5) self.assertEqual(i + 1, coll2.count()) self.assertAlmostEqual(math.sqrt(q1 / n - s1 * s1 / n / n), coll1.standard_deviation(), 5) self.assertAlmostEqual(math.sqrt(q2 / n - s2 * s2 / n / n), coll2.standard_deviation(), 5) self.assertAlmostEqual(q1 / n - s1 * s1 / n / n, coll1.variance(), 5) self.assertAlmostEqual(q2 / n - s2 * s2 / n / n, coll2.variance(), 5) self.assertAlmostEqual(s1 / n, coll1.average(), 5) self.assertAlmostEqual(s2 / n, coll2.average(), 5)
def test1(self): coll = Collector() for x in self.data: coll.add(x) self.assertAlmostEqual(self.stdev, coll.standard_deviation()) self.assertAlmostEqual(self.var, coll.variance(), 5) self.assertAlmostEqual(self.avg, coll.average(), 5) self.assertAlmostEqual(self.sumsq, coll.sum_squares(), 5) self.assertAlmostEqual(self.sum, coll.sum(), 5) self.assertEqual(self.count, coll.count())
class UserCrawler(): def __init__(self, artist_id, cookie, capacity=1024): self.url = 'https://www.pixiv.net/ajax/user/' + artist_id + '/profile/all?lang=zh' self.ref = 'https://www.pixiv.net/users/' + artist_id + '/illustrations' self.headers = {'x-user-id': USER_ID} self.headers.update({'Referer': self.ref}) self.cookie = cookie self.collector = Collector(cookie, capacity) def collect(self): user = CollectorUnit(self.url, self.cookie, user_selector, self.headers) user.start() user.join() self.collector.add(user.group) print("--send user " + user.url + " to collector--") def run(self): self.collect() self.collector.collect() return self.collector.download()
from collector import Collector coll = Collector() data = [ 5, 7, 13, 24, 37, 56, 1001, 251, 300, 55, 4, 8, 14, 76, 15, 1051, 751, ] for x in range(len(data)): coll.add(data[x]) print(f"The list contains {coll.count()} numbers") print(f"Sum = {coll.sum():15}") print(f"Sum of Squares = {coll.sum_squares():15}") print(f"Average = {coll.average():15f}") print(f"Variance = {coll.variance():15f}") print(f"Standard Deviation = {coll.standard_deviation():15f}")
receivedB = conn.recv(2048) data_string = data_bytes.decode("UTF-8") print("data_string", data_string) if len(data_string) == 0: reply = "Error: empty command" else: cmd = data_string[0] # first character parameter = data_string[1:] # the rest # Adds the value to the collector if cmd == 'A': try: v = float(parameter) collector.add(v) reply = "OK " + parameter + " added" except ValueError as ve: reply = "Error" # Asks for the average elif cmd == 'a': reply = str(collector.average()) # Asks for the count elif cmd == 'c': reply = str(collector.count()) # Asks for the standard deviation elif cmd == 'd': reply = str(collector.standard_deviation()) # Asks for the sum of squares elif cmd == 'q': reply = str(collector.sum_squares())
class BookmarkCrawler(): # count-badge sample: # </h1><span class="count-badge">3069.</span> # artworks sample: # href="/artworks/83348083 # (out-dated) see def get_count & def collect for more # url sample: # https://www.pixiv.net/bookmark.php?rest=show&p=1 # rest=show for public/ rest=hide for private # note that 20 artworks per p # max number of downloads # flow capacity def __init__(self, cookie, maxnum=200, capacity=1024): self.num = maxnum self.cookie = cookie self.url = "https://www.pixiv.net/ajax/user/" + USER_ID + "/illusts" self.headers = BROWSER_HEADER self.collect_cnt = 0 self.collector = Collector(cookie, capacity) # get count-badge # https://www.pixiv.net/ajax/user/xxxx/illusts/bookmark/tags?lang=zh def get_count(self): count_url = self.url + "/bookmark/tags?lang=zh" print("---start collecting bookmark count---") for i in range(FAIL_TIMES): try: response = requests.get(count_url, headers=self.headers, proxies=PROXIES, cookies=self.cookie, timeout=4) if response.status_code == 200: # cnt = re.search('count-badge.*?(\d+)', # response.text).group(1) cnt = response.json()['body']['public'][0]['cnt'] self.num = min(self.num, int(cnt)) print("total count: " + cnt) print("download count: " + str(self.num)) print("---collect bookmark count complete---") return except Exception as e: print(e) print("check your proxy setting") # print("maybe it was banned.") print("This is " + str(i + 1) + " attempt to collect bookmark count") print("next attempt will start in " + str(FAIL_DELAY) + " sec\n") time.sleep(FAIL_DELAY) print("---fail to collect bookmark count---") sys.exit(0) # collect bookmark # https://www.pixiv.net/ajax/user/xxx/illusts/bookmarks?tag=&offset=0&limit=48&rest=show&lang=zh # [offset+1,offset+limit] # note that disable artwork'id is num not str... def collect(self): # default is 48, I just keep it. limit = 48 page_num = (self.num - 1) // limit + 1 pool = ParallelPool(page_num) print("---start collecting " + PIXIV_ID + "\'s bookmarks---") # store all pages' url in self.group self.group = set() for i in range(page_num): url = self.url + "/bookmarks?tag=" url = url + "&offset=" + str(i * limit) + "&limit=" + str(limit) url = url + "&rest=show&lang=zh" self.group.add(url) while len(self.group) or not pool.empty(): time.sleep(THREAD_DELAY) # send page to parallel pool while not pool.full() and len(self.group): pool.add( CollectorUnit(self.group.pop(), self.cookie, page_selector)) # remove complete thread finished = pool.finished_item() while True: try: page = next(finished) self.collector.add(page.group) if MOST_OUTPUT: print("--send page " + page.url + " to collector--") except StopIteration: break print("\n---collecting bookmark complete---") print("downloadable artworks: " + str(len(self.collector.group))) def run(self): self.get_count() self.collect() self.collector.collect() return self.collector.download()