class RankingCrawler():
    # flow capacity (MB)
    def __init__(self, cookie, capacity=1024):
        self.date = START_DATE
        self.domain = DOMAIN
        self.mode = PIXIV_MODES[PIXIV_MODE]
        # url sample: https://www.pixiv.net/ranking.php?mode=daily&date=20200801&p=1&format=json
        # ref url sample: https://www.pixiv.net/ranking.php?mode=daily&date=20200801
        self.url = 'https://www.pixiv.net/ranking.php?mode=' + self.mode
        self.cookie = cookie
        self.headers = {'x-requested-with': 'XMLHttpRequest'}
        self.collector = Collector(cookie, capacity)

    def __nextday(self):
        self.date += datetime.timedelta(days=1)

    # collect illust_id from daily json
    def collect(self):
        # note that 50 artworks per p=x
        page_num = (ARTWORKS_PER - 1) // 50 + 1  #ceil
        print("---start collecting " + self.mode + " ranking---")
        print("start with " + self.date.strftime("%Y-%m-%d"))
        print("end with " + (self.date + datetime.timedelta(
            days=self.domain - 1)).strftime("%Y-%m-%d" + '\n'))
        # store all jsons' url in self.group
        self.group = set()
        for _i in range(DOMAIN):
            for j in range(page_num):
                self.group.add(self.url + '&date=' +
                               self.date.strftime("%Y%m%d") + '&p=' +
                               str(j + 1) + '&format=json')
            self.__nextday()
        pool = ParallelPool(len(self.group))
        while len(self.group) or not pool.empty():
            time.sleep(THREAD_DELAY)
            # send ranking_json to parallel pool
            while not pool.full() and len(self.group):
                url = self.group.pop()
                ref = re.search('(.*)&p', url).group(1)
                headers = self.headers.update({'Referer': ref})
                pool.add(
                    CollectorUnit(url, self.cookie, ranking_selector, headers))
            # remove complete thread
            finished = pool.finished_item()
            while True:
                try:
                    ranking_json = next(finished)
                    self.collector.add(ranking_json.group)
                    if MOST_OUTPUT:
                        print("--send page " + ranking_json.url +
                              " to collector--")
                except StopIteration:
                    break

        print("\n---collect " + self.mode + " ranking complete---")

    def run(self):
        self.collect()
        self.collector.collect()
        return self.collector.download()
Exemple #2
0
    def test1(self):
        coll1 = Collector()
        coll2 = Collector()
        n = 40
        s1 = 0
        s2 = 0
        q1 = 0
        q2 = 0
        for i in range(n):
            x1 = uniform(5, 15)
            x2 = uniform(20, 30)
            coll1.add(x1)
            coll2.add(x2)
            s1 += x1
            s2 += x2
            q1 += x1 * x1
            q2 += x2 * x2
            self.assertAlmostEqual(q1, coll1.sum_squares(), 5)
            self.assertAlmostEqual(q2, coll2.sum_squares(), 5)
            self.assertAlmostEqual(s1, coll1.sum(), 5)
            self.assertEqual(i + 1, coll1.count())
            self.assertAlmostEqual(s2, coll2.sum(), 5)
            self.assertEqual(i + 1, coll2.count())

        self.assertAlmostEqual(math.sqrt(q1 / n - s1 * s1 / n / n),
                               coll1.standard_deviation(), 5)
        self.assertAlmostEqual(math.sqrt(q2 / n - s2 * s2 / n / n),
                               coll2.standard_deviation(), 5)
        self.assertAlmostEqual(q1 / n - s1 * s1 / n / n, coll1.variance(), 5)
        self.assertAlmostEqual(q2 / n - s2 * s2 / n / n, coll2.variance(), 5)
        self.assertAlmostEqual(s1 / n, coll1.average(), 5)
        self.assertAlmostEqual(s2 / n, coll2.average(), 5)
Exemple #3
0
 def test1(self):
     coll = Collector()
     for x in self.data:
         coll.add(x)
     self.assertAlmostEqual(self.stdev, coll.standard_deviation())
     self.assertAlmostEqual(self.var, coll.variance(), 5)
     self.assertAlmostEqual(self.avg, coll.average(), 5)
     self.assertAlmostEqual(self.sumsq, coll.sum_squares(), 5)
     self.assertAlmostEqual(self.sum, coll.sum(), 5)
     self.assertEqual(self.count, coll.count())
class UserCrawler():
    def __init__(self, artist_id, cookie, capacity=1024):
        self.url = 'https://www.pixiv.net/ajax/user/' + artist_id + '/profile/all?lang=zh'
        self.ref = 'https://www.pixiv.net/users/' + artist_id + '/illustrations'
        self.headers = {'x-user-id': USER_ID}
        self.headers.update({'Referer': self.ref})
        self.cookie = cookie
        self.collector = Collector(cookie, capacity)

    def collect(self):
        user = CollectorUnit(self.url, self.cookie, user_selector,
                             self.headers)
        user.start()
        user.join()
        self.collector.add(user.group)
        print("--send user " + user.url + " to collector--")

    def run(self):
        self.collect()
        self.collector.collect()
        return self.collector.download()
from collector import Collector

coll = Collector()
data = [
    5,
    7,
    13,
    24,
    37,
    56,
    1001,
    251,
    300,
    55,
    4,
    8,
    14,
    76,
    15,
    1051,
    751,
]
for x in range(len(data)):
    coll.add(data[x])
print(f"The list contains {coll.count()} numbers")
print(f"Sum                = {coll.sum():15}")
print(f"Sum of Squares     = {coll.sum_squares():15}")
print(f"Average            = {coll.average():15f}")
print(f"Variance           = {coll.variance():15f}")
print(f"Standard Deviation = {coll.standard_deviation():15f}")
Exemple #6
0
        receivedB = conn.recv(2048)

    data_string = data_bytes.decode("UTF-8")

    print("data_string", data_string)

    if len(data_string) == 0:
        reply = "Error: empty command"
    else:
        cmd = data_string[0]            # first character
        parameter = data_string[1:]     # the rest
        # Adds the value to the collector
        if cmd == 'A':
            try:
                v = float(parameter)
                collector.add(v)
                reply = "OK " + parameter + " added"
            except ValueError as ve:
                reply = "Error"
        # Asks for the average
        elif cmd == 'a':
            reply = str(collector.average())
        # Asks for the count
        elif cmd == 'c':
            reply = str(collector.count())
        # Asks for the standard deviation
        elif cmd == 'd':
            reply = str(collector.standard_deviation())
        # Asks for the sum of squares
        elif cmd == 'q':
            reply = str(collector.sum_squares())
class BookmarkCrawler():
    # count-badge sample:
    #   </h1><span class="count-badge">3069.</span>
    # artworks sample:
    #   href="/artworks/83348083

    # (out-dated) see def get_count & def collect for more
    # url sample:
    #   https://www.pixiv.net/bookmark.php?rest=show&p=1
    #   rest=show for public/ rest=hide for private
    #   note that 20 artworks per p

    # max number of downloads
    # flow capacity
    def __init__(self, cookie, maxnum=200, capacity=1024):
        self.num = maxnum
        self.cookie = cookie
        self.url = "https://www.pixiv.net/ajax/user/" + USER_ID + "/illusts"
        self.headers = BROWSER_HEADER
        self.collect_cnt = 0
        self.collector = Collector(cookie, capacity)

    # get count-badge
    # https://www.pixiv.net/ajax/user/xxxx/illusts/bookmark/tags?lang=zh
    def get_count(self):
        count_url = self.url + "/bookmark/tags?lang=zh"
        print("---start collecting bookmark count---")

        for i in range(FAIL_TIMES):
            try:
                response = requests.get(count_url,
                                        headers=self.headers,
                                        proxies=PROXIES,
                                        cookies=self.cookie,
                                        timeout=4)
                if response.status_code == 200:
                    # cnt = re.search('count-badge.*?(\d+)',
                    #                 response.text).group(1)
                    cnt = response.json()['body']['public'][0]['cnt']
                    self.num = min(self.num, int(cnt))
                    print("total count: " + cnt)
                    print("download count: " + str(self.num))
                    print("---collect bookmark count complete---")
                    return

            except Exception as e:
                print(e)
                print("check your proxy setting")
                # print("maybe it was banned.")
                print("This is " + str(i + 1) +
                      " attempt to collect bookmark count")
                print("next attempt will start in " + str(FAIL_DELAY) +
                      " sec\n")
                time.sleep(FAIL_DELAY)

        print("---fail to collect bookmark count---")
        sys.exit(0)

    # collect bookmark
    # https://www.pixiv.net/ajax/user/xxx/illusts/bookmarks?tag=&offset=0&limit=48&rest=show&lang=zh
    # [offset+1,offset+limit]
    # note that disable artwork'id is num not str...
    def collect(self):
        # default is 48, I just keep it.
        limit = 48
        page_num = (self.num - 1) // limit + 1
        pool = ParallelPool(page_num)
        print("---start collecting " + PIXIV_ID + "\'s bookmarks---")
        # store all pages' url in self.group
        self.group = set()
        for i in range(page_num):
            url = self.url + "/bookmarks?tag="
            url = url + "&offset=" + str(i * limit) + "&limit=" + str(limit)
            url = url + "&rest=show&lang=zh"
            self.group.add(url)

        while len(self.group) or not pool.empty():
            time.sleep(THREAD_DELAY)
            # send page to parallel pool
            while not pool.full() and len(self.group):
                pool.add(
                    CollectorUnit(self.group.pop(), self.cookie,
                                  page_selector))
            # remove complete thread
            finished = pool.finished_item()
            while True:
                try:
                    page = next(finished)
                    self.collector.add(page.group)
                    if MOST_OUTPUT:
                        print("--send page " + page.url + " to collector--")
                except StopIteration:
                    break

        print("\n---collecting bookmark complete---")
        print("downloadable artworks: " + str(len(self.collector.group)))

    def run(self):
        self.get_count()
        self.collect()
        self.collector.collect()
        return self.collector.download()