Beispiel #1
0
def test_collector_download_destination_path_does_not_exist():
    timestamp = datetime.datetime.now().strftime("%Y%m%d.%H")
    uta_label = "#" + "M.U1" + "." + timestamp + "#"
    filename = "ia.icama." + uta_label + ".zip"
    collector = Collector()
    collector.connect(host="192.168.179.130", username="******", password="******")
    collector.download(path="/home/fastfile/ewsd", filename=filename, destination_path="c:\\Zdenek\\_tmp\\nonexistant\\")
Beispiel #2
0
def test_collector_download_invalid_request():
    timestamp = datetime.datetime.now().strftime("%Y%m%d.%H")
    uta_label = "#" + "M.U1" + "." + timestamp + "#"
    filename = "ia.icama." + uta_label + ".zip"
    collector = Collector()
    collector.connect(host="192.168.179.130", username="******", password="******")
    collector.download(path=None, filename=None, destination_path=None)
class RankingCrawler():
    # flow capacity (MB)
    def __init__(self, cookie, capacity=1024):
        self.date = START_DATE
        self.domain = DOMAIN
        self.mode = PIXIV_MODES[PIXIV_MODE]
        # url sample: https://www.pixiv.net/ranking.php?mode=daily&date=20200801&p=1&format=json
        # ref url sample: https://www.pixiv.net/ranking.php?mode=daily&date=20200801
        self.url = 'https://www.pixiv.net/ranking.php?mode=' + self.mode
        self.cookie = cookie
        self.headers = {'x-requested-with': 'XMLHttpRequest'}
        self.collector = Collector(cookie, capacity)

    def __nextday(self):
        self.date += datetime.timedelta(days=1)

    # collect illust_id from daily json
    def collect(self):
        # note that 50 artworks per p=x
        page_num = (ARTWORKS_PER - 1) // 50 + 1  #ceil
        print("---start collecting " + self.mode + " ranking---")
        print("start with " + self.date.strftime("%Y-%m-%d"))
        print("end with " + (self.date + datetime.timedelta(
            days=self.domain - 1)).strftime("%Y-%m-%d" + '\n'))
        # store all jsons' url in self.group
        self.group = set()
        for _i in range(DOMAIN):
            for j in range(page_num):
                self.group.add(self.url + '&date=' +
                               self.date.strftime("%Y%m%d") + '&p=' +
                               str(j + 1) + '&format=json')
            self.__nextday()
        pool = ParallelPool(len(self.group))
        while len(self.group) or not pool.empty():
            time.sleep(THREAD_DELAY)
            # send ranking_json to parallel pool
            while not pool.full() and len(self.group):
                url = self.group.pop()
                ref = re.search('(.*)&p', url).group(1)
                headers = self.headers.update({'Referer': ref})
                pool.add(
                    CollectorUnit(url, self.cookie, ranking_selector, headers))
            # remove complete thread
            finished = pool.finished_item()
            while True:
                try:
                    ranking_json = next(finished)
                    self.collector.add(ranking_json.group)
                    if MOST_OUTPUT:
                        print("--send page " + ranking_json.url +
                              " to collector--")
                except StopIteration:
                    break

        print("\n---collect " + self.mode + " ranking complete---")

    def run(self):
        self.collect()
        self.collector.collect()
        return self.collector.download()
class UserCrawler():
    def __init__(self, artist_id, cookie, capacity=1024):
        self.url = 'https://www.pixiv.net/ajax/user/' + artist_id + '/profile/all?lang=zh'
        self.ref = 'https://www.pixiv.net/users/' + artist_id + '/illustrations'
        self.headers = {'x-user-id': USER_ID}
        self.headers.update({'Referer': self.ref})
        self.cookie = cookie
        self.collector = Collector(cookie, capacity)

    def collect(self):
        user = CollectorUnit(self.url, self.cookie, user_selector,
                             self.headers)
        user.start()
        user.join()
        self.collector.add(user.group)
        print("--send user " + user.url + " to collector--")

    def run(self):
        self.collect()
        self.collector.collect()
        return self.collector.download()
Beispiel #5
0
def test_collector_download_file_does_not_exist():
    collector = Collector()
    collector.connect(host="192.168.179.130", username="******", password="******")
    collector.download(path="/home/fastfile/ewsd", filename="nonexistant", destination_path="c:\\Zdenek\\_tmp\\")
class BookmarkCrawler():
    # count-badge sample:
    #   </h1><span class="count-badge">3069.</span>
    # artworks sample:
    #   href="/artworks/83348083

    # (out-dated) see def get_count & def collect for more
    # url sample:
    #   https://www.pixiv.net/bookmark.php?rest=show&p=1
    #   rest=show for public/ rest=hide for private
    #   note that 20 artworks per p

    # max number of downloads
    # flow capacity
    def __init__(self, cookie, maxnum=200, capacity=1024):
        self.num = maxnum
        self.cookie = cookie
        self.url = "https://www.pixiv.net/ajax/user/" + USER_ID + "/illusts"
        self.headers = BROWSER_HEADER
        self.collect_cnt = 0
        self.collector = Collector(cookie, capacity)

    # get count-badge
    # https://www.pixiv.net/ajax/user/xxxx/illusts/bookmark/tags?lang=zh
    def get_count(self):
        count_url = self.url + "/bookmark/tags?lang=zh"
        print("---start collecting bookmark count---")

        for i in range(FAIL_TIMES):
            try:
                response = requests.get(count_url,
                                        headers=self.headers,
                                        proxies=PROXIES,
                                        cookies=self.cookie,
                                        timeout=4)
                if response.status_code == 200:
                    # cnt = re.search('count-badge.*?(\d+)',
                    #                 response.text).group(1)
                    cnt = response.json()['body']['public'][0]['cnt']
                    self.num = min(self.num, int(cnt))
                    print("total count: " + cnt)
                    print("download count: " + str(self.num))
                    print("---collect bookmark count complete---")
                    return

            except Exception as e:
                print(e)
                print("check your proxy setting")
                # print("maybe it was banned.")
                print("This is " + str(i + 1) +
                      " attempt to collect bookmark count")
                print("next attempt will start in " + str(FAIL_DELAY) +
                      " sec\n")
                time.sleep(FAIL_DELAY)

        print("---fail to collect bookmark count---")
        sys.exit(0)

    # collect bookmark
    # https://www.pixiv.net/ajax/user/xxx/illusts/bookmarks?tag=&offset=0&limit=48&rest=show&lang=zh
    # [offset+1,offset+limit]
    # note that disable artwork'id is num not str...
    def collect(self):
        # default is 48, I just keep it.
        limit = 48
        page_num = (self.num - 1) // limit + 1
        pool = ParallelPool(page_num)
        print("---start collecting " + PIXIV_ID + "\'s bookmarks---")
        # store all pages' url in self.group
        self.group = set()
        for i in range(page_num):
            url = self.url + "/bookmarks?tag="
            url = url + "&offset=" + str(i * limit) + "&limit=" + str(limit)
            url = url + "&rest=show&lang=zh"
            self.group.add(url)

        while len(self.group) or not pool.empty():
            time.sleep(THREAD_DELAY)
            # send page to parallel pool
            while not pool.full() and len(self.group):
                pool.add(
                    CollectorUnit(self.group.pop(), self.cookie,
                                  page_selector))
            # remove complete thread
            finished = pool.finished_item()
            while True:
                try:
                    page = next(finished)
                    self.collector.add(page.group)
                    if MOST_OUTPUT:
                        print("--send page " + page.url + " to collector--")
                except StopIteration:
                    break

        print("\n---collecting bookmark complete---")
        print("downloadable artworks: " + str(len(self.collector.group)))

    def run(self):
        self.get_count()
        self.collect()
        self.collector.collect()
        return self.collector.download()