コード例 #1
0
    def scrape(self, username):
        self._connect('{}/{}/media'.format(self.base_url, username))

        if self._mode != 'silent':
            print('Crawling...')

        done = self.scrollToBottom()

        source = self.source()
        soup = bs(source, 'html.parser')

        # title = soup.find('title')
        # name = title.get_text().replace('Media Tweets by ', '').replace(' | Twitter', '')

        # avatar_url = soup.find("a", { "class" : "ProfileCardMini-avatar" }).get('data-resolved-url-large')
        # background_url = soup.find("div", { "class" : "ProfileCanopy-headerBg" }).find('img').get('src')

        tasks = []
        for div in soup.find_all('div', { "class" : "AdaptiveMedia-photoContainer" }):
            url = div.get('data-image-url')
            tasks.append((url+':large', username, get_filename(url)))

        if self._mode != 'silent':
            print('{} media are found.'.format(len(tasks)))

        return tasks
コード例 #2
0
    def scrape(self, username):
        self._connect('{}/{}/media'.format(self.base_url, username))

        if self._mode != 'silent':
            print('Crawling...')

        done = self.scrollToBottom()

        source = self.source()
        soup = bs(source, 'html.parser')

        # title = soup.find('title')
        # name = title.get_text().replace('Media Tweets by ', '').replace(' | Twitter', '')

        # avatar_url = soup.find("a", { "class" : "ProfileCardMini-avatar" }).get('data-resolved-url-large')
        # background_url = soup.find("div", { "class" : "ProfileCanopy-headerBg" }).find('img').get('src')

        tasks = []
        for li in soup.find_all('li', {'class': 'js-stream-item stream-item stream-item '}):
            photos = li.find_all('div', { "class" : "AdaptiveMedia-photoContainer" })
            if photos == []:
                try:
                    img_url, vid_url = get_twitter_video_url(li['data-item-id'])
                    tasks.append((img_url+':large', username, get_basename(get_filename(img_url))))
                    tasks.append((vid_url, username, get_basename(get_filename(vid_url))))
                except Exception as e:
                    with open('error.txt', 'w', encoding='utf-8') as f:
                        f.write(str(e) + '\n')
                        f.write(str(li))
            else:
                for photo in photos:
                    url = photo['data-image-url']
                    tasks.append((url+':large', username, get_basename(get_filename(url))))
                    
        for div in soup.find_all('div', { "class" : "AdaptiveMedia-photoContainer" }):
            url = div['data-image-url']
            tasks.append((url+':large', username, get_basename(get_filename(url))))

        if self._mode != 'silent':
            print('{} media are found.'.format(len(tasks)))

        return tasks
コード例 #3
0
    def scrape(self, id, content_type='all'):
        self._connect(self.post_url + id)
        self.id = id
        self.type = content_type

        if self._mode != 'silent':
            print('Crawling...')

        # TODO

        # get page num
        pager_container = self._driver.get_element_by_class_name('page-list')
        last_pager = pager_container.get_element_by_tag_name('li')[-1]
        num_page = int(last_pager.get_element_by_tag_name('a').text)
        print('# of page: {}'.format(num_page))

        # crawl each page
        for p in range(1, num_page+1):
            url = 'https://www.pixiv.net/member_illust.php?id={}&type={}&p={}'.format(self.id, self.type, p)
            self._driver._connect(url)
            time.sleep(self._next_page_pause_time)
            print(url)
        # scrape each post

        return

        done = self.scrollToBottom()

        source = self.source()
        soup = bs(source, 'html.parser')

        # title = soup.find('title')
        # name = title.get_text().replace('Media Tweets by ', '').replace(' | Twitter', '')

        # avatar_url = soup.find("a", { "class" : "ProfileCardMini-avatar" }).get('data-resolved-url-large')
        # background_url = soup.find("div", { "class" : "ProfileCanopy-headerBg" }).find('img').get('src')

        tasks = []
        for div in soup.find_all('div', { "class" : "AdaptiveMedia-photoContainer" }):
            url = div.get('data-image-url')
            tasks.append((url+':large', get_filename(url)))

        if self._mode != 'silent':
            print('{} media are found.'.format(len(tasks)))

        return tasks