Python ClientParsing.GetSoup Examples

Example #1

0

Show file

File: ClientDownloading.py Project: 3wayHimself/hydrus

    def _ParseGalleryPage(self, html, url_base):

        definitely_no_more_pages = False

        urls = []

        soup = ClientParsing.GetSoup(html)

        thumbnail_links = soup.find_all(class_='work')

        thumbnail_urls = [
            thumbnail_link['href'] for thumbnail_link in thumbnail_links
        ]

        for thumbnail_url in thumbnail_urls:

            url = urlparse.urljoin(
                url_base, thumbnail_url
            )  # http://www.pixiv.net/member_illust.php?mode=medium&illust_id=33500690

            urls.append(url)

        urls_and_tags = [(url, set()) for url in urls]

        return (urls_and_tags, definitely_no_more_pages)

Example #2

0

Show file

 def _ParseImagePage( self, html, url_base ):
     
     soup = ClientParsing.GetSoup( html )
     
     tags = set()
     
     author_links = soup.find( 'ul', class_ = 'authorlinks' )
     
     if author_links is not None:
         
         authors = set()
         
         links = author_links.find_all( 'a' )
         
         for link in links:
             
             try:
                 
                 href = link[ 'href' ] # http://warlord-of-noodles.newgrounds.com
                 
                 creator = href.replace( 'http://', '' ).replace( '.newgrounds.com', '' )
                 
                 tags.add( u'creator:' + creator )
                 
             except: pass
             
         
     
     try:
         
         title = soup.find( 'title' )
         
         tags.add( u'title:' + title.string )
         
     except: pass
     
     all_links = soup.find_all( 'a' )
     
     for link in all_links:
         
         try:
             
             href = link[ 'href' ]
             
             if '/browse/tag/' in href: tags.add( link.string )
             
         except: pass
         
     
     #
     
     flash_url = html.split( '"http:\/\/uploads.ungrounded.net\/', 1 )[1]
     
     flash_url = flash_url.split( '"', 1 )[0]
     
     flash_url = flash_url.replace( "\/", '/' )
     
     flash_url = 'http://uploads.ungrounded.net/' + flash_url
     
     return ( flash_url, tags )

Example #3

0

Show file

 def _ParseGalleryPage( self, html, url_base ):
     
     definitely_no_more_pages = False
     
     urls_set = set()
     
     soup = ClientParsing.GetSoup( html )
     
     def correct_url( href ):
         
         if href is None:
             
             return False
             
         
         # a good url is in the form "/pictures/user/artist_name/file_id/title"
         
         if href.count( '/' ) == 5 and href.startswith( '/pictures/user/' ):
             
             ( nothing, pictures, user, artist_name, file_id, title ) = href.split( '/' )
             
             # /pictures/user/artist_name/page/3
             if file_id != 'page':
                 
                 return True
                 
             
         
         return False
         
     
     urls = []
     
     links = soup.find_all( 'a', href = correct_url )
     
     for link in links:
         
         url = 'http://www.hentai-foundry.com' + link['href']
         
         if url not in urls_set:
             
             urls_set.add( url )
             
             urls.append( url )
             
         
     
     # this is copied from old code. surely we can improve it?
     if 'class="next"' not in html:
         
         definitely_no_more_pages = True
         
     
     urls_and_tags = [ ( url, set() ) for url in urls ]
     
     return ( urls_and_tags, definitely_no_more_pages )

Example #4

0

Show file

File: ClientDownloading.py Project: 3wayHimself/hydrus

    def _ParseImagePage(self, html, page_url):

        if 'member_illust.php?mode=manga' in html:

            manga_url = page_url.replace('medium', 'manga')

            raise HydrusExceptions.MimeException(
                page_url +
                ' was manga, not a single image, so could not be downloaded.')

        if 'member_illust.php?mode=ugoira_view' in html:

            raise HydrusExceptions.MimeException(
                page_url +
                ' was ugoira, not a single image, so could not be downloaded.')

        soup = ClientParsing.GetSoup(html)

        #

        original_image = soup.find(class_='original-image')

        image_url = original_image[
            'data-src']  # http://i3.pixiv.net/img-original/img/2014/01/25/19/21/56/41171994_p0.jpg

        #

        tags_parent = soup.find('section', class_='work-tags')

        # <a href="/search.php?s_mode=s_tag_full&amp;word=%E3%83%8F%E3%83%B3%E3%83%89%E3%83%A1%E3%82%A4%E3%83%89" class="text">[unicode tag here]</a>
        tags = [
            link.string for link in tags_parent.find_all('a', class_='text')
        ]

        user = soup.find('h1', class_='user')

        if user is not None:

            tags.append('creator:' + user.string)

        title_parent = soup.find('section', class_=re.compile('work-info'))

        if title_parent is not None:

            title = title_parent.find('h1', class_='title')

            if title is not None:

                tags.append('title:' + title.string)

        return (image_url, tags)

Example #5

0

Show file

File: ClientNetworkingLogin.py Project: antonpaquin/hydrus

    def LoginPixiv(self, network_context, pixiv_id, password):

        session = self.engine.session_manager.GetSession(network_context)

        response = session.get('https://accounts.pixiv.net/login')

        soup = ClientParsing.GetSoup(response.content)

        # some whocking 20kb bit of json tucked inside a hidden form input wew lad
        i = soup.find('input', id='init-config')

        raw_json = i['value']

        j = json.loads(raw_json)

        if 'pixivAccount.postKey' not in j:

            raise HydrusExceptions.ForbiddenException(
                'When trying to log into Pixiv, I could not find the POST key! This is a problem with hydrus\'s pixiv parsing, not your login! Please contact hydrus dev!'
            )

        post_key = j['pixivAccount.postKey']

        form_fields = {}

        form_fields['pixiv_id'] = pixiv_id
        form_fields['password'] = password
        form_fields['captcha'] = ''
        form_fields['g_recaptcha_response'] = ''
        form_fields['return_to'] = 'https://www.pixiv.net'
        form_fields['lang'] = 'en'
        form_fields['post_key'] = post_key
        form_fields['source'] = 'pc'

        headers = {}

        headers[
            'referer'] = "https://accounts.pixiv.net/login?lang=en^source=pc&view_type=page&ref=wwwtop_accounts_index"
        headers['origin'] = "https://accounts.pixiv.net"

        session.post('https://accounts.pixiv.net/api/login?lang=en',
                     data=form_fields,
                     headers=headers)

        time.sleep(1)

Example #6

0

Show file

File: ClientDownloading.py Project: 3wayHimself/hydrus

def Parse4chanPostScreen(html):

    soup = ClientParsing.GetSoup(html)

    title_tag = soup.find('title')

    if title_tag.string == 'Post successful!': return ('success', None)
    elif title_tag.string == '4chan - Banned':

        HydrusData.Print(soup)

        text = 'You are banned from this board! html written to log.'

        HydrusData.ShowText(text)

        return ('big error', text)

    else:

        try:

            problem_tag = soup.find(id='errmsg')

            if problem_tag is None:

                HydrusData.Print(soup)

                text = 'Unknown problem; html written to log.'

                HydrusData.ShowText(text)

                return ('error', text)

            problem = HydrusData.ToUnicode(problem_tag)

            if 'CAPTCHA' in problem: return ('captcha', None)
            elif 'seconds' in problem: return ('too quick', None)
            elif 'Duplicate' in problem:
                return ('error', 'duplicate file detected')
            else:
                return ('error', problem)

        except:
            return ('error', 'unknown error')

Example #7

0

Show file

 def _ParseGalleryPage( self, html, url_base ):
     
     soup = ClientParsing.GetSoup( html )
     
     fatcol = soup.find( 'div', class_ = 'fatcol' )
     
     if fatcol is not None:
         
         links = fatcol.find_all( 'a' )
         
     else:
         
         links = []
         
     
     urls_set = set()
     
     urls = []
     
     for link in links:
         
         try:
             
             url = link[ 'href' ]
             
             if url not in urls_set:
                 
                 if url.startswith( 'http://www.newgrounds.com/portal/view/' ): 
                     
                     urls_set.add( url )
                     
                     urls.append( url )
                     
                 
             
         except: pass
         
     
     definitely_no_more_pages = True
     
     urls_and_tags = [ ( url, set() ) for url in urls ]
     
     return ( urls_and_tags, definitely_no_more_pages )

Example #8

0

Show file

File: ClientDownloading.py Project: 3wayHimself/hydrus

    def _ParseGalleryPage(self, html, url_base):

        definitely_no_more_pages = False

        urls_and_tags = []

        soup = ClientParsing.GetSoup(html)

        thumbs_container = soup.find('div', class_='torpedo-container')

        thumbs = thumbs_container.find_all('span', class_='thumb')

        for thumb in thumbs:

            url = thumb[
                'href']  # something in the form of blah.da.com/art/blah-123456

            tags = []

            urls_and_tags.append((url, tags))

        return (urls_and_tags, definitely_no_more_pages)

Example #9

0

Show file

File: ClientDownloading.py Project: 3wayHimself/hydrus

    def _ParseImagePage(self, html, url_base):

        # can't parse this easily normally because HF is a pain with the preview->click to see full size business.
        # find http://pictures.hentai-foundry.com//
        # then extend it to http://pictures.hentai-foundry.com//k/KABOS/172144/image.jpg
        # the .jpg bit is what we really need, but whatever

        # an example of this:
        # http://www.hentai-foundry.com/pictures/user/Sparrow/440257/Meroulix-LeBeau

        # addendum:
        # some users put pictures.hentai-foundry.com links in their profile images, which then gets repeated up above in some <meta> tag
        # so, lets limit this search to a smaller bit of html

        # example of this:
        # http://www.hentai-foundry.com/pictures/user/teku/572881/Special-Gang-Bang

        try:

            image_soup = ClientParsing.GetSoup(html)

            image_html = unicode(image_soup.find('section', id='picBox'))

            index = image_html.index('pictures.hentai-foundry.com')

            image_url = image_html[index:index + 256]

            if '"' in image_url:

                (image_url, gumpf) = image_url.split('"', 1)

            if '&#039;' in image_url:

                (image_url, gumpf) = image_url.split('&#039;', 1)

            image_url = 'http://' + image_url

        except Exception as e:

            raise Exception('Could not parse image url!' + os.linesep +
                            HydrusData.ToUnicode(e))

        soup = ClientParsing.GetSoup(html)

        tags = []

        try:

            title = soup.find('title')

            (data, nothing) = title.string.split(' - Hentai Foundry')

            data_reversed = data[::
                                 -1]  # want to do it right-side first, because title might have ' by ' in it

            (artist_reversed, title_reversed) = data_reversed.split(' yb ')

            artist = artist_reversed[::-1]

            title = title_reversed[::-1]

            tags.append('creator:' + artist)
            tags.append('title:' + title)

        except:
            pass

        return (image_url, tags)

Example #10

0

Show file

File: ClientDownloading.py Project: 3wayHimself/hydrus

    def _ParseImagePage(self, html, referral_url):

        img_url = None

        soup = ClientParsing.GetSoup(html)

        download_button = soup.find('a', class_='dev-page-download')

        if download_button is None:

            # this method maxes out at 1024 width

            img = soup.find(class_='dev-content-full')

            if img is None:

                # nsfw

                # used to fetch this from a tumblr share url, now we grab from some hidden gubbins behind an age gate

                a_ismatures = soup.find_all('a', class_='ismature')

                imgs = []

                for a_ismature in a_ismatures:

                    imgs.extend(a_ismature.find_all('img'))

                for img in imgs:

                    # <img   width="150" height="75" alt="Jelly gals by ArtInCase" src="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg" data-src="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg" srcset="http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 150w,http://t00.deviantart.net/ELwFngzSW07znskrO2jToktP2Og=/fit-in/700x350/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 698w,http://t04.deviantart.net/53Saq2w0esrTTjZIfHap4ItNNkQ=/fit-in/800x400/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 1262w" sizes="150px">

                    if img.has_attr('srcset'):

                        # http://t13.deviantart.net/l1NkrOhjTzsGDu9nsgMQHgsuZNY=/fit-in/150x150/filters:no_upscale():origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 150w,http://t00.deviantart.net/ELwFngzSW07znskrO2jToktP2Og=/fit-in/700x350/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 698w,http://t04.deviantart.net/53Saq2w0esrTTjZIfHap4ItNNkQ=/fit-in/800x400/filters:fixed_height(100,100):origin()/pre07/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg 1262w
                        # the last url here is what we want

                        srcset = img['srcset']

                        # 798w,http://pre07.deviantart.net/26b2/th/pre/i/2013/187/b/1/jelly_gals_by_artincase-d6caxba.jpg

                        gubbins_and_url = srcset.split(' ')[-2]

                        img_url = gubbins_and_url.split(',')[1]

                        break

            else:

                img_url = img['src']

        else:

            # something like http://www.deviantart.com/download/518046750/varda_and_the_sacred_trees_of_valinor_by_implosinoatic-d8kfjfi.jpg?token=476cb73aa2ab22bb8554542bc9f14982e09bd534&ts=1445717843
            # given the right cookies, it redirects to the truly fullsize image_url
            # otherwise, it seems to redirect to a small interstitial redirect page that heads back to the original image page

            img_url = download_button['href']

        if img_url is None:

            raise HydrusExceptions.ParseException(
                'Could not find a download link--maybe this work was text?')

        return img_url

Example #11

0

Show file

File: ClientDownloading.py Project: 3wayHimself/hydrus

    def _ParseImagePage(self, html, url_base):

        (search_url, search_separator, advance_by_page_num, thumb_classname,
         image_id, image_data,
         tag_classnames_to_namespaces) = self._booru.GetData()

        soup = ClientParsing.GetSoup(html)

        image_url = None

        try:

            if image_id is not None:

                image = soup.find(id=image_id)

                if image is None:

                    image_string = soup.find(text=re.compile('Save this file'))

                    if image_string is None:

                        image_string = soup.find(
                            text=re.compile('Save this video'))

                    if image_string is None:

                        # catchall for rule34hentai.net's webms

                        if image_url is None:

                            a_tags = soup.find_all('a')

                            for a_tag in a_tags:

                                href = a_tag['href']

                                if href is not None:

                                    if href.endswith('.webm'):

                                        image_url = href

                                        break

                        # catchall for rule34hentai.net's mp4s, which are loaded in a mickey-mouse flv player

                        if image_url is None:

                            magic_phrase = 'document.write("<source src=\''

                            if magic_phrase in html:

                                # /image/252605' type='video/mp4...

                                image_url_and_gumpf = html.split(
                                    magic_phrase, 1)[1]

                                image_url = image_url_and_gumpf.split('\'',
                                                                      1)[0]

                    else:

                        image = image_string.parent

                        image_url = image['href']

                else:

                    if image.name in ('img', 'video'):

                        image_url = image['src']

                        if 'Running Danbooru' in html:

                            # possible danbooru resized image

                            possible_better_image = soup.find(
                                id='image-resize-link')

                            if possible_better_image is not None:

                                image_url = possible_better_image['href']

                    elif image.name == 'a':

                        image_url = image['href']

            if image_data is not None:

                links = soup.find_all('a')

                ok_link = None
                better_link = None

                for link in links:

                    if link.string is not None:

                        if link.string.startswith(
                                image_data) or link.string.endswith(
                                    image_data):

                            ok_link = link['href']

                        if link.string.startswith('Download PNG'):

                            better_link = link['href']

                            break

                if better_link is not None:

                    image_url = better_link

                else:

                    image_url = ok_link

        except Exception as e:

            raise HydrusExceptions.DataMissing(
                'Could not parse a download link for ' + url_base + '!' +
                os.linesep + HydrusData.ToUnicode(e))

        if image_url is None:

            raise HydrusExceptions.DataMissing(
                'Could not parse a download link for ' + url_base + '!')

        image_url = urlparse.urljoin(url_base, image_url)

        if 'gelbooru.com' in url_base:

            # giving 404 on some content servers for http, no redirect for some reason
            image_url = ClientNetworkingDomain.ConvertHTTPToHTTPS(image_url)

        tags = []

        for (tag_classname, namespace) in tag_classnames_to_namespaces.items():

            tag_list_entries = soup.find_all(class_=tag_classname)

            for tag_list_entry in tag_list_entries:

                links = tag_list_entry.find_all('a')

                if tag_list_entry.name == 'a':

                    links.append(tag_list_entry)

                for link in links:

                    if link.string is None:

                        continue

                    try:

                        tag_string = HydrusData.ToUnicode(link.string)

                        tag_string = HydrusTags.CleanTag(tag_string)

                        if tag_string in (
                                '?', '-', '+', u'\xe2\x80\x93', u'\u2013'
                        ):  # last two are a couple of amusing encodings of en-dash '-' from danbooru

                            continue

                        tag = HydrusTags.CombineTag(namespace, tag_string)

                        tags.append(tag)

                    except Exception as e:

                        HydrusData.Print('Could not parse tag "' +
                                         repr(link.string) + '":')

                        HydrusData.PrintException(e)

        return (image_url, tags)

Example #12

0

Show file

File: ClientDownloading.py Project: 3wayHimself/hydrus

    def _ParseGalleryPage(self, html, url_base):

        definitely_no_more_pages = False

        urls_set = set()
        urls = []

        soup = ClientParsing.GetSoup(html)

        # this catches 'post-preview' along with 'post-preview not-approved' sort of bullshit
        def starts_with_classname(classname):

            return classname is not None and classname.startswith(
                self._thumb_classname)

        thumbnails = soup.find_all(class_=starts_with_classname)

        # this is a sankaku thing
        popular_thumbnail_parent = soup.find(id='popular-preview')

        if popular_thumbnail_parent is not None:

            popular_thumbnails = popular_thumbnail_parent.find_all(
                class_=starts_with_classname)

            thumbnails = thumbnails[len(popular_thumbnails):]

        for thumbnail in thumbnails:

            links = thumbnail.find_all('a')

            if thumbnail.name == 'a':

                links.append(thumbnail)

            for link in links:

                if link.string is not None and link.string == 'Image Only':

                    continue  # rule 34 @ paheal fix

                url = link['href']

                url = urlparse.urljoin(url_base, url)

                if url not in urls_set:

                    urls_set.add(url)
                    urls.append(url)

        if len(urls) == 0:

            definitely_no_more_pages = True

        if self._booru_name not in gallery_advance_nums:

            if len(urls) > 0:

                gallery_advance_nums[self._booru_name] = len(urls)

        if 'gelbooru.com' in url_base:

            # they now use redirect urls for thumbs, wew lad

            bad_urls = urls

            urls = []

            session = requests.Session()

            for bad_url in bad_urls:

                # the garbage after the redirect.php is the redirect in base64

                # https://gelbooru.com/redirect.php?s=Ly9nZWxib29ydS5jb20vaW5kZXgucGhwP3BhZ2U9cG9zdCZzPXZpZXcmaWQ9MzY5NDEyMg==

                if 'redirect.php' in bad_url:

                    try:

                        encoded_location = bad_url.split('?s=')[1]

                        location = encoded_location.decode('base64')

                        url = urlparse.urljoin(bad_url, location)

                        urls.append(url)

                    except Exception as e:

                        HydrusData.ShowText('gelbooru parsing problem!')
                        HydrusData.ShowException(e)

                        time.sleep(2)

                        break

                else:

                    urls.append(bad_url)

            # giving 404 on some content servers for http, no redirect for some reason
            urls = [
                ClientNetworkingDomain.ConvertHTTPToHTTPS(url) for url in urls
            ]

        urls_and_tags = [(url, set()) for url in urls]

        return (urls_and_tags, definitely_no_more_pages)

Example #13

0

Show file

File: ClientDownloading.py Project: 3wayHimself/hydrus

    def _ParseGalleryPage(self, data, url_base):
        def ConvertRegularToRawURL(regular_url):

            # convert this:
            # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_500.jpg
            # to this:
            # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
            # the 500 part can be a bunch of stuff, including letters

            url_components = regular_url.split('_')

            last_component = url_components[-1]

            (number_gubbins, file_ext) = last_component.split('.')

            raw_last_component = 'raw.' + file_ext

            url_components[-1] = raw_last_component

            raw_url = '_'.join(url_components)

            return raw_url

        def Remove68Subdomain(long_url):

            # sometimes the 68 subdomain gives a 404 on the raw url, so:

            # convert this:
            # http://68.media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg
            # to this:
            # http://media.tumblr.com/5af0d991f26ef9fdad5a0c743fb1eca2/tumblr_opl012ZBOu1tiyj7vo1_raw.jpg

            # I am not sure if it is always 68, but let's not assume

            # Indeed, this is apparently now 78, wew!

            (scheme, rest) = long_url.split('://', 1)

            if rest.startswith('media.tumblr.com'):

                return long_url

            (gumpf, shorter_rest) = rest.split('.', 1)

            shorter_url = scheme + '://' + shorter_rest

            return shorter_url

        def MediaToDataSubdomain(url):

            return url.replace('media', 'data', 1)

        definitely_no_more_pages = False

        if data.startswith('<!DOCTYPE html>'):

            message = 'The tumblr downloader received an unexpected HTML page when it tried to download JSON post information. It is likely that you are an EU/EEA user and have been hit by a GDPR click-through issue.'
            message += os.linesep * 2
            message += 'In order to get the hydrus client to \'click ok\' on that page, please hit _network->logins->DEBUG: misc->do tumblr GDPR click-through_ and try this gallery search again.'
            message += os.linesep * 2
            message += 'If you still have problems, please let hydrus dev know.'

            HydrusData.ShowText(message)

            raise Exception('Tumblr GDPR issue.')

        processed_raw_json = data.split(
            'var tumblr_api_read = ')[1][:-2]  # -1 takes a js ';' off the end

        json_object = json.loads(processed_raw_json)

        urls_and_tags = []

        if 'posts' in json_object:

            for post in json_object['posts']:

                # 2012-06-20 15:59:00 GMT
                date = post['date-gmt']

                date_struct = time.strptime(date, '%Y-%m-%d %H:%M:%S %Z')

                raw_url_available = date_struct.tm_year > 2012

                if 'tags' in post:

                    tags = post['tags']

                else:

                    tags = []

                post_type = post['type']

                if post_type == 'photo':

                    if len(post['photos']) == 0:

                        photos = [post]

                    else:

                        photos = post['photos']

                    for photo in photos:

                        try:

                            url = photo['photo-url-1280']

                            # some urls are given in the form:
                            # https://68.media.tumblr.com/tumblr_m5yb5m2O6A1rso2eyo1_540.jpg
                            # which is missing the hex key in the middle
                            # these urls are unavailable as raws from the main media server
                            # these seem to all be the pre-2013 files, but we'll double-check just in case anyway
                            unusual_hexless_url = url.count('/') == 3

                            if not unusual_hexless_url:

                                if raw_url_available:

                                    url = ConvertRegularToRawURL(url)

                                    url = Remove68Subdomain(url)

                                    url = MediaToDataSubdomain(url)

                            url = ClientNetworkingDomain.ConvertHTTPToHTTPS(
                                url)

                            urls_and_tags.append((url, tags))

                        except:

                            pass

                elif post_type == 'video':

                    if 'video-player' in post:

                        video_player_html = post['video-player']

                        try:

                            vp_soup = ClientParsing.GetSoup(video_player_html)

                            vp_source = vp_soup.find('source')

                            url = vp_source['src']

                            urls_and_tags.append((url, tags))

                        except:

                            pass

        return (urls_and_tags, definitely_no_more_pages)

Example #14

0

Show file

File: ClientNetworkingLogin.py Project: antonpaquin/hydrus

    def TestPixiv(self, pixiv_id, password):

        # this is just an ugly copy, but f**k it for the minute
        # we'll figure out a proper testing engine later with the login engine and tie the manage gui into it as well

        session = requests.Session()

        response = session.get('https://accounts.pixiv.net/login')

        soup = ClientParsing.GetSoup(response.content)

        # some whocking 20kb bit of json tucked inside a hidden form input wew lad
        i = soup.find('input', id='init-config')

        raw_json = i['value']

        j = json.loads(raw_json)

        if 'pixivAccount.postKey' not in j:

            return (
                False,
                'When trying to log into Pixiv, I could not find the POST key! This is a problem with hydrus\'s pixiv parsing, not your login! Please contact hydrus dev!'
            )

        post_key = j['pixivAccount.postKey']

        form_fields = {}

        form_fields['pixiv_id'] = pixiv_id
        form_fields['password'] = password
        form_fields['captcha'] = ''
        form_fields['g_recaptcha_response'] = ''
        form_fields['return_to'] = 'https://www.pixiv.net'
        form_fields['lang'] = 'en'
        form_fields['post_key'] = post_key
        form_fields['source'] = 'pc'

        headers = {}

        headers[
            'referer'] = "https://accounts.pixiv.net/login?lang=en^source=pc&view_type=page&ref=wwwtop_accounts_index"
        headers['origin'] = "https://accounts.pixiv.net"

        r = session.post('https://accounts.pixiv.net/api/login?lang=en',
                         data=form_fields,
                         headers=headers)

        if not r.ok:

            HydrusData.ShowText(r.content)

            return (False, 'Login request failed! Info printed to log.')

        cookies = session.cookies

        cookies.clear_expired_cookies()

        domains = cookies.list_domains()

        for domain in domains:

            if domain.endswith('pixiv.net'):

                d = cookies.get_dict(domain)

                if 'PHPSESSID' not in d:

                    HydrusData.ShowText(r.content)

                    return (
                        False,
                        'Pixiv login failed to establish session! Info printed to log.'
                    )

                return (True, '')

        HydrusData.ShowText(r.content)

        return (
            False,
            'Pixiv login failed to establish session! Info printed to log.')