Python ScraperParseException Examples, sandcrawler.scraper.ScraperParseException Python Examples

Example #1

0

Show file

File: serienjunkies_org.py Project: realchief/Scraping_BeautifulSoup_phantomjs

    def _resolve_url(self, url):
        soup = self.get_soup(url)
        download_id_input = soup.find('input',
                                      attrs={
                                          'name': 's',
                                          'type': 'HIDDEN'
                                      })
        if not download_id_input:
            raise ScraperParseException('Could not find download id.')

        download_id = download_id_input['value']
        token = self.get_recaptcha_token()

        data = {
            'g-recaptcha-response': token,
            'action': 'Download',
            's': download_id,
            'newcap': 'true',
        }

        redirect_soup = self.post_soup(
            url,
            data=data,
        )
        # Just grab the first form.
        download_form = redirect_soup.select_one('form')
        if not download_form:
            raise ScraperParseException('Could not find download form.')
        target = download_form['action']
        return self.get_redirect_location(target)

Example #2

0

Show file

    def _parse_parse_page(self, soup):
        # grab the main frame
        mainframe = soup.select_one('frame#mainFrame')
        if not mainframe:
            raise ScraperParseException('Failed to find frame')
        frame_soup = self.get_soup(mainframe['src'])
        if unicode(frame_soup).find('seems to be out of date or broken.') >= 0:
            self.log.warning('404 from search result.')
            return

        body = frame_soup.select('td.post2 div.postcolor')
        if not body:
            raise ScraperParseException('Could not find body')
        body = body[0]
        image = self.util.find_image_src_or_none(body, 'img')
        for link in self.util.find_urls_in_text(
                unicode(body),
                skip_images=True,
                skip_imdb=True,
                skip_youtube=True,
        ):
            self.submit_parse_result(
                index_page_title=soup.title.text.strip(),
                link_url=link,
                image=image,
            )

Example #3

0

Show file

    def _follow_iframes(self, url, depth=1):
        if depth > 5:
            raise ScraperParseException(
                "Reached recursion depth of 5 following"
                "iframe %s" % url)
        # Follow that and look for
        # <iframe src="..."
        iframe_source = self.get(url)

        matches = []
        mtch = re.search('iframe (id="ifr_view" )?src="(.*?)"',
                         iframe_source.text)
        if mtch:
            matches.append(mtch.group(2))

        mtch = re.search('iframe src=\&quot;(.*?)\&quot;', iframe_source.text)
        if mtch:
            matches.append(mtch.group(1))

        for url in matches:
            if url.startswith('http'):
                return url
            else:
                return self._follow_iframes('http://api.ekranka.tv/' + url,
                                            depth=depth + 1)
        else:
            raise ScraperParseException("No iframe found on api")

Example #4

0

Show file

File: somosmovies_com.py Project: realchief/Scraping_BeautifulSoup_phantomjs

    def get_captcha_links(self, parse_url):
        self.load_session_cookies()
        soup = self.get_soup(parse_url)
        if self._recaptcha_on_page(soup):
            key = self.get_recaptcha_token()
            verify_response = self.get(
                'http://somosmovies.com/recaptcha/verify/{}/'.format(key),
                headers={
                    'Referer': parse_url,
                    'Accept': 'application/json, text/plain, */*',
                })
            if not verify_response.ok:
                raise ScraperParseException(
                    'Got invalid response for recap verify.')
            if not verify_response.json()['success']:
                raise ScraperParseException(
                    'Got not OK response for recap verify.')
            soup = self.get_soup(parse_url)
            if self._recaptcha_on_page(soup):
                raise ScraperParseException(
                    'Recapcha back on page after refresh.')
            self.save_session_cookies()

        index_page_title = self.util.get_page_title(soup)
        submitted_links = set()
        results = []
        for episodeblock in soup.find_all('div',
                                          {'id': re.compile('^S\d+E\d+$')}):
            season, episode = \
                self.util.extract_season_episode(episodeblock['id'])
            for result in episodeblock.select('a.btn'):
                if result['href'] in submitted_links:
                    continue
                submitted_links.add(result['href'])
                results.append(
                    dict(
                        link_url=result['href'],
                        link_title=result.text.strip(),
                        index_page_title=index_page_title,
                        series_season=season,
                        series_episode=episode,
                    ))

        for result in soup.select('div.tab-links tr div.collapse a.btn'):
            if result['href'] in submitted_links:
                continue
            submitted_links.add(result['href'])
            results.append(
                dict(
                    link_url=result['href'],
                    link_title=result.text.strip(),
                    index_page_title=index_page_title,
                ))
        return results

Example #5

0

Show file

    def _parse_parse_page(self, soup, serie_mode=False):
        title = soup.select_one('.icon-1').text.strip()

        found_items = []
        body = unicode(soup)
        ss_index = body.find('var ss = "')
        end_ss_section_index = body.find(u'// Показываем какой-то сезон')
        if ss_index > 0 and end_ss_section_index > 0:
            ss_script = body[ss_index:end_ss_section_index]
            for ss_section in ss_script.split('var ss = "'):
                if not ss_section:
                    # Skip first one.
                    continue
                season = ss_section[:ss_section.find('"')]
                for episode in ss_section.split(');'):
                    if not episode.strip():
                        # Skip last one.
                        continue
                    id = re.search("'id': (\d+)", episode).group(1)
                    tt = re.search("'tt': '(.*?)'", episode).group(1)
                    if id and tt:
                        found_items.append(
                            (season, id, tt)
                        )
        else:
            raise ScraperParseException('Could not find ss section.')

        index_page_title = self.util.get_page_title(soup)

        for season, id, tt in found_items:
            result = self._extract_result(season, id, tt)
            result['index_page_title'] = index_page_title
            result['link_title'] = title
            self.submit_parse_result(**result)

Example #6

0

Show file

 def _parse_parse_page(self, soup):
     for hidden_tag in soup.select('div.tag_hide script'):
         # Simple obfuscation - does a document.write(HTML)
         # Extract that out, soup it, find our linkss
         match = re.match(
             '.*document\.write\((.*)\);.*',
             hidden_tag.text
         )
         if match:
             html = match.group(1)
             hidden_soup = self.make_soup(html)
             for link in hidden_soup.select('a'):
                 if link['href'].startswith('/lnk/'):
                     try:
                         response = self.get(self.BASE_URL + link['href'])
                     except ScraperFetchException:
                         self.log.error('Could not follow link: %s', link)
                     else:
                         self.submit_parse_result(index_page_title=soup.title.text.strip(),
                                                  link_url=response.url,
                                                  link_title=link.text,
                                                  )
                 else:
                     self.submit_parse_result(index_page_title=soup.title.text.strip(),
                                              link_url=link['href'],
                                              link_title=link.text,
                                              )
         else:
             raise ScraperParseException('Failed to extract hidden tags.')

Example #7

0

Show file

 def _parse_parse_page(self, soup):
     # First suck out the header so we can get season/ep
     title = soup.find('h1', 'post-title').text
     season, episode = self._extract_season_episode(title)
     # Javascript callouts to
     # http://www.cucirca.eu/getvideo.php?id=193507&nr=20&time=1429763089910
     # Where id and nr come from each links onclick
     # Doesn't look like time matters.
     for link in soup.findAll('a',
                              attrs={'onclick': re.compile('^video\(')}):
         match = re.search('video\((\d+),(\d+),\d+\)', link['onclick'])
         if match:
             vid_num, vid_id = match.groups()
             video_soup = self.get_soup(
                 'http://www.cucirca.eu/getvideo.php?id=%s&nr=%s' %
                 (vid_id, vid_num))
             for iframe in video_soup.select('iframe'):
                 self.submit_parse_result(
                     index_page_title=soup.title.text.strip(),
                     link_url=iframe['src'],
                     link_title=title,
                     series_season=season,
                     series_episode=episode,
                 )
         else:
             raise ScraperParseException(
                 'Failed to parse video onclick: %s' % link['onclick'])

Example #8

0

Show file

 def search(self, search_term, media_type, **extra):
     category = {
         ScraperBase.MEDIA_TYPE_FILM: 3,
         ScraperBase.MEDIA_TYPE_GAME: 2,
         ScraperBase.MEDIA_TYPE_TV: 4,
         ScraperBase.MEDIA_TYPE_BOOK: 7,
     }
     category_id = category.get(media_type, '')
     self._load_cookies()
     tries = 0
     step = '{};{};0'.format(self.util.quote(search_term), category_id)
     while self.can_fetch_next():
         soup = self.post_soup(self.BASE_URL + '/res/suche',
                               data={
                                   'Step': step,
                               },
                               headers={})
         if not soup:
             if tries > 2:
                 raise ScraperParseException(
                     'Could not get search response after 2 tries.')
             tries += 1
             cache.invalidate(self.cache_key)
             self._load_cookies()
         else:
             for link in soup.select('a'):
                 text_block = link.select_one('div.tle')
                 link_title = None
                 if text_block:
                     link_title = text_block.text
                 self.submit_parse_result(
                     link_url=self.BASE_URL + '/' + link.href,
                     link_title=link_title,
                     image=self.util.find_image_src_or_none(
                         link, 'img.lazy'))

Example #9

0

Show file

    def _parse_parse_page(self, soup):

        for serie_link in soup.select(
                '.panel-collapse.collapse.in .media .media-body .media-heading'
        ):
            video_url = serie_link.parent.href
            serie_soup = self.get_soup(video_url)

            # <img id="gmodal1" data-token="MzMxMjc1MTA3NDMxOA==" onclick="ccc('MzMxMjc1MTA3NDMxOA==');" class="openw_old" style="cursor:pointer;margin-top:-165px;" title="Oglądaj odcinki online!" src="http://www.serialeonline.pl/assets/img/serial-odcinek_06.png">
            # calls
            # function ccc(tk){
            # var url = tk;
            # $("#video").modal('show');
            # $.get(JS_MAIN_DOMAIN + "ajax/em.php?did=" + url + "&trurl=1453233789569e967d25e87&w=0", function(data) {
            #   $('#modalem').html(data);
            # }).success(function() {
            #   $(".embedlista li a").first().click();
            # });

            # http://www.serialeonline.pl/ajax/em.php?did=MzMxMjc1MTA3NDMxOA==&trurl=1453233877569e96d5d21af&w=0

            # Looks like we only need the did - the rest is just junk :)

            # find the watch me link.
            links = serie_soup.select('img#gmodal1')
            for link in links:
                token = link.attrs.get('data-token')
                if not token:
                    raise ScraperParseException('Could not extract token.')
                url = self.BASE_URL + '/ajax/em.php?w=0&did=' + token
                links_soup = self.get_soup(url)

                for funnylnk in links_soup.select('dd.linkplayer'):
                    onclick = funnylnk.attrs.get('onclick')
                    if not onclick:
                        raise ScraperParseException(
                            'Could not extract onclick from dd.')
                    season, episode = self.util.extract_season_episode(
                        video_url)
                    for lnk in self.util.find_urls_in_text(onclick):
                        self.submit_parse_result(
                            index_page_title=soup.title.text.strip(),
                            link_url=lnk,
                            link_title=funnylnk.text,
                            series_season=season,
                            series_episode=episode,
                        )

Example #10

0

Show file

File: emuparadise_me.py Project: realchief/Scraping_BeautifulSoup_phantomjs

    def _parse_parse_page(self, soup):
        index_page_title = self.util.get_page_title(soup)
        for link in soup.select('div.download-link a'):
            if link.href.startswith('//') or link.href.startswith('http'):
                # It's a standalone link - submit it
                href = link.href
                if href.startswith('//'):
                    href = 'http:{}'.format(href)
                self.submit_parse_result(
                    index_page_title=index_page_title,
                    link_url=href,
                    link_title=link.text,
                )
            else:
                url = self.BASE_URL + link.href
                soup = self.get_soup(url)
                if soup.select('#captchadiv'):
                    # LOL - there's a skip captcha link/function.
                    # all it does is set the cookie, which is done in the 'parse'
                    # function.
                    raise ScraperParseException('Found captcha in page.')

                    # self.log.debug('Captcha Found')
                    # # Pull out the noscript version of the iframe.
                    # iframe = None
                    # for noscript in soup.select('noscript'):
                    #     srch = re.search('<iframe src=\"(.*?)\"')
                    #     if srch:
                    #         iframe = srch.group(1)
                    #         break
                    #
                    # solve_soup = self.get(iframe)
                    # image = solve_soup.select_one('img#adcopy-puzzle-image')
                    # response = self.solve_captcha(image['src'])
                    #
                    #
                    #
                    # self.get()
                    #
                    # iframe = soup.find('script', src=re.compile('api-secure.solvemedia.com'))
                    #
                    # key = self.solve_captcha(iframe['src'])
                    #
                    # soup = self.post_soup(
                    #     url, data={'adcopy_response': key,
                    #                                  'adcopy_challenge' : '',
                    #                                  'submit': 'Verify & Download'})
                    # self.save_session_cookies()

                links = soup.select('a[href*="get-download.php"]')
                for link in links:
                    download = self._follow_link(link.href)
                    self.submit_parse_result(
                        index_page_title=index_page_title,
                        link_url=download,
                        link_title=link.text,
                    )

Example #11

0

Show file

 def _extract_link(self, url):
     self.load_session_cookies()
     soup = self.get_soup(url)
     if soup.select('div.g-recaptcha'):
         # we need to solve a recap.
         soup = self.post_soup(
             url, data={'g-recaptcha-response': self.get_recaptcha_token()})
         if 'Captcha incorrecto' in unicode(soup):
             raise ScraperParseException('Invalid captcha returned')
         self.save_session_cookies()
     # otherwise, dig out the actual url.
     return list(
         self.util.find_urls_in_text(
             soup.select_one('div.tab_content').text))

Example #12

0

Show file

File: dreamfilmhd_org.py Project: realchief/Scraping_BeautifulSoup_phantomjs

 def __fetch_movie(self, soup):
     # We have a base64 encoded string.
     # javascript: function a {unescape(atob(data)) }
     # document.write(a('....'))
     movie_box = soup.select('div#m0 script')
     if not movie_box:
         raise ScraperParseException('Failed to find movie box')
     movie_box = movie_box[0]
     search = re.search("document.write\(a\('([^']*)'\)\)", movie_box.text)
     if search:
         data = search.group(1)
         # Make soup out of that and extract iframes :D
         content_soup = self.make_soup(self.__decode(data))
         self._parse_iframes(content_soup,
                             asset_type=ScraperBase.MEDIA_TYPE_FILM)

Example #13

0

Show file

File: kkiste_to.py Project: realchief/Scraping_BeautifulSoup_phantomjs

 def parse(self, parse_url, **extra):
     soup = self.get_soup(parse_url)
     # Two ways this is done.
     # Moves just have a link.
     for link in soup.select('li.free a'):
         self.submit_parse_result(
             index_page_title=soup.title.text.strip(),
             link_url=link['href'],
             link_title=link.text,
         )
     for link in soup.select('li.premium a'):
         self.submit_parse_result(
             index_page_title=soup.title.text.strip(),
             link_url=link['href'],
             link_title=link.text,
         )
     # Or a dropdown of series, POST that to
     # /xhr/movies/episodes/*PAGENAME*/
     # and we get a JSON response with all our episodes.
     # Each episode value shoudl be appended to
     # http://www.ecostream.tv/stream/
     season_select = soup.find('select', 'seasonselect')
     if season_select:
         pagematch = re.search('^%s/(.*)\.html' % self.BASE_URL, parse_url)
         if not pagematch:
             raise ScraperParseException('Could not find name from %s' %
                                         parse_url)
         season_url = self.BASE_URL + \
             '/xhr/movies/episodes/' + \
             pagematch.group(1) + \
             '/'
         for season in season_select.select('option'):
             # skip the first one :)
             if not season['value']:
                 continue
             # This will 404 without the X-Requested-With header.
             resp = self.post(
                 season_url,
                 data={'season': season['value']},
                 headers={'X-Requested-With': 'XMLHttpRequest'})
             episodes = resp.json()
             for episode in episodes['episodes']:
                 link_url = 'http://www.ecostream.tv/stream/' + episode[
                     'link']
                 self.submit_parse_result(link_url=link_url,
                                          link_title=episode['part'],
                                          series_season=season['value'],
                                          series_episode=episode['episode'])

Example #14

0

Show file

File: tehparadox_com.py Project: realchief/Scraping_BeautifulSoup_phantomjs

    def search(self, search_term, media_type, **extra):
        self.load_session_cookies()

        home_page = self.get(self.BASE_URL + '/').content
        if '/forum/login.php?do=login' in unicode(home_page):
            self._login()
            self.save_session_cookies()

            home_page = self.get(self.BASE_URL + '/').content

        home_soup = self.make_soup(home_page)

        security_token = home_soup.find('input',
                                        {'name': 'securitytoken'})['value']

        tries = 0
        while tries < 5:
            tries += 1
            page_results = self.post_soup(self.BASE_URL +
                                          '/search.php?do=process',
                                          data={
                                              'securitytoken': security_token,
                                              'do': 'process',
                                              'q': search_term,
                                          })
            if 'This forum requires that you wait' in str(page_results):
                time_to_wait = int(
                    str(page_results).split('again in ')[1].split(' seconds')
                    [0]) + random.randint(2, 15)
                self.log.warning('Got wait message - waiting %s', time_to_wait)
                time.sleep(time_to_wait)
            elif 'The server is too busy at the moment. Please try again later.' in str(
                    page_results):
                raise ScraperParseException(
                    'Got "The server is too busy at the moment. Please try again later."'
                )
            else:
                self._parse_results_page(page_results)
                while self.can_fetch_next():
                    next_page = page_results.select_one('a[rel="next"]')
                    if next_page:
                        page_results = self.get_soup(
                            self.BASE_URL + '/' + next_page.href, )
                        self._parse_results_page(page_results)
                    else:
                        break

Example #15

0

Show file

File: vk_com.py Project: realchief/Scraping_BeautifulSoup_phantomjs

    def _fetch_new_access_token(self):

        # Get the auth url
        url = 'https://oauth.vk.com/authorize?client_id=' \
              '4583728&scope=video&redirect_uri=http://api.vk.com/blank.html' \
              '&response_type=token&v=5.25'
        response = self.get(url)
        # Bs4 is being a hog here - it's not findign the form on a full
        # parse of the page... suck out the form manually and soup it.
        start_index = response.content.find('<form')
        end_index = response.content.find('</form>')
        if start_index < 0 or end_index < 0:
            raise ScraperParseException('Could not find form.')
        login_page_soup = self.make_soup(
            response.content[start_index:end_index + 7])
        form = login_page_soup.select_one('form')
        data = {'email': self.USERNAME, 'pass': self.PASSWORD}
        for input in form.select('input'):
            if 'name' in input.attrs and input['name'] not in ('email',
                                                               'pass'):
                data[input['name']] = input.get('value', None)
        response = self.post(form['action'], data=data)
        if not response.ok:
            raise ScraperAuthException('Failed to login.')
        if response.content.find('In order to confirm ownership of the page') \
                >= 0 or \
                        response.content.find('account from an unusual place') >= 0:
            self.log.info('Found confirmation check...')
            phone_soup = self.make_soup(response.content)
            form = phone_soup.select_one('form')
            response = self.post('https://m.vk.com' + form['action'],
                                 data={
                                     'code': '4095292',
                                     'submit': 'Confirm',
                                 })
        # Should end up with a URL similar to:
        # http://api.vk.com/blank.html#access_token=9ac82b801aa3ac62b1b33e2475f7e09d5217e5252d0540a92fa01d7e230139e294c9a17c844ca2ffab2c6&expires_in=86400&user_id=258191591
        parsed_url = urlparse.urlparse(response.url)
        # Fragment is the bit after the hash; it's formatted like a qs.
        parsed_qs = urlparse.parse_qs(parsed_url.fragment)
        if 'access_token' not in parsed_qs:
            self.log.error('No token in %s -> %s', response.url, parsed_qs)
            raise ScraperAuthException('Could not find access token in url')

        return parsed_qs['access_token'][0]

Example #16

0

Show file

    def _parse_parse_page(self, soup, depth=0):

        content = str(soup)
        season, episode = None, None
        # Get the first video
        # onload we do a
        # var idseries="74"; var season="07"; var episode="10"; loadplayer(idseries,season,episode)
        # loadplayer(b,e,a) {$.get(badress+ p_v+ b+"-"+ e+"_"+ a,function(a){$("#cont_player").html(a)})}
        # baddress, p_v are static.


        srch = re.search('var idseries="(.*?)"; var season="(.*)"; var episode="(.*?)"',
                         content)
        if srch:
            season = srch.group(2)
            episode = srch.group(3)
            get_url = self.BASE_URL + '/play/plvids' + \
                      srch.group(1) + '-' + \
                      srch.group(2) + '_' + \
                      srch.group(3)  # a
            vid_soup = self.get_soup(get_url)
            for iframe in vid_soup.select('iframe'):
                self.submit_parse_result(index_page_title=soup.title.text.strip(),
                                         link_url=iframe['src'],
                                         series_season=season,
                                         series_episode=episode
                                         )
        else:
            raise ScraperParseException('Could not find id/series/season in javascript.')

        # Find each of the other videos.
        # ("#morevideo1").click(function(){ morurlvid('DwicBN9mlQbTxr8rLHAIowT7PyUc2Rx8b7ponXdyPy7r44LuQDDFvERQXKQVaZVMl5mTyjtuP2FJMVboBbHd4w,,',this);
        for activation_link in re.findall(
                "morurlvid\('(.*?)',this\)",
                str(soup),
        ):
            new_url = self.BASE_URL + '/play/mvideo_' + activation_link
            link = self.get_redirect_location(new_url)
            if link:
                self.submit_parse_result(index_page_title=soup.title.text.strip(),
                                         link_url=link,
                                         series_season=season,
                                         series_episode=episode
                                         )

Example #17

0

Show file

File: dizigold_net.py Project: realchief/Scraping_BeautifulSoup_phantomjs

 def _extract_link(self, iframe_link):
     self.load_session_cookies()
     soup = self.get_soup(iframe_link)
     if soup.select('div.g-recaptcha'):
         soup = self.post_soup(
             'http://player.dizigold1.com/control',
             data={'g-recaptcha-response': self.get_recaptcha_token()})
         if 'Captcha incorrecto' in unicode(soup):
             raise ScraperParseException('Invalid captcha returned')
         self.save_session_cookies()
     link = soup.select_one('iframe')
     if link:
         link = link['src']
         return link
     else:
         player_script_text = soup.select_one('div#player').find_next(
             'script').text
         return filter(None,
                       list(self.util.find_file_in_js(player_script_text)))

Example #18

0

Show file

 def _parse_search_result_page(self, soup):
     # Ugly html...
     # Find dle-content's parent, then find all tables with an ntitle
     dle_content = soup.select_one('div#dle-content')
     if not dle_content:
         raise ScraperParseException('Could not find dle-content')
     for table in dle_content.parent.children:
         if table.name != 'table':
             continue
         ntitle = table.select_one('span.ntitle')
         if not ntitle:
             continue
         readmore = table.find('strong', text='Read More')
         if not readmore:
             continue
         link = readmore.parent
         self.submit_search_result(
             link_url=link['href'],
             link_title=ntitle.text,
         )

Example #19

0

Show file

File: vk_com.py Project: realchief/Scraping_BeautifulSoup_phantomjs

 def handle_video(self, video):
     image = video.get('image_medium', video.get('thumb', None))
     if 'link' in video:
         # in some cases (local videos?) we get links directly.
         url = 'http://vk.com/' + video['link'],
         self.submit_parse_result(
             parse_url=video['player'],
             link_url=url,
             link_title=video['title'],
             image=image,
         )
     elif 'player' in video:
         # Others we need to go parse.
         self.submit_search_result(
             link_url=video['player'],
             link_title=video['title'],
             image=image,
         )
     else:
         raise ScraperParseException('Could not find info from %s' % video)

Example #20

0

Show file

File: pastebin_com.py Project: realchief/Scraping_BeautifulSoup_phantomjs

    def parse(self, parse_url, **extra):
        for soup in self.soup_each([
                parse_url,
        ]):
            # Use the text area for rawness
            textarea = soup.find('textarea', 'paste_code')
            # Appears to get different content based on user agent; fall back
            #  to div.text
            if not textarea:
                self.log.debug('Could not find textarea; using div.text')
                textarea = soup.find('div', 'text')

            if not textarea:
                return
                raise ScraperParseException('Could not find textarea or div.')
            for link in self.util.find_urls_in_text(textarea.text,
                                                    skip_images=True):
                if '.jpg' in link or '.png' in link or 'api.' in link:
                    continue
                self.submit_parse_result(
                    index_page_title=self.util.get_page_title(soup),
                    link_url=link)

Example #21

0

Show file

    def _extract_film(self, soup):
        for iframe in soup.select('.player iframe'):
            if 'src' in iframe.attrs:
                link = iframe['src']
            if 'data-src' in iframe.attrs:
                link = iframe['data-src']
            else:
                raise ScraperParseException('Invalid iframe.')
            # skip the first, ad iframe.
            if link.startswith(self.BASE_URL + '/ads') or not link:
                continue

            self._extract_video_page(
                link,
                index_page_title=self.util.get_page_title(soup),
                asset_type=ScraperBase.MEDIA_TYPE_FILM)
        for url_link in soup.select('a.p-episodio'):
            ttl = soup.find('h3','tt-filme').text
            # print url_link.find_previous('td', 'pt-titulo').text
            self._extract_video_page(
                url_link['href'],
                index_page_title=self.util.get_page_title(soup),
                asset_type=ScraperBase.MEDIA_TYPE_FILM,
                link_title = ttl+' '+url_link.find_previous('td', 'pt-titulo').text)

Example #22

0

Show file

    def _extract_downloads(self, url):
        links = []
        if not url:
            return []

        # Pull the id out of the url.
        srch = re.search('(\d+)/download.html', url)
        if not srch:
            return []
        video_id = srch.group(1)

        # This site has a session based captcha, which appears to be reusable.
        # Grab our php sess id and veify out of the cache and try to submit
        from sandcrawler.scraper.caching import cache
        import redis_cache

        try:
            phpsessid, verify = cache.get_pickle(self.VERIFY_CACHE_KEY)
        except (redis_cache.ExpiredKeyException,
                redis_cache.CacheMissException, TypeError):
            self.log.debug('Failed loading session and verify from cache.')
            phpsessid, verify = self._find_captcha()

        self._http_session.cookies.set('PHPSESSID',
                                       phpsessid,
                                       domain='yeuhd.net')

        response = self.post(u'{}/ajax/download'.format(self.BASE_URL),
                             data={
                                 'download[verify]': verify,
                                 'download[filmId]': video_id,
                             }).json()
        if not response['_fxStatus']:
            # Solve the captcha.
            phpsessid, verify = self._find_captcha()
            self._http_session.cookies.set('PHPSESSID',
                                           phpsessid,
                                           domain='yeuhd.net')
            response = self.post(u'{}/ajax/download'.format(self.BASE_URL),
                                 data={
                                     'download[verify]': verify,
                                     'download[filmId]': video_id,
                                 }).json()
            if not response['_fxStatus']:
                raise ScraperParseException('Failed to find captcha')

        soup = self.make_soup(response['_fxHtml'])
        for link in soup.select('a'):
            # Pull out (and later cache?) the id from this url.
            srch = re.search(
                'download-(\d+)\.html',
                link.href,
            )
            if srch:
                downloadid = srch.group(1)
                download_soup = self.post_soup(u'{}/ajax/download'.format(
                    self.BASE_URL),
                                               data={
                                                   'loadUrlDown': 1,
                                                   'episodeId': downloadid,
                                               })
                for link in download_soup.select('a'):
                    links.append(link.href)

        return links

Example #23

0

Show file

    def parse(self, parse_url, **extra):
        resp = self.get(parse_url)

        use_webdriver = False
        dupes = set()
        soup = self.make_soup(resp.content)

        # Parse url page will contain following type of script tag in head part of HTML.
        # <script src="../../exes/rlvo.php?i=2089"></script>
        srch = re.findall('/([0-9]{0,4})/', parse_url)
        if not srch:
            raise ScraperParseException('Could not find pattern for rlvo.php')
        p_id = srch[-1]

        url = 'http://www.latinomovies.net/temps/get_links.php'
        prev_url = parse_url
        render_page = self.post(url=url,
                                data={'p': p_id},
                                headers={
                                    'Referer': prev_url,
                                    'X-Requested-With': 'XMLHttpRequest'
                                })

        #gkpluginsphp("linkl",{link:"xlmx*!c77!BeF0c!c77!c976Ly8x4b4D0onml!c96xlmx*Gllci5!c96b20vP2VyeG!BeFoxlmx*nFtY2c{707.v2x}"});
        #gkpluginsphp("linkc",{link:"xlmx*!c77!BeF0c!c77!c976Ly8x4b4D0onml!c96xlmx*Gllci5!c96b20vP2044b4D0onTNtY!c96Y1!c0324{707.v2x}"});

        srch = re.finditer(r'\{link\:\"(.*?)\"\}\)\;', render_page.content)
        if not srch:
            raise ScraperParseException(
                'Could not find pattern for urls after fetching get_links.php')

        for match in srch:
            link_string = match.group(1)
            url = "http://www.latinomovies.net/views/ch_vd/plugins/gkpluginsphp.php"
            render_page = self.post(url=url,
                                    data={'link': link_string},
                                    headers={
                                        'Referer': prev_url,
                                        'X-Requested-With': 'XMLHttpRequest'
                                    })
            json_string = render_page.content
            srch = re.search(r'"link":"([^"]*)"', json_string)
            if srch:
                url = srch.group(1)
                url = url.replace('\\', '')
                if url not in dupes:
                    dupes.add(url)

        if dupes:
            for url in dupes:
                self.submit_parse_result(
                    index_page_title=self.util.get_page_title(soup),
                    link_url=url)
        else:
            use_webdriver = True
        if use_webdriver:
            self._parse_with_webdriver(parse_url, soup)

        # ------------------------------------------------------------------
        # ------- This part is for grabbing embedded player links ----------
        # ------------------------------------------------------------------

        # GET a URL based on that
        # http://www.latinomovies.net/exes/e.php?p=2437
        # http://www.latinomovies.net/exes/e.php?p=2089
        # http://www.latinomovies.net/exes/e.php?p=2436
        url = self.BASE_URL + '/exes/e.php?p=%s' % p_id
        resp = self.get(url)
        # var str = "
        srch = re.finditer(r'var\s*str\s*\=\s*\"(.*?)\"', resp.content)
        if not srch:
            raise ScraperParseException('Could not find pattern for rlvo.php')

        for match in srch:
            encoded_string = match.group(1)
            decoded_string = urllib.unquote(encoded_string)
            srch = re.search(r'src="([^"]*)"', decoded_string)
            if not srch:
                raise ScraperParseException(
                    'Could not find pattern for rlvo.php')
            url = srch.group(1)
            if "latinomovies" not in url:
                #self.submit_parse_result(index_page_title=soup.title.text.strip(), link_url=url)
                if url not in dupes:
                    dupes.add(url)

Example #24

0

Show file

File: urgrove_com.py Project: realchief/Scraping_BeautifulSoup_phantomjs

 def _parse_parse_page(self, soup):
     title = soup.select_one('#pagecontent h1').text
     raise ScraperParseException('TODO - captcha')