def _resolve_url(self, url): soup = self.get_soup(url) download_id_input = soup.find('input', attrs={ 'name': 's', 'type': 'HIDDEN' }) if not download_id_input: raise ScraperParseException('Could not find download id.') download_id = download_id_input['value'] token = self.get_recaptcha_token() data = { 'g-recaptcha-response': token, 'action': 'Download', 's': download_id, 'newcap': 'true', } redirect_soup = self.post_soup( url, data=data, ) # Just grab the first form. download_form = redirect_soup.select_one('form') if not download_form: raise ScraperParseException('Could not find download form.') target = download_form['action'] return self.get_redirect_location(target)
def _parse_parse_page(self, soup): # grab the main frame mainframe = soup.select_one('frame#mainFrame') if not mainframe: raise ScraperParseException('Failed to find frame') frame_soup = self.get_soup(mainframe['src']) if unicode(frame_soup).find('seems to be out of date or broken.') >= 0: self.log.warning('404 from search result.') return body = frame_soup.select('td.post2 div.postcolor') if not body: raise ScraperParseException('Could not find body') body = body[0] image = self.util.find_image_src_or_none(body, 'img') for link in self.util.find_urls_in_text( unicode(body), skip_images=True, skip_imdb=True, skip_youtube=True, ): self.submit_parse_result( index_page_title=soup.title.text.strip(), link_url=link, image=image, )
def _follow_iframes(self, url, depth=1): if depth > 5: raise ScraperParseException( "Reached recursion depth of 5 following" "iframe %s" % url) # Follow that and look for # <iframe src="..." iframe_source = self.get(url) matches = [] mtch = re.search('iframe (id="ifr_view" )?src="(.*?)"', iframe_source.text) if mtch: matches.append(mtch.group(2)) mtch = re.search('iframe src=\"(.*?)\"', iframe_source.text) if mtch: matches.append(mtch.group(1)) for url in matches: if url.startswith('http'): return url else: return self._follow_iframes('http://api.ekranka.tv/' + url, depth=depth + 1) else: raise ScraperParseException("No iframe found on api")
def get_captcha_links(self, parse_url): self.load_session_cookies() soup = self.get_soup(parse_url) if self._recaptcha_on_page(soup): key = self.get_recaptcha_token() verify_response = self.get( 'http://somosmovies.com/recaptcha/verify/{}/'.format(key), headers={ 'Referer': parse_url, 'Accept': 'application/json, text/plain, */*', }) if not verify_response.ok: raise ScraperParseException( 'Got invalid response for recap verify.') if not verify_response.json()['success']: raise ScraperParseException( 'Got not OK response for recap verify.') soup = self.get_soup(parse_url) if self._recaptcha_on_page(soup): raise ScraperParseException( 'Recapcha back on page after refresh.') self.save_session_cookies() index_page_title = self.util.get_page_title(soup) submitted_links = set() results = [] for episodeblock in soup.find_all('div', {'id': re.compile('^S\d+E\d+$')}): season, episode = \ self.util.extract_season_episode(episodeblock['id']) for result in episodeblock.select('a.btn'): if result['href'] in submitted_links: continue submitted_links.add(result['href']) results.append( dict( link_url=result['href'], link_title=result.text.strip(), index_page_title=index_page_title, series_season=season, series_episode=episode, )) for result in soup.select('div.tab-links tr div.collapse a.btn'): if result['href'] in submitted_links: continue submitted_links.add(result['href']) results.append( dict( link_url=result['href'], link_title=result.text.strip(), index_page_title=index_page_title, )) return results
def _parse_parse_page(self, soup, serie_mode=False): title = soup.select_one('.icon-1').text.strip() found_items = [] body = unicode(soup) ss_index = body.find('var ss = "') end_ss_section_index = body.find(u'// Показываем какой-то сезон') if ss_index > 0 and end_ss_section_index > 0: ss_script = body[ss_index:end_ss_section_index] for ss_section in ss_script.split('var ss = "'): if not ss_section: # Skip first one. continue season = ss_section[:ss_section.find('"')] for episode in ss_section.split(');'): if not episode.strip(): # Skip last one. continue id = re.search("'id': (\d+)", episode).group(1) tt = re.search("'tt': '(.*?)'", episode).group(1) if id and tt: found_items.append( (season, id, tt) ) else: raise ScraperParseException('Could not find ss section.') index_page_title = self.util.get_page_title(soup) for season, id, tt in found_items: result = self._extract_result(season, id, tt) result['index_page_title'] = index_page_title result['link_title'] = title self.submit_parse_result(**result)
def _parse_parse_page(self, soup): for hidden_tag in soup.select('div.tag_hide script'): # Simple obfuscation - does a document.write(HTML) # Extract that out, soup it, find our linkss match = re.match( '.*document\.write\((.*)\);.*', hidden_tag.text ) if match: html = match.group(1) hidden_soup = self.make_soup(html) for link in hidden_soup.select('a'): if link['href'].startswith('/lnk/'): try: response = self.get(self.BASE_URL + link['href']) except ScraperFetchException: self.log.error('Could not follow link: %s', link) else: self.submit_parse_result(index_page_title=soup.title.text.strip(), link_url=response.url, link_title=link.text, ) else: self.submit_parse_result(index_page_title=soup.title.text.strip(), link_url=link['href'], link_title=link.text, ) else: raise ScraperParseException('Failed to extract hidden tags.')
def _parse_parse_page(self, soup): # First suck out the header so we can get season/ep title = soup.find('h1', 'post-title').text season, episode = self._extract_season_episode(title) # Javascript callouts to # http://www.cucirca.eu/getvideo.php?id=193507&nr=20&time=1429763089910 # Where id and nr come from each links onclick # Doesn't look like time matters. for link in soup.findAll('a', attrs={'onclick': re.compile('^video\(')}): match = re.search('video\((\d+),(\d+),\d+\)', link['onclick']) if match: vid_num, vid_id = match.groups() video_soup = self.get_soup( 'http://www.cucirca.eu/getvideo.php?id=%s&nr=%s' % (vid_id, vid_num)) for iframe in video_soup.select('iframe'): self.submit_parse_result( index_page_title=soup.title.text.strip(), link_url=iframe['src'], link_title=title, series_season=season, series_episode=episode, ) else: raise ScraperParseException( 'Failed to parse video onclick: %s' % link['onclick'])
def search(self, search_term, media_type, **extra): category = { ScraperBase.MEDIA_TYPE_FILM: 3, ScraperBase.MEDIA_TYPE_GAME: 2, ScraperBase.MEDIA_TYPE_TV: 4, ScraperBase.MEDIA_TYPE_BOOK: 7, } category_id = category.get(media_type, '') self._load_cookies() tries = 0 step = '{};{};0'.format(self.util.quote(search_term), category_id) while self.can_fetch_next(): soup = self.post_soup(self.BASE_URL + '/res/suche', data={ 'Step': step, }, headers={}) if not soup: if tries > 2: raise ScraperParseException( 'Could not get search response after 2 tries.') tries += 1 cache.invalidate(self.cache_key) self._load_cookies() else: for link in soup.select('a'): text_block = link.select_one('div.tle') link_title = None if text_block: link_title = text_block.text self.submit_parse_result( link_url=self.BASE_URL + '/' + link.href, link_title=link_title, image=self.util.find_image_src_or_none( link, 'img.lazy'))
def _parse_parse_page(self, soup): for serie_link in soup.select( '.panel-collapse.collapse.in .media .media-body .media-heading' ): video_url = serie_link.parent.href serie_soup = self.get_soup(video_url) # <img id="gmodal1" data-token="MzMxMjc1MTA3NDMxOA==" onclick="ccc('MzMxMjc1MTA3NDMxOA==');" class="openw_old" style="cursor:pointer;margin-top:-165px;" title="Oglądaj odcinki online!" src="http://www.serialeonline.pl/assets/img/serial-odcinek_06.png"> # calls # function ccc(tk){ # var url = tk; # $("#video").modal('show'); # $.get(JS_MAIN_DOMAIN + "ajax/em.php?did=" + url + "&trurl=1453233789569e967d25e87&w=0", function(data) { # $('#modalem').html(data); # }).success(function() { # $(".embedlista li a").first().click(); # }); # http://www.serialeonline.pl/ajax/em.php?did=MzMxMjc1MTA3NDMxOA==&trurl=1453233877569e96d5d21af&w=0 # Looks like we only need the did - the rest is just junk :) # find the watch me link. links = serie_soup.select('img#gmodal1') for link in links: token = link.attrs.get('data-token') if not token: raise ScraperParseException('Could not extract token.') url = self.BASE_URL + '/ajax/em.php?w=0&did=' + token links_soup = self.get_soup(url) for funnylnk in links_soup.select('dd.linkplayer'): onclick = funnylnk.attrs.get('onclick') if not onclick: raise ScraperParseException( 'Could not extract onclick from dd.') season, episode = self.util.extract_season_episode( video_url) for lnk in self.util.find_urls_in_text(onclick): self.submit_parse_result( index_page_title=soup.title.text.strip(), link_url=lnk, link_title=funnylnk.text, series_season=season, series_episode=episode, )
def _parse_parse_page(self, soup): index_page_title = self.util.get_page_title(soup) for link in soup.select('div.download-link a'): if link.href.startswith('//') or link.href.startswith('http'): # It's a standalone link - submit it href = link.href if href.startswith('//'): href = 'http:{}'.format(href) self.submit_parse_result( index_page_title=index_page_title, link_url=href, link_title=link.text, ) else: url = self.BASE_URL + link.href soup = self.get_soup(url) if soup.select('#captchadiv'): # LOL - there's a skip captcha link/function. # all it does is set the cookie, which is done in the 'parse' # function. raise ScraperParseException('Found captcha in page.') # self.log.debug('Captcha Found') # # Pull out the noscript version of the iframe. # iframe = None # for noscript in soup.select('noscript'): # srch = re.search('<iframe src=\"(.*?)\"') # if srch: # iframe = srch.group(1) # break # # solve_soup = self.get(iframe) # image = solve_soup.select_one('img#adcopy-puzzle-image') # response = self.solve_captcha(image['src']) # # # # self.get() # # iframe = soup.find('script', src=re.compile('api-secure.solvemedia.com')) # # key = self.solve_captcha(iframe['src']) # # soup = self.post_soup( # url, data={'adcopy_response': key, # 'adcopy_challenge' : '', # 'submit': 'Verify & Download'}) # self.save_session_cookies() links = soup.select('a[href*="get-download.php"]') for link in links: download = self._follow_link(link.href) self.submit_parse_result( index_page_title=index_page_title, link_url=download, link_title=link.text, )
def _extract_link(self, url): self.load_session_cookies() soup = self.get_soup(url) if soup.select('div.g-recaptcha'): # we need to solve a recap. soup = self.post_soup( url, data={'g-recaptcha-response': self.get_recaptcha_token()}) if 'Captcha incorrecto' in unicode(soup): raise ScraperParseException('Invalid captcha returned') self.save_session_cookies() # otherwise, dig out the actual url. return list( self.util.find_urls_in_text( soup.select_one('div.tab_content').text))
def __fetch_movie(self, soup): # We have a base64 encoded string. # javascript: function a {unescape(atob(data)) } # document.write(a('....')) movie_box = soup.select('div#m0 script') if not movie_box: raise ScraperParseException('Failed to find movie box') movie_box = movie_box[0] search = re.search("document.write\(a\('([^']*)'\)\)", movie_box.text) if search: data = search.group(1) # Make soup out of that and extract iframes :D content_soup = self.make_soup(self.__decode(data)) self._parse_iframes(content_soup, asset_type=ScraperBase.MEDIA_TYPE_FILM)
def parse(self, parse_url, **extra): soup = self.get_soup(parse_url) # Two ways this is done. # Moves just have a link. for link in soup.select('li.free a'): self.submit_parse_result( index_page_title=soup.title.text.strip(), link_url=link['href'], link_title=link.text, ) for link in soup.select('li.premium a'): self.submit_parse_result( index_page_title=soup.title.text.strip(), link_url=link['href'], link_title=link.text, ) # Or a dropdown of series, POST that to # /xhr/movies/episodes/*PAGENAME*/ # and we get a JSON response with all our episodes. # Each episode value shoudl be appended to # http://www.ecostream.tv/stream/ season_select = soup.find('select', 'seasonselect') if season_select: pagematch = re.search('^%s/(.*)\.html' % self.BASE_URL, parse_url) if not pagematch: raise ScraperParseException('Could not find name from %s' % parse_url) season_url = self.BASE_URL + \ '/xhr/movies/episodes/' + \ pagematch.group(1) + \ '/' for season in season_select.select('option'): # skip the first one :) if not season['value']: continue # This will 404 without the X-Requested-With header. resp = self.post( season_url, data={'season': season['value']}, headers={'X-Requested-With': 'XMLHttpRequest'}) episodes = resp.json() for episode in episodes['episodes']: link_url = 'http://www.ecostream.tv/stream/' + episode[ 'link'] self.submit_parse_result(link_url=link_url, link_title=episode['part'], series_season=season['value'], series_episode=episode['episode'])
def search(self, search_term, media_type, **extra): self.load_session_cookies() home_page = self.get(self.BASE_URL + '/').content if '/forum/login.php?do=login' in unicode(home_page): self._login() self.save_session_cookies() home_page = self.get(self.BASE_URL + '/').content home_soup = self.make_soup(home_page) security_token = home_soup.find('input', {'name': 'securitytoken'})['value'] tries = 0 while tries < 5: tries += 1 page_results = self.post_soup(self.BASE_URL + '/search.php?do=process', data={ 'securitytoken': security_token, 'do': 'process', 'q': search_term, }) if 'This forum requires that you wait' in str(page_results): time_to_wait = int( str(page_results).split('again in ')[1].split(' seconds') [0]) + random.randint(2, 15) self.log.warning('Got wait message - waiting %s', time_to_wait) time.sleep(time_to_wait) elif 'The server is too busy at the moment. Please try again later.' in str( page_results): raise ScraperParseException( 'Got "The server is too busy at the moment. Please try again later."' ) else: self._parse_results_page(page_results) while self.can_fetch_next(): next_page = page_results.select_one('a[rel="next"]') if next_page: page_results = self.get_soup( self.BASE_URL + '/' + next_page.href, ) self._parse_results_page(page_results) else: break
def _fetch_new_access_token(self): # Get the auth url url = 'https://oauth.vk.com/authorize?client_id=' \ '4583728&scope=video&redirect_uri=http://api.vk.com/blank.html' \ '&response_type=token&v=5.25' response = self.get(url) # Bs4 is being a hog here - it's not findign the form on a full # parse of the page... suck out the form manually and soup it. start_index = response.content.find('<form') end_index = response.content.find('</form>') if start_index < 0 or end_index < 0: raise ScraperParseException('Could not find form.') login_page_soup = self.make_soup( response.content[start_index:end_index + 7]) form = login_page_soup.select_one('form') data = {'email': self.USERNAME, 'pass': self.PASSWORD} for input in form.select('input'): if 'name' in input.attrs and input['name'] not in ('email', 'pass'): data[input['name']] = input.get('value', None) response = self.post(form['action'], data=data) if not response.ok: raise ScraperAuthException('Failed to login.') if response.content.find('In order to confirm ownership of the page') \ >= 0 or \ response.content.find('account from an unusual place') >= 0: self.log.info('Found confirmation check...') phone_soup = self.make_soup(response.content) form = phone_soup.select_one('form') response = self.post('https://m.vk.com' + form['action'], data={ 'code': '4095292', 'submit': 'Confirm', }) # Should end up with a URL similar to: # http://api.vk.com/blank.html#access_token=9ac82b801aa3ac62b1b33e2475f7e09d5217e5252d0540a92fa01d7e230139e294c9a17c844ca2ffab2c6&expires_in=86400&user_id=258191591 parsed_url = urlparse.urlparse(response.url) # Fragment is the bit after the hash; it's formatted like a qs. parsed_qs = urlparse.parse_qs(parsed_url.fragment) if 'access_token' not in parsed_qs: self.log.error('No token in %s -> %s', response.url, parsed_qs) raise ScraperAuthException('Could not find access token in url') return parsed_qs['access_token'][0]
def _parse_parse_page(self, soup, depth=0): content = str(soup) season, episode = None, None # Get the first video # onload we do a # var idseries="74"; var season="07"; var episode="10"; loadplayer(idseries,season,episode) # loadplayer(b,e,a) {$.get(badress+ p_v+ b+"-"+ e+"_"+ a,function(a){$("#cont_player").html(a)})} # baddress, p_v are static. srch = re.search('var idseries="(.*?)"; var season="(.*)"; var episode="(.*?)"', content) if srch: season = srch.group(2) episode = srch.group(3) get_url = self.BASE_URL + '/play/plvids' + \ srch.group(1) + '-' + \ srch.group(2) + '_' + \ srch.group(3) # a vid_soup = self.get_soup(get_url) for iframe in vid_soup.select('iframe'): self.submit_parse_result(index_page_title=soup.title.text.strip(), link_url=iframe['src'], series_season=season, series_episode=episode ) else: raise ScraperParseException('Could not find id/series/season in javascript.') # Find each of the other videos. # ("#morevideo1").click(function(){ morurlvid('DwicBN9mlQbTxr8rLHAIowT7PyUc2Rx8b7ponXdyPy7r44LuQDDFvERQXKQVaZVMl5mTyjtuP2FJMVboBbHd4w,,',this); for activation_link in re.findall( "morurlvid\('(.*?)',this\)", str(soup), ): new_url = self.BASE_URL + '/play/mvideo_' + activation_link link = self.get_redirect_location(new_url) if link: self.submit_parse_result(index_page_title=soup.title.text.strip(), link_url=link, series_season=season, series_episode=episode )
def _extract_link(self, iframe_link): self.load_session_cookies() soup = self.get_soup(iframe_link) if soup.select('div.g-recaptcha'): soup = self.post_soup( 'http://player.dizigold1.com/control', data={'g-recaptcha-response': self.get_recaptcha_token()}) if 'Captcha incorrecto' in unicode(soup): raise ScraperParseException('Invalid captcha returned') self.save_session_cookies() link = soup.select_one('iframe') if link: link = link['src'] return link else: player_script_text = soup.select_one('div#player').find_next( 'script').text return filter(None, list(self.util.find_file_in_js(player_script_text)))
def _parse_search_result_page(self, soup): # Ugly html... # Find dle-content's parent, then find all tables with an ntitle dle_content = soup.select_one('div#dle-content') if not dle_content: raise ScraperParseException('Could not find dle-content') for table in dle_content.parent.children: if table.name != 'table': continue ntitle = table.select_one('span.ntitle') if not ntitle: continue readmore = table.find('strong', text='Read More') if not readmore: continue link = readmore.parent self.submit_search_result( link_url=link['href'], link_title=ntitle.text, )
def handle_video(self, video): image = video.get('image_medium', video.get('thumb', None)) if 'link' in video: # in some cases (local videos?) we get links directly. url = 'http://vk.com/' + video['link'], self.submit_parse_result( parse_url=video['player'], link_url=url, link_title=video['title'], image=image, ) elif 'player' in video: # Others we need to go parse. self.submit_search_result( link_url=video['player'], link_title=video['title'], image=image, ) else: raise ScraperParseException('Could not find info from %s' % video)
def parse(self, parse_url, **extra): for soup in self.soup_each([ parse_url, ]): # Use the text area for rawness textarea = soup.find('textarea', 'paste_code') # Appears to get different content based on user agent; fall back # to div.text if not textarea: self.log.debug('Could not find textarea; using div.text') textarea = soup.find('div', 'text') if not textarea: return raise ScraperParseException('Could not find textarea or div.') for link in self.util.find_urls_in_text(textarea.text, skip_images=True): if '.jpg' in link or '.png' in link or 'api.' in link: continue self.submit_parse_result( index_page_title=self.util.get_page_title(soup), link_url=link)
def _extract_film(self, soup): for iframe in soup.select('.player iframe'): if 'src' in iframe.attrs: link = iframe['src'] if 'data-src' in iframe.attrs: link = iframe['data-src'] else: raise ScraperParseException('Invalid iframe.') # skip the first, ad iframe. if link.startswith(self.BASE_URL + '/ads') or not link: continue self._extract_video_page( link, index_page_title=self.util.get_page_title(soup), asset_type=ScraperBase.MEDIA_TYPE_FILM) for url_link in soup.select('a.p-episodio'): ttl = soup.find('h3','tt-filme').text # print url_link.find_previous('td', 'pt-titulo').text self._extract_video_page( url_link['href'], index_page_title=self.util.get_page_title(soup), asset_type=ScraperBase.MEDIA_TYPE_FILM, link_title = ttl+' '+url_link.find_previous('td', 'pt-titulo').text)
def _extract_downloads(self, url): links = [] if not url: return [] # Pull the id out of the url. srch = re.search('(\d+)/download.html', url) if not srch: return [] video_id = srch.group(1) # This site has a session based captcha, which appears to be reusable. # Grab our php sess id and veify out of the cache and try to submit from sandcrawler.scraper.caching import cache import redis_cache try: phpsessid, verify = cache.get_pickle(self.VERIFY_CACHE_KEY) except (redis_cache.ExpiredKeyException, redis_cache.CacheMissException, TypeError): self.log.debug('Failed loading session and verify from cache.') phpsessid, verify = self._find_captcha() self._http_session.cookies.set('PHPSESSID', phpsessid, domain='yeuhd.net') response = self.post(u'{}/ajax/download'.format(self.BASE_URL), data={ 'download[verify]': verify, 'download[filmId]': video_id, }).json() if not response['_fxStatus']: # Solve the captcha. phpsessid, verify = self._find_captcha() self._http_session.cookies.set('PHPSESSID', phpsessid, domain='yeuhd.net') response = self.post(u'{}/ajax/download'.format(self.BASE_URL), data={ 'download[verify]': verify, 'download[filmId]': video_id, }).json() if not response['_fxStatus']: raise ScraperParseException('Failed to find captcha') soup = self.make_soup(response['_fxHtml']) for link in soup.select('a'): # Pull out (and later cache?) the id from this url. srch = re.search( 'download-(\d+)\.html', link.href, ) if srch: downloadid = srch.group(1) download_soup = self.post_soup(u'{}/ajax/download'.format( self.BASE_URL), data={ 'loadUrlDown': 1, 'episodeId': downloadid, }) for link in download_soup.select('a'): links.append(link.href) return links
def parse(self, parse_url, **extra): resp = self.get(parse_url) use_webdriver = False dupes = set() soup = self.make_soup(resp.content) # Parse url page will contain following type of script tag in head part of HTML. # <script src="../../exes/rlvo.php?i=2089"></script> srch = re.findall('/([0-9]{0,4})/', parse_url) if not srch: raise ScraperParseException('Could not find pattern for rlvo.php') p_id = srch[-1] url = 'http://www.latinomovies.net/temps/get_links.php' prev_url = parse_url render_page = self.post(url=url, data={'p': p_id}, headers={ 'Referer': prev_url, 'X-Requested-With': 'XMLHttpRequest' }) #gkpluginsphp("linkl",{link:"xlmx*!c77!BeF0c!c77!c976Ly8x4b4D0onml!c96xlmx*Gllci5!c96b20vP2VyeG!BeFoxlmx*nFtY2c{707.v2x}"}); #gkpluginsphp("linkc",{link:"xlmx*!c77!BeF0c!c77!c976Ly8x4b4D0onml!c96xlmx*Gllci5!c96b20vP2044b4D0onTNtY!c96Y1!c0324{707.v2x}"}); srch = re.finditer(r'\{link\:\"(.*?)\"\}\)\;', render_page.content) if not srch: raise ScraperParseException( 'Could not find pattern for urls after fetching get_links.php') for match in srch: link_string = match.group(1) url = "http://www.latinomovies.net/views/ch_vd/plugins/gkpluginsphp.php" render_page = self.post(url=url, data={'link': link_string}, headers={ 'Referer': prev_url, 'X-Requested-With': 'XMLHttpRequest' }) json_string = render_page.content srch = re.search(r'"link":"([^"]*)"', json_string) if srch: url = srch.group(1) url = url.replace('\\', '') if url not in dupes: dupes.add(url) if dupes: for url in dupes: self.submit_parse_result( index_page_title=self.util.get_page_title(soup), link_url=url) else: use_webdriver = True if use_webdriver: self._parse_with_webdriver(parse_url, soup) # ------------------------------------------------------------------ # ------- This part is for grabbing embedded player links ---------- # ------------------------------------------------------------------ # GET a URL based on that # http://www.latinomovies.net/exes/e.php?p=2437 # http://www.latinomovies.net/exes/e.php?p=2089 # http://www.latinomovies.net/exes/e.php?p=2436 url = self.BASE_URL + '/exes/e.php?p=%s' % p_id resp = self.get(url) # var str = " srch = re.finditer(r'var\s*str\s*\=\s*\"(.*?)\"', resp.content) if not srch: raise ScraperParseException('Could not find pattern for rlvo.php') for match in srch: encoded_string = match.group(1) decoded_string = urllib.unquote(encoded_string) srch = re.search(r'src="([^"]*)"', decoded_string) if not srch: raise ScraperParseException( 'Could not find pattern for rlvo.php') url = srch.group(1) if "latinomovies" not in url: #self.submit_parse_result(index_page_title=soup.title.text.strip(), link_url=url) if url not in dupes: dupes.add(url)
def _parse_parse_page(self, soup): title = soup.select_one('#pagecontent h1').text raise ScraperParseException('TODO - captcha')