if (outside == 1): skipCheck2 = 0 else: if (external == 0): skipCheck2 = 0 else: skipCheck2 = 1 if (skipCheck == -1) & (skipCheck2 == 0): conn.request("GET", page) code = conn.getresponse() # read response code src = code.read() src = str(src) flist = l.split('.') ftype = flist[-1] imageCheck = imageTypes.find(ftype) links = SoupStrainer('a') # grab all anchors imgs = SoupStrainer('img') # grab all img elements if (imageCheck == -1): bs = BeautifulSoup(src, parseOnlyThese=links) # parse for anchors if (imageCheck == -1): print "Crawling\t",l,"\t",code.status # loop through all of the anchors found on the page # crawler only records the FIRST time it finds a link. If a link is on 20 pages # it will still only show up once in the log. for j in bs.findAll('a', {'href':True}): testresult = 0 absUrl = urlparse.urljoin(l, j['href']) # check for javascript/mailto checkAbs = absUrl.split(':') checkAbs = checkAbs[0] checkAbs = checkAbs.strip()
def __init__(self, params): import re from addon import Addon from addondict import AddonDict as XBMCDict from BeautifulSoup import BeautifulSoup, SoupStrainer, Comment a = Addon() site = self.__module__ mode = params['mode'] home_url = 'http://xtheatre.net/' search_url = home_url + '?s=' false_positives = [ 'http://watchxxxhd.net/watch-full-movies-hd/', 'http://watchxxxhd.net', 'http://watchxxxhd.net/category/movies/', 'http://watchxxxhd.net/category/ategorized222/', 'http://watchxxxhd.net/watch-full-movies-hd/' ] if mode == 'main': item_list = [{ 'site': site, 'mode': 'list', 'title': a.language(30006), 'content': '', 'url': home_url + '?filtre=date&cat=0', 'cover_url': a.image('all.png', image), 'backdrop_url': a.art(), 'type': 3 }, { 'site': site, 'mode': 'categories', 'title': a.language(30005), 'content': '', 'url': home_url + 'categories/', 'cover_url': a.image('categories.png', image), 'backdrop_url': a.art(), 'type': 3 }, { 'site': site, 'mode': 'list', 'title': a.language(30004), 'content': 'search', 'url': search_url, 'cover_url': a.image('search.png', image), 'backdrop_url': a.art(), 'type': 3 }] item_list.extend(a.favs_hist_menu(site)) item_list.extend(a.extended_menu()) a.add_items(item_list) a.end_of_directory() elif mode == 'categories': html = a.get_page(params['url']) soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer( 'ul', {'class': 'listing-cat'})) item_list = [] if soup: for item in soup.findAll('li'): if item: if item.a.get('href') not in false_positives: try: vidcount = item.findAll( 'span', {'class': 'nb_cat border-radius-5' })[0].string.encode('UTF-8') vidcount = re.sub('\svideo[s]*', '', vidcount) except: vidcount = '0' if vidcount and vidcount != '0': img = item.find('img') if img: try: img = img.get('data-lazy-src') except: try: img = img.get('src') except: img = '' if not img: img = '' title = item.a.get('title').encode( 'UTF-8') + ' (%s)' % vidcount item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.a.get('href'), 'content': '', 'title': title, 'cover_url': a.image(img, image), 'backdrop_url': a.art(), 'type': 3 }]) a.add_items(item_list) a.end_of_directory() elif mode == 'list': if params.get('content', '') == 'search': item = a.search_input() if item: params['url'] = search_url + item else: exit(1) elif params.get('content', '') == 'goto': last_item = re.search('/page/([0-9]+)/', params['url']) if last_item: last_item = int(last_item.group(1)) else: last_item = 10000 item = a.page_input(last_item) if item: params['url'] = re.sub('/page/[0-9]+/', '/page/' + str(item) + '/', params['url']) else: exit(1) html = a.get_page(params['url']) soup = BeautifulSoup( html, parseOnlyThese=SoupStrainer( 'ul', {'class': 'listing-videos listing-extract'})) item_list = [] params['mode'] = 'play' params['content'] = 'movies' params['type'] = 0 params['context'] = 0 params['duration'] = '7200' if soup: xbmcdict = XBMCDict(0).update(params) for item in soup.findAll( 'li', {'class': 'border-radius-5 box-shadow'}): if item: if item.a.get('href') not in false_positives: _dict = xbmcdict.copy() _dict['url'] = item.a.get('href') _dict['title'] = item.a.get('title').encode( 'UTF-8') _dict['tvshowtitle'] = _dict['title'] _dict['originaltitle'] = _dict['title'] img = item.find('img') if img: try: img = img.get('data-lazy-src') except: try: img = img.get('src') except: img = '' if not img: img = '' _dict['cover_url'] = a.image(img) _dict['thumb_url'] = _dict['cover_url'] _dict['poster'] = _dict['cover_url'] _dict['sub_site'] = site plot = item.find('div', {'class': 'right'}) if plot: plot = plot.p.contents[0].encode('utf-8') _dict['plot'] = plot _dict['plotoutline'] = plot item_list.extend([_dict]) soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer( 'div', {'class': 'pagination'})) last_item = False if soup: for item in soup.findAll('a'): if (item.string.encode('UTF-8') == 'Last »') or (item.get('class') == 'last'): last_item = item.get('href') break if last_item is False: for last_item in soup.findAll('a', {'class': 'inactive'}): pass if last_item: last_item = last_item.get('href') item = soup.find('span', {'class': 'current'}) if item: if item.parent: item = item.parent if item.previousSibling: if item.previousSibling.find('a'): item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.previousSibling.a.get('href'), 'content': params['content'], 'title': a.language(30017, True), 'cover_url': a.image('previous.png', image), 'backdrop_url': a.art(), 'type': 3 }]) if item.nextSibling: if item.nextSibling.find('a'): item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.nextSibling.a.get('href'), 'content': params['content'], 'title': a.language(30018, True), 'cover_url': a.image('next.png', image), 'backdrop_url': a.art(), 'type': 3 }]) if last_item: item_list.extend([{ 'site': site, 'mode': 'list', 'url': last_item, 'content': 'goto', 'title': a.language(30019, True), 'cover_url': a.image('goto.png', image), 'backdrop_url': a.art(), 'type': 3 }]) a.add_items(item_list) a.end_of_directory() elif mode == 'play': html = a.get_page(params['url']) soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer( 'div', {'class': 'video-embed'})) item_list = [] if soup: for script in soup.findAll(re.compile('s_*c_*r_*i_*p_*t')): item = '' if script.get('src'): if 'http://videomega.tv/validatehash.php' in script[ 'src']: item = script['src'] elif 'ref=' in script.get('src'): temp = re.search('.*ref=[\'"](.+?)[\'"]', script.get('src')) if temp: item = 'http://videomega.tv/iframe.php?ref=' + temp.group( 1) xbmcdict = XBMCDict(0).update(params) if item: _dict = xbmcdict.copy() _dict['url'] = item item_list.extend([_dict]) if soup.find('iframe', src=True): item = '' for iframe in soup.findAll('iframe', src=True): if iframe.get('data-lazy-src'): item = iframe.get('data-lazy-src') r = re.search('.+old=(.+)$', item) if r: item = r.group(1) else: item = iframe.get('src').replace('\\', '') xbmcdict = XBMCDict(0).update(params) if item: _dict = xbmcdict.copy() _dict['url'] = item item_list.extend([_dict]) soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer( 'div', {'id': 'video-infos'})) if soup: item = '' for p in soup.findAll('p'): if p.iframe: item = p.iframe.get('src') xbmcdict = XBMCDict(0).update(params) if item: _dict = xbmcdict.copy() _dict['url'] = item item_list.extend([_dict]) if item_list: from playback import Playback Playback().choose_sources(item_list) else: a.alert(a.language(30904, True), sound=False)
def scrapeShowsGrid(self, html, params = {}): get = params.get params["folder"] = "true" if self.__dbg__: print self.__plugin__ + " scrapeShowsGrid" next = "false" pager = SoupStrainer(name="div", attrs = {'class':"yt-uix-pager"}) pagination = BeautifulSoup(html, parseOnlyThese=pager) if (len(pagination) > 0): tmp = str(pagination) if (tmp.find("Next") > 0): next = "true" #Now look for the shows in the list. list = SoupStrainer(name="div", attrs = {"class":"popular-show-list"}) shows = BeautifulSoup(html, parseOnlyThese=list) yobjects = [] status = 200 if (len(shows) > 0): show = shows.div.div while (show != None): if (show.a): item = {} episodes = show.find(name = "div", attrs= {'class':"show-extrainfo"}) title = show.div.h3.contents[0] if (episodes and episodes.span): title = title + " (" + episodes.span.contents[0].lstrip().rstrip() + ")" title = title.replace("&", "&") item['Title'] = title show_url = show.a["href"] if (show_url.find("?p=") > 0): show_url = show_url[show_url.find("?p=") + 1:] else : show_url = show_url.replace("/show/", "") show_url = urllib.quote_plus(show_url) item['show'] = show_url item['icon'] = "shows" item['scraper'] = "show" thumbnail = show.a.span.img['src'] if ( thumbnail.find("_thumb.") > 0): thumbnail = thumbnail.replace("_thumb.",".") else: thumbnail = "shows" item["thumbnail"] = thumbnail if self.__dbg__: print self.__plugin__ + " adding show " + repr(item['Title']) + ", url: " + repr(item['show']) yobjects.append(item) show = show.findNextSibling(name="div", attrs = { 'class':re.compile("show-cell .") }) if (not yobjects): return (self.__language__(30601), 303) yobjects[len(yobjects) -1]["next"] = next return (yobjects, status)
def ParsePlotPage(self, id): resp = urllib.urlopen(self.story_url % id) strain = SoupStrainer("div", {"id": "synopsis"}) soup = BeautifulSoup(resp.read(), strain, fromEncoding="utf-8") plot = ''.join(soup.find('div', {"class": "txt"}).findAll(text=True)) self.meta.m_plot = plot.strip().replace('\r', '')
if not os.path.exists(DL_DIR): os.makedirs(DL_DIR) URL_BIB = 'http://www.google.com/googlebooks/uspto-patents-grants-biblio.html' DBSERVER = '127.0.0.1' USERNAME = '******' PASSWORD = '******' DATABASE = 'PatentTools' def bib_urls(): response = urllib2.urlopen(URL_BIB).read() urls = [] for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')): if link.has_key('href') and link['href'].find('.zip') != -1: urls.apend(link['href']) def download_next_bib(n=1): # Create the needed tables if they don't exist create_retrieve_tables() # Get a list of files we've already downloaded con = mdb.connect(DBSERVER, USERNAME, PASSWORD, DATABASE); cur = con.cursor() cur.execute('SELECT url from files_retrieved') dl_urls = [f[0] for f in cur.fetchall()]
def get_links(html): links = BeautifulSoup(html, parseOnlyThese=SoupStrainer('a')) for link in links: yield link
def main(): ''' <Perpetual Loop Auto-submit business> ''' list_home = os.getcwd() readyListFileName = "ready_list.txt" lockFileName = readyListFileName + ".lck" ### Exit if last list still pending, wait for it to be renamed/removed. if os.access(readyListFileName, os.F_OK) is True: print( 'ABORT: %s exists (Not picked up yet? Should be renamed' 'when retrieved by auto_submit loop!)' % readyListFileName) if os.access(lockFileName, os.F_OK) is True: os.remove(lockFileName) exit(0) ### If lock file exists, another process is already generating the list if os.access(lockFileName, os.F_OK) is True: print( 'ABORT: %s lockfile exists (Another process generating list' 'already? Should be deleted when complete!)' % lockFileName) exit(0) ### Touch a lock and list file. touchLi = open(readyListFileName, 'wb') touchLi.write('') touchLi.close() touchLo = open(lockFileName, 'wb') touchLo.write('') touchLo.close() ''' <Peprpetual Loop Auto-submit business /> ''' mkdir('/1/incoming/tmp/wiki-dumps') home = os.getcwd() # Get links for every Wiki Directory # (i.e. aawiki/,aawikibooks/,aawiktionary/,etc.). url = 'http://wikipedia.c3sl.ufpr.br/' indexHTML = urllib2.urlopen(url).read() wikiList = BeautifulSoup(indexHTML, parseOnlyThese=SoupStrainer( 'a', href=re.compile('wik'))) for link in wikiList: flogger.info('Downloading: %s' % link['href']) # Get links for the most recent dump in every Wiki Directory. # (i.e. 20110901/,20110908/,20111010/,etc.) itemHTML = urllib2.urlopen(url + link['href']).read() dirStrainer = SoupStrainer('a', href=re.compile('20')) print dirStrainer dirLinks = ([ tag for tag in BeautifulSoup(itemHTML, parseOnlyThese=dirStrainer) ][-1]) for itemDIR in dirLinks: identifier = ("%s-%s" % (link['href'].strip('/'), itemDIR)) mkdir(identifier) makeMeta(identifier) # Get links for every file in dump directory # (i.e. pages-logging.xml.gz,pages-articles.xml.bz2,etc.) dirHTML = urllib2.urlopen(url + link['href'] + itemDIR).read() dirLinks = BeautifulSoup(dirHTML, parseOnlyThese=SoupStrainer( 'a', href=re.compile(link['href']))) for dumpFile in dirLinks: dirURL = url + dumpFile['href'].strip('/') fname = dumpFile['href'].split('/')[-1] flogger.info('Downloading: %s' % dirURL) wget = 'wget -c %s' % dirURL execute = call(wget, shell=True) os.chdir(home) os.chdir(list_home) dataList = os.listdir(home) f = open(readyListFileName, 'wb') f.write('\n'.join(dataList)) f.close() ### Remove lock file... os.remove(lockFileName) flogger.info('YOU HAVE SO MUCH WIKI!')
def parseHtmlParams(currentURL, htmlContent): global database, database_css, database_js """ Parse html to get args """ for url in database_url: k = url.find('?') if k > 0: keyUrl = url[0:k - 1] query = url[k + 1:] if not keyUrl in database: database[keyUrl] = {} database[keyUrl]['GET'] = {} database[keyUrl]['POST'] = {} lG = database[keyUrl]['GET'] lG = dict_add(lG, splitQuery(query)) database[keyUrl]['GET'] = lG elif len(dumb_params) > 0: keyUrl = url # no params in the URL... let's assign the dumb_params if not keyUrl in database: database[keyUrl] = {} database[keyUrl]['GET'] = {} database[keyUrl]['POST'] = {} lG = database[keyUrl]['GET'] lP = database[keyUrl]['POST'] lG = dict_add_list(lG, dumb_params) lP = dict_add_list(lP, dumb_params) database[keyUrl]['GET'] = lG database[keyUrl]['POST'] = lP # then, parse the forms forms = SoupStrainer('form') input = SoupStrainer('input') listForm = [ tag for tag in BeautifulSoup(htmlContent, parseOnlyThese=forms) ] for f in listForm: method = 'GET' if 'method' in f or 'METHOD' in f: method = f['method'].upper() action = currentURL if 'action' in f or 'ACTION' in f: action = f['action'] keyUrl = giveGoodURL(action, currentURL) listInput = [ tag for tag in BeautifulSoup(str(f), parseOnlyThese=input) ] for i in listInput: if not keyUrl in database: database[keyUrl] = {} database[keyUrl]['GET'] = {} database[keyUrl]['POST'] = {} try: value = i['value'] except KeyError: value = '42' try: name = i['name'] except KeyError: name = 'foo' value = 'bar' continue lGP = database[keyUrl][method] lGP = dict_add(lGP, {name: value}) database[keyUrl][method] = lGP return True
from BeautifulSoup import BeautifulSoup, SoupStrainer import urllib, urllib2, re #Homepage where the sergas waiting lists are hosted home = 'http://www.sergas.es/MostrarContidos_N2_T01.aspx?IdPaxina=40026' #Site to include in case of relative paths site = 'http://www.sergas.es' #Bottom link page that allows to navigate to the different data for a waitingList docLinks = 'DocLinks.htm' #Exceptions to the automation process developed skipped1 = 'http://www.sergas.es/Docs/ListasEspera/CIR/HTML/201003/WEB_CI~1/DocLinks.htm' skipped2 = 'http://www.sergas.es/Docs/ListasEspera/CIR/HTML/200506/web_cir_2005-06_archivos/tabstrip.htm' try: html = urllib2.urlopen(home).read() findDivs = SoupStrainer('div', {'class': 'list_nivel2'}) soup = BeautifulSoup(html, parseOnlyThese=findDivs) output = open('SergasAuxLinks.txt', 'w') for link in soup.findAll('a', href=re.compile('MostrarContidos_')): uri = link['href'].partition('uri=') uriaux = uri[2] if not re.match('^http', uriaux): uriaux = site + uriaux ruta = uriaux.partition('.htm') rutaFinal = ruta[0] + '/' + docLinks output.write(rutaFinal + u'\n') #We manually enter this two skipped links that fall out of the automation output.write(skipped1 + u'\n') output.write(skipped2 + u'\n') output.close() except Exception, e:
def parse(self): result = [(i['href'], i.string) for i in BeautifulSoup( self.content, parseOnlyThese=SoupStrainer('td')).findAll( 'a', {'class': 'highLight'})] return result
def parseHtmlLinks(currentURL, htmlContent): global database_url, database_js, database_css """ Parse the HTML/XHTML code to get JS, CSS, links etc. """ links = SoupStrainer('a') # listAnchors = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=links)] listAnchors = [] for tag in BeautifulSoup(htmlContent, parseOnlyThese=links): try: string = str(tag).lower() if string.count("href") > 0: listAnchors.append(tag['href']) except TypeError: continue except KeyError: continue for a in listAnchors: goodA = giveGoodURL(a, currentURL) goodA = removeSESSID(goodA) if (root in goodA) and (goodA not in database_url): database_url.append(goodA) # parse the CSS and the JavaScript script = SoupStrainer('script') #listScripts = [tag['src'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=script)] listScripts = [] for tag in BeautifulSoup(htmlContent, parseOnlyThese=script): try: string = str(tag).lower() if string.count("src") > 0 and string.count(".src") < 1: listScripts.append(tag['src']) except TypeError: continue except KeyError: continue for a in listScripts: sc = giveGoodURL(a, currentURL) if sc not in database_js: database_js.append(sc) if sc == currentURL: # remote script database_ext.append(sc) parseJavaScriptCalls() link = SoupStrainer('link') # listLinks = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=link)] listLinks = [] for tag in BeautifulSoup(htmlContent, parseOnlyThese=link): try: string = str(tag).lower() if string.count("href") > 0: listLinks.append(tag['href']) except TypeError: continue except KeyError: continue for a in listLinks: sc = giveGoodURL(a, currentURL) if sc not in database_css: database_css.append(sc) return True
def html2plaintext(html, body_id=None, encoding='ascii'): """ from an HTML text, convert the HTML to plain text. If @body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ urls = [] if body_id is not None: strainer = SoupStrainer(id=body_id) else: # if the html doesn't contain a <body> tag it doesn't make # sense to use a strainer if html.count('<body'): strainer = SoupStrainer('body') strainer = None soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding) for link in soup.findAll('a'): title = unicode(link.renderContents(), encoding) for url in [x[1] for x in link.attrs if x[0]=='href']: urls.append(dict(url=url, tag=unicode(str(link), encoding), title=title)) try: html = soup.renderContents(encoding=encoding) except AttributeError: # using OLDER version that BeautifulSoup 3.1 html = soup.__str__(encoding) if isinstance(html, str) and encoding != 'ascii': html = unicode(html, encoding) url_index = [] i = 0 for d in urls: if d['title'] == d['url'] or u'http://'+d['title'] == d['url']: html = html.replace(d['tag'], d['url']) else: i += 1 html = html.replace(d['tag'], u'%s [%s]' % (d['title'], i)) url_index.append(d['url']) html = html.replace('<strong>','*').replace('</strong>','*') html = html.replace('<b>','*').replace('</b>','*') html = html.replace('<h3>','*').replace('</h3>','*') html = html.replace('<h2>','**').replace('</h2>','**') html = html.replace('<h1>','**').replace('</h1>','**') html = html.replace('<em>','/').replace('</em>','/') # the only line breaks we respect is those of ending tags and # breaks html = html.replace('\n',' ') html = html.replace('<br>', '\n') html = html.replace(' ', ' ') html = html.replace('</p>', '\n\n') html = re.sub('<br\s*/>', '\n', html) html = html.replace(' ' * 2, ' ') # for all other tags we failed to clean up, just remove then and # complain about them on the stderr def desperate_fixer(g): #print >>sys.stderr, "failed to clean up %s" % str(g.group()) return ' ' html = re.sub('<.*?>', desperate_fixer, html) # lstrip all lines html = u'\n'.join([x.lstrip() for x in html.splitlines()]) for i, url in enumerate(url_index): if i == 0: html += u'\n\n' html += u'[%s] %s\n' % (i+1, url) html = unescape(html) return html
def main(): base_fmt = ("http://www.bing.com/search?q=%s&first=%d&filters=" "ex1%%253a%%22ez%s%%22") new_urls = set() all_urls = set() try: with open(URIS, "rb") as f: all_urls.update(f.read().splitlines()) except IOError: pass print "# urls: ", len(all_urls) try: skip = get_crawl_index() gen = compute_all_search_combinations() for i, (search, period, index) in enumerate(gen): if i < skip: continue set_crawl_index(i - 1) print "#" * 30 print i, len(gen), (search, period, index) header = { 'Referer': 'http://google.com/p/%d' % random.randint(1, 1000), 'User-agent': 'Mozilla/%.1f' % random.random() } page_start = 1 notfound = 0 while page_start < 200: print "page offset: ", page_start string_quote = urllib.quote('"%s" %s' % (search, str(index))) url = base_fmt % (string_quote, page_start, period) request = urllib2.Request(url, None, header) response = urllib2.urlopen(request) urls = [] links = SoupStrainer('a') for a in BeautifulSoup(response.read(), parseOnlyThese=links): for k, v in a.attrs: if k == "href": try: v = str(v) except: continue if not v.startswith("http"): continue if ".microsofttranslator.com" in v: continue if ".microsoft.com" in v: continue urls.append(v) print set(urls) - all_urls num_new = len(set(urls) - all_urls) if num_new == 0: notfound += 1 if notfound > 1: print "nothing new, skip" break else: notfound = 0 new_urls.update(urls) all_urls.update(urls) page_start += 10 finally: print "writing..." with open(URIS, "ab") as f: f.write("\n".join(new_urls) + "\n")
def get_movies(iurl): """ Get the list of movies. :return: list """ movies = [] if iurl[-3:] == '?s=': search_text = GetSearchQuery('WatchOnlineMovies') search_text = urllib.quote_plus(search_text) iurl += search_text html = requests.get(iurl, headers=mozhdr).text mlink = SoupStrainer('div', {'class':re.compile('postbox')}) items = BeautifulSoup(html, parseOnlyThese=mlink) plink = SoupStrainer('div', {'class':'wp-pagenavi'}) Paginator = BeautifulSoup(html, parseOnlyThese=plink) for item in items: title1 = item.h2.text try: title2 = title1.replace("Full Movie", "") except: title2 = title1.replace("Watch Online", "") try: title3 = title2.replace("Watch Online Placeholdernt", "") except: title3 = title2.replace(".", "") try: title4 = title3.replace(".", "") except: title4 = title3.replace("Watch Online Placeholder","") try: title5 = title4.replace("Watch Online", "") except: title5 = title4.replace("Download","") try: title6 = title5.replace("Watch Onlin", "") except: title6 = title5.replace("Placeholder","") try: title7 = title6.replace("HD Pri", "") except: title7 = title6.replace("Placeholder","") try: title8 = title7.replace(" Watch On", "") except: title8 = title7.replace("Placeholder","") try: title9 = title8.replace(" Watch", "") except: title9 = title8.replace("Placeholder","") try: title10 = title9.replace("Free Down", "") except: title10 = title9.replace("Placeholder","") try: title11 = title10.replace("Free D", "") except: title11 = title10.replace("Placeholder","") try: title12 = title11.replace("Free", "") except: title12 = title11.replace("Placeholder","") try: title13 = title12.replace(" F", "") except: title13 = title12.replace("Placeholder","") try: title14 = title13.replace(" Fr", "") except: title14 = title13.replace("Placeholder","") try: title15 = title14.replace(" Fre", "") except: title15 = title14.replace("Placeholder","") try: title16 = title15.replace(" HD", "") except: title16 = title15.replace("Placeholder","") try: title17 = title16.replace(" H", "") except: title17 = title16.replace("Placeholder","") try: title18 = title17.replace(" HD P", "") except: title18 = title17.replace("Placeholder","") try: title19 = title18.replace(" re", "") except: title19 = title18.replace("Placeholder","") try: title120 = title19.replace(" r", "") except: title120 = title19.replace("Placeholder","") # Coloring Years try: title21 = title120.replace("(2018)", "[COLOR yellow](2018)[/COLOR]") except: title21 = title120.replace("Placeholder","") try: title22 = title21.replace("(2016)", "[COLOR lightsalmon](2016)[/COLOR]") except: title22 = title21.replace("Placeholder","") try: title23 = title22.replace("(2015)", "[COLOR lime](2016)[/COLOR]") except: title23 = title22.replace("Placeholder","") # Language try: title24 = title23.replace("Hindi", "[COLOR green]Hindi[/COLOR]") except: title24 = title23.replace("Placeholder","") try: title25 = title24.replace("Dubbed", "[COLOR cyan]Dubbed[/COLOR]") except: title25 = title24.replace("Placeholder","") # Continued try: title26 = title25.replace(" nt o", "") except: title26 = title25.replace("Placeholder","") try: title27 = title26.replace(" nt F", "") except: title27 = title26.replace("Placeholder","") try: title28 = title27.replace(" nt", "") except: title28 = title27.replace("Placeholder","") try: title = title28.replace(" Pr", "") except: title = title28.replace("Placeholder","") url = item.h2.find('a')['href'] try: thumb = item.find('img')['src'].strip() except: thumb = _icon movies.append((title, thumb, url)) if 'next' in str(Paginator): nextli = Paginator.find('a', {'class':re.compile('page larger')}) purl = nextli.get('href') pages = Paginator.findAll('span', {'class':re.compile('pages')}) lastpg = pages[len(pages)-1].text title = 'Next Page.. (Currently in %s)' % (lastpg) movies.append((title, _icon, purl)) return movies
import socket, urllib2 import re import cPickle as pickle # timeout in seconds timeout = 5 socket.setdefaulttimeout(timeout) # utf-8 i/o plz! import sys import codecs stdout = codecs.getwriter('utf-8')(sys.stdout) stdin = codecs.getwriter('utf-8')(sys.stdin) stderr = codecs.getwriter('utf-8')(sys.stderr) titles = SoupStrainer('title') def get_title(h): '''Retrieve title of a web page.''' try: s = BeautifulSoup(urllib2.urlopen(h), parseOnlyThese=titles) return s.title.string.replace('\n', ' ').replace('\r', ' ').strip() except Exception, err: return '' global blog, links, rlinks, tags, rtags, title, ua blog = {} links = {} rlinks = {}
def get_link(input_url=None): """ To get inner link of given link """ input_url = str(input_url) return_list = {} url_list = [] #contains all url list url_dict = { } #contains url and key value pairs EX: {"http://vlabs.ac.in/index.html#aboutus","ABOUT"} try: http = httplib2.Http() status, response = http.request(input_url) except: response = [] logger.exception("Invalid URL --- .".format(input_url)) return response for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')): #Filter only link from web page(Given search url) try: content = ''.join(link.findAll(text=True)) content = replace_html(content) content = ' '.join(content.split()) except: content = '' if link.has_key('href'): url = link['href'] if not content: logger.info( "URL --- {0} dont have any content. CONTENT {1}".format( url, content)) continue try: if re.match('^http', url): url = url else: #In relative url add base url url = "/".join([input_url, url]) try: url = url.rstrip("/") req = urllib2.Request(url) urllib2.urlopen(req) url_list.append(url) url_dict[url] = content logger.info( "get_link Valid_Url URL --- {0} and CONTENT --- {1}". format(url, content)) except: logger.info( "get_link Not_Valid_Url URL --- {0} and CONTENT --- {1}" .format(url, content)) except: logger.exception("get_link Invalid_Url URL --- ".format(url)) url_list = set(url_list) return_list['url_list'] = list(url_list) return_list['url_dict'] = url_dict return return_list
def get_links(x): '''Retrieve links from blog posts in XML file.''' google_tags = set([u'fresh', u'read', u'reading-list']) anchors = SoupStrainer('a', href=re.compile('^http')) d = feedparser.parse(x) for e in d.entries: try: b = e.source.link # blog source i = e.title # blog post title l = e.link # link to blog post # try to get permalink by following redirects #print >>stderr, "blog:", b, "link:", l b_ = b.replace('http://', '').replace('www.', '') if b_ not in l: l = get_true_url(l) try: blog[l] = b tags[l] = set( [t.label or t.term for t in e.tags if t.label or t.term]) - google_tags #print >>stderr, tags[l] for t in tags[l]: rtags.setdefault(t, set()) rtags[t].add(l) title[l] = i # get blog post summary by trying # several RSS aliases p = None if 'summary' in e: p = e.summary elif 'subtitle' in e: p = e.subtitle elif 'content' in e and 'value' in e.content: p = e.content.value else: req = urllib2.Request(l, None, {'User-Agent': ua}) p = urllib2.urlopen(req).geturl() # parse the html s = BeautifulSoup(p, parseOnlyThese=anchors) #print >>stderr, s.prettify() # index links in blog post summary links.setdefault(l, []) for a in s.findAll('a'): h = a['href'] #h = get_true_url(h) blog.setdefault(h, '') links[l].append(h) rlinks.setdefault(h, []) rlinks[h].append(l) tags.setdefault(h, set()) #title.setdefault(h, get_title(h)) #print >>stderr, h print >> stderr, "WIN! \(^o^)/", l except Exception, err: print >> stderr, "FAIL! >_<", err, l except Exception, err: print >> stderr, "EPIC FAIL! Orz", err, e.id
def __init__(self, params): import re from addon import Addon from addondict import AddonDict as XBMCDict from BeautifulSoup import BeautifulSoup, SoupStrainer, Comment a = Addon() site = self.__module__ mode = params['mode'] home_url = 'http://pornhardx.com/' movies_url = home_url + 'category/full-movie/' scenes_url = home_url + 'video/' search_url = home_url + '?s=' false_positives = [ 'http://pornhardx.com/video', 'http://pornhardx.com/video/?order=viewed', 'http://pornhardx.com/video/?order=liked', 'http://pornhardx.com/' ] if mode == 'main': item_list = [] item_list.extend([{ 'site': site, 'mode': 'list', 'title': a.language(30006), 'content': '', 'url': scenes_url, 'cover_url': a.image('all.png', image), 'backdrop_url': a.art(), 'type': 3 }]) item_list.extend([{ 'site': site, 'mode': 'list', 'title': a.language(30003), 'content': '', 'url': home_url, 'cover_url': a.image('recent.png', image), 'backdrop_url': a.art(), 'type': 3 }]) item_list.extend([{ 'site': site, 'mode': 'categories', 'title': a.language(30005), 'content': '', 'url': scenes_url, 'cover_url': a.image('categories.png', image), 'backdrop_url': a.art(), 'type': 3 }]) item_list.extend([{ 'site': site, 'mode': 'list', 'title': a.language(30004), 'content': 'search', 'url': search_url, 'cover_url': a.image('search.png', image), 'backdrop_url': a.art(), 'type': 3 }]) item_list.extend(a.favs_hist_menu(site)) item_list.extend(a.extended_menu()) a.add_items(item_list) a.end_of_directory() elif mode == 'categories': html = a.get_page(params['url']) soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer( 'div', {'id': 'navigation-wrapper'})) item_list = [] if soup: for item in soup.findAll('a', {'href': True}): if item: if item.get('href') not in false_positives: if 'full-movie' in params['url']: if movies_url != item.get( 'href') and 'full-movie' in item.get( 'href'): item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.get('href'), 'content': '', 'title': item.contents[0].encode('UTF-8'), 'cover_url': a.image(image, image), 'backdrop_url': a.art(), 'type': 3 }]) elif 'full-movie' not in item.get('href'): item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.get('href'), 'content': '', 'title': item.contents[0].encode('UTF-8'), 'cover_url': a.image(image, image), 'backdrop_url': a.art(), 'type': 3 }]) a.add_items(item_list) a.end_of_directory() elif mode == 'list': if params.get('content', '') == 'search': item = a.search_input() if item: params['url'] = search_url + item else: exit(1) elif params.get('content', '') == 'goto': last_item = re.search('/page/([0-9]+)/', params['url']) if last_item: last_item = int(last_item.group(1)) else: last_item = 10000 item = a.page_input(last_item) if item: params['url'] = re.sub('/page/[0-9]+/', '/page/' + str(item) + '/', params['url']) else: exit(1) html = a.get_page(params['url']) soup = BeautifulSoup( html, parseOnlyThese=SoupStrainer( 'div', {'class': re.compile('col-sm-8(?:\s*main-content)*')})) item_list = [] params['mode'] = 'play' params['content'] = 'movies' params['type'] = 0 params['context'] = 0 params['duration'] = '7200' if soup: xbmcdict = XBMCDict(0).update(params) for item in soup.findAll( 'div', { 'class': re.compile( '.*(?:col-xs-6 item|post type-post status-publish).*' ) }): if item: if item.a.get('href') not in false_positives: _dict = xbmcdict.copy() if 'full-movie' not in params['url']: _dict['duration'] = '1500' _dict['content'] = 'episodes' if item.h3: _dict['url'] = item.h3.a.get('href') if item.h3.a.contents: _dict['title'] = item.h3.a.contents[ 0].encode('UTF-8') else: _dict['title'] = 'Untitled' elif item.h2: _dict['url'] = item.h2.a.get('href') if item.h2.a.contents: _dict['title'] = item.h2.a.contents[ 0].encode('UTF-8') else: _dict['title'] = 'Untitled' _dict['tvshowtitle'] = _dict['title'] _dict['originaltitle'] = _dict['title'] _dict['cover_url'] = a.image(item.img.get('src')) _dict['thumb_url'] = _dict['cover_url'] _dict['poster'] = _dict['cover_url'] _dict['sub_site'] = site item_list.extend([_dict]) soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer( 'ul', {'class': 'pagination'})) if soup.li: item = soup.find('a', {'class': 'prev page-numbers'}) if item: item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.get('href'), 'content': params['content'], 'title': a.language(30017, True), 'cover_url': a.image(image, image), 'backdrop_url': a.art(), 'type': 3 }]) item = soup.find('a', {'class': 'next page-numbers'}) if item: item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.get('href'), 'content': params['content'], 'title': a.language(30018, True), 'cover_url': a.image(image, image), 'backdrop_url': a.art(), 'type': 3 }]) if len(soup.findAll('a')) > 2: last_item = soup.find('a', { 'class': 'next page-numbers' }).parent.previousSibling.a.get('href') item_list.extend([{ 'site': site, 'mode': 'list', 'url': last_item, 'content': 'goto', 'title': a.language(30019, True), 'cover_url': a.image(image, image), 'backdrop_url': a.art(), 'type': 3 }]) else: item = soup.find('span', {'class': 'page-numbers current'}) if item: if len(soup.findAll('a')) > 2: last_item = soup.find( 'span', { 'class': 'page-numbers current' }).parent.previousSibling.a.get('href') item_list.extend([{ 'site': site, 'mode': 'list', 'url': last_item, 'content': 'goto', 'title': a.language(30019, True), 'cover_url': a.image('goto.png', image), 'backdrop_url': a.art(), 'type': 3 }]) else: soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer( 'ul', {'class': 'pager'})) item = soup.find('li', {'class': 'previous'}) if item: item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.previousSibling.get('href'), 'content': params['content'], 'title': a.language(30017, True), 'cover_url': a.image('previous.png', image), 'backdrop_url': a.art(), 'type': 3 }]) item = soup.find('li', {'class': 'next'}) if item: item_list.extend([{ 'site': site, 'mode': 'list', 'url': item.previousSibling.get('href'), 'content': params['content'], 'title': a.language(30018, True), 'cover_url': a.image('next.png', image), 'backdrop_url': a.art(), 'type': 3 }]) a.add_items(item_list) a.end_of_directory() elif mode == 'play': html = a.get_page(params['url']) soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer('body')) item = '' item_list = [] if soup: for item in soup.findAll('param', {'name': 'FlashVars'}): item = item.get('value') item = re.search('.*?proxy\.link=(.+?)&(?:proxy|skin).*?', item) if item: if item not in item_list: item = item.group(1) else: item = '' else: item = '' xbmcdict = XBMCDict(0).update(params) if item: _dict = xbmcdict.copy() _dict['url'] = item item_list.extend([_dict]) item = '' for item in soup.findAll('video'): for source in soup.findAll('source'): src = source.get('src') if src: xbmcdict = XBMCDict(0).update(params) if item and ('..' not in src): _dict = xbmcdict.copy() try: _dict['src_title'] = source.get( 'data-res') + 'p' except: pass _dict['url'] = src item_list.extend([_dict]) try: src = item.get('src') if src: xbmcdict = XBMCDict(0).update(params) if item and ('..' not in src): _dict = xbmcdict.copy() try: _dict['src_title'] = source.get( 'data-res') + 'p' except: pass _dict['url'] = src item_list.extend([_dict]) except: pass for script in soup.findAll('script'): item = '' if script.get('src'): if 'http://videomega.tv/validatehash.php' in script[ 'src']: item = script['src'] elif 'ref=' in script.get('src'): temp = re.search('.*ref=[\'"](.+?)[\'"]', script.get('src')) if temp: item = 'http://videomega.tv/iframe.php?ref=' + temp.group( 1) xbmcdict = XBMCDict(0).update(params) if item: _dict = xbmcdict.copy() _dict['url'] = item item_list.extend([_dict]) for iframe in soup.findAll('iframe'): item = '' if iframe.get('src'): if 'http://videomega.tv/validatehash.php' in iframe[ 'src']: item = iframe['src'] elif 'ref=' in iframe.get('src'): temp = re.search('.*ref=[\'"](.+?)[\'"]', iframe.get('src')) if temp: item = 'http://videomega.tv/iframe.php?ref=' + temp.group( 1) else: item = iframe.get('src') xbmcdict = XBMCDict(0).update(params) if item: _dict = xbmcdict.copy() _dict['url'] = item item_list.extend([_dict]) if item_list: from playback import Playback Playback().choose_sources(item_list) else: a.alert(a.language(30904, True), sound=False)
def handle_noargs(self, **options): def clean_num(value): if value.strip() == '': value = None else: try: value = float(value.replace(',', '').replace(' ', '')) except: value = None return value for year in YEARS: insert_count = 0 update_count = 0 if year < 2009: url = '%sstalt%s.htm' % (URL, str(year)[-2:]) else: url = '%sstalt%s%s.htm' % (URL, str(year)[-2:], 'q4') try: page = urllib2.urlopen(url) print '%s - scraping labor underutilization: %s' % (year, url) except: print 'No labor underutilization page for %s. full URL is %s' % ( year, url) continue #underemployment table should have an id of 'alternmeas' + the year, #so just parse that portion of the page strainerTag = SoupStrainer('table', id=re.compile('alt')) table = BeautifulSoup(page, parseOnlyThese=strainerTag) if len(table) < 1: print 'no underemployment table found on page %s' % url continue elif len(table) > 1: print 'duplicate tables found on page %s' % url continue #get a list of data headers headers = table.find('thead').findAll(text=re.compile("U-")) headers = [x.lower().replace('-', '') for x in headers] #scrape data & store in dictionary form data = {} rows = table.find('tbody').findAll('tr') for row in rows: state = row.th.text if data.has_key(state): print 'error: duplicate row found for state %s' % state continue else: data[state] = {} cols = row.findAll('td') for i, col in enumerate(cols): data[state][headers[i]] = col.text #insert/update try: record = LaborUnderutilizationStateRaw.objects.get( year=year, state=state) update_count = update_count + 1 except: record = LaborUnderutilizationStateRaw(year=year, state=state) insert_count = insert_count + 1 record.u1 = clean_num(data[state]['u1']) record.u2 = clean_num(data[state]['u2']) record.u3 = clean_num(data[state]['u3']) record.u4 = clean_num(data[state]['u4']) record.u5 = clean_num(data[state]['u5']) record.u6 = clean_num(data[state]['u6']) record.save() db.reset_queries() print '%s - %s rows scraped' % (year, len(data)) print '%s - %s records inserted and %s records updated' % ( year, insert_count, update_count)
def sources(self, url): logger.debug('SOURCES URL %s' % url, __name__) try: srcs = [] if url == None: return srcs if 'hd' in url.lower(): quality = 'HD' else: quality = 'SD' html = client.request(url) try: linkcode = jsunpack.unpack(html).replace('\\', '') srcs = json.loads(re.findall('sources:(.*?)\}\)', linkcode)[0]) for source in srcs: url = source['file'] host = client.host(url) self.srcs.append({ 'source': host, 'parts': '1', 'quality': quality, 'provider': 'tamilgun', 'url': url, 'direct': False }) except: pass mlink = SoupStrainer('div', {'id': 'videoframe'}) videoclass = BeautifulSoup(html, parseOnlyThese=mlink) try: links = videoclass.findAll('iframe') for link in links: url = link.get('src') host = client.host(url) self.srcs.append({ 'source': host, 'parts': '1', 'quality': quality, 'provider': 'tamilgun', 'url': url, 'direct': False }) except: pass mlink = SoupStrainer('div', {'class': 'entry-excerpt'}) videoclass = BeautifulSoup(html, parseOnlyThese=mlink) try: links = videoclass.findAll('iframe') for link in links: if 'http' in str(link): url = link.get('src') host = client.host(url) self.srcs.append({ 'source': host, 'parts': '1', 'quality': quality, 'provider': 'tamilgun', 'url': url, 'direct': False }) except: pass try: sources = json.loads( re.findall('vdf-data-json">(.*?)<', html)[0]) url = 'https://www.youtube.com/watch?v=%s' % sources['videos'][ 0]['youtubeID'] host = client.host(url) self.srcs.append({ 'source': host, 'parts': '1', 'quality': quality, 'provider': 'tamilgun', 'url': url, 'direct': False }) except: pass return self.srcs except: return self.srcs
def verify_url(self): logger.debug('Fetching url') url = self.url.get_text() name = self.name.get_text() verified = False proxies = get_network_proxies() try: if url.startswith('file://'): GObject.idle_add(self.set_loading_url, False) GObject.idle_add(self.create_app, url, name) return elif not url.startswith(( 'http://', 'https://', )): url = 'http://%s' % url try: logger.debug('starting') response = requests.get(url, proxies=proxies) verified = True logger.debug('finishing') except requests.RequestException: logger.debug('Error downloading url %s' % url) GObject.idle_add(self.set_loading_url, False) GObject.idle_add(self.set_error_message, _('The URL %s could not be reached.\nPlease double check'\ ' the URL you provided and try again.' % url)) return SkipIcon = type('SkipIcon', (Exception, ), {}) if self.icon != DEFAULT_APP_ICON: raise SkipIcon() # Try to find the apple-touch-icon logger.debug('parsing') soup = BeautifulSoup(response.content, parseOnlyThese=SoupStrainer('link')) icons = soup.findAll('link', rel=re.compile('^apple-touch-icon')) logger.debug('finished parsing') soup = BeautifulSoup(response.content) if not icons: logger.debug('No apple touch icon found') raise SkipIcon() icon = icons[0] href = icon.attrMap.get('href', None) if not href: logger.debug('Bad apple touch icon') raise SkipIcon() icon_url = None if href.startswith('/'): parsed = urlparse.urlparse(url) icon_url = urlparse.urljoin( '%s://%s' % ( parsed.scheme, parsed.netloc, ), href) else: parsed = urlparse.urlparse(href) if parsed.scheme: icon_url = href else: icon_url = urlparse.urljoin(url, href) ext = op.splitext(icon_url)[-1] tmpf = tempfile.mktemp(ext) logger.debug('temp file: %s' % tmpf) headers = {'User-Agent': 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like'\ ' Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko)'\ ' Version/4.0.4 Mobile/7B334b Safari/531.21.10'} try: icon_bytes = requests.get(icon_url, headers=headers, proxies=proxies).content except requests.RequestException: logger.debug('Error dowloading apple touch icon') else: handle = open(tmpf, 'w') handle.write(icon_bytes) handle.close() self.setup_icon(tmpf) except Exception, e: logger.debug("Error", e)
def get_movies(iurl): """ Get the list of movies. :return: list """ movies = [] logging.warning("{0} {1} {2} {0}".format('##' * 15, 'getmovies', iurl)) if iurl[-3:] == '?s=': search_text = GetSearchQuery('TamilGun') search_text = urllib.quote_plus(search_text) iurl += search_text if 'tamildbox' in iurl: logging.warning("{0} {1} {2} {0}".format('##' * 15, 'dbox-iurl', iurl)) html = requests.get(iurl, headers=mozhdr).text tlink = SoupStrainer('div', {'class': re.compile('listbox')}) items = BeautifulSoup(html, parseOnlyThese=tlink) plink = SoupStrainer('div', {'class': 'pagination'}) Paginator = BeautifulSoup(html, parseOnlyThese=plink) for item in items: title = item.h4.text url = item.find('div', attrs={ 'class': 'btn btn-primary watch' }).find('a', href=True)['href'] try: thumb = item.find('img')['src'].strip() except: thumb = _icon movies.append((title, thumb, url)) logging.warning("{0} {1} {2} {0}".format('##' * 15, 'dbox-Pagintor', Paginator)) if 'current' in str(Paginator): purl = Paginator.find('span', { 'class': re.compile('current') }).findNext('a')['href'] if 'http' not in purl: purl = url currpg = Paginator.find('span', { 'class': re.compile('current') }).text lastpg = Paginator.findAll('a', text=True)[-1] title = 'Next Page.. (Currently in Page %s of %s)' % (currpg, lastpg) movies.append((title, _icon, purl)) if 'gun' in iurl: if iurl == tamilgunurl: list_categories(iurl) logging.warning("{0} {1} {2} {0}".format('##' * 15, 'tgun-iurl', iurl)) html = requests.get(iurl, headers=mozhdr).text mlink = SoupStrainer('article', {'class': re.compile('video')}) items = BeautifulSoup(html, parseOnlyThese=mlink) plink = SoupStrainer('ul', {'class': 'page-numbers'}) Paginator = BeautifulSoup(html, parseOnlyThese=plink) for item in items: title = item.h3.text url = item.h3.find('a')['href'] try: thumb = item.find('img')['src'].strip() except: thumb = _icon movies.append((title, thumb, url)) logging.warning("{0} {1} {2} {0}".format('##' * 15, 'tgun-Pagintor', Paginator)) if 'next' in str(Paginator): nextli = Paginator.find('a', {'class': re.compile('next')}) logging.warning("{0} {1} {2} {0}".format('##' * 15, 'Pagintor', nextli)) purl = nextli.get('href') if 'http' not in purl: purl = self.bu[:-12] + purl currpg = Paginator.find('span', { 'class': re.compile('current') }).text pages = Paginator.findAll('a', {'class': re.compile('^page')}) logging.warning("{0} {1} {2} {0}".format('##' * 15, 'Pages', pages)) lastpg = pages[len(pages) - 1].text title = 'Next Page.. (Currently in Page %s of %s)' % (currpg, lastpg) movies.append((title, _icon, purl)) return movies
def ParsePage(self, id): resp = urllib.urlopen(self.details_url % id) strain = SoupStrainer("div", {"id": "movieinfoDetail"}) soup = BeautifulSoup(resp.read(), strain, fromEncoding="utf-8") header = soup.find("p", {"class": "header"}) self.meta.m_title = header.strong.string # class=title_kor self.meta.m_year = header.a.string aka = header.find("em", {"class": "title_AKA"}) temp = aka.find("span", {"class": "eng"}) if temp: self.meta.m_aka = temp.string else: self.meta.m_aka = aka.contents[0] self.meta.m_genres = [] sect = soup.find("dl", {"class": "cu mainInfo"}) secName = sect.dt.strong.renderContents() if secName != "요약정보": print "ERROR: unexpected " + secName ptCount = 0 for tt in sect.dd.contents: if hasattr(tt, 'name'): if tt.name == 'span': if tt['class'] == 'bar': ptCount = ptCount + 1 elif tt['class'] == 'rating': self.meta.m_cert = tt.img['title'] elif tt.name == 'a': text = tt.string.strip() if text: if ptCount == 0: self.meta.m_genres.append(text) else: text = tt.string.strip() if text: if ptCount == 2: self.meta.m_runtime = text elif ptCount == 4: self.meta.m_cert = text self.meta.m_rating = float( soup.find("span", { "class": "star_big pink" }).em.string) self.meta.m_poster = re.compile('C\d{3}x\d{3}').sub( 'image', soup.find('p', { "class": "poster" }).a.img['src']) self.meta.m_id = id self.meta.m_directors = [] self.meta.m_writers = [] self.meta.m_actors = [] self.meta.m_backdrop_list = [] self.ParsePlotPage(id) self.ParseCastPage(id) self.ParsePhotoPageList(id) return self.meta
def get_videos(url): """ Get the list of videos. :return: list """ videos = [] if 'cinebix.com' in url: resolve_media(url, videos) return videos if 'tamildbox' in url: resolve_media(url, videos) return videos html = requests.get(url, headers=mozhdr).text try: linkcode = jsunpack.unpack(html).replace('\\', '') sources = json.loads(re.findall('sources:(.*?)\}\)', linkcode)[0]) for source in sources: url = source['file'] + '|Referer=http://%s/' % get_vidhost( source['file']) url = urllib.quote_plus(url) videos.append(('tamilgun | %s' % source['label'], url)) except: pass mlink = SoupStrainer('div', {'id': 'videoframe'}) videoclass = BeautifulSoup(html, parseOnlyThese=mlink) try: links = videoclass.findAll('iframe') for link in links: url = link.get('src') resolve_media(url, videos) except: pass mlink = SoupStrainer('div', {'class': 'entry-excerpt'}) videoclass = BeautifulSoup(html, parseOnlyThese=mlink) try: links = videoclass.findAll('iframe') for link in links: if 'http' in str(link): url = link.get('src') resolve_media(url, videos) except: pass try: url = videoclass.p.a.get('href') resolve_media(url, videos) except: pass try: sources = json.loads(re.findall('vdf-data-json">(.*?)<', html)[0]) url = 'https://www.youtube.com/watch?v=%s' % sources['videos'][0][ 'youtubeID'] resolve_media(url, videos) except: pass return videos
def __call__(self, text, subscription): anchor_exp = re.compile('#\w+') root_exp = re.compile('^/') relative_exp = re.compile('^(?!(\w+://|mailto:|javascript:|/))') alias_exp = re.compile('|'.join(self.aliases), re.IGNORECASE) soup = BeautifulSoup(text, fromEncoding='UTF-8') # hmm curl = self.context.absolute_url() curl_parts = curl.split('/') for attr in ('href', 'src'): for tag in soup.findAll(SoupStrainer(**{attr: root_exp})): if len(curl_parts) > 3 and \ ':' in curl_parts[2] and \ tag[attr].startswith('/%s/' % curl_parts[3]): tag[attr] = '/' + '/'.join(tag[attr].split('/')[2:]) # Kupu makes absolute links without the domain, which # include the Plone site, so let's try and strip the # Plone site's id out: site_id = component.getUtility(IPloneSiteRoot).getId() if tag[attr].startswith('/%s/' % site_id): tag[attr] = tag[attr].replace('/%s/' % site_id, '/', 1) tag[attr] = '%s%s' % (self.site_url, tag[attr]) for tag in soup.findAll(SoupStrainer(**{attr: relative_exp})): if tag[attr].startswith('#'): tag[attr] = self.context_url + tag[attr] continue parts = (self.context_url + '/' + tag[attr]).split('/') while '..' in parts: dots = parts.index('..') del parts[dots - 1:dots + 1] tag[attr] = '/'.join(parts) for tag in soup.findAll(SoupStrainer(**{attr: anchor_exp})): prot, dom, path, params, query, frag = urlparse.urlparse( tag[attr]) if not prot or not dom: tag[attr] = '#%s' % frag continue url = '%s://%s%s' % (prot, dom, path) if url.endswith('/'): url = url[:-1] # If the url points to our context and the anchor exists in our # text we change it to a bare anchor. # XXX: Maybe this should work with links to non-default views. if url == self.context_url: for match in soup.findAll(attrs=dict(name=frag)): if match.name == u'a': tag[attr] = '#%s' % frag # Check for aliases if self.aliases: for tag in soup.findAll(SoupStrainer(**{attr: alias_exp})): p = re.compile( '^(\w+://)(%s)(/?)(.*)' % '|'.join(self.aliases), re.IGNORECASE) tag[attr] = p.sub(r'%s\3\4' % self._base(), tag[attr]) return str(soup)
def resolve_media(url, videos): non_str_list = [ '#', 'magnet:', 'desihome.co', 'thiruttuvcd', 'cineview', 'bollyheaven', 'videolinkz', 'imdb.', 'mgid.', 'facebook.', 'm2pub', 'tamilraja.org' ] embed_list = [ 'cineview', 'bollyheaven', 'videolinkz', 'vidzcode', 'embedzone', 'embedsr', 'fullmovie-hd', 'adly.biz', 'embedscr', 'embedrip', 'movembed', 'power4link.us', 'techking.me', 'onlinemoviesworld.xyz', 'cinebix.com' ] if 'tamildbox' in url: link = requests.get(url, headers=mozhdr).text try: mlink = SoupStrainer('div', {'id': 'player-embed'}) dclass = BeautifulSoup(link, parseOnlyThese=mlink) if 'unescape' in str(dclass): etext = re.findall("unescape.'[^']*", str(dclass))[0] etext = urllib.unquote(etext) dclass = BeautifulSoup(etext) glink = dclass.iframe.get('src') vidhost = get_vidhost(glink) videos.append((vidhost, glink)) mlink = SoupStrainer('div', {'class': 'item-content toggled'}) dclass = BeautifulSoup(link, parseOnlyThese=mlink) glink = dclass.p.iframe.get('src') vidhost = get_vidhost(glink) videos.append((vidhost, glink)) except: pass try: codes = re.findall('"return loadEP.([^,]*),(\d*)', link) for ep_id, server_id in codes: burl = 'http://www.tamildbox.com/actions.php?case=loadEP&ep_id=%s&server_id=%s' % ( ep_id, server_id) bhtml = requests.get(burl, headers=mozhdr).text blink = re.findall('(?i)iframe\s*src="(.*?)"', bhtml)[0] vidhost = get_vidhost(blink) if 'googleapis' in blink: blink = 'https://drive.google.com/open?id=' + re.findall( 'docid=([^&]*)', blink)[0] vidhost = 'GVideo' videos.append((vidhost, blink)) except: pass elif any([x in url for x in embed_list]): clink = requests.get(url, headers=mozhdr).text csoup = BeautifulSoup(clink) try: for link in csoup.findAll('iframe'): strurl = link.get('src') if not any([x in strurl for x in non_str_list]): vidhost = get_vidhost(strurl) videos.append((vidhost, strurl)) except: pass try: plink = csoup.find(class_='main-button dlbutton') strurl = plink.get('href') if not any([x in strurl for x in non_str_list]): vidhost = get_vidhost(strurl) videos.append((vidhost, strurl)) except: pass try: plink = csoup.find(class_='aio-pulse') strurl = plink.find('a')['href'] if not any([x in strurl for x in non_str_list]): vidhost = get_vidhost(strurl) videos.append((vidhost, strurl)) except: pass try: plink = csoup.find(class_='entry-content rich-content') strurl = plink.find('a')['href'] if not any([x in strurl for x in non_str_list]): vidhost = get_vidhost(strurl) videos.append((vidhost, strurl)) except: pass try: for linksSection in csoup.findAll('embed'): strurl = linksSection.get('src') if not any([x in strurl for x in non_str_list]): vidhost = get_vidhost(strurl) videos.append((vidhost, strurl)) except: pass elif not any([x in url for x in non_str_list]): vidhost = get_vidhost(url) videos.append((vidhost, url)) return
def scrapeShowEpisodes(self, html, params = {}): get = params.get if self.__dbg__: print self.__plugin__ + " scrapeShowEpisodes" page = int(get("page", "0")) per_page = ( 10, 15, 20, 25, 30, 40, 50, )[ int( self.__settings__.getSetting( "perpage" ) ) ] oldVideos = self.__settings__.getSetting("show_" + get("show") + "_season_" + get("season","0") ) if ( page == 0 or not oldVideos): videos = re.compile('<a href="/watch\?v=(.*)&feature=sh_e_sl&list=SL"').findall(html) list = SoupStrainer(name="div", attrs = {'class':"show-more-ctrl"}) nexturl = BeautifulSoup(html, parseOnlyThese=list) if (len(nexturl) > 0): nexturl = nexturl.find(name="div", attrs = {'class':"button-container"}) if (nexturl.button): nexturl = nexturl.button["data-next-url"] else: nexturl = "" if nexturl.find("start=") > 0: fetch = True start = 20 nexturl = nexturl.replace("start=20", "start=%s") while fetch: url = self.urls["main"] + nexturl % start html = self._fetchPage(url) if html: html = html.replace("\\u0026","&") html = html.replace("\\/","/") html = html.replace('\\"','"') html = html.replace("\\u003c","<") html = html.replace("\\u003e",">") more_videos = re.compile('data-video-ids="([^"]*)"').findall(html) if not more_videos: fetch = False else: videos += more_videos start += 20 if self.__dbg__: print self.__plugin__ + "found " + str(len(videos)) + " videos: " + repr(videos) self.__settings__.setSetting("show_" + get("show") + "_season_" + get("season","0"), self.core.arrayToPipe(videos)) else: videos = oldVideos.split("|") if ( per_page * ( page + 1 ) < len(videos) ): next = 'true' else: next = 'false' subitems = videos[(per_page * page):(per_page * (page + 1))] ( ytobjects, status ) = self.core._get_batch_details(subitems) if (len(ytobjects) > 0): ytobjects[len(ytobjects)-1]['next'] = next return (ytobjects, status)
def html2plaintext(html, body_id=None, encoding='utf-8'): """ from an HTML text, convert the HTML to plain text. If @body_id is provided then this is the tag where the body (not necessarily <body>) starts. """ urls = [] if body_id is not None: strainer = SoupStrainer(id=body_id) else: strainer = SoupStrainer() """ soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding) for link in soup.findAll('a'): title = link.renderContents() for url in [x[1] for x in link.attrs if x[0]=='href']: urls.append(dict(url=url, tag=str(link), title=title)) html = soup.__str__(encoding) url_index = [] i = 0 for d in urls: if d['title'] == d['url'] or 'http://'+d['title'] == d['url']: html = html.replace(d['tag'], d['url']) else: i += 1 html = html.replace(d['tag'], '%s [%s]' % (d['title'], i)) url_index.append(d['url']) """ html = html.replace('<a', 'HREFOPEN').replace('</a>', 'HREFCLOSE') html = html.replace('<strong>', '*').replace('</strong>', '*') html = html.replace('<b>', '*').replace('</b>', '*') html = html.replace('<h3>', '*').replace('</h3>', '*') html = html.replace('<p>', 'PARAOPEN').replace('</p>', 'PARACLOSE') html = html.replace('<h2>', 'ITALICOPEN').replace('</h2>', 'ITALICCLOSE') html = html.replace('<h1>', 'BOLDOPEN').replace('</h1>', 'BOLDCLOSEBREAKBREAK') html = html.replace('<em>', '/').replace('</em>', '/') # the only line breaks we respect is those of ending tags and # breaks html = html.replace('\n', ' ') html = html.replace('<br>', '\n') #html = html.replace('</p>', '\n') html = re.sub('<br\s*/>', '\n', html) html = html.replace(' ' * 2, ' ') # for all other tags we failed to clean up, just remove then and # complain about them on the stderr def desperate_fixer(g): #print >>sys.stderr, "failed to clean up %s" % str(g.group()) return ' ' html = re.sub('<.*?>', desperate_fixer, html) html = html.replace('ITALICOPEN', '\n\n<i>') html = html.replace('ITALICCLOSE', '</i>\n\n') html = html.replace('BOLDOPEN', '<b>') html = html.replace('BOLDCLOSEBREAKBREAK', '</b>\n\n') html = html.replace('PARAOPEN', '') html = html.replace('PARACLOSE', '') html = html.replace('HREFOPEN', '<a') html = html.replace('HREFCLOSE', '</a>') # lstrip all lines html = '\n'.join([x.lstrip() for x in html.splitlines()]) #for i, url in enumerate(url_index): # if i == 0: # html += '\n\n' # html += '[%s] %s\n' % (i+1, url) html = unescape(html) return html
def scrapeCategoryList(self, html = "", params = {}, tag = ""): get = params.get if self.__dbg__: print self.__plugin__ + " scrapeCategories " scraper = "categories" thumbnail = "explore" if (tag): scraper = tag thumbnail = tag list = SoupStrainer(name="div", attrs = {"class":"yt-uix-expander-body"}) categories = BeautifulSoup(html, parseOnlyThese=list) if len(categories) == 0: list = SoupStrainer(name="div", id = "browse-filter-menu") categories = BeautifulSoup(html, parseOnlyThese=list) yobjects = [] status = 200 if (len(categories) > 0): ul = categories.ul while (ul != None): category = ul.li while (category != None): if (category.a): item = {} title = category.a.contents[0] title = title.replace("&", "&") item['Title'] = title cat = category.a["href"].replace("/" + tag + "/", "") if get("scraper") == "categories": if title == "Music": category = category.findNextSibling(name = "li") continue if cat.find("?") != -1: cat = cat[cat.find("?"):] if cat.find("comedy") > 0: cat = "?c=23" if cat.find("gaming") > 0: cat = "?c=20" if get("scraper") == "movies": if cat.find("pt=nr") > 0: category = category.findNextSibling(name = "li") continue elif cat == "indian-cinema": item["subcategory"] = "true" cat = urllib.quote_plus(cat) item['category'] = cat item['scraper'] = scraper item["thumbnail"] = thumbnail if self.__dbg__: print self.__plugin__ + "adding item: " + repr(item['Title']) + ", url: " + item['category'] yobjects.append(item) category = category.findNextSibling(name = "li") ul = ul.findNextSibling(name = "ul") if (not yobjects): return (self.__language__(30601), 303) return (yobjects, status)
def ParsePage(self, id): resp = urllib.urlopen(self.main_url % id) soup = BeautifulSoup(resp.read(), fromEncoding="euc-kr") self.meta.m_id = id self.meta.m_name = soup.find('h2').string strain = SoupStrainer("div", {"class": "artist_info"}) sect = soup.find(strain) self.meta.m_thumb = sect.find("div", { "class": "albumartist_thumb" }).img['src'] chk = sect.find("img", alt=u"출생") if chk: self.meta.m_born = chk.parent.nextSibling.nextSibling.next.string.strip( ) chk = sect.find("img", alt=u"사망") if chk: self.meta.m_died = chk.parent.nextSibling.nextSibling.next.string.strip( ) chk = sect.find("img", alt=u"결성") if chk: self.meta.m_formed = chk.parent.nextSibling.nextSibling.next.string.strip( ) chk = sect.find("img", alt=u"해체") if chk: self.meta.m_disbanded = chk.parent.nextSibling.nextSibling.next.string.strip( ) self.meta.m_years = [] chk = sect.find("img", alt=u"활동연대") if chk: self.meta.m_years = chk.parent.nextSibling.nextSibling.next.string.strip( ).split(',') self.meta.m_styles = [] chk = sect.find("img", alt=u"활동유형") if chk: self.meta.m_styles = chk.parent.nextSibling.nextSibling.next.string.strip( ).split(',') self.meta.m_genres = [] chk = sect.find("img", title=u"장르") if chk: self.meta.m_genres = chk.parent.nextSibling.nextSibling.next.string.strip( ).split(',') self.meta.m_biography = ''.join( soup.find("div", id="artistBio").findAll(text=True)).strip() self.meta.m_biography = self.meta.m_biography.replace("&", "&") self.meta.m_biography = self.meta.m_biography.replace( "'", "'").replace("–", "-") self.meta.m_biography = unicode(self.meta.m_biography, 'utf-8') self.ParseAlbumPage(id) self.ParsePhotoPage(id) return self.meta