Esempi in Python per SoupStrainer, esempi in Python per BeautifulSoup.SoupStrainer

Esempio n. 1

0

Mostra file

File: cmcrawler.py Progetto: GunioRobot/cmcrawler

		if (outside == 1):
			skipCheck2 = 0
		else:
			if (external == 0):
				skipCheck2 = 0
			else:
				skipCheck2 = 1
		if (skipCheck == -1) & (skipCheck2 == 0):
			conn.request("GET", page)
			code = conn.getresponse() # read response code 
			src = code.read()
			src = str(src)
			flist = l.split('.')
			ftype = flist[-1]
			imageCheck = imageTypes.find(ftype)
			links = SoupStrainer('a') # grab all anchors
			imgs = SoupStrainer('img') # grab all img elements
			if (imageCheck == -1): 
				bs = BeautifulSoup(src, parseOnlyThese=links) # parse for anchors
				if (imageCheck == -1): 
					print "Crawling\t",l,"\t",code.status
					# loop through all of the anchors found on the page
					# crawler only records the FIRST time it finds a link. If a link is on 20 pages
					# it will still only show up once in the log.
					for j in bs.findAll('a', {'href':True}):
						testresult = 0
						absUrl = urlparse.urljoin(l, j['href'])
						# check for javascript/mailto
						checkAbs = absUrl.split(':')
						checkAbs = checkAbs[0]
						checkAbs = checkAbs.strip()

Esempio n. 2

0

Mostra file

File: xtheatre.py Progetto: yam4me/repository.openeleq

    def __init__(self, params):
        import re
        from addon import Addon
        from addondict import AddonDict as XBMCDict
        from BeautifulSoup import BeautifulSoup, SoupStrainer, Comment

        a = Addon()
        site = self.__module__
        mode = params['mode']

        home_url = 'http://xtheatre.net/'
        search_url = home_url + '?s='
        false_positives = [
            'http://watchxxxhd.net/watch-full-movies-hd/',
            'http://watchxxxhd.net', 'http://watchxxxhd.net/category/movies/',
            'http://watchxxxhd.net/category/ategorized222/',
            'http://watchxxxhd.net/watch-full-movies-hd/'
        ]

        if mode == 'main':
            item_list = [{
                'site': site,
                'mode': 'list',
                'title': a.language(30006),
                'content': '',
                'url': home_url + '?filtre=date&cat=0',
                'cover_url': a.image('all.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }, {
                'site': site,
                'mode': 'categories',
                'title': a.language(30005),
                'content': '',
                'url': home_url + 'categories/',
                'cover_url': a.image('categories.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }, {
                'site': site,
                'mode': 'list',
                'title': a.language(30004),
                'content': 'search',
                'url': search_url,
                'cover_url': a.image('search.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }]
            item_list.extend(a.favs_hist_menu(site))
            item_list.extend(a.extended_menu())
            a.add_items(item_list)
            a.end_of_directory()

        elif mode == 'categories':
            html = a.get_page(params['url'])
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer(
                                     'ul', {'class': 'listing-cat'}))
            item_list = []
            if soup:
                for item in soup.findAll('li'):
                    if item:
                        if item.a.get('href') not in false_positives:
                            try:
                                vidcount = item.findAll(
                                    'span',
                                    {'class': 'nb_cat border-radius-5'
                                     })[0].string.encode('UTF-8')
                                vidcount = re.sub('\svideo[s]*', '', vidcount)
                            except:
                                vidcount = '0'
                            if vidcount and vidcount != '0':
                                img = item.find('img')
                                if img:
                                    try:
                                        img = img.get('data-lazy-src')
                                    except:
                                        try:
                                            img = img.get('src')
                                        except:
                                            img = ''
                                if not img:
                                    img = ''
                                title = item.a.get('title').encode(
                                    'UTF-8') + ' (%s)' % vidcount
                                item_list.extend([{
                                    'site':
                                    site,
                                    'mode':
                                    'list',
                                    'url':
                                    item.a.get('href'),
                                    'content':
                                    '',
                                    'title':
                                    title,
                                    'cover_url':
                                    a.image(img, image),
                                    'backdrop_url':
                                    a.art(),
                                    'type':
                                    3
                                }])

            a.add_items(item_list)
            a.end_of_directory()

        elif mode == 'list':
            if params.get('content', '') == 'search':
                item = a.search_input()
                if item:
                    params['url'] = search_url + item
                else:
                    exit(1)
            elif params.get('content', '') == 'goto':
                last_item = re.search('/page/([0-9]+)/', params['url'])
                if last_item:
                    last_item = int(last_item.group(1))
                else:
                    last_item = 10000
                item = a.page_input(last_item)
                if item:
                    params['url'] = re.sub('/page/[0-9]+/',
                                           '/page/' + str(item) + '/',
                                           params['url'])
                else:
                    exit(1)
            html = a.get_page(params['url'])
            soup = BeautifulSoup(
                html,
                parseOnlyThese=SoupStrainer(
                    'ul', {'class': 'listing-videos listing-extract'}))
            item_list = []
            params['mode'] = 'play'
            params['content'] = 'movies'
            params['type'] = 0
            params['context'] = 0
            params['duration'] = '7200'
            if soup:
                xbmcdict = XBMCDict(0).update(params)
                for item in soup.findAll(
                        'li', {'class': 'border-radius-5 box-shadow'}):
                    if item:
                        if item.a.get('href') not in false_positives:
                            _dict = xbmcdict.copy()
                            _dict['url'] = item.a.get('href')
                            _dict['title'] = item.a.get('title').encode(
                                'UTF-8')
                            _dict['tvshowtitle'] = _dict['title']
                            _dict['originaltitle'] = _dict['title']
                            img = item.find('img')
                            if img:
                                try:
                                    img = img.get('data-lazy-src')
                                except:
                                    try:
                                        img = img.get('src')
                                    except:
                                        img = ''
                            if not img:
                                img = ''
                            _dict['cover_url'] = a.image(img)
                            _dict['thumb_url'] = _dict['cover_url']
                            _dict['poster'] = _dict['cover_url']
                            _dict['sub_site'] = site
                            plot = item.find('div', {'class': 'right'})
                            if plot:
                                plot = plot.p.contents[0].encode('utf-8')
                                _dict['plot'] = plot
                                _dict['plotoutline'] = plot
                            item_list.extend([_dict])
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer(
                                     'div', {'class': 'pagination'}))
            last_item = False
            if soup:
                for item in soup.findAll('a'):
                    if (item.string.encode('UTF-8')
                            == 'Last »') or (item.get('class') == 'last'):
                        last_item = item.get('href')
                        break
                if last_item is False:
                    for last_item in soup.findAll('a', {'class': 'inactive'}):
                        pass
                    if last_item: last_item = last_item.get('href')
                item = soup.find('span', {'class': 'current'})
                if item:
                    if item.parent:
                        item = item.parent
                        if item.previousSibling:
                            if item.previousSibling.find('a'):
                                item_list.extend([{
                                    'site':
                                    site,
                                    'mode':
                                    'list',
                                    'url':
                                    item.previousSibling.a.get('href'),
                                    'content':
                                    params['content'],
                                    'title':
                                    a.language(30017, True),
                                    'cover_url':
                                    a.image('previous.png', image),
                                    'backdrop_url':
                                    a.art(),
                                    'type':
                                    3
                                }])
                        if item.nextSibling:
                            if item.nextSibling.find('a'):
                                item_list.extend([{
                                    'site':
                                    site,
                                    'mode':
                                    'list',
                                    'url':
                                    item.nextSibling.a.get('href'),
                                    'content':
                                    params['content'],
                                    'title':
                                    a.language(30018, True),
                                    'cover_url':
                                    a.image('next.png', image),
                                    'backdrop_url':
                                    a.art(),
                                    'type':
                                    3
                                }])
            if last_item:
                item_list.extend([{
                    'site': site,
                    'mode': 'list',
                    'url': last_item,
                    'content': 'goto',
                    'title': a.language(30019, True),
                    'cover_url': a.image('goto.png', image),
                    'backdrop_url': a.art(),
                    'type': 3
                }])

            a.add_items(item_list)
            a.end_of_directory()

        elif mode == 'play':
            html = a.get_page(params['url'])
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer(
                                     'div', {'class': 'video-embed'}))
            item_list = []
            if soup:
                for script in soup.findAll(re.compile('s_*c_*r_*i_*p_*t')):
                    item = ''
                    if script.get('src'):
                        if 'http://videomega.tv/validatehash.php' in script[
                                'src']:
                            item = script['src']
                        elif 'ref=' in script.get('src'):
                            temp = re.search('.*ref=[\'"](.+?)[\'"]',
                                             script.get('src'))
                            if temp:
                                item = 'http://videomega.tv/iframe.php?ref=' + temp.group(
                                    1)
                        xbmcdict = XBMCDict(0).update(params)
                        if item:
                            _dict = xbmcdict.copy()
                            _dict['url'] = item
                            item_list.extend([_dict])
                if soup.find('iframe', src=True):
                    item = ''
                    for iframe in soup.findAll('iframe', src=True):
                        if iframe.get('data-lazy-src'):
                            item = iframe.get('data-lazy-src')
                            r = re.search('.+old=(.+)$', item)
                            if r:
                                item = r.group(1)
                        else:
                            item = iframe.get('src').replace('\\', '')
                        xbmcdict = XBMCDict(0).update(params)
                        if item:
                            _dict = xbmcdict.copy()
                            _dict['url'] = item
                            item_list.extend([_dict])
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer(
                                     'div', {'id': 'video-infos'}))
            if soup:
                item = ''
                for p in soup.findAll('p'):
                    if p.iframe:
                        item = p.iframe.get('src')
                        xbmcdict = XBMCDict(0).update(params)
                        if item:
                            _dict = xbmcdict.copy()
                            _dict['url'] = item
                            item_list.extend([_dict])
            if item_list:
                from playback import Playback
                Playback().choose_sources(item_list)
            else:
                a.alert(a.language(30904, True), sound=False)

Esempio n. 3

0

Mostra file

File: YouTubeScraperCore.py Progetto: archme/myisos

	def scrapeShowsGrid(self, html, params = {}):
		get = params.get
		params["folder"] = "true"
		if self.__dbg__:
			print self.__plugin__ + " scrapeShowsGrid"
		
		next = "false"
		pager = SoupStrainer(name="div", attrs = {'class':"yt-uix-pager"})
		pagination = BeautifulSoup(html, parseOnlyThese=pager)

		if (len(pagination) > 0):
			tmp = str(pagination)
			if (tmp.find("Next") > 0):
				next = "true"
		
		#Now look for the shows in the list.
		list = SoupStrainer(name="div", attrs = {"class":"popular-show-list"})
		shows = BeautifulSoup(html, parseOnlyThese=list)
		
		yobjects = []
		status = 200
		
		if (len(shows) > 0):
			
			show = shows.div.div
			
			while (show != None):
				
				if (show.a):
					item = {}
					episodes = show.find(name = "div", attrs= {'class':"show-extrainfo"})
					title = show.div.h3.contents[0]
					if (episodes and episodes.span):
						title = title + " (" + episodes.span.contents[0].lstrip().rstrip() + ")"
					
					title = title.replace("&amp;", "&")
					item['Title'] = title
					
					show_url = show.a["href"]
					if (show_url.find("?p=") > 0):
						show_url = show_url[show_url.find("?p=") + 1:]
					else :
						show_url = show_url.replace("/show/", "")
					
					show_url = urllib.quote_plus(show_url)
					item['show'] = show_url
					item['icon'] = "shows"
					item['scraper'] = "show"
					thumbnail = show.a.span.img['src']
					if ( thumbnail.find("_thumb.") > 0):
						thumbnail = thumbnail.replace("_thumb.",".")
					else:
						thumbnail = "shows"
					
					item["thumbnail"] = thumbnail
					
					if self.__dbg__:
						print self.__plugin__ + " adding show " + repr(item['Title']) + ", url: " + repr(item['show'])
					
					yobjects.append(item)
				
				show = show.findNextSibling(name="div", attrs = { 'class':re.compile("show-cell .") })
			
		if (not yobjects):
			return (self.__language__(30601), 303)
		
		yobjects[len(yobjects) -1]["next"] = next
			
		return (yobjects, status)

Esempio n. 4

0

Mostra file

 def ParsePlotPage(self, id):
     resp = urllib.urlopen(self.story_url % id)
     strain = SoupStrainer("div", {"id": "synopsis"})
     soup = BeautifulSoup(resp.read(), strain, fromEncoding="utf-8")
     plot = ''.join(soup.find('div', {"class": "txt"}).findAll(text=True))
     self.meta.m_plot = plot.strip().replace('\r', '')

Esempio n. 5

0

Mostra file

File: dlpatent.py Progetto: orgPatentRoot/Patent-Tools

if not os.path.exists(DL_DIR):
    os.makedirs(DL_DIR)


URL_BIB = 'http://www.google.com/googlebooks/uspto-patents-grants-biblio.html'
DBSERVER = '127.0.0.1'
USERNAME = '******'
PASSWORD = '******'
DATABASE = 'PatentTools'


def bib_urls():
	response = urllib2.urlopen(URL_BIB).read()
    urls = []

	for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
		if link.has_key('href') and link['href'].find('.zip') != -1:
			urls.apend(link['href'])


def download_next_bib(n=1):

    # Create the needed tables if they don't exist
    create_retrieve_tables()

    # Get a list of files we've already downloaded
    con = mdb.connect(DBSERVER, USERNAME, PASSWORD, DATABASE);
    cur = con.cursor()
    cur.execute('SELECT url from files_retrieved')
    dl_urls = [f[0] for f in cur.fetchall()]

Esempio n. 6

0

Mostra file

File: tasks.py Progetto: natanocr/reddit_crawler

def get_links(html):
    links = BeautifulSoup(html, parseOnlyThese=SoupStrainer('a'))
    for link in links:
        yield link

Esempio n. 7

0

Mostra file

File: wiki-d.py Progetto: jjjake/Perpetual-Wiki

def main():
    ''' <Perpetual Loop Auto-submit business> '''
    list_home = os.getcwd()
    readyListFileName = "ready_list.txt"
    lockFileName = readyListFileName + ".lck"
    ### Exit if last list still pending, wait for it to be renamed/removed.
    if os.access(readyListFileName, os.F_OK) is True:
        print(
            'ABORT: %s exists (Not picked up yet? Should be renamed'
            'when retrieved by auto_submit loop!)' % readyListFileName)
        if os.access(lockFileName, os.F_OK) is True:
            os.remove(lockFileName)
        exit(0)
    ### If lock file exists, another process is already generating the list
    if os.access(lockFileName, os.F_OK) is True:
        print(
            'ABORT: %s lockfile exists (Another process generating list'
            'already? Should be deleted when complete!)' % lockFileName)
        exit(0)
    ### Touch a lock and list file.
    touchLi = open(readyListFileName, 'wb')
    touchLi.write('')
    touchLi.close()
    touchLo = open(lockFileName, 'wb')
    touchLo.write('')
    touchLo.close()
    ''' <Peprpetual Loop Auto-submit business /> '''

    mkdir('/1/incoming/tmp/wiki-dumps')
    home = os.getcwd()
    # Get links for every Wiki Directory
    # (i.e. aawiki/,aawikibooks/,aawiktionary/,etc.).
    url = 'http://wikipedia.c3sl.ufpr.br/'
    indexHTML = urllib2.urlopen(url).read()
    wikiList = BeautifulSoup(indexHTML,
                             parseOnlyThese=SoupStrainer(
                                 'a', href=re.compile('wik')))

    for link in wikiList:
        flogger.info('Downloading: %s' % link['href'])
        # Get links for the most recent dump in every Wiki Directory.
        # (i.e. 20110901/,20110908/,20111010/,etc.)
        itemHTML = urllib2.urlopen(url + link['href']).read()
        dirStrainer = SoupStrainer('a', href=re.compile('20'))
        print dirStrainer
        dirLinks = ([
            tag for tag in BeautifulSoup(itemHTML, parseOnlyThese=dirStrainer)
        ][-1])

        for itemDIR in dirLinks:
            identifier = ("%s-%s" % (link['href'].strip('/'), itemDIR))
            mkdir(identifier)
            makeMeta(identifier)
            # Get links for every file in dump directory
            # (i.e. pages-logging.xml.gz,pages-articles.xml.bz2,etc.)
            dirHTML = urllib2.urlopen(url + link['href'] + itemDIR).read()
            dirLinks = BeautifulSoup(dirHTML,
                                     parseOnlyThese=SoupStrainer(
                                         'a', href=re.compile(link['href'])))
            for dumpFile in dirLinks:
                dirURL = url + dumpFile['href'].strip('/')
                fname = dumpFile['href'].split('/')[-1]
                flogger.info('Downloading: %s' % dirURL)
                wget = 'wget -c %s' % dirURL
                execute = call(wget, shell=True)
        os.chdir(home)

    os.chdir(list_home)
    dataList = os.listdir(home)
    f = open(readyListFileName, 'wb')
    f.write('\n'.join(dataList))
    f.close()
    ### Remove lock file...
    os.remove(lockFileName)

    flogger.info('YOU HAVE SO MUCH WIKI!')

Esempio n. 8

0

Mostra file

def parseHtmlParams(currentURL, htmlContent):
    global database, database_css, database_js
    """
		Parse html to get args
	"""
    for url in database_url:
        k = url.find('?')
        if k > 0:
            keyUrl = url[0:k - 1]
            query = url[k + 1:]
            if not keyUrl in database:
                database[keyUrl] = {}
                database[keyUrl]['GET'] = {}
                database[keyUrl]['POST'] = {}
            lG = database[keyUrl]['GET']
            lG = dict_add(lG, splitQuery(query))
            database[keyUrl]['GET'] = lG
        elif len(dumb_params) > 0:
            keyUrl = url
            # no params in the URL... let's assign the dumb_params
            if not keyUrl in database:
                database[keyUrl] = {}
                database[keyUrl]['GET'] = {}
                database[keyUrl]['POST'] = {}
            lG = database[keyUrl]['GET']
            lP = database[keyUrl]['POST']
            lG = dict_add_list(lG, dumb_params)
            lP = dict_add_list(lP, dumb_params)
            database[keyUrl]['GET'] = lG
            database[keyUrl]['POST'] = lP

    # then, parse the forms
    forms = SoupStrainer('form')
    input = SoupStrainer('input')
    listForm = [
        tag for tag in BeautifulSoup(htmlContent, parseOnlyThese=forms)
    ]
    for f in listForm:
        method = 'GET'
        if 'method' in f or 'METHOD' in f:
            method = f['method'].upper()
        action = currentURL
        if 'action' in f or 'ACTION' in f:
            action = f['action']
        keyUrl = giveGoodURL(action, currentURL)
        listInput = [
            tag for tag in BeautifulSoup(str(f), parseOnlyThese=input)
        ]
        for i in listInput:
            if not keyUrl in database:
                database[keyUrl] = {}
                database[keyUrl]['GET'] = {}
                database[keyUrl]['POST'] = {}
            try:
                value = i['value']
            except KeyError:
                value = '42'
            try:
                name = i['name']
            except KeyError:
                name = 'foo'
                value = 'bar'
                continue
            lGP = database[keyUrl][method]
            lGP = dict_add(lGP, {name: value})
            database[keyUrl][method] = lGP
    return True

Esempio n. 9

0

Mostra file

File: SergasDocLinks.py Progetto: malon/EsperaDesEspera

from BeautifulSoup import BeautifulSoup, SoupStrainer
import urllib, urllib2, re

#Homepage where the sergas waiting lists are hosted
home = 'http://www.sergas.es/MostrarContidos_N2_T01.aspx?IdPaxina=40026'
#Site to include in case of relative paths
site = 'http://www.sergas.es'
#Bottom link page that allows to navigate to the different data for a waitingList
docLinks = 'DocLinks.htm'
#Exceptions to the automation process developed
skipped1 = 'http://www.sergas.es/Docs/ListasEspera/CIR/HTML/201003/WEB_CI~1/DocLinks.htm'
skipped2 = 'http://www.sergas.es/Docs/ListasEspera/CIR/HTML/200506/web_cir_2005-06_archivos/tabstrip.htm'

try:
    html = urllib2.urlopen(home).read()
    findDivs = SoupStrainer('div', {'class': 'list_nivel2'})
    soup = BeautifulSoup(html, parseOnlyThese=findDivs)
    output = open('SergasAuxLinks.txt', 'w')
    for link in soup.findAll('a', href=re.compile('MostrarContidos_')):
        uri = link['href'].partition('uri=')
        uriaux = uri[2]
        if not re.match('^http', uriaux):
            uriaux = site + uriaux
        ruta = uriaux.partition('.htm')
        rutaFinal = ruta[0] + '/' + docLinks
        output.write(rutaFinal + u'\n')
    #We manually enter this two skipped links that fall out of the automation
    output.write(skipped1 + u'\n')
    output.write(skipped2 + u'\n')
    output.close()
except Exception, e:

Esempio n. 10

0

Mostra file

File: ilib.py Progetto: BGCX262/zspy-svn-to-git

 def parse(self):
     result = [(i['href'], i.string) for i in BeautifulSoup(
         self.content, parseOnlyThese=SoupStrainer('td')).findAll(
             'a', {'class': 'highLight'})]
     return result

Esempio n. 11

0

Mostra file

def parseHtmlLinks(currentURL, htmlContent):
    global database_url, database_js, database_css
    """
		Parse the HTML/XHTML code to get JS, CSS, links etc.
	"""
    links = SoupStrainer('a')
    # listAnchors = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=links)]
    listAnchors = []
    for tag in BeautifulSoup(htmlContent, parseOnlyThese=links):
        try:
            string = str(tag).lower()
            if string.count("href") > 0:
                listAnchors.append(tag['href'])
        except TypeError:
            continue
        except KeyError:
            continue

    for a in listAnchors:
        goodA = giveGoodURL(a, currentURL)
        goodA = removeSESSID(goodA)
        if (root in goodA) and (goodA not in database_url):
            database_url.append(goodA)

    # parse the CSS and the JavaScript
    script = SoupStrainer('script')
    #listScripts = [tag['src'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=script)]
    listScripts = []
    for tag in BeautifulSoup(htmlContent, parseOnlyThese=script):
        try:
            string = str(tag).lower()
            if string.count("src") > 0 and string.count(".src") < 1:
                listScripts.append(tag['src'])
        except TypeError:
            continue
        except KeyError:
            continue

    for a in listScripts:
        sc = giveGoodURL(a, currentURL)
        if sc not in database_js:
            database_js.append(sc)
        if sc == currentURL:
            # remote script
            database_ext.append(sc)
    parseJavaScriptCalls()

    link = SoupStrainer('link')
    # listLinks = [tag['href'] for tag in BeautifulSoup(htmlContent, parseOnlyThese=link)]
    listLinks = []
    for tag in BeautifulSoup(htmlContent, parseOnlyThese=link):
        try:
            string = str(tag).lower()
            if string.count("href") > 0:
                listLinks.append(tag['href'])
        except TypeError:
            continue
        except KeyError:
            continue

    for a in listLinks:
        sc = giveGoodURL(a, currentURL)
        if sc not in database_css:
            database_css.append(sc)
    return True

Esempio n. 12

0

Mostra file

File: html2plaintext.py Progetto: ErdeleB/djangopeoplenet

def html2plaintext(html, body_id=None, encoding='ascii'):
    """ from an HTML text, convert the HTML to plain text.
    If @body_id is provided then this is the tag where the 
    body (not necessarily <body>) starts.
    """
    urls = []
    if body_id is not None:
        strainer = SoupStrainer(id=body_id)
    else:
        # if the html doesn't contain a <body> tag it doesn't make 
        # sense to use a strainer
        if html.count('<body'):
            strainer = SoupStrainer('body')
        strainer = None
    
    soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding)
    for link in soup.findAll('a'):
        title = unicode(link.renderContents(), encoding)
        for url in [x[1] for x in link.attrs if x[0]=='href']:
            urls.append(dict(url=url, 
                             tag=unicode(str(link), encoding), 
                             title=title))

    try:
        html = soup.renderContents(encoding=encoding)
    except AttributeError:
        # using OLDER version that BeautifulSoup 3.1
        html = soup.__str__(encoding)
    
    if isinstance(html, str) and encoding != 'ascii':
        html = unicode(html, encoding)
        
    url_index = []
    i = 0
    for d in urls:
        if d['title'] == d['url'] or u'http://'+d['title'] == d['url']:
            html = html.replace(d['tag'], d['url'])
        else:
            i += 1
            html = html.replace(d['tag'], u'%s [%s]' % (d['title'], i))
            url_index.append(d['url'])

    html = html.replace('<strong>','*').replace('</strong>','*')
    html = html.replace('<b>','*').replace('</b>','*')
    html = html.replace('<h3>','*').replace('</h3>','*')
    html = html.replace('<h2>','**').replace('</h2>','**')
    html = html.replace('<h1>','**').replace('</h1>','**')
    html = html.replace('<em>','/').replace('</em>','/')
    

    # the only line breaks we respect is those of ending tags and 
    # breaks
    
    html = html.replace('\n',' ')
    html = html.replace('<br>', '\n')
    html = html.replace('&nbsp;', ' ')
    html = html.replace('</p>', '\n\n')
    html = re.sub('<br\s*/>', '\n', html)
    html = html.replace(' ' * 2, ' ')


    # for all other tags we failed to clean up, just remove then and 
    # complain about them on the stderr
    def desperate_fixer(g):
        #print >>sys.stderr, "failed to clean up %s" % str(g.group())
        return ' '

    html = re.sub('<.*?>', desperate_fixer, html)

    # lstrip all lines
    html = u'\n'.join([x.lstrip() for x in html.splitlines()])

    for i, url in enumerate(url_index):
        if i == 0:
            html += u'\n\n'
        html += u'[%s] %s\n' % (i+1, url)

    html = unescape(html)
    
    return html

Esempio n. 13

0

Mostra file

def main():
    base_fmt = ("http://www.bing.com/search?q=%s&first=%d&filters="
                "ex1%%253a%%22ez%s%%22")

    new_urls = set()
    all_urls = set()

    try:
        with open(URIS, "rb") as f:
            all_urls.update(f.read().splitlines())
    except IOError:
        pass

    print "# urls: ", len(all_urls)

    try:
        skip = get_crawl_index()
        gen = compute_all_search_combinations()
        for i, (search, period, index) in enumerate(gen):

            if i < skip:
                continue
            set_crawl_index(i - 1)

            print "#" * 30
            print i, len(gen), (search, period, index)

            header = {
                'Referer': 'http://google.com/p/%d' % random.randint(1, 1000),
                'User-agent': 'Mozilla/%.1f' % random.random()
            }

            page_start = 1
            notfound = 0
            while page_start < 200:
                print "page offset: ", page_start
                string_quote = urllib.quote('"%s" %s' % (search, str(index)))
                url = base_fmt % (string_quote, page_start, period)
                request = urllib2.Request(url, None, header)
                response = urllib2.urlopen(request)

                urls = []
                links = SoupStrainer('a')
                for a in BeautifulSoup(response.read(), parseOnlyThese=links):
                    for k, v in a.attrs:
                        if k == "href":
                            try:
                                v = str(v)
                            except:
                                continue
                            if not v.startswith("http"):
                                continue
                            if ".microsofttranslator.com" in v:
                                continue
                            if ".microsoft.com" in v:
                                continue
                            urls.append(v)

                print set(urls) - all_urls
                num_new = len(set(urls) - all_urls)
                if num_new == 0:
                    notfound += 1
                    if notfound > 1:
                        print "nothing new, skip"
                        break
                else:
                    notfound = 0

                new_urls.update(urls)
                all_urls.update(urls)

                page_start += 10
    finally:
        print "writing..."
        with open(URIS, "ab") as f:
            f.write("\n".join(new_urls) + "\n")

Esempio n. 14

0

Mostra file

def get_movies(iurl):
    """
    Get the list of movies.
    :return: list
    """
    movies = []
    
    if iurl[-3:] == '?s=':
        search_text = GetSearchQuery('WatchOnlineMovies')
        search_text = urllib.quote_plus(search_text)
        iurl += search_text

    html = requests.get(iurl, headers=mozhdr).text
    mlink = SoupStrainer('div', {'class':re.compile('postbox')})
    items = BeautifulSoup(html, parseOnlyThese=mlink)
    plink = SoupStrainer('div', {'class':'wp-pagenavi'})
    Paginator = BeautifulSoup(html, parseOnlyThese=plink)

    for item in items:
        title1 = item.h2.text
        try:
            title2 = title1.replace("Full Movie", "")
        except:
            title2 = title1.replace("Watch Online", "")
        try:
            title3 = title2.replace("Watch Online Placeholdernt", "")
        except:
            title3 = title2.replace(".", "")
        try:
            title4 = title3.replace(".", "")
        except:
            title4 = title3.replace("Watch Online Placeholder","")
        try:
            title5 = title4.replace("Watch Online", "")
        except:
            title5 = title4.replace("Download","")
        try:
            title6 = title5.replace("Watch Onlin", "")
        except:
            title6 = title5.replace("Placeholder","")
        try:
            title7 = title6.replace("HD Pri", "")
        except:
            title7 = title6.replace("Placeholder","")
        try:
            title8 = title7.replace("  Watch On", "")
        except:
            title8 = title7.replace("Placeholder","")
        try:
            title9 = title8.replace("  Watch", "")
        except:
            title9 = title8.replace("Placeholder","")
        try:
            title10 = title9.replace("Free Down", "")
        except:
            title10 = title9.replace("Placeholder","")
        try:
            title11 = title10.replace("Free D", "")
        except:
            title11 = title10.replace("Placeholder","")
        try:
            title12 = title11.replace("Free", "")
        except:
            title12 = title11.replace("Placeholder","")
        try:
            title13 = title12.replace("   F", "")
        except:
            title13 = title12.replace("Placeholder","")
        try:
            title14 = title13.replace("   Fr", "")
        except:
            title14 = title13.replace("Placeholder","")
        try:
            title15 = title14.replace("   Fre", "")
        except:
            title15 = title14.replace("Placeholder","")
        try:
            title16 = title15.replace(" HD", "")
        except:
            title16 = title15.replace("Placeholder","")
        try:
            title17 = title16.replace("    H", "")
        except:
            title17 = title16.replace("Placeholder","")
        try:
            title18 = title17.replace("    HD P", "")
        except:
            title18 = title17.replace("Placeholder","")
        try:
            title19 = title18.replace("  re", "")
        except:
            title19 = title18.replace("Placeholder","")
        try:
            title120 = title19.replace("  r", "")
        except:
            title120 = title19.replace("Placeholder","")
        # Coloring Years
        try:
            title21 = title120.replace("(2018)", "[COLOR yellow](2018)[/COLOR]")
        except:
            title21 = title120.replace("Placeholder","")
        try:
            title22 = title21.replace("(2016)", "[COLOR lightsalmon](2016)[/COLOR]")
        except:
            title22 = title21.replace("Placeholder","")
        try:
            title23 = title22.replace("(2015)", "[COLOR lime](2016)[/COLOR]")
        except:
            title23 = title22.replace("Placeholder","")
        # Language
        try:
            title24 = title23.replace("Hindi", "[COLOR green]Hindi[/COLOR]")
        except:
            title24 = title23.replace("Placeholder","")
        try:
            title25 = title24.replace("Dubbed", "[COLOR cyan]Dubbed[/COLOR]")
        except:
            title25 = title24.replace("Placeholder","")

        # Continued
        try:
            title26 = title25.replace("   nt o", "")
        except:
            title26 = title25.replace("Placeholder","")
        try:
            title27 = title26.replace("   nt F", "")
        except:
            title27 = title26.replace("Placeholder","")
        try:
            title28 = title27.replace("   nt", "")
        except:
            title28 = title27.replace("Placeholder","")
        try:
            title = title28.replace("   Pr", "")
        except:
            title = title28.replace("Placeholder","")

        url = item.h2.find('a')['href']
        try:
            thumb = item.find('img')['src'].strip()
        except:
            thumb = _icon
        movies.append((title, thumb, url))
    
    if 'next' in str(Paginator):

        nextli = Paginator.find('a', {'class':re.compile('page larger')})

        purl = nextli.get('href')
        pages = Paginator.findAll('span', {'class':re.compile('pages')})
        lastpg = pages[len(pages)-1].text
        title = 'Next Page.. (Currently in %s)' % (lastpg)
        movies.append((title, _icon, purl))
   
    return movies

Esempio n. 15

0

Mostra file

File: grutils.py Progetto: underspecified/bscorpus

import socket, urllib2
import re
import cPickle as pickle

# timeout in seconds
timeout = 5
socket.setdefaulttimeout(timeout)

# utf-8 i/o plz!
import sys
import codecs
stdout = codecs.getwriter('utf-8')(sys.stdout)
stdin = codecs.getwriter('utf-8')(sys.stdin)
stderr = codecs.getwriter('utf-8')(sys.stderr)

titles = SoupStrainer('title')


def get_title(h):
    '''Retrieve title of a web page.'''
    try:
        s = BeautifulSoup(urllib2.urlopen(h), parseOnlyThese=titles)
        return s.title.string.replace('\n', ' ').replace('\r', ' ').strip()
    except Exception, err:
        return ''


global blog, links, rlinks, tags, rtags, title, ua
blog = {}
links = {}
rlinks = {}

Esempio n. 16

0

Mostra file

def get_link(input_url=None):
    """
    To get inner link of given link
    """
    input_url = str(input_url)
    return_list = {}
    url_list = []  #contains all url list
    url_dict = {
    }  #contains url and  key value pairs EX: {"http://vlabs.ac.in/index.html#aboutus","ABOUT"}
    try:
        http = httplib2.Http()
        status, response = http.request(input_url)
    except:
        response = []
        logger.exception("Invalid URL --- .".format(input_url))
        return response

    for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
        #Filter only link from web page(Given search url)
        try:
            content = ''.join(link.findAll(text=True))
            content = replace_html(content)
            content = ' '.join(content.split())
        except:
            content = ''

        if link.has_key('href'):
            url = link['href']
            if not content:
                logger.info(
                    "URL --- {0} dont have any content. CONTENT {1}".format(
                        url, content))
                continue

            try:
                if re.match('^http', url):
                    url = url
                else:
                    #In relative url add base url
                    url = "/".join([input_url, url])

                try:
                    url = url.rstrip("/")
                    req = urllib2.Request(url)
                    urllib2.urlopen(req)
                    url_list.append(url)
                    url_dict[url] = content
                    logger.info(
                        "get_link Valid_Url URL --- {0} and CONTENT --- {1}".
                        format(url, content))
                except:
                    logger.info(
                        "get_link Not_Valid_Url URL --- {0} and CONTENT --- {1}"
                        .format(url, content))

            except:
                logger.exception("get_link Invalid_Url URL --- ".format(url))
    url_list = set(url_list)
    return_list['url_list'] = list(url_list)
    return_list['url_dict'] = url_dict
    return return_list

Esempio n. 17

0

Mostra file

File: grutils.py Progetto: underspecified/bscorpus

def get_links(x):
    '''Retrieve links from blog posts in XML file.'''
    google_tags = set([u'fresh', u'read', u'reading-list'])
    anchors = SoupStrainer('a', href=re.compile('^http'))
    d = feedparser.parse(x)
    for e in d.entries:
        try:
            b = e.source.link  # blog source
            i = e.title  # blog post title
            l = e.link  # link to blog post

            # try to get permalink by following redirects
            #print >>stderr, "blog:", b, "link:", l
            b_ = b.replace('http://', '').replace('www.', '')
            if b_ not in l:
                l = get_true_url(l)

            try:
                blog[l] = b
                tags[l] = set(
                    [t.label or t.term
                     for t in e.tags if t.label or t.term]) - google_tags
                #print >>stderr, tags[l]
                for t in tags[l]:
                    rtags.setdefault(t, set())
                    rtags[t].add(l)
                title[l] = i

                # get blog post summary by trying
                # several RSS aliases
                p = None
                if 'summary' in e:
                    p = e.summary
                elif 'subtitle' in e:
                    p = e.subtitle
                elif 'content' in e and 'value' in e.content:
                    p = e.content.value
                else:
                    req = urllib2.Request(l, None, {'User-Agent': ua})
                    p = urllib2.urlopen(req).geturl()

                # parse the html
                s = BeautifulSoup(p, parseOnlyThese=anchors)
                #print >>stderr, s.prettify()

                # index links in blog post summary
                links.setdefault(l, [])
                for a in s.findAll('a'):
                    h = a['href']
                    #h = get_true_url(h)
                    blog.setdefault(h, '')
                    links[l].append(h)
                    rlinks.setdefault(h, [])
                    rlinks[h].append(l)
                    tags.setdefault(h, set())
                    #title.setdefault(h, get_title(h))
                    #print >>stderr, h

                print >> stderr, "WIN! \(^o^)/", l

            except Exception, err:
                print >> stderr, "FAIL! >_<", err, l

        except Exception, err:
            print >> stderr, "EPIC FAIL! Orz", err, e.id

Esempio n. 18

0

Mostra file

File: sensualhd.py Progetto: yam4me/repository.openeleq

    def __init__(self, params):
        import re
        from addon import Addon
        from addondict import AddonDict as XBMCDict
        from BeautifulSoup import BeautifulSoup, SoupStrainer, Comment

        a = Addon()
        site = self.__module__
        mode = params['mode']

        home_url = 'http://pornhardx.com/'
        movies_url = home_url + 'category/full-movie/'
        scenes_url = home_url + 'video/'
        search_url = home_url + '?s='
        false_positives = [
            'http://pornhardx.com/video',
            'http://pornhardx.com/video/?order=viewed',
            'http://pornhardx.com/video/?order=liked', 'http://pornhardx.com/'
        ]

        if mode == 'main':
            item_list = []
            item_list.extend([{
                'site': site,
                'mode': 'list',
                'title': a.language(30006),
                'content': '',
                'url': scenes_url,
                'cover_url': a.image('all.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }])
            item_list.extend([{
                'site': site,
                'mode': 'list',
                'title': a.language(30003),
                'content': '',
                'url': home_url,
                'cover_url': a.image('recent.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }])
            item_list.extend([{
                'site': site,
                'mode': 'categories',
                'title': a.language(30005),
                'content': '',
                'url': scenes_url,
                'cover_url': a.image('categories.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }])
            item_list.extend([{
                'site': site,
                'mode': 'list',
                'title': a.language(30004),
                'content': 'search',
                'url': search_url,
                'cover_url': a.image('search.png', image),
                'backdrop_url': a.art(),
                'type': 3
            }])
            item_list.extend(a.favs_hist_menu(site))
            item_list.extend(a.extended_menu())
            a.add_items(item_list)
            a.end_of_directory()

        elif mode == 'categories':
            html = a.get_page(params['url'])
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer(
                                     'div', {'id': 'navigation-wrapper'}))
            item_list = []
            if soup:
                for item in soup.findAll('a', {'href': True}):
                    if item:
                        if item.get('href') not in false_positives:
                            if 'full-movie' in params['url']:
                                if movies_url != item.get(
                                        'href') and 'full-movie' in item.get(
                                            'href'):
                                    item_list.extend([{
                                        'site':
                                        site,
                                        'mode':
                                        'list',
                                        'url':
                                        item.get('href'),
                                        'content':
                                        '',
                                        'title':
                                        item.contents[0].encode('UTF-8'),
                                        'cover_url':
                                        a.image(image, image),
                                        'backdrop_url':
                                        a.art(),
                                        'type':
                                        3
                                    }])
                            elif 'full-movie' not in item.get('href'):
                                item_list.extend([{
                                    'site':
                                    site,
                                    'mode':
                                    'list',
                                    'url':
                                    item.get('href'),
                                    'content':
                                    '',
                                    'title':
                                    item.contents[0].encode('UTF-8'),
                                    'cover_url':
                                    a.image(image, image),
                                    'backdrop_url':
                                    a.art(),
                                    'type':
                                    3
                                }])
            a.add_items(item_list)
            a.end_of_directory()

        elif mode == 'list':
            if params.get('content', '') == 'search':
                item = a.search_input()
                if item:
                    params['url'] = search_url + item
                else:
                    exit(1)
            elif params.get('content', '') == 'goto':
                last_item = re.search('/page/([0-9]+)/', params['url'])
                if last_item:
                    last_item = int(last_item.group(1))
                else:
                    last_item = 10000
                item = a.page_input(last_item)
                if item:
                    params['url'] = re.sub('/page/[0-9]+/',
                                           '/page/' + str(item) + '/',
                                           params['url'])
                else:
                    exit(1)
            html = a.get_page(params['url'])
            soup = BeautifulSoup(
                html,
                parseOnlyThese=SoupStrainer(
                    'div',
                    {'class': re.compile('col-sm-8(?:\s*main-content)*')}))
            item_list = []
            params['mode'] = 'play'
            params['content'] = 'movies'
            params['type'] = 0
            params['context'] = 0
            params['duration'] = '7200'
            if soup:
                xbmcdict = XBMCDict(0).update(params)
                for item in soup.findAll(
                        'div',
                    {
                        'class':
                        re.compile(
                            '.*(?:col-xs-6 item|post type-post status-publish).*'
                        )
                    }):
                    if item:
                        if item.a.get('href') not in false_positives:
                            _dict = xbmcdict.copy()
                            if 'full-movie' not in params['url']:
                                _dict['duration'] = '1500'
                                _dict['content'] = 'episodes'
                            if item.h3:
                                _dict['url'] = item.h3.a.get('href')
                                if item.h3.a.contents:
                                    _dict['title'] = item.h3.a.contents[
                                        0].encode('UTF-8')
                                else:
                                    _dict['title'] = 'Untitled'
                            elif item.h2:
                                _dict['url'] = item.h2.a.get('href')
                                if item.h2.a.contents:
                                    _dict['title'] = item.h2.a.contents[
                                        0].encode('UTF-8')
                                else:
                                    _dict['title'] = 'Untitled'
                            _dict['tvshowtitle'] = _dict['title']
                            _dict['originaltitle'] = _dict['title']
                            _dict['cover_url'] = a.image(item.img.get('src'))
                            _dict['thumb_url'] = _dict['cover_url']
                            _dict['poster'] = _dict['cover_url']
                            _dict['sub_site'] = site

                            item_list.extend([_dict])
            soup = BeautifulSoup(html,
                                 parseOnlyThese=SoupStrainer(
                                     'ul', {'class': 'pagination'}))
            if soup.li:
                item = soup.find('a', {'class': 'prev page-numbers'})
                if item:
                    item_list.extend([{
                        'site': site,
                        'mode': 'list',
                        'url': item.get('href'),
                        'content': params['content'],
                        'title': a.language(30017, True),
                        'cover_url': a.image(image, image),
                        'backdrop_url': a.art(),
                        'type': 3
                    }])
                item = soup.find('a', {'class': 'next page-numbers'})
                if item:
                    item_list.extend([{
                        'site': site,
                        'mode': 'list',
                        'url': item.get('href'),
                        'content': params['content'],
                        'title': a.language(30018, True),
                        'cover_url': a.image(image, image),
                        'backdrop_url': a.art(),
                        'type': 3
                    }])
                    if len(soup.findAll('a')) > 2:
                        last_item = soup.find('a', {
                            'class': 'next page-numbers'
                        }).parent.previousSibling.a.get('href')
                        item_list.extend([{
                            'site': site,
                            'mode': 'list',
                            'url': last_item,
                            'content': 'goto',
                            'title': a.language(30019, True),
                            'cover_url': a.image(image, image),
                            'backdrop_url': a.art(),
                            'type': 3
                        }])
                else:
                    item = soup.find('span', {'class': 'page-numbers current'})
                    if item:
                        if len(soup.findAll('a')) > 2:
                            last_item = soup.find(
                                'span', {
                                    'class': 'page-numbers current'
                                }).parent.previousSibling.a.get('href')
                            item_list.extend([{
                                'site':
                                site,
                                'mode':
                                'list',
                                'url':
                                last_item,
                                'content':
                                'goto',
                                'title':
                                a.language(30019, True),
                                'cover_url':
                                a.image('goto.png', image),
                                'backdrop_url':
                                a.art(),
                                'type':
                                3
                            }])
            else:
                soup = BeautifulSoup(html,
                                     parseOnlyThese=SoupStrainer(
                                         'ul', {'class': 'pager'}))
                item = soup.find('li', {'class': 'previous'})
                if item:
                    item_list.extend([{
                        'site':
                        site,
                        'mode':
                        'list',
                        'url':
                        item.previousSibling.get('href'),
                        'content':
                        params['content'],
                        'title':
                        a.language(30017, True),
                        'cover_url':
                        a.image('previous.png', image),
                        'backdrop_url':
                        a.art(),
                        'type':
                        3
                    }])
                item = soup.find('li', {'class': 'next'})
                if item:
                    item_list.extend([{
                        'site': site,
                        'mode': 'list',
                        'url': item.previousSibling.get('href'),
                        'content': params['content'],
                        'title': a.language(30018, True),
                        'cover_url': a.image('next.png', image),
                        'backdrop_url': a.art(),
                        'type': 3
                    }])
            a.add_items(item_list)
            a.end_of_directory()

        elif mode == 'play':
            html = a.get_page(params['url'])
            soup = BeautifulSoup(html, parseOnlyThese=SoupStrainer('body'))
            item = ''
            item_list = []
            if soup:
                for item in soup.findAll('param', {'name': 'FlashVars'}):
                    item = item.get('value')
                    item = re.search('.*?proxy\.link=(.+?)&(?:proxy|skin).*?',
                                     item)
                    if item:
                        if item not in item_list:
                            item = item.group(1)
                        else:
                            item = ''
                    else:
                        item = ''
                    xbmcdict = XBMCDict(0).update(params)
                    if item:
                        _dict = xbmcdict.copy()
                        _dict['url'] = item
                        item_list.extend([_dict])
                item = ''
                for item in soup.findAll('video'):
                    for source in soup.findAll('source'):
                        src = source.get('src')
                        if src:
                            xbmcdict = XBMCDict(0).update(params)
                            if item and ('..' not in src):
                                _dict = xbmcdict.copy()
                                try:
                                    _dict['src_title'] = source.get(
                                        'data-res') + 'p'
                                except:
                                    pass
                                _dict['url'] = src
                                item_list.extend([_dict])
                    try:
                        src = item.get('src')
                        if src:
                            xbmcdict = XBMCDict(0).update(params)
                            if item and ('..' not in src):
                                _dict = xbmcdict.copy()
                                try:
                                    _dict['src_title'] = source.get(
                                        'data-res') + 'p'
                                except:
                                    pass
                                _dict['url'] = src
                                item_list.extend([_dict])
                    except:
                        pass
                for script in soup.findAll('script'):
                    item = ''
                    if script.get('src'):
                        if 'http://videomega.tv/validatehash.php' in script[
                                'src']:
                            item = script['src']
                        elif 'ref=' in script.get('src'):
                            temp = re.search('.*ref=[\'"](.+?)[\'"]',
                                             script.get('src'))
                            if temp:
                                item = 'http://videomega.tv/iframe.php?ref=' + temp.group(
                                    1)
                        xbmcdict = XBMCDict(0).update(params)
                        if item:
                            _dict = xbmcdict.copy()
                            _dict['url'] = item
                            item_list.extend([_dict])
                for iframe in soup.findAll('iframe'):
                    item = ''
                    if iframe.get('src'):
                        if 'http://videomega.tv/validatehash.php' in iframe[
                                'src']:
                            item = iframe['src']
                        elif 'ref=' in iframe.get('src'):
                            temp = re.search('.*ref=[\'"](.+?)[\'"]',
                                             iframe.get('src'))
                            if temp:
                                item = 'http://videomega.tv/iframe.php?ref=' + temp.group(
                                    1)
                        else:
                            item = iframe.get('src')
                        xbmcdict = XBMCDict(0).update(params)
                        if item:
                            _dict = xbmcdict.copy()
                            _dict['url'] = item
                            item_list.extend([_dict])

            if item_list:
                from playback import Playback
                Playback().choose_sources(item_list)
            else:
                a.alert(a.language(30904, True), sound=False)

Esempio n. 19

0

Mostra file

    def handle_noargs(self, **options):
        def clean_num(value):
            if value.strip() == '':
                value = None
            else:
                try:
                    value = float(value.replace(',', '').replace(' ', ''))
                except:
                    value = None
            return value

        for year in YEARS:
            insert_count = 0
            update_count = 0
            if year < 2009:
                url = '%sstalt%s.htm' % (URL, str(year)[-2:])
            else:
                url = '%sstalt%s%s.htm' % (URL, str(year)[-2:], 'q4')

            try:
                page = urllib2.urlopen(url)
                print '%s - scraping labor underutilization: %s' % (year, url)
            except:
                print 'No labor underutilization page for %s. full URL is %s' % (
                    year, url)
                continue

            #underemployment table should have an id of 'alternmeas' + the year,
            #so just parse that portion of the page
            strainerTag = SoupStrainer('table', id=re.compile('alt'))
            table = BeautifulSoup(page, parseOnlyThese=strainerTag)

            if len(table) < 1:
                print 'no underemployment table found on page %s' % url
                continue
            elif len(table) > 1:
                print 'duplicate tables found on page %s' % url
                continue

            #get a list of data headers
            headers = table.find('thead').findAll(text=re.compile("U-"))
            headers = [x.lower().replace('-', '') for x in headers]

            #scrape data & store in dictionary form
            data = {}
            rows = table.find('tbody').findAll('tr')
            for row in rows:
                state = row.th.text
                if data.has_key(state):
                    print 'error: duplicate row found for state %s' % state
                    continue
                else:
                    data[state] = {}
                cols = row.findAll('td')
                for i, col in enumerate(cols):
                    data[state][headers[i]] = col.text

                #insert/update
                try:
                    record = LaborUnderutilizationStateRaw.objects.get(
                        year=year, state=state)
                    update_count = update_count + 1
                except:
                    record = LaborUnderutilizationStateRaw(year=year,
                                                           state=state)
                    insert_count = insert_count + 1
                record.u1 = clean_num(data[state]['u1'])
                record.u2 = clean_num(data[state]['u2'])
                record.u3 = clean_num(data[state]['u3'])
                record.u4 = clean_num(data[state]['u4'])
                record.u5 = clean_num(data[state]['u5'])
                record.u6 = clean_num(data[state]['u6'])
                record.save()
                db.reset_queries()

            print '%s - %s rows scraped' % (year, len(data))
            print '%s - %s records inserted and %s records updated' % (
                year, insert_count, update_count)

Esempio n. 20

0

Mostra file

File: tamilgun.py Progetto: abc8496/aftershock

    def sources(self, url):
        logger.debug('SOURCES URL %s' % url, __name__)
        try:
            srcs = []

            if url == None: return srcs

            if 'hd' in url.lower():
                quality = 'HD'
            else:
                quality = 'SD'

            html = client.request(url)

            try:
                linkcode = jsunpack.unpack(html).replace('\\', '')
                srcs = json.loads(re.findall('sources:(.*?)\}\)', linkcode)[0])
                for source in srcs:
                    url = source['file']
                    host = client.host(url)
                    self.srcs.append({
                        'source': host,
                        'parts': '1',
                        'quality': quality,
                        'provider': 'tamilgun',
                        'url': url,
                        'direct': False
                    })
            except:
                pass

            mlink = SoupStrainer('div', {'id': 'videoframe'})
            videoclass = BeautifulSoup(html, parseOnlyThese=mlink)

            try:
                links = videoclass.findAll('iframe')
                for link in links:
                    url = link.get('src')
                    host = client.host(url)
                    self.srcs.append({
                        'source': host,
                        'parts': '1',
                        'quality': quality,
                        'provider': 'tamilgun',
                        'url': url,
                        'direct': False
                    })
            except:
                pass

            mlink = SoupStrainer('div', {'class': 'entry-excerpt'})
            videoclass = BeautifulSoup(html, parseOnlyThese=mlink)

            try:
                links = videoclass.findAll('iframe')
                for link in links:
                    if 'http' in str(link):
                        url = link.get('src')
                        host = client.host(url)
                        self.srcs.append({
                            'source': host,
                            'parts': '1',
                            'quality': quality,
                            'provider': 'tamilgun',
                            'url': url,
                            'direct': False
                        })
            except:
                pass

            try:
                sources = json.loads(
                    re.findall('vdf-data-json">(.*?)<', html)[0])
                url = 'https://www.youtube.com/watch?v=%s' % sources['videos'][
                    0]['youtubeID']
                host = client.host(url)
                self.srcs.append({
                    'source': host,
                    'parts': '1',
                    'quality': quality,
                    'provider': 'tamilgun',
                    'url': url,
                    'direct': False
                })
            except:
                pass

            return self.srcs
        except:
            return self.srcs

Esempio n. 21

0

Mostra file

    def verify_url(self):
        logger.debug('Fetching url')
        url = self.url.get_text()
        name = self.name.get_text()
        verified = False
        proxies = get_network_proxies()
        try:
            if url.startswith('file://'):
                GObject.idle_add(self.set_loading_url, False)
                GObject.idle_add(self.create_app, url, name)
                return
            elif not url.startswith((
                    'http://',
                    'https://',
            )):
                url = 'http://%s' % url

            try:
                logger.debug('starting')
                response = requests.get(url, proxies=proxies)
                verified = True
                logger.debug('finishing')
            except requests.RequestException:
                logger.debug('Error downloading url %s' % url)
                GObject.idle_add(self.set_loading_url, False)
                GObject.idle_add(self.set_error_message,
                        _('The URL %s could not be reached.\nPlease double check'\
                        ' the URL you provided and try again.' % url))
                return

            SkipIcon = type('SkipIcon', (Exception, ), {})
            if self.icon != DEFAULT_APP_ICON:
                raise SkipIcon()

            # Try to find the apple-touch-icon
            logger.debug('parsing')
            soup = BeautifulSoup(response.content,
                                 parseOnlyThese=SoupStrainer('link'))
            icons = soup.findAll('link', rel=re.compile('^apple-touch-icon'))
            logger.debug('finished parsing')
            soup = BeautifulSoup(response.content)
            if not icons:
                logger.debug('No apple touch icon found')
                raise SkipIcon()
            icon = icons[0]
            href = icon.attrMap.get('href', None)
            if not href:
                logger.debug('Bad apple touch icon')
                raise SkipIcon()
            icon_url = None
            if href.startswith('/'):
                parsed = urlparse.urlparse(url)
                icon_url = urlparse.urljoin(
                    '%s://%s' % (
                        parsed.scheme,
                        parsed.netloc,
                    ), href)
            else:
                parsed = urlparse.urlparse(href)
                if parsed.scheme:
                    icon_url = href
                else:
                    icon_url = urlparse.urljoin(url, href)

            ext = op.splitext(icon_url)[-1]
            tmpf = tempfile.mktemp(ext)
            logger.debug('temp file: %s' % tmpf)

            headers = {'User-Agent': 'Mozilla/5.0 (iPad; U; CPU OS 3_2 like'\
                ' Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko)'\
                ' Version/4.0.4 Mobile/7B334b Safari/531.21.10'}
            try:
                icon_bytes = requests.get(icon_url,
                                          headers=headers,
                                          proxies=proxies).content
            except requests.RequestException:
                logger.debug('Error dowloading apple touch icon')
            else:
                handle = open(tmpf, 'w')
                handle.write(icon_bytes)
                handle.close()
                self.setup_icon(tmpf)
        except Exception, e:
            logger.debug("Error", e)

Esempio n. 22

0

Mostra file

File: main.py Progetto: rkramesh/kodi-addon

def get_movies(iurl):
    """
    Get the list of movies.
    :return: list
    """
    movies = []

    logging.warning("{0} {1} {2} {0}".format('##' * 15, 'getmovies', iurl))
    if iurl[-3:] == '?s=':
        search_text = GetSearchQuery('TamilGun')
        search_text = urllib.quote_plus(search_text)
        iurl += search_text

    if 'tamildbox' in iurl:
        logging.warning("{0} {1} {2} {0}".format('##' * 15, 'dbox-iurl', iurl))
        html = requests.get(iurl, headers=mozhdr).text
        tlink = SoupStrainer('div', {'class': re.compile('listbox')})
        items = BeautifulSoup(html, parseOnlyThese=tlink)
        plink = SoupStrainer('div', {'class': 'pagination'})
        Paginator = BeautifulSoup(html, parseOnlyThese=plink)

        for item in items:
            title = item.h4.text
            url = item.find('div', attrs={
                'class': 'btn btn-primary watch'
            }).find('a', href=True)['href']
            try:
                thumb = item.find('img')['src'].strip()
            except:
                thumb = _icon
            movies.append((title, thumb, url))

        logging.warning("{0} {1} {2} {0}".format('##' * 15, 'dbox-Pagintor',
                                                 Paginator))
        if 'current' in str(Paginator):
            purl = Paginator.find('span', {
                'class': re.compile('current')
            }).findNext('a')['href']
            if 'http' not in purl:
                purl = url
            currpg = Paginator.find('span', {
                'class': re.compile('current')
            }).text
            lastpg = Paginator.findAll('a', text=True)[-1]
            title = 'Next Page.. (Currently in Page %s of %s)' % (currpg,
                                                                  lastpg)
            movies.append((title, _icon, purl))

    if 'gun' in iurl:
        if iurl == tamilgunurl:
            list_categories(iurl)

        logging.warning("{0} {1} {2} {0}".format('##' * 15, 'tgun-iurl', iurl))
        html = requests.get(iurl, headers=mozhdr).text
        mlink = SoupStrainer('article', {'class': re.compile('video')})
        items = BeautifulSoup(html, parseOnlyThese=mlink)
        plink = SoupStrainer('ul', {'class': 'page-numbers'})
        Paginator = BeautifulSoup(html, parseOnlyThese=plink)

        for item in items:
            title = item.h3.text
            url = item.h3.find('a')['href']
            try:
                thumb = item.find('img')['src'].strip()
            except:
                thumb = _icon
            movies.append((title, thumb, url))

        logging.warning("{0} {1} {2} {0}".format('##' * 15, 'tgun-Pagintor',
                                                 Paginator))
        if 'next' in str(Paginator):
            nextli = Paginator.find('a', {'class': re.compile('next')})
            logging.warning("{0} {1} {2} {0}".format('##' * 15, 'Pagintor',
                                                     nextli))
            purl = nextli.get('href')
            if 'http' not in purl:
                purl = self.bu[:-12] + purl
            currpg = Paginator.find('span', {
                'class': re.compile('current')
            }).text
            pages = Paginator.findAll('a', {'class': re.compile('^page')})
            logging.warning("{0} {1} {2} {0}".format('##' * 15, 'Pages',
                                                     pages))
            lastpg = pages[len(pages) - 1].text
            title = 'Next Page.. (Currently in Page %s of %s)' % (currpg,
                                                                  lastpg)
            movies.append((title, _icon, purl))

    return movies

Esempio n. 23

0

Mostra file

    def ParsePage(self, id):
        resp = urllib.urlopen(self.details_url % id)
        strain = SoupStrainer("div", {"id": "movieinfoDetail"})
        soup = BeautifulSoup(resp.read(), strain, fromEncoding="utf-8")

        header = soup.find("p", {"class": "header"})
        self.meta.m_title = header.strong.string  # class=title_kor
        self.meta.m_year = header.a.string
        aka = header.find("em", {"class": "title_AKA"})
        temp = aka.find("span", {"class": "eng"})
        if temp:
            self.meta.m_aka = temp.string
        else:
            self.meta.m_aka = aka.contents[0]

        self.meta.m_genres = []
        sect = soup.find("dl", {"class": "cu mainInfo"})
        secName = sect.dt.strong.renderContents()
        if secName != "요약정보":
            print "ERROR: unexpected " + secName
        ptCount = 0
        for tt in sect.dd.contents:
            if hasattr(tt, 'name'):
                if tt.name == 'span':
                    if tt['class'] == 'bar':
                        ptCount = ptCount + 1
                    elif tt['class'] == 'rating':
                        self.meta.m_cert = tt.img['title']
                elif tt.name == 'a':
                    text = tt.string.strip()
                    if text:
                        if ptCount == 0:
                            self.meta.m_genres.append(text)
            else:
                text = tt.string.strip()
                if text:
                    if ptCount == 2:
                        self.meta.m_runtime = text
                    elif ptCount == 4:
                        self.meta.m_cert = text

        self.meta.m_rating = float(
            soup.find("span", {
                "class": "star_big pink"
            }).em.string)
        self.meta.m_poster = re.compile('C\d{3}x\d{3}').sub(
            'image',
            soup.find('p', {
                "class": "poster"
            }).a.img['src'])

        self.meta.m_id = id
        self.meta.m_directors = []
        self.meta.m_writers = []
        self.meta.m_actors = []
        self.meta.m_backdrop_list = []

        self.ParsePlotPage(id)
        self.ParseCastPage(id)
        self.ParsePhotoPageList(id)

        return self.meta

Esempio n. 24

0

Mostra file

File: main.py Progetto: rkramesh/kodi-addon

def get_videos(url):
    """
    Get the list of videos.
    :return: list
    """
    videos = []
    if 'cinebix.com' in url:
        resolve_media(url, videos)
        return videos

    if 'tamildbox' in url:
        resolve_media(url, videos)
        return videos

    html = requests.get(url, headers=mozhdr).text

    try:
        linkcode = jsunpack.unpack(html).replace('\\', '')
        sources = json.loads(re.findall('sources:(.*?)\}\)', linkcode)[0])
        for source in sources:
            url = source['file'] + '|Referer=http://%s/' % get_vidhost(
                source['file'])
            url = urllib.quote_plus(url)
            videos.append(('tamilgun | %s' % source['label'], url))
    except:
        pass

    mlink = SoupStrainer('div', {'id': 'videoframe'})
    videoclass = BeautifulSoup(html, parseOnlyThese=mlink)
    try:
        links = videoclass.findAll('iframe')
        for link in links:
            url = link.get('src')
            resolve_media(url, videos)
    except:
        pass

    mlink = SoupStrainer('div', {'class': 'entry-excerpt'})
    videoclass = BeautifulSoup(html, parseOnlyThese=mlink)
    try:
        links = videoclass.findAll('iframe')
        for link in links:
            if 'http' in str(link):
                url = link.get('src')
                resolve_media(url, videos)
    except:
        pass

    try:
        url = videoclass.p.a.get('href')
        resolve_media(url, videos)
    except:
        pass

    try:
        sources = json.loads(re.findall('vdf-data-json">(.*?)<', html)[0])
        url = 'https://www.youtube.com/watch?v=%s' % sources['videos'][0][
            'youtubeID']
        resolve_media(url, videos)
    except:
        pass

    return videos

Esempio n. 25

0

Mostra file

File: transform.py Progetto: luxcas/collective

    def __call__(self, text, subscription):
        anchor_exp = re.compile('#\w+')
        root_exp = re.compile('^/')
        relative_exp = re.compile('^(?!(\w+://|mailto:|javascript:|/))')
        alias_exp = re.compile('|'.join(self.aliases), re.IGNORECASE)
        soup = BeautifulSoup(text, fromEncoding='UTF-8')  # hmm
        curl = self.context.absolute_url()
        curl_parts = curl.split('/')

        for attr in ('href', 'src'):
            for tag in soup.findAll(SoupStrainer(**{attr: root_exp})):
                if len(curl_parts) > 3 and \
                       ':' in curl_parts[2] and \
                       tag[attr].startswith('/%s/' % curl_parts[3]):
                    tag[attr] = '/' + '/'.join(tag[attr].split('/')[2:])

                # Kupu makes absolute links without the domain, which
                # include the Plone site, so let's try and strip the
                # Plone site's id out:
                site_id = component.getUtility(IPloneSiteRoot).getId()
                if tag[attr].startswith('/%s/' % site_id):
                    tag[attr] = tag[attr].replace('/%s/' % site_id, '/', 1)

                tag[attr] = '%s%s' % (self.site_url, tag[attr])

            for tag in soup.findAll(SoupStrainer(**{attr: relative_exp})):
                if tag[attr].startswith('#'):
                    tag[attr] = self.context_url + tag[attr]
                    continue

                parts = (self.context_url + '/' + tag[attr]).split('/')
                while '..' in parts:
                    dots = parts.index('..')
                    del parts[dots - 1:dots + 1]
                tag[attr] = '/'.join(parts)

            for tag in soup.findAll(SoupStrainer(**{attr: anchor_exp})):
                prot, dom, path, params, query, frag = urlparse.urlparse(
                    tag[attr])

                if not prot or not dom:
                    tag[attr] = '#%s' % frag
                    continue

                url = '%s://%s%s' % (prot, dom, path)
                if url.endswith('/'):
                    url = url[:-1]

                # If the url points to our context and the anchor exists in our
                # text we change it to a bare anchor.
                # XXX: Maybe this should work with links to non-default views.
                if url == self.context_url:
                    for match in soup.findAll(attrs=dict(name=frag)):
                        if match.name == u'a':
                            tag[attr] = '#%s' % frag

            # Check for aliases
            if self.aliases:
                for tag in soup.findAll(SoupStrainer(**{attr: alias_exp})):
                    p = re.compile(
                        '^(\w+://)(%s)(/?)(.*)' % '|'.join(self.aliases),
                        re.IGNORECASE)
                    tag[attr] = p.sub(r'%s\3\4' % self._base(), tag[attr])

        return str(soup)

Esempio n. 26

0

Mostra file

File: main.py Progetto: rkramesh/kodi-addon

def resolve_media(url, videos):

    non_str_list = [
        '#', 'magnet:', 'desihome.co', 'thiruttuvcd', 'cineview',
        'bollyheaven', 'videolinkz', 'imdb.', 'mgid.', 'facebook.', 'm2pub',
        'tamilraja.org'
    ]

    embed_list = [
        'cineview', 'bollyheaven', 'videolinkz', 'vidzcode', 'embedzone',
        'embedsr', 'fullmovie-hd', 'adly.biz', 'embedscr', 'embedrip',
        'movembed', 'power4link.us', 'techking.me', 'onlinemoviesworld.xyz',
        'cinebix.com'
    ]

    if 'tamildbox' in url:
        link = requests.get(url, headers=mozhdr).text
        try:
            mlink = SoupStrainer('div', {'id': 'player-embed'})
            dclass = BeautifulSoup(link, parseOnlyThese=mlink)
            if 'unescape' in str(dclass):
                etext = re.findall("unescape.'[^']*", str(dclass))[0]
                etext = urllib.unquote(etext)
                dclass = BeautifulSoup(etext)
            glink = dclass.iframe.get('src')
            vidhost = get_vidhost(glink)
            videos.append((vidhost, glink))
            mlink = SoupStrainer('div', {'class': 'item-content toggled'})
            dclass = BeautifulSoup(link, parseOnlyThese=mlink)
            glink = dclass.p.iframe.get('src')
            vidhost = get_vidhost(glink)
            videos.append((vidhost, glink))
        except:
            pass

        try:
            codes = re.findall('"return loadEP.([^,]*),(\d*)', link)
            for ep_id, server_id in codes:
                burl = 'http://www.tamildbox.com/actions.php?case=loadEP&ep_id=%s&server_id=%s' % (
                    ep_id, server_id)
                bhtml = requests.get(burl, headers=mozhdr).text
                blink = re.findall('(?i)iframe\s*src="(.*?)"', bhtml)[0]
                vidhost = get_vidhost(blink)
                if 'googleapis' in blink:
                    blink = 'https://drive.google.com/open?id=' + re.findall(
                        'docid=([^&]*)', blink)[0]
                    vidhost = 'GVideo'
                videos.append((vidhost, blink))
        except:
            pass

    elif any([x in url for x in embed_list]):
        clink = requests.get(url, headers=mozhdr).text
        csoup = BeautifulSoup(clink)
        try:
            for link in csoup.findAll('iframe'):
                strurl = link.get('src')
                if not any([x in strurl for x in non_str_list]):
                    vidhost = get_vidhost(strurl)
                    videos.append((vidhost, strurl))
        except:
            pass

        try:
            plink = csoup.find(class_='main-button dlbutton')
            strurl = plink.get('href')
            if not any([x in strurl for x in non_str_list]):
                vidhost = get_vidhost(strurl)
                videos.append((vidhost, strurl))
        except:
            pass

        try:
            plink = csoup.find(class_='aio-pulse')
            strurl = plink.find('a')['href']
            if not any([x in strurl for x in non_str_list]):
                vidhost = get_vidhost(strurl)
                videos.append((vidhost, strurl))
        except:
            pass

        try:
            plink = csoup.find(class_='entry-content rich-content')
            strurl = plink.find('a')['href']
            if not any([x in strurl for x in non_str_list]):
                vidhost = get_vidhost(strurl)
                videos.append((vidhost, strurl))
        except:
            pass

        try:
            for linksSection in csoup.findAll('embed'):
                strurl = linksSection.get('src')
                if not any([x in strurl for x in non_str_list]):
                    vidhost = get_vidhost(strurl)
                    videos.append((vidhost, strurl))
        except:
            pass

    elif not any([x in url for x in non_str_list]):
        vidhost = get_vidhost(url)
        videos.append((vidhost, url))

    return

Esempio n. 27

0

Mostra file

File: YouTubeScraperCore.py Progetto: archme/myisos

	def scrapeShowEpisodes(self, html, params = {}):
		get = params.get
		if self.__dbg__:
			print self.__plugin__ + " scrapeShowEpisodes"
		
		page = int(get("page", "0"))
		per_page = ( 10, 15, 20, 25, 30, 40, 50, )[ int( self.__settings__.getSetting( "perpage" ) ) ]
		
		oldVideos = self.__settings__.getSetting("show_" + get("show") + "_season_" + get("season","0") )
		
		if ( page == 0 or not oldVideos):
			videos = re.compile('<a href="/watch\?v=(.*)&amp;feature=sh_e_sl&amp;list=SL"').findall(html)
			
			list = SoupStrainer(name="div", attrs = {'class':"show-more-ctrl"})
			nexturl = BeautifulSoup(html, parseOnlyThese=list)
			if (len(nexturl) > 0):
				nexturl = nexturl.find(name="div", attrs = {'class':"button-container"})
				if (nexturl.button):
					nexturl = nexturl.button["data-next-url"]
				else:
					nexturl = ""
			
			if nexturl.find("start=") > 0:
				fetch = True
				start = 20
				nexturl = nexturl.replace("start=20", "start=%s")
				while fetch:
					url = self.urls["main"] + nexturl % start
					html = self._fetchPage(url)
					
					if html:
						html = html.replace("\\u0026","&")
						html = html.replace("\\/","/")
						html = html.replace('\\"','"')
						html = html.replace("\\u003c","<")
						html = html.replace("\\u003e",">")
						more_videos = re.compile('data-video-ids="([^"]*)"').findall(html)
						
						if not more_videos:
							fetch = False
						else:
							videos += more_videos
							start += 20
			if self.__dbg__:
				print self.__plugin__ + "found " + str(len(videos)) + " videos: " + repr(videos)
			
			self.__settings__.setSetting("show_" + get("show") + "_season_" + get("season","0"), self.core.arrayToPipe(videos))
		else:
			videos = oldVideos.split("|")
		
		if ( per_page * ( page + 1 ) < len(videos) ):
			next = 'true'
		else:
			next = 'false'
		
		subitems = videos[(per_page * page):(per_page * (page + 1))]
		
		( ytobjects, status ) = self.core._get_batch_details(subitems)

		if (len(ytobjects) > 0):
			ytobjects[len(ytobjects)-1]['next'] = next
		
		return (ytobjects, status)

Esempio n. 28

0

Mostra file

File: html2plaintext.py Progetto: cash2one/newsrivr

def html2plaintext(html, body_id=None, encoding='utf-8'):
    """ from an HTML text, convert the HTML to plain text.
    If @body_id is provided then this is the tag where the 
    body (not necessarily <body>) starts.
    """
    urls = []
    if body_id is not None:
        strainer = SoupStrainer(id=body_id)
    else:
        strainer = SoupStrainer()
    """
    soup = BeautifulSoup(html, parseOnlyThese=strainer, fromEncoding=encoding)
    for link in soup.findAll('a'):
        title = link.renderContents()
        for url in [x[1] for x in link.attrs if x[0]=='href']:
            urls.append(dict(url=url, tag=str(link), title=title))

    html = soup.__str__(encoding)
            
    url_index = []
    i = 0
    for d in urls:
        if d['title'] == d['url'] or 'http://'+d['title'] == d['url']:
            html = html.replace(d['tag'], d['url'])
        else:
            i += 1
            html = html.replace(d['tag'], '%s [%s]' % (d['title'], i))
            url_index.append(d['url'])
    """

    html = html.replace('<a', 'HREFOPEN').replace('</a>', 'HREFCLOSE')
    html = html.replace('<strong>', '*').replace('</strong>', '*')
    html = html.replace('<b>', '*').replace('</b>', '*')
    html = html.replace('<h3>', '*').replace('</h3>', '*')
    html = html.replace('<p>', 'PARAOPEN').replace('</p>', 'PARACLOSE')
    html = html.replace('<h2>', 'ITALICOPEN').replace('</h2>', 'ITALICCLOSE')
    html = html.replace('<h1>', 'BOLDOPEN').replace('</h1>',
                                                    'BOLDCLOSEBREAKBREAK')
    html = html.replace('<em>', '/').replace('</em>', '/')

    # the only line breaks we respect is those of ending tags and
    # breaks

    html = html.replace('\n', ' ')
    html = html.replace('<br>', '\n')
    #html = html.replace('</p>', '\n')
    html = re.sub('<br\s*/>', '\n', html)
    html = html.replace(' ' * 2, ' ')

    # for all other tags we failed to clean up, just remove then and
    # complain about them on the stderr
    def desperate_fixer(g):
        #print >>sys.stderr, "failed to clean up %s" % str(g.group())
        return ' '

    html = re.sub('<.*?>', desperate_fixer, html)

    html = html.replace('ITALICOPEN', '\n\n<i>')
    html = html.replace('ITALICCLOSE', '</i>\n\n')
    html = html.replace('BOLDOPEN', '<b>')
    html = html.replace('BOLDCLOSEBREAKBREAK', '</b>\n\n')
    html = html.replace('PARAOPEN', '')
    html = html.replace('PARACLOSE', '')
    html = html.replace('HREFOPEN', '<a')
    html = html.replace('HREFCLOSE', '</a>')

    # lstrip all lines
    html = '\n'.join([x.lstrip() for x in html.splitlines()])

    #for i, url in enumerate(url_index):
    #    if i == 0:
    #        html += '\n\n'
    #    html += '[%s] %s\n' % (i+1, url)

    html = unescape(html)

    return html

Esempio n. 29

0

Mostra file

File: YouTubeScraperCore.py Progetto: archme/myisos

	def scrapeCategoryList(self, html = "", params = {}, tag = ""):
		get = params.get
		if self.__dbg__:
			print self.__plugin__ + " scrapeCategories " 
		scraper = "categories"
		thumbnail = "explore"
		
		if (tag):
			scraper = tag
			thumbnail = tag
		
		list = SoupStrainer(name="div", attrs = {"class":"yt-uix-expander-body"})
		categories = BeautifulSoup(html, parseOnlyThese=list)
		
		if len(categories) == 0:
			list = SoupStrainer(name="div", id = "browse-filter-menu")
			categories = BeautifulSoup(html, parseOnlyThese=list)
		
		yobjects = []
		status = 200
		
		if (len(categories) > 0):
			ul = categories.ul
			while (ul != None):
				category = ul.li
				while (category != None):
					if (category.a):
						item = {}
						title = category.a.contents[0]
						title = title.replace("&amp;", "&")
						item['Title'] = title
						cat = category.a["href"].replace("/" + tag + "/", "")
						if get("scraper") == "categories":
							if title == "Music":
								category = category.findNextSibling(name = "li")
								continue
							if cat.find("?") != -1:
								cat = cat[cat.find("?"):]
							if cat.find("comedy") > 0:
								cat = "?c=23"
							if cat.find("gaming") > 0:
								cat = "?c=20"
						if get("scraper") == "movies":
							if cat.find("pt=nr") > 0:
								category = category.findNextSibling(name = "li")
								continue
							elif cat == "indian-cinema":
								item["subcategory"] = "true"
						
						cat = urllib.quote_plus(cat)
						item['category'] = cat
						item['scraper'] = scraper
						item["thumbnail"] = thumbnail
						if self.__dbg__:
							print self.__plugin__ + "adding item: " + repr(item['Title']) + ", url: " + item['category']
						yobjects.append(item)
					
					category = category.findNextSibling(name = "li")
				ul = ul.findNextSibling(name = "ul")
		
		if (not yobjects):
			return (self.__language__(30601), 303)
		
		return (yobjects, status)

Esempio n. 30

0

Mostra file

File: naver.py Progetto: cjrules/xbmc-korean

    def ParsePage(self, id):
        resp = urllib.urlopen(self.main_url % id)
        soup = BeautifulSoup(resp.read(), fromEncoding="euc-kr")

        self.meta.m_id = id
        self.meta.m_name = soup.find('h2').string

        strain = SoupStrainer("div", {"class": "artist_info"})
        sect = soup.find(strain)
        self.meta.m_thumb = sect.find("div", {
            "class": "albumartist_thumb"
        }).img['src']

        chk = sect.find("img", alt=u"출생")
        if chk:
            self.meta.m_born = chk.parent.nextSibling.nextSibling.next.string.strip(
            )

        chk = sect.find("img", alt=u"사망")
        if chk:
            self.meta.m_died = chk.parent.nextSibling.nextSibling.next.string.strip(
            )

        chk = sect.find("img", alt=u"결성")
        if chk:
            self.meta.m_formed = chk.parent.nextSibling.nextSibling.next.string.strip(
            )

        chk = sect.find("img", alt=u"해체")
        if chk:
            self.meta.m_disbanded = chk.parent.nextSibling.nextSibling.next.string.strip(
            )

        self.meta.m_years = []
        chk = sect.find("img", alt=u"활동연대")
        if chk:
            self.meta.m_years = chk.parent.nextSibling.nextSibling.next.string.strip(
            ).split(',')

        self.meta.m_styles = []
        chk = sect.find("img", alt=u"활동유형")
        if chk:
            self.meta.m_styles = chk.parent.nextSibling.nextSibling.next.string.strip(
            ).split(',')

        self.meta.m_genres = []
        chk = sect.find("img", title=u"장르")
        if chk:
            self.meta.m_genres = chk.parent.nextSibling.nextSibling.next.string.strip(
            ).split(',')

        self.meta.m_biography = ''.join(
            soup.find("div", id="artistBio").findAll(text=True)).strip()
        self.meta.m_biography = self.meta.m_biography.replace("&amp;", "&")
        self.meta.m_biography = self.meta.m_biography.replace(
            "&#039;", "'").replace("&#8211;", "-")
        self.meta.m_biography = unicode(self.meta.m_biography, 'utf-8')

        self.ParseAlbumPage(id)
        self.ParsePhotoPage(id)

        return self.meta