コード例 #1
0
def whitespace(options):
    # clean events
    Event.objects.filter(source="whitespace").delete()

    soup = BeautifulSoup(urlopen("http://www.0x20.be/Main_Page").read())

    for event in soup.ul('li'):
        if event.text == 'More...':
            continue
        title = event.a.text
        url = "http://www.0x20.be" + event.a["href"]
        if "-" in event.b.text[:-1]:
            start, end = map(lambda x: parse(x.strip()), event.b.text[:-1].split("-"))
        else:
            start = parse(event.b.text[:-1])
            end = None
        location = event('a')[1].text

        Event.objects.create(
            title=title,
            source="whitespace",
            url=url,
            start=start,
            end=end,
            location=location.strip() if location else None
        )

        if not options["quiet"]:
            print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "whitespace", location.encode("Utf-8"))
コード例 #2
0
ファイル: html.py プロジェクト: tanghaibao/jcvi
def links(args):
    """
    %prog links url

    Extract all the links "<a href=''>" from web page.
    """
    p = OptionParser(links.__doc__)
    p.add_option("--img", default=False, action="store_true",
                 help="Extract <img> tags [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    url, = args
    img = opts.img

    htmlfile = download(url)
    page = open(htmlfile).read()
    soup = BeautifulSoup(page)

    tag = 'img' if img else 'a'
    src = 'src' if img else 'href'
    aa = soup.findAll(tag)
    for a in aa:
        link = a.get(src)
        link = urljoin(url, link)
        print(link)
コード例 #3
0
def getAvailabilityRank(table):

	try:

		#print "getting List of ATMs requires attention..."
	
		soup = BeautifulSoup(str(table))
	
		rows = soup.findAll('tr')

		numRows = getRowsNumber(table)

		numRowsHead = getRowsHeadNumber(table)

	
		arrBestBranchBri = []
		
		for a in range (2, numRows-1):

			trs = BeautifulSoup(str(rows[a]))
			tdcells = trs.findAll("td")

			percentAvailBri = float(tdcells[17].getText())
			ukerName = cleanUpNamaUker(tdcells[0].getText())

			if (percentAvailBri == 100.00):

				#arrBestBranch.append(ukerName+", "+jumlahATM)
				arrBestBranchBri.append(ukerName)

	except IndexError:

		arrBestBranchBri = getAvailabilityRank(table)

	return sorted(arrBestBranchBri)
コード例 #4
0
def wolfplex(options):
    # clean events
    Event.objects.filter(source="wolfplex").delete()

    html_parser = HTMLParser()

    soup = BeautifulSoup(urlopen("http://www.wolfplex.org/wiki/Main_Page").read())

    events = soup.find("div", id="accueil-agenda").dl

    for date_info, event in zip(events('dt'), events('dd')[1::2]):
        if event.span:
            event.span.clear()

        title = html_parser.unescape(event.text)
        base_domain = "http://www.wolfplex.org" if not event.a["href"].startswith("http") else ""
        url = (base_domain + event.a["href"]) if event.a else "http://www.wolfplex.org"
        start = parse(date_info.span["title"])

        if "@" in title:
            title, location = title.split("@", 1)
        else:
            location = None

        Event.objects.create(
            title=title,
            source="wolfplex",
            url=url,
            start=start,
            location=location
        )

        if not options["quiet"]:
            print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "wolfplex", location.encode("Utf-8") if location else "")
コード例 #5
0
ファイル: ping.py プロジェクト: Big-Data/gnowsys-studio
 def find_external_urls(self, gbobject):
     """Find external urls in an gbobject"""
     soup = BeautifulSoup(gbobject.html_content)
     external_urls = [a['href'] for a in soup.findAll('a')
                      if self.is_external_url(
                          a['href'], self.ressources.site_url)]
     return external_urls
コード例 #6
0
def getRowsHeadNumber(table):

	# bagaimana cara menentukan berapa jumlah baris yang terpakai sebagai header?

	soup = BeautifulSoup(str(table))
	rows = soup.findAll('tr')
	numRows = len(table.findAll(lambda tag: tag.name == 'tr' and tag.findParent('table') == table))

	# inisialisasi variabel numRowsHead sebagai jumlah baris yang mengandung header

	numRowsHead = 0	
	
	# periksa satu per satu setiap baris

	for i in range (0, numRows):
		
		# apabila dalam suatu baris tertentu terdapat tag <th>
		if rows[i].findAll('th'):
			
			# maka numRows bertambah 1
			numRowsHead = i + 1


	# hasil akhir fungsi getTableDimension ini menghasilkan jumlah baris, jumlah baris yang terpakai header, jumlah kolom dan isi tabel itu sendiri

	return numRowsHead
コード例 #7
0
def getLastPageNum(alamatURL):


	strHTML = fetchHTML(alamatURL)

	mysoup = BeautifulSoup(strHTML)

	arrURL = mysoup.findAll('tfoot')[0].findAll('tr')[0].findAll('a')
	
	maxPage = 0

	if arrURL:
		
		for i in range (0, len(arrURL)):

			lastPageNum = int(arrURL[i].get('href').split('/')[7].split('?')[0])

			if lastPageNum > maxPage:

				maxPage = lastPageNum

		lastPageNum = maxPage
		
	else:
		lastPageNum = 0
	print "last page number is:", lastPageNum
	return int(lastPageNum)
コード例 #8
0
ファイル: spider.py プロジェクト: aaronsamuel137/legal-scrape
    def crawl(self, url, q):
        """
        Crawls the main url looking for sub-urls.

        """
        print 'calling crawl with url', url
        s = requests.Session()

        num_urls = 0
        r = requests.get(url)
        soup = BeautifulSoup(r.text)

        trs = soup.findAll('tr')
        for tr in trs:
            tds = tr.findAll('td')
            if len(tds) == 6:
                title = tds[1].getText()
                link = tds[3].find('a')['href']
                item = {
                    'main_page': title,
                }
                item['link'] = self.get_data_link(link, s)
                num_urls += self.crawl_again(item, q, s)

        print 'total urls crawled:', num_urls
コード例 #9
0
ファイル: spider.py プロジェクト: aaronsamuel137/legal-scrape
    def crawl_again(self, item, q, s):
        """
        Crawls the content page, looking for all urls in the same domain.

        """
        r = s.get(item['link'])
        soup = BeautifulSoup(r.text)
        main = soup.title.getText()
        urls = soup.findAll('a')
        chre = re.compile("(?<=chpt=)\d+")
        for url in urls:
            href = url['href']
            isChapt = chre.search(href)
            if isChapt == None:
                mySub = "NoChap"
            else:
                mySub = isChapt.group(0)
            if href.startswith('/'):
                link = domain + href
                q.enq({
                    'main_page': main,
                    'sub-page': mySub,
                    'section': url.parent.parent.getText().lstrip(),
                    'link': link
                })
        return len(urls)
コード例 #10
0
def getpresentationdetails(sender, **kwargs):
    print "Pre Save!"
    #print sender
    model =  kwargs['instance']
    
    
    # fetch the presentation url
    
    try:
        import urllib
        from BeautifulSoup import BeautifulSoup as BS
        html = urllib.urlopen(kwargs['instance'].url).read()
        bs = BS(html)
        # find the let's get the media url

        presurl = bs.find('link', rel='media:presentation')
        print "* Presentation: " + presurl['href']
        # and the thumbnail
        thumburl = bs.find('link', rel='image_src')
        print "* Thumbnail: " + thumburl['href']
        # and the author ame
        creator = bs.find('meta', property='dc:creator')
        print "* Creator: " + creator['content']
        
        title = bs.find('meta', property="media:title")
        print "* Content: " + title['content']

    except Exception, e:
        raise e
コード例 #11
0
ファイル: dolcetto.py プロジェクト: dayzleaper/lunchsvall
def get_daily_specials(day=None):
	page = urlopen(URL)
	soup = BeautifulSoup(page)
	page.close()

	daily_specials = {
		"name": "Dolcetto",
		"specials": [],
		"streetaddress": "Kyrkogatan 8, Sundsvall",
		"dataurl": URL,
		"mapurl": "http://www.hitta.se/ViewDetailsPink.aspx?Vkiid=4uG7%252fiYMOcHQKtp0VSkMNw%253d%253d&Vkid=3215131"
	}

	if day == None:
		day = date.today().weekday()

	# No lunch on Saturday or Sunday
	if day == 5 or day == 6:
		return daily_specials

	day = [u"måndag", u"tisdag", u"onsdag", u"torsdag", u"fredag"][day]
	anchor = soup.find(lambda t: t.name == "h2" and t.text == "Lunchmeny")
	menu = filter(lambda x: isinstance(x, NavigableString), anchor.findNextSibling("p"))
	for i, v in enumerate(menu):
		if day == v.lower():
			daily_specials["specials"].append(menu[i+1])
			break	

	return daily_specials
コード例 #12
0
	def get(self, regno):
		#self.response.headers['Content-Type'] = 'text/html'
		br= _mechanize.Browser()
		cj = cookielib.CookieJar()
		br.set_cookiejar(cj)
		br.set_handle_equiv(True)
		br.set_handle_redirect(True)
		br.set_handle_referer(True)
		br.set_handle_robots(False)
		n=262
		while(n<=262):
			m=str(n).zfill(4) # filling zeros for roll no like 001,002 etc.
			n=n+1
			#self.response.write('11BEC') # This is where roll no goes, for 09BCE just replace by 09BCE.
			#u=regno
			r=br.open('https://academics.vit.ac.in/parent/parent_login.asp')
			html=r.read()
			soup=BeautifulSoup(html)
			img = soup.find('img', id='imgCaptcha')
			image_response = br.open_novisit(img['src'])
			captcha = Captcha()
			#captcha.cookie = "123456788sids"
			#captcha.image = db.Blob(image_response.read())
			captcha.regno = regno
			for cook in cj:
                                                                captcha.cookie = cook.value
                                                                captcha.cookiename = cook.name
																
			captcha.put()
			self.response.headers['Content-Type'] = 'image/jpeg'
			self.response.out.write(image_response.read())
コード例 #13
0
ファイル: down.py プロジェクト: gbarou/cordis-scraper
def theme_worker():
    def get_projects(doc):
        for result in doc.findAll(title=u"Project acronym"):
            a = result.a
            link = "http://cordis.europa.eu" + dict(a.attrs)['href'][2:]
            yield link

    logging.info('START THEME WORKER')
    while True:
        count = 0
        theme = q.get()
        logging.info('THEME: %s', repr(theme))

        url = THEME_URL % {'theme': theme}
        try:
            while True:
                r = requests.get(url, config=REQUESTS_CONFIG)
                if not r.ok:
                    logging.error("Request failed for url: %s", url)
                    continue
                doc = BeautifulSoup(r.content)
                for proj in get_projects(doc):
                    project_queue.put((theme, proj))
                    count += 1
                try:
                    next_ = dict(doc.find(
                            text="Next 20 projects &raquo;").parent.attrs
                        )['href'][2:]
                except AttributeError:
                    break
                url = "http://cordis.europa.eu" + next_
        except Exception, e:
            logging.error("THEME_WORKER: Error for url: %s", url)
            logging.error(e)
        finally:
コード例 #14
0
ファイル: mybrowser.py プロジェクト: yeungocanh/MultiUpload
	def selectForm(self, r):
		html = r.content
		linkget = r.url
		forms_filter = SoupStrainer('form');
		soup = BeautifulSoup(html, parseOnlyThese=forms_filter);
		forms_post = ClientForm.ParseFile(StringIO.StringIO(soup.prettify()), linkget, backwards_compat=False);
		return forms_post
コード例 #15
0
ファイル: Crawler_PPT_Politic.py プロジェクト: choakai/thesis
def main():

    #for p in range(1,intGetMaxPage +1):
    #soup = BeautifulSoup()
    try:
        resp = urllib2.urlopen(getUrl,timeout=10)
        soup = BeautifulSoup(resp)
        soup = soup.find('div' ,{'id':'prodlist'})

    
        #for k in soup.findAll("div", {'class': 'p-name'}): # 抓< div class='p=name'>...< /div>
        for k in soup.findAll('a', href=True): 
            try:
            
                url = k.get('href') 
                print k.text
                print url 
        
                page_url = homeUrl + url
                print page_url
                resp_text_page = urllib2.urlopen(homeUrl + url, timeout=10)
            
                soup_text_page = BeautifulSoup(resp_text_page)
                contextPageUrl(soup_text_page,page_url)    
            except:
                print "Unexpected error:", sys.exc_info()[0]
                print "Unexpected error:", sys.exc_info()[1]
                continue
    except:
        #continue
        print "Unexpected error:", sys.exc_info()[0]
        print "Unexpected error:", sys.exc_info()[1]
        pass
def scrape_and_look_for_next_link(url):      
    html = scraperwiki.scrape(url)
    #print html
    root = lxml.html.fromstring(html)
    soup = BeautifulSoup(html)                        #using BeautifulSoup to find next page links
    scrape_table(root)                                     #before carrying on scrape the hrefs using the scrape_table function
    #print soup
    
    items = soup.findAll('a',title="Next page")           # findAll "next page" links        
    if items:                                             # if there is a next page link continue
        
        next_link = root.cssselect("div.srch-Page.srch-Page-bg a")
    #print next_link
        if next_link:
            next_link2 = next_link[2].attrib['href']
            #print next_link2
            split_link = re.split("\)+",next_link2)
            split_link2 = re.split("\=+",split_link[0])
            split_link3 = re.split("\'+",split_link2[2])
            #print split_link3[0]
        #print split_link2
        #if split_link ==11:
            next_url = nextlink_url+split_link3[0]
            if next_url:
                print next_url
                scrape_and_look_for_next_link(next_url)
コード例 #17
0
ファイル: views.py プロジェクト: sofatyping/WebSpider
def getsubhyperlink(origin_url, html_content, reslist, temp_set):
	soup = BeautifulSoup(html_content, parseOnlyThese=SoupStrainer('a'))
	hyperlink = soup.findAll('a',href=True)

	for tag in hyperlink:
		if "https" in tag['href'] or "http" in tag['href']:
			if tag['href'] not in temp_set:
				if origin_url in tag['href']:
					reslist.append(tag['href'])
					temp_set.append(tag['href'])
		else:
			if "www" in tag['href']:
				temp_url = "http://"+tag['href']
				if temp_url not in temp_set:
					if origin_url in temp_url:
						reslist.append(temp_url)
						temp_set.append(temp_url)
			else:
				if tag['href'] and tag['href'][0] == '/': 
					temp_url = origin_url + tag['href']
					if temp_url not in temp_set:
						reslist.append(temp_url)
						temp_set.append(temp_url)
				else:
					temp_url = origin_url + tag['href']
					if temp_url not in temp_set:
						reslist.append(temp_url)
						temp_set.append(temp_url)
コード例 #18
0
ファイル: syllables.py プロジェクト: phrenchphry11/AI_Project
def get_syllables(word):
	url = 'http://www.wordcalc.com/index.php'

	post_data = urllib.urlencode(
	   {'text': word})
	post_data = '%s&optionSyllableCount&optionWordCount' % post_data


	cnxn = urllib.urlopen(url, post_data)
	response = cnxn.read()
	cnxn.close()

	soup = BeautifulSoup(response)
	h3_matches = [h3 for h3 in soup.findAll('h3') if h3.text == 'Statistics']
	if len(h3_matches) != 1:
	 raise Exception('Wrong number of <h3>Statistics</h3>')
	h3_match = h3_matches[0]
	table = h3_match.findNextSibling('table')

	td_matches = [td for td in table.findAll('td')
	             if td.text == 'Syllable Count']
	if len(td_matches) != 1:
	 raise Exception('Wrong number of <td>Syllable Count</td>')
	td_match = td_matches[0]

	td_value = td_match.findNextSibling('td')
	syllable_count = int(td_value.text)
	return syllable_count
コード例 #19
0
 def setUp(self):
     "Setting common information"
     try:
         from BeautifulSoup import BeautifulSoup, SoupStrainer
     except ImportError:
         self.indices = None
         return
     # Load the file as a tree, but only take the SST table (border=1)
     from urllib import urlopen
     url = "http://www.cpc.noaa.gov/products/analysis_monitoring/"\
           "ensostuff/ensoyears.shtml"
     url = urlopen(url)
     table = BeautifulSoup(url.read(),
                           parseOnlyThese=SoupStrainer("table", border=1))
     # Separate it by rows, but skip the first one (the header)
     years = []
     indices = []
     color = dict(red=+1, white=0, blue=-1)
     deft = [(None,'color:white')]
     for row in table.findAll("tr")[1:]:
         cols = row.findAll('td')
         years.append(int(cols.pop(0).strong.string))
         indices.append([color[getattr(_.span, 'attrs', deft)[0][-1].split(':')[-1]]
                         for _ in cols])
     start_date = ts.Date('M', year=years[0], month=1)
     self.indices = time_series(np.array(indices).ravel(),
                                start_date=start_date)
コード例 #20
0
ファイル: Main.py プロジェクト: Backmute/seppius-xbmc-repo
  def getMovieData(self):
    list = []
    #-- get serial play list & parameters  -------------------------------------
    html = self.Auth.get_HTML(self.serial_url, None, 'http://serialu.net/media/uppod.swf')

    # -- parsing web page
    html = re.compile('<body>(.+?)<\/body>', re.MULTILINE|re.DOTALL).findall(html)[0]
    soup = BeautifulSoup(html)
    pl_url = ''

    is_multiseason = len(soup.findAll('object', {'type':'application/x-shockwave-flash'}))

    for rec in soup.findAll('object', {'type':'application/x-shockwave-flash'}):
        if is_multiseason > 1:
            season = rec.parent.previousSibling.previousSibling.text+r' '
        else:
            season = r''

        for par in rec.find('param', {'name':'flashvars'})['value'].split('&'):
            if par.split('=')[0] == 'pl':
                pl_url = par[3:]

        if pl_url.find('http:') == -1:
            pl_url = xppod.Decode(pl_url)

        #-- get playlist details ---------------------------------------------------
        html = self.Auth.get_HTML(pl_url, None, 'http://serialu.net/media/uppod.swf')
        self.pl_url = pl_url

        # -- check if playlist is encoded
        if html.find('{"playlist":[') == -1:
            html = xppod.Decode(html).encode('utf-8').split(' or ')[0] #-- TODO: make smart choice

        # -- parsing web page
        s_url = ''
        s_num = 0
        movie_list = []
        for rec in re.compile('{(.+?)}', re.MULTILINE|re.DOTALL).findall(html.replace('{"playlist":[', '')):
            for par in rec.replace('"','').split(','):
                if par.split(':')[0]== 'comment':
                    name = str(s_num+1) + ' серия' #par.split(':')[1]+' '
                if par.split(':')[0]== 'file':
                    if 'http' in par.split(':')[1]:
                        s_url = par.split(':')[1]+':'+par.split(':')[2]
                    else:
                        s_url = xppod.Decode(par.split(':')[1]).split(' or ')[0]
            s_num += 1

            # mark part for history
            name = season.encode('utf-8') + name

            movie_list.append({'movie_name': name, 'url': s_url})
            #if h_part <> '-':
            #    if name == h_part:
            #        name = '[COLOR FF00FF00]'+name+'[/COLOR]'
        #-- parse data
        list.append({'name':self.serial_name, 'img': self.serial_img, 'descr': self.serial_descr, 'season_number':s_num, 'name_orig':'', 'movie': movie_list})

    #-- return movie list
    return list
コード例 #21
0
ファイル: textract.py プロジェクト: BnMcGn/warflagger
def extract_title(url):
    page = open(page_loc(url))
    soup = BeautifulSoup(page.read())
    title = soup.find('title')
    title = title.string.encode('utf-8')
    gadgets.string_to_file(title, title_loc(url))
    page.close()
コード例 #22
0
def parseLyrics(lyricList,outlist,s,e):
	baseURL = u'http://www.darklyrics.com' 
	i = 0 ;
	for key in lyricList :
		i = i + 1 ;
		if(i >= s and i<= e):
			#key = 'In Flames'  # REMOVE FOR 100 Bands
			time.sleep(1)
			turl = lyricList[key] ;
			print 'Looking up band ' + key
			#print turl
			opener = urllib2.build_opener()
			opener.addheaders = [('User-agent', 'Mozilla/5.0')]
			page = opener.open(turl)
			soup = BeautifulSoup(page.read())
			divs = soup.findChildren('div',attrs={"class" : "album"})
			#get the sub-URL to the lyrics of the latest album and then full URL to the lyrics source
			if(len(divs)>0):
				sub_url =  divs[len(divs)-1].findChildren('a')[0]['href']
				lurl = baseURL + sub_url.split('#')[0][2:]
				#print lurl
				# hit the URL and get data
				page = opener.open(lurl)
				soup = BeautifulSoup(page.read())
				lydiv = soup.findChildren('div',attrs={"class" : "lyrics"})[0]
				[x.extract() for x in lydiv('div')]
				#lyrictext = re.sub('\'lydiv.text ;
				rly = getRawLyrics(lydiv) 
			else:
				rly = "Manual"
				print rly
			outlist[key] = rly
		#break ; # remove once started full testing
	print 'done' , s, ' to ', e	
	return outlist
コード例 #23
0
ファイル: utils.py プロジェクト: joshgoss/reader-server
def get_favicon_url(url):
    if not url.startswith('http'):
        url = "http://{0}".format(url)

    # Check if the root location has a favicon before parsing for it
    if _has_root_favicon(url):
        return urlparse.urljoin(url, 'favicon.ico')

    headers = {'User-Agent': 'Mozilla/5.0'}
    request = urllib2.Request(url, None, headers)

    website = urllib2.urlopen(request).read()

    soup = BeautifulSoup(website)
    favicon_element = soup.find("link", rel="shortcut icon")

    if favicon_element:
        hostname = urlparse.urlparse(url).hostname
        favicon_url = favicon_element['href']

        if favicon_url.startswith('//cdn'):
            return "http:" + favicon_url
        # favicon url is relative and must be converted to absolute path
        elif hostname not in favicon_url:
            return urlparse.urljoin(url, favicon_url)
        else:
            return favicon_url
    else:
        return None
コード例 #24
0
def get_epfile(url):
    """
    Return the file (mp3) URL to be read from the website to play the selected
    reloaded episode.
    Input
        the webpage URL of the episode to be played.
        E.g.: http://www.deejay.it/audio/20130526-4/269989/
    Output
        the URL of the mp3 (rarely a wma) file to be played to listen to the
        selected episode. E.g.:
        http://flv.kataweb.it/deejay/audio/dee_giallo/deegiallolosmemoratodicollegno.mp3
        Returns an empty string if the file cannot be found.
    """
    soup = BeautifulSoup(urllib2.urlopen(url))
    fileurl = soup.find('div', {'id': 'playerCont'})

    if not fileurl:
        return ''
    else:
        hit = re.findall("file=(.*.mp3)&",
            fileurl.iframe['src'])
        if not hit:
            return ''
        else:
            return hit[0]
コード例 #25
0
ファイル: fetch99tel.py プロジェクト: aviatorBeijing/ptpy
 def start(self):
     with QMutexLocker(self.mutex):
         self.stoped = False
         
     #for i in range(self.start_p,self.end_p):
     for i in range(1,3):
         while self.suspended:
             self.wait()  
             return
         if self.stoped:
             return
         url ="http://www.99fang.com/service/agency/a1/?p=%d" % i
         print url            
         
         try:
             r = urllib2.urlopen(url).read()
             soup = BeautifulSoup(r)
             box = soup.find("div",{'class':'agency-call-box'})
             lis = box("li")
             for li in lis:
                 
                 tel = li.a.string
                 print tel
                 r =urllib2.urlopen("http://suzhou.jjr360.com/app.php?c=spider&a=index&city=&tel=%s" % tel)
                 print r.read()
         except:
             pass
         else:
             #self.emit(SIGNAL("updateTime()"))
             time.sleep(1)
コード例 #26
0
ファイル: account.py プロジェクト: r-darwish/neobot
    def _on_login(self, page):
        soup = BeautifulSoup(page)
        if soup.find('a', text='Log in'):
            raise LoginError(page)

        self._browser.save_cookies()
        return soup
コード例 #27
0
ファイル: tasks.py プロジェクト: devrow/bookmark_service
def fetch_page(link_id):
    link = Link.objects.get(pk=link_id)
    url = link.url

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'}
    req = urllib2.Request(url, None, headers)

    try:
        html = urllib2.urlopen(req).read()
        soup = BeautifulSoup(html)
        link.title = soup.find('title').text

        favicon = soup.find('link', rel='shortcut icon')
        if favicon and favicon['href']:
            link.favicon = urljoin(url, favicon['href'])

        for item in soup.findAll('meta'):
            if item.get('name', '').lower() in ('description', 'og:description') and item.get('content', ''):
                link.description = item.get('content', '')

    except Exception as e:
        link.is_error = 1
        link.error_text = e.reason.__str__()

    link.save()
コード例 #28
0
ファイル: __init__.py プロジェクト: Narsil/django-css
 def split_contents(self):
     """ Iterates over the elements in the block """
     if self.split_content:
         return self.split_content
     split = self.soup.findAll({'link' : True, 'style' : True})
     for elem in split:
         if elem.name == 'link' and elem['rel'] == 'stylesheet':
             filename = self.get_filename(elem['href'])
             path, ext = os.path.splitext(filename)
             if ext in settings.COMPILER_FORMATS.keys():
                 if self.recompile(filename):
                     self.compile(path,settings.COMPILER_FORMATS[ext])
                 basename = os.path.splitext(os.path.basename(filename))[0]
                 elem = BeautifulSoup(re.sub(basename+ext,basename+'.css',unicode(elem)))
                 filename = path + '.css'
             try:
                 self.split_content.append(('file', filename, elem))
             except UncompressableFileError:
                 if django_settings.DEBUG:
                     raise
         if elem.name == 'style':
             data = elem.string            
             elem_type = elem.get('type', '').lower()
             if elem_type and elem_type != "text/css":
                 # it has to be preprocessed
                 if '/' in elem_type:
                     # we accept 'text/ccss' and plain 'ccss' too
                     elem_type = elem_type.split('/')[1]
                 # TODO: that dot-adding compatibility stuff looks strange.
                 # do we really need a dot in COMPILER_FORMATS keys?
                 ext = '.'+elem_type
                 data = self.compile_inline(data,ext)
                 elem = ''.join(("<style type='text/css'>\n",data,"\n</style>"))
             self.split_content.append(('hunk', data, elem))
     return self.split_content
コード例 #29
0
ファイル: account.py プロジェクト: r-darwish/neobot
    def _on_page(self, page):
        if not page:
            import ipdb
            ipdb.set_trace()

        soup = BeautifulSoup(page)
        if not soup.find('a', text='Log in'):
            event = soup.find('b', text='Something has happened!')
            if event:
                cell = event.findParent('table').findAll('td')[2]
                text = ''.join([x.text if hasattr(x, 'text') else x
                        for x in cell.childGenerator()])
                self._logger.info("Something has happned: %s", text)

            try:
                self._neopoints = get_np(soup)
            except NoNpInPage:
                pass

            return soup

        self._logger.info('Need to login. Using account %s', self._username)
        data = dict(username=self._username, password=self._password,
                    destination=soup.find(
                        'input', attrs=dict(name='destination'))['value'])
        d = self._browser.post('http://www.neopets.com/login.phtml', data)
        d.addCallback(self._on_login)
        return d
コード例 #30
0
ファイル: utils.py プロジェクト: yunmanger1/code_project
def removecut(string):
    soup = BeautifulSoup(string, selfClosingTags=['img','br'])
    tag = soup.find('yvcut')
    if not tag: return string
    tag.extract()
    string = soup.renderContents()
    return string    
コード例 #31
0
ファイル: views.py プロジェクト: sanyaade-teachings/unisubs
    def test_diffing(self):
        create_langs_and_versions(self.video, ['en'])

        eng = self.video.newsubtitlelanguage_set.get(language_code='en')
        subtitles = SubtitleSet.from_list('en', [
            (10000, 20000, "1 - :D"),
            (20000, 30000, "2 - :D"),
            (30000, 40000, "3 - :D"),
            (40000, 50000, "4 - :D"),
            (50000, 60000, "5 - :D"),
        ])
        subtitles2 = SubtitleSet.from_list(
            'en',
            [
                (10000, 20000, "1 - :D"),
                (20000, 25000, "2 - :D"),  # time change,
                (30000, 40000, "Three - :D"),  # text change,
                # multiple lines replaced by a single line
                (40000, 60000, "45 - :D"),
            ])
        first_version = eng.add_version(subtitles=subtitles)
        second_version = eng.add_version(subtitles=subtitles2)
        # Note on the argument order to diff: we always diff the more recent
        # version against the less recent
        diff_result = diff(subtitles2, subtitles)

        response = self._simple_test('videos:diffing',
                                     [first_version.id, second_version.id])
        self.assertEquals(diff_result, response.context['diff_data'])

        diff_sub_data = diff_result['subtitle_data']

        html = BeautifulSoup(response.content)
        diff_list = html.find('ol', {"class": 'subtitles-diff'})
        diff_items = diff_list.findAll('li')
        # check number of lines
        self.assertEquals(len(diff_items), len(diff_sub_data))

        def check_column_data(column, sub_data):
            """Check the data in the HTML for a column against the data in
            from diff()
            """
            # special check for empty lines
            if sub_data.text is None:
                self.assertEquals(column.string.strip(), "")
                return
            time_span, text_span = column.findAll('span', recursive=False)
            self.assertEquals(text_span.string.strip(), sub_data.text)
            time_child_spans = time_span.findAll('span',
                                                 {'class': 'stamp_text'})
            self.assertEquals(time_child_spans[0].string.strip(),
                              format_sub_time(sub_data.start_time))
            self.assertEquals(time_child_spans[1].string.strip(),
                              format_sub_time(sub_data.end_time))

        for li, diff_sub_data_item in zip(diff_items, diff_sub_data):
            # Intuitively, left_column should be compared against
            # ['subtitles'][0], but we do the opposite.  This is because of
            # the way things are ordered:
            #  - diff() was passed (older_version, newer_version)
            #  - The rendered HTML has the newer version on the left and the
            #  older version on the right
            check_column_data(li.find('div', {'class': 'left_column'}),
                              diff_sub_data_item['subtitles'][1])
            check_column_data(li.find('div', {'class': 'right_column'}),
                              diff_sub_data_item['subtitles'][0])
            # we use the time_change class for either text or time changes.
            time_changes = li.findAll('span', {'class': 'time_change'})
            if (diff_sub_data_item['time_changed']
                    or diff_sub_data_item['text_changed']):
                self.assertNotEqual(len(time_changes), 0)
            else:
                self.assertEquals(len(time_changes), 0)
コード例 #32
0
ファイル: network.py プロジェクト: ristomet/xbmc
def mobileUA(content):
    soup = BeautifulSoup(content, convertEntities=BeautifulSoup.HTML_ENTITIES)
    res = soup.find('html')
    res = res.get('class', '') if res else ''
    return True if 'a-mobile' in res or 'a-tablet' in res else False
コード例 #33
0
ファイル: network.py プロジェクト: ristomet/xbmc
def _parseHTML(br):
    response = br.response().read().decode('utf-8')
    response = re.sub(r'(?i)(<!doctype \w+).*>', r'\1>', response)
    soup = BeautifulSoup(response, convertEntities=BeautifulSoup.HTML_ENTITIES)
    return response, soup
コード例 #34
0
ファイル: timesnow.py プロジェクト: suryansh2020/crawlers
import urllib
import datetime
from BeautifulSoup import BeautifulSoup
sock=urllib.urlopen("http://www.timesnow.tv/")
htmlSrc=sock.read()
soup=BeautifulSoup(htmlSrc)
print "The Times now\n"



for a in soup.findAll('class', attrs={'href':'[a-zA-Z,0-9;:,]*.cms'}):
	print a.text    
   
コード例 #35
0
            record['Obama'] = \
            table_td[1].text
            record['Clinton'] = \
            table_td[2].text
            record['Reagan'] = \
            table_td[3].text
            print record,
            print "-" * 10
            #Save data step by step
            scraperwiki.sqlite.save(["Word"], record)


#website link
Website = 'http://www.guardian.co.uk/news/datablog/2011/may/25/us-presidents-adressing-parliament-obama-clinton-reagan-speech-word-count'
html = scraperwiki.scrape(Website)
soup = BeautifulSoup(html)
scrape_table(soup)
## GURPREET SINGH
import scraperwiki
from BeautifulSoup import BeautifulSoup


def scrape_table(soup):
    #to define coloumns name used in table
    scraperwiki.sqlite.save('data_columns',
                            ['Word', 'Obama', 'Clinton', 'Reagan'])
    table = soup.find("table", {"class": "in-article sortable"})
    #To each row of table is selected
    rows = table.findAll("tr")
    for row in rows:
        record = {}
コード例 #36
0
ファイル: RajivPySms.py プロジェクト: rajivm1991/Rajiv-Py-SMS
def Soup_check(html):
    soup = BeautifulSoup(html)

    if html == " <script language='javascript' type='text/javascript'>window.location.href = 'http://sms.fullonsms.com/action_main.php';</script>":
        return True

    confirmation160 = soup.find('div', attrs={"class": "h-sta"})
    if confirmation160:
        print "+++++++++++++++ Service Response +++++++++++++++++"
        print "+|",
        print confirmation160.find('h2').\
            findAll(text=True)[0].strip().replace('\r', '')
        print "++++++++++++++++++++++++++++++++++++++++++++++++++"

    w2s_Confirmation = soup.find('div', attrs={"class": "confirm"})
    if w2s_Confirmation:
        print "+++++++++++++++ Service Response +++++++++++++++++"
        print "+|", w2s_Confirmation.find('h2').findAll(text=True)[0]
        print "++++++++++++++++++++++++++++++++++++++++++++++++++"

    w2sms_mobile_no = soup.find('div', attrs={"class": "mobile-in"})
    if w2sms_mobile_no:
        print "+++++++++++++ Way2Sms Login Detail +++++++++++++++"
        name = soup.find('span', attrs={"onmouseover": "dismouout();"})
        print "+| Name:", name.findAll(text=True)[0]

        Text_list = w2sms_mobile_no.findAll(text=True)
        cut = ['\t', '\n', '\r', '  ', '.']
        for text in Text_list[:]:
            i = Text_list.index(text)
            for s in cut:
                text = text.replace(s, '')
            Text_list[i] = text
            if not text:
                Text_list.remove(text)
        print "+|", ': '.join(Text_list)

        email = str(soup.find('input', attrs={"id": "logemail"}))
        print "+| Email:",
        print email[email.index('value=') + 7:email.index('>') - 3]

        ips = soup.find('div', attrs={"class": "item1 flt ip"})
        Text_list = ips.findAll(text=True)
        cut = ['&nbsp;', '\n', ' ']
        for text in Text_list[:]:
            i = Text_list.index(text)
            for s in cut:
                text = text.replace(s, '')
            Text_list[i] = text
            if not text:
                Text_list.remove(text)
        for i in range(0, len(Text_list), 2):
            print "+|", Text_list[i],
            print Text_list[i + 1] if i + 1 < len(Text_list) else ''
        return True

    acc_details = soup.find('div', attrs={"class": "mad"})
    if acc_details:
        print "++++++++++++++ 160by2 Login Detail +++++++++++++++"
        Text_list = acc_details.findAll(text=True)
        rem = [u'Change Password', u'(Change)', u'\n']
        cut = [
            '&nbsp;',
        ]
        for text in Text_list[:]:
            if [x for x in rem if x in text]:
                Text_list.remove(text)
            else:
                i = Text_list.index(text)
                for s in cut:
                    text = text.replace(s, '')
                Text_list[i] = text

        print "$|", Text_list[0]
        for i in range(1, len(Text_list), 3):
            print "+| %s%s %s" % (
                Text_list[i], Text_list[i + 1] if i + 1 < len(Text_list) else
                '', Text_list[i + 2] if i + 2 < len(Text_list) else '')

        last_login = soup.find('div', attrs={"class": "lh"})
        Text_list = last_login.findAll(text=True)
        rem = [u'\n', u'about', u'view', u'button']
        for text in Text_list[:]:
            if [x for x in rem if x in text]:
                Text_list.remove(text)
            else:
                i = Text_list.index(text)
                for s in cut:
                    text = text.replace(s, '')
                Text_list[i] = text
        print "$|", Text_list[0]
        for i in range(1, len(Text_list), 3):
            print "+| %s%s %s" % (
                Text_list[i], Text_list[i + 1] if i + 1 < len(Text_list) else
                '', Text_list[i + 2] if i + 2 < len(Text_list) else '')
        return True

    return False
コード例 #37
0
    def parse(self,
              of=None,
              req=None,
              limit=CFG_EXTERNAL_COLLECTION_MAXRESULTS):
        """Parse buffer to extract records."""

        if CFG_BEAUTIFULSOUP_INSTALLED:
            soup = BeautifulSoup(self.buffer)

            # Remove "more" links that include Indico Javascript
            more_links = soup.findAll('a', {
                "class": "searchResultLink",
                "href": "#"
            })
            [more_link.extract() for more_link in more_links]

            # Events
            event_results = soup.findAll('li', {"class": "searchResultEvent"})
            event_index = 1
            for result in event_results:
                self.add_html_result((event_index == 1 and '<b>Events:</b><br/>' or '') + \
                                     str(result)  + '<br />', limit)
                event_index += 1
            # Contributions
            contribution_results = soup.findAll(
                'li', {"class": "searchResultContribution"})
            contribution_index = 1
            for result in contribution_results:
                self.add_html_result((contribution_index == 1 and '<b>Contributions:</b><br/>' or '') + \
                                     str(result)  + '<br />', limit)
                contribution_index += 1
        else:
            # Markup is complex. Do whatever we can...
            # Events
            split_around_events = self.buffer.split(
                '<li class="searchResultEvent">')
            if len(split_around_events) > 1:
                event_index = 1
                for html_chunk in split_around_events[1:]:
                    output = '<li class="searchResultEvent">'
                    if event_index == len(split_around_events) - 1:
                        split_around_link = html_chunk.split(
                            'searchResultLink')
                        split_around_ul = 'searchResultLink'.join(
                            split_around_link[1:]).split('</ul>')
                        output += split_around_link[0] + 'searchResultLink' + \
                                  split_around_ul[0] + '</ul>' + split_around_ul[1]
                    else:
                        output += html_chunk
                    self.add_html_result((event_index == 1 and '<b>Events:</b><br/>' or '') + \
                                     output  + '<br />', limit)
                    event_index += 1
            # Contributions
            split_around_contributions = self.buffer.split(
                '<li class="searchResultContribution">')
            if len(split_around_contributions) > 1:
                contribution_index = 1
                for html_chunk in split_around_contributions[1:]:
                    output = '<li class="searchResultContribution">'
                    if contribution_index == len(
                            split_around_contributions) - 1:
                        split_around_link = html_chunk.split(
                            'searchResultLink')
                        split_around_ul = 'searchResultLink'.join(
                            split_around_link[1:]).split('</ul>')
                        output += split_around_link[0] + 'searchResultLink' + \
                                  split_around_ul[0] + '</ul>' + split_around_ul[1]
                    else:
                        output += html_chunk
                    self.add_html_result((contribution_index == 1 and '<b>Contributions:</b><br/>' or '') + \
                                     output  + '<br />', limit)
                    contribution_index += 1
コード例 #38
0
 def _get_newblogpost_dom(self):
     response = self._get_newblogpost()
     return BeautifulSoup(response.html())
コード例 #39
0
def Movie_List(params):
    #-- get filter parameters
    par = Get_Parameters(params)

    # show search dialog
    if par.search == 'Y':
        skbd = xbmc.Keyboard()
        skbd.setHeading('Поиск сериалов.')
        skbd.doModal()
        if skbd.isConfirmed():
            SearchStr = skbd.getText().split(':')
            url = 'http://seasonvar.ru/autocomplete.php?query=' + urllib.quote(
                SearchStr[0])
            par.search = SearchStr[0]
        else:
            return False
    else:
        url = 'http://seasonvar.ru/index.php?onlyjanrnew=' + par.genre + '&&sortto=name&country=' + par.country + '&nocache=' + str(
            random.random())

    #== get movie list =====================================================
    html = get_HTML(url)

    # -- parsing web page --------------------------------------------------
    count = 1
    list = []

    if par.search != '':  #-- parsing search page
        s = json.loads(html)
        count = len(s['suggestions'])
        if count < 1: return False

        for i in range(0, count):
            name = s['suggestions'][i].encode('utf-8')
            list.append({
                'title': name,
                'url': 'http://seasonvar.ru/' + s['data'][i],
                'img': icon
            })
    else:  #-- parsing serial list
        soup = BeautifulSoup(html, fromEncoding="utf-8")
        # -- get number of serials
        mtag = GetTag(soup)
        #with open('d:\\seasonvar.html', 'a') as the_file:
        #    the_file.write(html)

        if par.alphabet == '':
            count = 0
            for rec in soup.findAll('div', {'class': 'alf-letter'}):
                a_name = u'[COLOR FF00FFF0][B]' + rec.text + u'[/B][/COLOR] сериалов: ' + str(
                    len(rec.parent.findAll('div', {'class': mtag})))
                list.append({
                    'title': a_name.encode('utf-8'),
                    'alphabet': rec.text.encode('utf-8')
                })
                count = count + len(rec.parent.findAll('div', {'class': mtag}))
        else:
            for reca in soup.findAll('div', {'class': 'alf-letter'}):
                if reca.text.encode('utf-8') == par.alphabet:
                    for rec in reca.parent.findAll('div', {'class': mtag}):
                        list.append({
                            'url':
                            'http://seasonvar.ru' +
                            rec.find('a')['href'].encode('utf-8'),
                            'title':
                            rec.find('a').text.encode('utf-8'),
                            'img':
                            'http://cdn.seasonvar.ru/oblojka/' +
                            rec['id'].replace('div', '') + '.jpg'
                        })
                        count = len(list)

    #-- add header info
    Get_Header(par, count)

    #-- get movie info
    #try:
    if par.alphabet != '' or par.search != '':
        for rec in list:
            i = xbmcgui.ListItem(rec['title'],
                                 iconImage=rec['img'],
                                 thumbnailImage=rec['img'])
            u = sys.argv[0] + '?mode=SERIAL'
            u += '&name=%s' % urllib.quote_plus(rec['title'])
            u += '&title=%s' % urllib.quote_plus(rec['title'])
            u += '&url=%s' % urllib.quote_plus(rec['url'])
            u += '&genre=%s' % urllib.quote_plus(par.genre)
            u += '&genre_name=%s' % urllib.quote_plus(par.genre_name)
            u += '&country=%s' % urllib.quote_plus(par.country)
            u += '&country_name=%s' % urllib.quote_plus(par.country_name)
            xbmcplugin.addDirectoryItem(h, u, i, True)
    else:
        for rec in list:
            i = xbmcgui.ListItem(rec['title'],
                                 iconImage=icon,
                                 thumbnailImage=icon)
            u = sys.argv[0] + '?mode=MOVIE'
            #u += '&name=%s'%urllib.quote_plus(rec['title'])
            #u += '&title=%s'%urllib.quote_plus(rec['title'])
            u += '&alphabet=%s' % urllib.quote_plus(rec['alphabet'])
            u += '&genre=%s' % urllib.quote_plus(par.genre)
            u += '&genre_name=%s' % urllib.quote_plus(par.genre_name)
            u += '&country=%s' % urllib.quote_plus(par.country)
            u += '&country_name=%s' % urllib.quote_plus(par.country_name)
            xbmcplugin.addDirectoryItem(h, u, i, True)

    #except:
    #    pass

    xbmcplugin.endOfDirectory(h)
コード例 #40
0
def getTableDimension(arrTable):

	# fungsi ini untuk mendapatkan dimensi tabel dan isinya berdasarkan data sream string HTML
	# fungsi ini melanjutkan dari fungsi fetchHTML()
	# dimensi tabel dimaksud adalah jumlah baris dan jumlah kolom pada tabel
	# inisialisasi variabel 'largest_table' dan 'max_rows'

	# bagaimana cara menentukan tabel mana yang berisi data?
	# pilihlah tabel yang terbesar yang memiliki jumlah baris terbanyak

	largest_table = None

	max_rows = 0

	for table in arrTable:

		# cek satu per satu jumlah baris yang ada pada masing-masing tabel dalam array kumpulan tabel
		# simpan dalam variabel bernama numRows

		numRows = len(table.findAll(lambda tag: tag.name == 'tr' and tag.findParent('table') == table))
		
		# jika jumlah baris pada suatu tabel lebih besar daripada '0' maka jadikan sebagai max_rows sementara
		# proses ini diulangi terus menerus maka max_rows akan berisi jumlah baris terbanyak

		if numRows > max_rows:
			
		        largest_table = table
			max_rows = numRows

	# ini hanya mengembalikan penyebutan 'tabel terbesar' hanya sebagai 'tabel'
	# dan mengembalikan penyebutan 'jumlah baris terbanyak' hanya sebagai 'jumlah baris'

	table = largest_table

	numRows = max_rows

	# bagaimana cara menentukan berapa jumlah kolomnya?

	numCols = len(table.contents[1])
	

	# bagaimana cara menentukan berapa jumlah baris yang terpakai sebagai header?

	soup = BeautifulSoup(str(table))
	rows = soup.findAll('tr')

	# inisialisasi variabel numRowsHead sebagai jumlah baris yang mengandung header

	numRowsHead = 0	
	
	# periksa satu per satu setiap baris

	for i in range (0, numRows):
		
		# apabila dalam suatu baris tertentu terdapat tag <th>
		if rows[i].findAll('th'):
			
			# maka numRows bertambah 1
			numRowsHead = i + 1


	# hasil akhir fungsi getTableDimension ini menghasilkan jumlah baris, jumlah baris yang terpakai header, jumlah kolom dan isi tabel itu sendiri

	return numRows, numRowsHead, numCols, table
コード例 #41
0
from mechanize import Browser
from BeautifulSoup import BeautifulSoup

import scraperwiki
from scraperwiki import sqlite
mech = Browser()

url = 'http://www.gpupdate.net/en/standings/190/2013-motogp-standings/'

page = mech.open(url)
html = page.read()
soup = BeautifulSoup(html)

resContainer = soup.find("div", {"id": "middle_container"})
rownumber = 0

table = soup.find("table")
for row in table.findAll('tr')[1:30]:
    col = row.findAll('td')

    pos = int(col[0].string.replace(".", ""))
    driver = col[1].a.string

    tempTD = col[1]

    team = tempTD.findAll('span')
    team = team[1].string
    points = col[2].string

    country = tempTD.findAll('img')
    country = country[0]['alt'].upper()
コード例 #42
0
def Serial_Info(params):
    #-- checkif SWD decompiler set up properly
    if not Check_SWF():
        return False

    #-- get filter parameters
    par = Get_Parameters(params)
    #== get serial details =================================================
    tvshowtitle = par.title
    full_name = par.name
    url = par.url
    html = get_HTML(url)
    # -- parsing web page --------------------------------------------------
    soup = BeautifulSoup(html, fromEncoding="windows-1251")

    # -- check if serial has seasons and provide season list
    if par.is_season == '' and len(
            soup.findAll('div', {'class': 'full-news-2-content'})) > 0:
        #-- generate list of seasons
        for rec in soup.find('div', {
                'class': 'full-news-2-content'
        }).findAll('a'):
            s_url = ('http://seasonvar.ru' + rec['href']).encode('utf-8')
            s_name = rec.text.replace('>>>', '').replace(u'Сериал ', '')
            if s_name.find(u'сезон(') > -1:
                s_name = s_name.split(u'сезон(')[0] + u'сезон'
            s_name = s_name.encode('utf-8')
            s_id = rec['href'].split('-')[1]
            s_image = 'http://cdn.seasonvar.ru/oblojka/' + s_id + '.jpg'

            i = xbmcgui.ListItem(s_name,
                                 iconImage=s_image,
                                 thumbnailImage=s_image)
            u = sys.argv[0] + '?mode=SERIAL'
            #-- filter parameters
            u += '&name=%s' % urllib.quote_plus(s_name)
            u += '&title=%s' % urllib.quote_plus(tvshowtitle)
            u += '&url=%s' % urllib.quote_plus(s_url)
            u += '&genre=%s' % urllib.quote_plus(par.genre)
            u += '&genre_name=%s' % urllib.quote_plus(par.genre_name)
            u += '&country=%s' % urllib.quote_plus(par.country)
            u += '&country_name=%s' % urllib.quote_plus(par.country_name)
            u += '&is_season=%s' % urllib.quote_plus('*')
            xbmcplugin.addDirectoryItem(h, u, i, True)
    else:
        #-- generate list of movie parts
        # -- get movie info
        for rec in soup.find('td', {'class': 'td-for-content'}).findAll('p'):
            if len(rec.findAll('span', {'class': 'videl'})) > 0:
                for j in str(rec).split('<br />'):
                    r = re.compile(
                        '<span class="videl">(.+?)<\/span>(.+?)<\/br>',
                        re.MULTILINE | re.DOTALL).findall(str(j) + '</br>')
                    for s in r:
                        if s[0] == 'Жанр:': mi.genre = s[1].replace('</p>', '')
                        if s[0] == 'Страна:':
                            mi.country = s[1].replace('</p>', '')
                        if s[0] == 'Вышел:': mi.year = s[1].replace('</p>', '')
                        if s[0] == 'Режисёр:':
                            mi.director = s[1].replace('</p>', '')
                        if s[0] == 'Роли:':
                            mi.actors = s[1].replace('</p>', '')
            else:
                mi.text = rec.text.encode('utf-8')

        mi.actors = mi.actors.split(',')

        mi.img = soup.find('td', {
            'class': 'td-for-content'
        }).find('img')['src']

        # -- get serial parts info
        # -- mane of season
        i = xbmcgui.ListItem('[COLOR FFFFF000]' + par.name + '[/COLOR]',
                             path='',
                             thumbnailImage=icon)
        u = sys.argv[0] + '?mode=EMPTY'
        xbmcplugin.addDirectoryItem(h, u, i, False)
        pname = par.name
        # -- get list of season parts
        s_url = ''
        s_num = 0

        #---------------------------
        try:
            playlist, playlist_url, swf_player = Get_PlayList(soup, url)
        except:
            Initialize()
            playlist, playlist_url, swf_player = Get_PlayList(soup, url)
            if playlist == '':
                return False

        for rec in playlist:
            name = rec['name']
            s_url = rec['video']

            i = xbmcgui.ListItem(name,
                                 path=urllib.unquote(s_url),
                                 thumbnailImage=mi.img)  # iconImage=mi.img
            u = sys.argv[0] + '?mode=PLAY'
            u += '&url=%s' % urllib.quote_plus(s_url)
            u += '&name=%s' % urllib.quote_plus(pname)
            u += '&full_name=%s' % urllib.quote_plus(full_name)
            u += '&title=%s' % urllib.quote_plus(tvshowtitle)
            u += '&img=%s' % urllib.quote_plus(mi.img)
            u += '&playlist=%s' % urllib.quote_plus(playlist_url)
            try:
                cast = re.compile(">(.+?)</a>").findall(mi.actors)
            except:
                cast = []
            i.setInfo(type='video',
                      infoLabels={
                          'title': name,
                          'cast': cast,
                          'artist': mi.actors,
                          'year': int(mi.year),
                          'director': mi.director,
                          'plot': mi.text,
                          'genre': mi.genre
                      })
            i.setProperty('fanart_image', mi.img)
            #i.setProperty('IsPlayable', 'true')
            xbmcplugin.addDirectoryItem(h, u, i, False)

    xbmcplugin.endOfDirectory(h)
コード例 #43
0
def parse_hotellist_page(html, page_count):
    """Parses the website with the hotel list and prints the hotel name, the
    number of stars and the number of reviews it has. If there is a next page
    in the hotel list, it returns a list to that page. Otherwise, it exits the
    script. Corresponds to STEP 4 of the slides.

    Parameters
    ----------
    html : str
        The HTML of the website with the hotel list.

    Returns
    -------
    URL : str
        If there is a next page, return a relative link to this page.
        Otherwise, exit the script.
    """
    soup = BeautifulSoup(html)
    # Extract hotel name, star rating and number of reviews
    hotel_boxes = soup.findAll(
        'div',
        {'class': 'listing wrap reasoning_v5_wrap jfy_listing p13n_imperfect'})
    if not hotel_boxes:
        log.info(
            "#################################### Option 2 ######################################"
        )
        hotel_boxes = soup.findAll('div', {'class': 'listing_info jfy'})
    if not hotel_boxes:
        log.info(
            "#################################### Option 3 ######################################"
        )
        hotel_boxes = soup.findAll(
            'div', {'class': 'listing easyClear  p13n_imperfect'})

    data = []
    for hotel_box in hotel_boxes:
        hotel_name = hotel_box.find("a", {"target": "_blank"}).find(text=True)
        log.info("Hotel name: %s" % hotel_name.strip())

        stars = hotel_box.find("img", {"class": "sprite-ratings"})
        if stars:
            star = stars['alt'].split()[0]
            log.info("Stars: %s" % star)

        num_reviews = hotel_box.find("span", {
            'class': "more"
        }).findAll(text=True)
        if num_reviews:
            num_reviews1 = [x for x in num_reviews if "review" in x][0].strip()
            log.info("Number of reviews: %s " % num_reviews1)

        link = hotel_box.find('a', {'class': "property_title"})
        url = base_url + link['href']
        # Sleep 2 sec before starting a new http request
        time.sleep(2)
        # Request page
        headers = {'User-Agent': user_agent}
        response = requests.get(url, headers=headers)
        new_html = response.text.encode('utf-8')

        row = helper(new_html)
        row.insert(0, float(num_reviews1.strip("reviews").replace(",", "")))
        row.insert(0, float(star))
        data.append(row)
    with open("hotels.csv", "a+") as file:
        csv.writer(file).writerows(data)

    # Get next URL page if exists, otherwise exit
    div = soup.find("div", {"id": "pager_bottom"})
    # check if this is the last page
    pages = soup.find('span', {"class": "guiArw pageEndNext"})
    if not pages is None:
        log.info("We reached last page.")
        sys.exit()
    # If not, return the url to the next page
    hrefs = div.findAll('a', href=True)
    for href in hrefs:
        next = str(page_count + 1)
        if href.find(text=True) == next:
            log.info("Next url is %s" % href['href'])
            return href['href']
コード例 #44
0
ファイル: parse_links.py プロジェクト: hxssgaa/LearnPython
def fasterBS(url, f):
    """fasterBS() - use BeautifulSoup to parse only anchor tags"""
    parsed = BeautifulSoup(f, parseOnlyThese=SoupStrainer('a'))
    links = [urljoin(url, x['href']) for x in parsed]
    output(links)
コード例 #45
0
ファイル: loppe.py プロジェクト: pedroamaralf/loppe
    def inicio(self, widget):
        def message(msg, model):
            if model == 1:
                diag = gtk.MessageDialog(self.janela, gtk.DIALOG_MODAL,
                                         gtk.MESSAGE_WARNING, gtk.BUTTONS_OK)
            elif model == 2:
                diag = gtk.MessageDialog(self.janela, gtk.DIALOG_MODAL,
                                         gtk.MESSAGE_INFO, gtk.BUTTONS_OK)

            diag.set_markup(msg)
            diag.run()
            diag.destroy()

        day = datetime.date.today()
        month = datetime.date.today().month
        year = datetime.date.today().year
        year_month = str(year) + "-" + str(month)
        timeString = time.strftime('%H:%M:%S')
        timeString2 = time.strftime('%H-%M')
        arq_name = str(day) + "_" + timeString2

        # key validation (off)
        # URL_KEY = 'http://www.ow7.com.br/loppe.html'
        # page_key = requests.get(URL_KEY)
        # bs_key = BeautifulSoup(page_key.content)

        # a_key = bs_key.find('span', {'id': 'MM'}).string

        # print a_key

        a_key = 'teste'
        a_url = 'http://online4.detran.pe.gov.br/'
        a_url = a_url + 'NovoSite/Detran_Veiculos/result_Consulta.aspx?placa='

        if a_key != 'teste':
            message("Contacte o administrador.\n\nKleber Soares\n"
                    "81 8172.9074\[email protected]", 1)
        else:
            try:
                if not os.path.exists(year_month):
                    mkdir(year_month)

                arq = open(self.filechooserbutton.get_filename())
                str_placas = arq.read()
                # strip() serve para remover as quebras das linhas
                placas = str_placas.strip().split(",")
                placas = [x for x in placas if x]
                qtd_placas = len(placas)
                arq.close()

                i = 0
                lin = 1

                wb = xlwt.Workbook()
                ws = wb.add_sheet('Detran Pernambuco')

                ws.write(0, 0, 'PLACA')
                ws.write(0, 1, 'RESTRICAO 1')
                ws.write(0, 2, 'RESTRICAO 2')
                ws.write(0, 3, 'RESTRICAO 3')
                ws.write(0, 4, 'RESTRICAO 4')
                ws.write(0, 5, 'DATA')
                ws.write(0, 6, 'HORA')

                for placa in placas:
                    placa = placa.strip()
                    i += 1
                    self.count_in_thread(qtd_placas)
                    self.progress_bar.set_text(
                        "("+placa+") "+str(i)+"/"+str(qtd_placas))

                    while gtk.events_pending():
                        gtk.main_iteration()

                    URL_ULTIMOS_RESULTADOS = a_url + placa
                    page = requests.get(URL_ULTIMOS_RESULTADOS)
                    bs = BeautifulSoup(page.content)

                    labels = (
                        bs.find('span', {'id': 'lblRestricao1'}
                                ).find('font').string,
                        bs.find('span', {'id': 'lblRestricao2'}
                                ).find('font').string,
                        bs.find('span', {'id': 'lblRestricao3'}
                                ).find('font').string,
                        bs.find('span', {'id': 'lblRestricao4'}
                                ).find('font').string,
                    )

                    # csv = placa+","
                    ws.write(lin, 0, placa)

                    col = 1

                    for label in labels:
                        if not label:
                            ws.write(lin, col, label)
                        col += 1

                    ws.write(lin, 5, str(day))
                    ws.write(lin, 6, timeString)

                    lin += 1

                    sleep(1)

                    wb.save(year_month+"/"+arq_name+".xls")

                message("Arquivo gerado com sucesso.\n"
                        "Verifique a pasta do aplicativo.", 2)
            except TypeError, erro:
                if not self.filechooserbutton.get_filename():
                    message("Selecione um arquivo.", 1)
                else:
                    print "Um erro ocorreu: %s" % erro
                    message("Um erro ocorreu: %s" % erro, 1)
コード例 #46
0
def extract_from_html(raw_html, base_url, only_links=True):
    """
    Extract URLs from HTML.

    Implementation notes:

    - The current implementation is fault tolerant, meaning it will try
      to extract URLs even if the HTML is malformed and browsers wouldn't
      normally see those links. This may therefore result in some false
      positives.

    - HTML5 tags are supported, including tags not currently supported by
      any major browser.

    :param raw_html: Raw HTML data.
    :type raw_html: str

    :param base_url: Base URL for the current document.
    :type base_url: str

    :param only_links: If True, only extract links to other resources. If False, extract all URLs.
    :type only_links: bool

    :returns: Extracted URLs.
    :rtype: set(str)
    """

    # Set where the URLs will be collected.
    result = set()
    add_result = result.add

    # Remove the fragment from the base URL.
    base_url = urldefrag(base_url)[0]

    # Parse the raw HTML.
    bs = BeautifulSoup(raw_html, convertEntities=BeautifulSoup.ALL_ENTITIES)

    # Some sets of tags and attributes to look for.
    href_tags = {"a", "link", "area"}
    src_tags = {
        "form", "script", "img", "iframe", "frame", "embed", "source", "track"
    }
    param_names = {"movie", "href", "link", "src", "url", "uri"}

    # Iterate once through all tags...
    for tag in bs.findAll():

        # Get the tag name, case insensitive.
        name = tag.name.lower()

        # Extract the URL from each tag that has one.
        url = None
        if name in href_tags:
            url = tag.get("href", None)
        elif name in src_tags:
            url = tag.get("src", None)
        elif name == "param":
            name = tag.get("name", "").lower().strip()
            if name in param_names:
                url = tag.get("value", None)
        elif name == "object":
            url = tag.get("data", None)
        elif name == "applet":
            url = tag.get("code", None)
        elif name == "meta":
            name = tag.get("name", "").lower().strip()
            if name == "http-equiv":
                content = tag.get("content", "")
                p = content.find(";")
                if p >= 0:
                    url = content[p + 1:]
        elif name == "base":
            url = tag.get("href", None)
            if url is not None:

                # Unicode URLs are not supported.
                try:
                    url = str(url)
                except Exception:
                    continue

                # Update the base URL.
                try:
                    base_url = urljoin(base_url,
                                       url.strip(),
                                       allow_fragments=False)
                except Exception:
                    continue

        # If we found an URL in this tag...
        if url is not None:

            # Unicode URLs are not supported.
            try:
                url = str(url)
            except Exception:
                continue

            # Canonicalize the URL.
            try:
                url = urljoin(base_url, url.strip())
            except Exception:
                continue

            # Discard URLs that are not links to other pages or resources.
            if not only_links or is_link(url, base_url=base_url):

                # Add the URL to the set.
                add_result(url)

    # Return the set of collected URLs.
    return result
コード例 #47
0
ファイル: config.py プロジェクト: seawindx/hyer
class Config(dict):
    """
    话说一米六二同志特别懒,这个配置文件的解析,就是证明:
    这个配置文件解析类,居然就是用的BeautifulSoup来解析的,懒到家了....
    @brief
    Example:
    //file demo.conf
    <root>
        <db type="string>
            host=localhost;user=root;pass=;db=test;
        </db>
    </root>
    
    #file test.py
    import hyer.config
    config=hyer.config.Config(open("demo.conf").read())
    print config["root"]["db"].values()
    """
    def __init__(self,content,last_find=None):
        self.content=content
        self.last_find=last_find
        self.soup= BeautifulSoup(self.content)
        self.builders={
                "string":str,
                "regexp":regexp,
                "list":list,
                "python":eval,
                "regexp_list":regexp_list,
                "python_list":python_list
                }
    def __str__(self):
        return self.content

    def values(self):
        results=[]
        for item in self.last_find:
            try:
                type=item["type"]
            except:
                type="string"
            results.append(self.builders[type](R_TRIM.sub("",str(item))))
        return results

    def value(self):
        item = self.last_find[0]
        try:
            type=item["type"]
        except:
            type="string"
        return self.builders[type](R_TRIM.sub("",str(item)))

    def sections(self):
        results=[]
        for item in self.last_find:
           results.append(str(item))
        return results

    def __getitem__(self,key):
        data=self.soup.findAll(key)
        self.last_find=data
        if len(data)==0:
            return NoneConfig()
        else:
            return Config(str(data),self.last_find)
コード例 #48
0
    def Genre(self, genre, filter, page, totalpage):
        if 'Top' in genre:
            url = self.url_base + '/top50/' + self.genrelist[genre]
            if filter != "": url = url + '/' + str(filter)
            type = 'table'
        elif genre == 'Kijktips':
            url = self.url_base + '/kijktips/etalage'
            type = 'json'
        else:
            url = self.url_base + '/7dagen/' + self.genrelist[genre]
            if filter != "": url = url + ',' + str(filter)
            url = url + '?weergave=detail&page=' + str(page)
            type = 'ol'

        if type == 'json':
            data = tools.urlopen(self.app, url, {'cache': 3600, 'xhr': True})
            json_data = json.loads(data)

            genrelist = []
            if len(data) < 1:
                mc.ShowDialogNotification("No genre found for " + str(genre))
                return genrelist

            for item in json_data:
                genreitem = CreateEpisode()
                if item['name'] != item['series_name']:
                    genreitem.name = item['series_name'] + ': ' + item['name']
                else:
                    genreitem.name = item['name']
                genreitem.id = self.url_base + item['link']
                genreitem.description = item['contents']
                genreitem.thumbnails = item['thumbnail']
                genreitem.page = page
                genreitem.totalpage = totalpage
                genrelist.append(genreitem)

            return genrelist

        else:
            data = tools.urlopen(self.app, url, {'cache': 3600})
            genrelist = []
            if data == "":
                mc.ShowDialogNotification("No genre found for " + str(genre))
                return genrelist

            soup = BeautifulSoup(data,
                                 convertEntities=BeautifulSoup.HTML_ENTITIES,
                                 smartQuotesTo="xml")
            if totalpage == "":
                try:
                    pagediv = soup.findAll('div', {'class': 'pagination'})[0]
                    apage = pagediv.findAll("a")
                    totalpage = int(apage[len(apage) - 2].contents[0])
                except:
                    totalpage = 1

            if type == 'table':
                div_show = soup.find('table', {'class': 'episodes'})
                list = div_show.findAll("tr")

            elif type == 'ol':
                div_show = soup.find('ol', {'class': 'broadcasts detail'})
                list = div_show.findAll("li")

            for info in list:
                try:
                    omroep = info.findAll(
                        attrs={"class": "broadcaster-logo"})[0]['alt']
                    item = True
                except:
                    item = False
                if item:
                    if omroep == "Nederland 1": omroep = "nl1"
                    elif omroep == "Nederland 2": omroep = "nl2"
                    elif omroep == "Nederland 3": omroep = "nl3"
                    try:
                        thumb = info.findAll(
                            attrs={"class": "thumbnail"})[0]['src']
                    except:
                        thumb = info.findAll(
                            attrs={"class": "thumbnail placeholder"})[0]['src']
                    path = self.url_base + info.find(
                        attrs={"class": "thumbnail_wrapper"})['href']

                    if type == 'ol':
                        title = info.findAll(
                            attrs={"class": "series"})[0].contents[0]
                        desc = info.find('div', {
                            'class': 'description'
                        }).p.contents[0]
                        date = info.find(attrs={
                            "class": "channel"
                        }).contents[0].replace(' ', '').replace(
                            '\n',
                            '').replace('\t',
                                        '').replace('op',
                                                    '').replace('om', '')
                    if type == 'table':
                        title = info.findAll(
                            attrs={"class": "series"}
                        )[0].contents[0] + ': [COLOR FFA6A6A6]' + info.find(
                            'a', {
                                'class': 'episode'
                            }).contents[0] + '[/COLOR]'
                        desc = ''
                        date = info.find(
                            'td', {'class': 'right'})['title'].split(' ')[0]
                    genreitem = CreateEpisode()
                    genreitem.name = title
                    genreitem.id = path
                    genreitem.description = desc
                    genreitem.thumbnails = thumb
                    genreitem.date = date
                    genreitem.filter = str(omroep).upper()
                    genreitem.page = page
                    genreitem.totalpage = totalpage
                    genrelist.append(genreitem)

            return genrelist
コード例 #49
0
def getData(user, pw):

    itemlist = []

    # get HTML
    link = 'https://www.onlinetvrecorder.com/v2/?go=home'
    data = functions.getHTML(user, pw, link)

    # logged in
    result = data.replace('\'', '\"')
    soup = BeautifulSoup(result)

    # search for highlights

    tables = soup.findAll('div', {'class': 'content'})

    for table in tables:
        # check its the right table
        taList = table.find('div', {'class': 'homedoublehighlight'})
        if taList is not None:

            x = ItemClass()

            taList = table.find('td')
            sStyle = taList['style'].encode()

            m = re.search('background-image:url\((?P<thumb>.*?)\)', sStyle)
            if (m is not None):
                x.thumb = m.group('thumb')
            else:
                x.thumb = 'DefaultVideo.png'

            h1 = table.find('a')
            x.url = h1['href']

            # we just want the id
            s = x.url.index('id=')
            x.url = x.url[s + 3:]

            sp = table.find('span')
            x.title = sp.text

            text1 = table.find('div', {'class': 'homedoublehighlight'})
            x.text = text1.text

            x.vid = ''

            x.text = x.text.replace('|', '\n')

            itemlist.append(x)

    # search for actual movies

    content = soup.findAll('div', {'class': 'homethree'})

    for c in content:

        x = ItemClass()

        link = c.find('a', {'class': 'homethreehredbig'})
        if link is None:
            break

        x.url = link['href']

        # we just want the id
        s = x.url.index('id=')
        x.url = x.url[s + 3:]

        title = c.find('div', {'class': 'toolbardiv'})
        x.title = title.text

        data = c.findAll('div', {'class': 'homethreee'})

        x.thumb = 'DefaultVideo.png'
        x.vid = ''

        for e in data:
            img = e.find('img')
            if img is not None:
                x.thumb = img['src']
            else:
                sty = e['style']
                m = re.search('background-image:url\((?P<thumb>.*?)\)', sty)
                if (m is not None):
                    x.thumb = m.group('thumb')

            vid = e.find('video')
            if vid is not None:
                x.vid = vid['src']

        desc = c.find('div', {'class': 'homethreec'})
        x.text = desc.text

        x.text = x.text.replace('|', '\n')

        itemlist.append(x)

    return itemlist
コード例 #50
0
ファイル: export.py プロジェクト: blhughes/dd
#!/bin/env python

import pdfkit
import requests
import getpass
from BeautifulSoup import BeautifulSoup
import sys

user = sys.argv[2]
urlroot = sys.argv[1]
passwd = getpass.getpass()
auth=(user,passwd)

r=requests.get("%s/ringsheets"%urlroot, auth=auth, verify=False)
soup = BeautifulSoup(r.text)
for x in soup.findAll('a')[5:]:
    path = x.get('href')
    print path
    filename = path.replace('/','_').replace('?','_').replace('&','_').replace('=','_')
    options = {
    'page-size': 'Letter',
    'margin-top': '0.5in',
    'margin-right': '0.5in',
    'margin-bottom': '0.5in',
    'margin-left': '0.5in',
    'encoding': "UTF-8",
    'username': user,
    'password': passwd,
    'zoom': '.9'
    }
    pdfkit.from_url("%s/%s"%(urlroot,path), "%s.pdf"%filename  ,  options=options)
コード例 #51
0
ファイル: laholator.py プロジェクト: ddlarosa/rmxstatistics
 def __unicode__(self):
     str = unicode(BeautifulSoup(self.text,convertEntities=BeautifulSoup.HTML_ENTITIES))
     return nltk.clean_html(str)
コード例 #52
0
def getMoreData(user, pw, page):

    # page 1 display page without big highlights

    itemlist = []

    # init browser
    br = mechanize.Browser()
    br.set_handle_robots(False)

    br.open("https://www.onlinetvrecorder.com/v2/?go=home")

    # login
    br.select_form('fhomelogin')

    br['email'] = user
    br['password'] = pw
    br.submit().read()

    select = 0  # -2 returns also the highlights

    if (page == 2):
        select = 14
    if (page > 2):
        select = (15 * (page - 1)) - 1

    params = {u'language': 'de', u'start': str(select)}
    data = urllib.urlencode(params)

    response = br.open(
        "https://www.onlinetvrecorder.com/v2/ajax/get_homethree.php", data)
    result = response.read()

    # logged in

    result = result.replace('\'', '\"')
    soup = BeautifulSoup(result)

    # search for actual movies

    content = soup.findAll('div', {'class': 'homethree'})

    for c in content:

        x = ItemClass()

        link = c.find('a', {'class': 'homethreehredbig'})
        if link is None:
            break

        x.url = link['href']

        # we just want the id
        s = x.url.index('id=')
        x.url = x.url[s + 3:]

        title = c.find('div', {'class': 'toolbardiv'})
        x.title = title.text

        data = c.findAll('div', {'class': 'homethreee'})

        x.thumb = 'DefaultVideo.png'

        for e in data:
            img = e.find('img')
            if img is not None:
                x.thumb = img['src']
            else:
                sty = e['style']
                m = re.search('background-image:url\((?P<thumb>.*?)\)', sty)
                if (m is not None):
                    x.thumb = m.group('thumb')

            vid = e.find('video')
            if vid is not None:
                x.vid = vid['src']

        desc = c.find('div', {'class': 'homethreec'})
        x.text = desc.text

        itemlist.append(x)

    return itemlist
コード例 #53
0
    def get_bill_info(self, chamber, session, bill_id):
        print 'Getting %s %s' % (session, bill_id)

        detail_url = 'http://www.leginfo.ca.gov/cgi-bin/postquery?bill_number=%s_%s&sess=%s' % (
            bill_id[:2].lower(), bill_id[2:], session.replace('-', ''))

        # Get the details page and parse it with BeautifulSoup. These
        # pages contain a malformed 'p' tag that (certain versions of)
        # BS choke on, so we replace it with a regex before parsing.
        details_raw = urllib2.urlopen(detail_url).read()
        details_raw = details_raw.replace('<P ALIGN=CENTER">', '')
        details = BeautifulSoup(details_raw)

        # Get the history page (following a link from the details page).
        # Once again, we remove tags that BeautifulSoup chokes on
        # (including all meta tags, because bills with quotation marks
        # in the title come to us w/ malformed meta tags)
        hist_link = details.find(href=re.compile("_history.html"))
        hist_url = 'http://www.leginfo.ca.gov%s' % hist_link['href']
        history_raw = urllib2.urlopen(hist_url).read()
        history_raw = history_raw.replace(
            '<! ****** document data starts here ******>', '')
        rem_meta = re.compile('</title>.*</head>', re.MULTILINE | re.DOTALL)
        history_raw = rem_meta.sub('</title></head>', history_raw)
        history = BeautifulSoup(history_raw)

        # Find title and add bill
        title_match = re.search('TOPIC\t:\s(\w.+\n(\t\w.*\n){0,})',
                                history_raw, re.MULTILINE)
        bill_title = title_match.group(1).replace('\n', '').replace('\t', ' ')
        self.add_bill(chamber, session, bill_id, bill_title)

        # Find author (primary sponsor)
        sponsor_match = re.search('^AUTHOR\t:\s(.*)$', history_raw,
                                  re.MULTILINE)
        bill_sponsor = sponsor_match.group(1)
        self.add_sponsorship(chamber, session, bill_id, 'primary',
                             bill_sponsor)

        # Get all versions of the bill
        text_re = '%s_%s_bill\w*\.html' % (bill_id[:2].lower(), bill_id[2:])
        links = details.find(text='Bill Text').parent.findAllNext(
            href=re.compile(text_re))
        for link in links:
            version_url = "http://www.leginfo.ca.gov%s" % link['href']

            # This name is not necessarily unique (for example, there may
            # be many versions called simply "Amended"). Perhaps we should
            # add a date or something to make it unique?
            version_name = link.parent.previousSibling.previousSibling.b.font.string
            self.add_bill_version(chamber, session, bill_id, version_name,
                                  version_url)

        # Get bill actions
        action_re = re.compile(
            '(\d{4})|([\w.]{4,6}\s+\d{1,2})\s+(.*(\n\s+.*){0,})', re.MULTILINE)
        act_year = None
        for act_match in action_re.finditer(history.find('pre').contents[0]):
            # If we didn't match group 2 then this must be a year change
            if act_match.group(2) == None:
                act_year = act_match.group(1)
                continue

            # If not year change, must be an action
            act_date = act_match.group(2)
            action = act_match.group(3).replace('\n',
                                                '').replace('  ', ' ').replace(
                                                    '\t', ' ')
            self.add_action(chamber, session, bill_id, chamber, action,
                            act_date)
コード例 #54
0
def scrape(output_file, url_id, url_end, type):
    with open(output_file, 'wb') as csvfile:
        w = unicodecsv.writer(csvfile, encoding='utf-8')
        headers = [
            'county', 'office', 'district', 'party', 'candidate', 'votes'
        ]
        w.writerow(headers)

        for i in range(len(counties)):

            url = ''
            if counties[i] == 'Santa Fe':
                url = 'http://www.sos.state.nm.us/uploads/FileLinks/' + url_id + '/conty000' + url_end + '.htm'
            else:
                url = 'http://www.sos.state.nm.us/uploads/FileLinks/' + url_id + '/conty0' + getCounty(
                    i) + '.HTM' + url_end + '.html'

            r = requests.get(url)
            soup = BeautifulSoup(r.text)
            hed = str(soup.find('h2'))
            tables = soup.findAll('table')

            if type == 'general':
                tables = tables[:len(tables) - 2]

            count = 0
            for table in tables:
                count = count + 1

                office_district = ''
                district = ''

                if count > 1:
                    office_district = table.findAll('h2')[0].getText().split(
                        '-')
                else:
                    office_district = ['PRESIDENT OF THE UNITED STATES']

                if len(office_district) > 1:
                    if len(office_district) > 1:
                        if office_district[1].split(' ')[
                                1] == 'DISTRICT' or office_district[1].split(
                                    ' ')[1] == 'DIVISION':
                            district = office_district[1].split(' ')[-1]
                            if district not in district_exclusions:
                                district = int(district)
                            else:
                                district = ''

                for row in table.findAll('tr'):
                    col = row.findAll('td')
                    county = counties[i]
                    office = office_district[0]
                    party = clean(col[1]).strip()
                    candidate = clean(col[0]).strip()
                    votes = clean(col[2]).strip()

                    if candidate:
                        w.writerow([
                            county,
                            office.strip(), district, party, candidate, votes
                        ])
コード例 #55
0
 def parseLoginError(self, res):
     page = BeautifulSoup(res.read())
     r = page.findAll('span', attrs={'class': 'error'})
     return r
コード例 #56
0
ファイル: food_shelters.py プロジェクト: bonnie/h4h_2017
def get_soup(url):
    """Request webpage and return a Beautiful Soup object."""

    resp = requests.get(url)
    soup = BeautifulSoup(resp.text)
    return soup
コード例 #57
0
    t = date.today() + timedelta(days=1)
    sevenday = t + timedelta(days=7)
    opd_date = str(t.timetuple().tm_year - 1911) + t.strftime("%m%d")
    opd_date2 = str(sevenday.timetuple().tm_year -
                    1911) + sevenday.strftime("%m%d")

    data = {
        'Opd_date': opd_date,
        'Opd_date2': opd_date2,
        'dept_code': dept_code,
        'doc_code': '',
        'Submit1': '確認送出'
    }
    page = fetchPOSTHtml("http://www.wanfang.gov.tw/W402008web_new/opdreg.asp",
                         data)
    soup = BeautifulSoup(page)
    table = soup.findAll('tr', align="middle")[0].parent
    time = str(int(time_shift[0:4]) -
               1911) + time_shift[5:7] + time_shift[8:10]
    shift = ord(time_shift[11:12]) - 64
    tr = table.findAll(lambda tag: tag.text.find(time) > -1)
    if tr == []:
        status = 1
        message = u"找不到可掛號時段."
    else:
        a = tr[0].contents[shift * 2 +
                           1].findAll(attrs={'href': re.compile(doct_code)})
        if a == []:
            status = 1
            message = u"找不到可掛號時段!"
        else:
コード例 #58
0
def loadVideos(url, name):
    #try:
    newlink = url
    xbmc.executebuiltin(
        "XBMC.Notification(Please Wait!,Loading selected video)")
    print newlink
    playtype = "direct"
    if (newlink.find("dailymotion") > -1):
        match = re.compile(
            '(dailymotion\.com\/(watch\?(.*&)?v=|(embed|v|user)\/))([^\?&"\'>]+)'
        ).findall(newlink)
        lastmatch = match[0][len(match[0]) - 1]
        link = 'http://www.dailymotion.com/' + str(lastmatch)
        req = urllib2.Request(link)
        req.add_header(
            'User-Agent',
            'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3'
        )
        response = urllib2.urlopen(req)
        link = response.read()
        response.close()
        sequence = re.compile('"sequence",  "(.+?)"').findall(link)
        newseqeunce = urllib.unquote(sequence[0]).decode('utf8').replace(
            '\\/', '/')
        #print 'in dailymontion:' + str(newseqeunce)
        imgSrc = re.compile('"videoPreviewURL":"(.+?)"').findall(newseqeunce)
        if (len(imgSrc[0]) == 0):
            imgSrc = re.compile('/jpeg" href="(.+?)"').findall(link)
        dm_low = re.compile('"sdURL":"(.+?)"').findall(newseqeunce)
        dm_high = re.compile('"hqURL":"(.+?)"').findall(newseqeunce)
        vidlink = urllib2.unquote(dm_low[0]).decode("utf8")
    elif (newlink.find("4shared") > -1):
        d = xbmcgui.Dialog()
        d.ok('Not Implemented', 'Sorry 4Shared links', ' not implemented yet')
    elif (newlink.find("docs.google.com") > -1
          or newlink.find("drive.google.com") > -1):
        docid = re.compile('/d/(.+?)/preview').findall(newlink)[0]
        cj = cookielib.LWPCookieJar()
        (cj, vidcontent) = GetContent2(
            "https://docs.google.com/get_video_info?docid=" + docid, "", cj)
        html = urllib2.unquote(vidcontent)
        cookiestr = ""
        try:
            html = html.encode("utf-8", "ignore")
        except:
            pass
        stream_map = re.compile('fmt_stream_map=(.+?)&fmt_list').findall(html)
        if (len(stream_map) > 0):
            formatArray = stream_map[0].replace("\/", "/").split(',')
            for formatContent in formatArray:
                formatContentInfo = formatContent.split('|')
                qual = formatContentInfo[0]
                url = (formatContentInfo[1]).decode('unicode-escape')

        else:
            cj = cookielib.LWPCookieJar()
            newlink1 = "https://docs.google.com/uc?export=download&id=" + docid
            (cj, vidcontent) = GetContent2(newlink1, newlink, cj)
            soup = BeautifulSoup(vidcontent)
            downloadlink = soup.findAll('a', {"id": "uc-download-link"})[0]
            newlink2 = "https://docs.google.com" + downloadlink["href"]
            url = GetDirVideoUrl(newlink2, cj)
        for cookie in cj:
            cookiestr += '%s=%s;' % (cookie.name, cookie.value)
        vidlink = url + ('|Cookie=%s' % cookiestr)
    elif (newlink.find("vimeo") > -1):
        idmatch = re.compile(
            "http://player.vimeo.com/video/([^\?&\"\'>]+)").findall(newlink)
        if (len(idmatch) > 0):
            playVideo('vimeo', idmatch[0])
    elif (newlink.find("youtube") > -1) and (newlink.find("playlists") > -1):
        playlistid = re.compile('playlists/(.+?)\?v').findall(newlink)
        vidlink = "plugin://plugin.video.youtube?path=/root/video&action=play_all&playlist=" + playlistid[
            0]
    elif (newlink.find("youtube") > -1) and (newlink.find("list=") > -1):
        playlistid = re.compile('videoseries\?list=(.+?)&').findall(newlink +
                                                                    "&")
        vidlink = "plugin://plugin.video.youtube?path=/root/video&action=play_all&playlist=" + playlistid[
            0]
    elif (newlink.find("youtube") > -1) and (newlink.find("/p/") > -1):
        playlistid = re.compile('/p/(.+?)\?').findall(newlink)
        vidlink = "plugin://plugin.video.youtube?path=/root/video&action=play_all&playlist=" + playlistid[
            0]
    elif (newlink.find("youtube") > -1) and (newlink.find("/embed/") > -1):
        playlistid = re.compile('/embed/(.+?)\?').findall(newlink + "?")
        vidlink = getYoutube(playlistid[0])
    elif (newlink.find("youtube") > -1):
        match = re.compile(
            '(youtu\.be\/|youtube-nocookie\.com\/|youtube\.com\/(watch\?(.*&)?v=|(embed|v|user)\/))([^\?&"\'>]+)'
        ).findall(newlink)
        if (len(match) == 0):
            match = re.compile(
                'http://www.youtube.com/watch\?v=(.+?)&dk;').findall(newlink1)
        if (len(match) > 0):
            lastmatch = match[0][len(match[0]) - 1].replace('v/', '')
        print "in youtube" + lastmatch[0]
        vidlink = lastmatch
        playtype = "youtube"
    else:
        sources = []
        label = name
        hosted_media = urlresolver.HostedMediaFile(url=newlink, title=label)
        sources.append(hosted_media)
        source = urlresolver.choose_source(sources)
        print "inresolver=" + newlink
        if source:
            vidlink = source.resolve()
        else:
            vidlink = ""
    playVideo(playtype, vidlink)
コード例 #59
0
def convert(text):
    global g_allIgnoredP
    # make some magic with text - to simplyfy parsing
    # remove <p><! p. xxx !></p>
    text = text[text.find("!>") + 2:]
    # remove page info
    i = text.find("<!")
    while -1 != i:
        i2 = text[i + 1:].find("!>")
        assert -1 != i2
        i2 += i + 1
        text = text[:i] + text[i2 + 2:]
        i = text.find("<!")
    text = text.replace("\n\n<p></p>", "")
    # move blockquotes to one p with def
    text = text.replace("</p>\n\n<p><blockquote>", "\n\n<blockquote>")
    # move col to one p with def
    text = text.replace("</p>\n\n<p><col>", "\n\n<col>")
    # move Syn. to one p with def
    text = text.replace("</p>\n\n<p><b>Syn.", "\n\n<b>Syn.")

    print "  start parsing (feed soup - it may take a while)"
    # start parsing
    soup = BeautifulSoup()
    soup.feed(text)
    print "  soup feeded"

    pList = soup.fetch("p")
    currentPos = "ignore"
    currentWord = ""
    currentDef = ""
    currentQuotes = []

    # add word
    # addWord(currentWord, currentPos, currentDef, currentQuotes)
    counter = 0
    for p in pList:
        counter += 1
        if counter % 2000 == 0:
            print "   counter: %d\t Last word: %s" % (counter, currentWord)
        pos = p.first("pos")
        if pos:
            currentPos = getPos(getAllTextFromTag(pos))
        if currentPos != "ignore":
            hw = p.first("hw")
            if hw:
                txt = getAllTextFromTag(hw)
                currentWord = removeAccents(txt)
            defs = p.first("def")
            currentDef = ""
            if defs:
                currentDef = getAllTextFromTag(defs)

            currentQuotes = []
            for q in p.fetch("blockquote"):
                currentQuotes.append(getQuote(q))

            if currentDef != "":
                if currentDef.startswith("See "):
                    handleSeeWord(currentWord, currentPos, currentDef,
                                  currentQuotes)
                else:
                    addWord(currentWord, currentPos, currentDef, currentQuotes)

            else:
                g_allIgnoredP += str(p) + "\n\n"
コード例 #60
0
# Set the user-agent as Mozilla - if the page knows we're Mechanize, it won't return all fields
br.addheaders = [(
    'User-agent',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
)]
#open the URL previously defined as 'starting_url'
br.open(starting_url)
#find out and display (print) the names of any forms in the HTML
#i.e. <form ... name="
print "All forms:", [form.name for form in br.forms()]
#as it happens, the name of the form in this page is... "form"
br.select_form(name="form")
#submit the form and put the contents into 'response'
response = br.submit()
#create soup object by reading the contents of response and passing it through BeautifulSoup
soup = BeautifulSoup(br.response().read())
# Have a look at 'soup': note the 'onSubmit' JavaScript function that is called when
# you click on the 'next' link. We'll mimic this in the function above.
print soup
# START scraping by running scrape_table function created above
scrape_table(soup)

#If we wanted to scrape more than one results page we would replace the previous line
#with this function, which would in turn run the other function
#scrape_and_look_for_next_link(soup)

#if we need to print contents of form so we can see what it contains before next step
#print br.form
#if the form requires certain fields to be filled/selected, then we would do so here
#like so: br["ctl00$phMainContent$dropDownAwardDate"] = ["Between"]
#see https://scraperwiki.com/scrapers/new/python?template=tutorial-mechanize#