def wolfplex(options):
    # clean events
    Event.objects.filter(source="wolfplex").delete()

    html_parser = HTMLParser()

    soup = BeautifulSoup(urlopen("http://www.wolfplex.org/wiki/Main_Page").read())

    events = soup.find("div", id="accueil-agenda").dl

    for date_info, event in zip(events('dt'), events('dd')[1::2]):
        if event.span:
            event.span.clear()

        title = html_parser.unescape(event.text)
        base_domain = "http://www.wolfplex.org" if not event.a["href"].startswith("http") else ""
        url = (base_domain + event.a["href"]) if event.a else "http://www.wolfplex.org"
        start = parse(date_info.span["title"])

        if "@" in title:
            title, location = title.split("@", 1)
        else:
            location = None

        Event.objects.create(
            title=title,
            source="wolfplex",
            url=url,
            start=start,
            location=location
        )

        if not options["quiet"]:
            print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "wolfplex", location.encode("Utf-8") if location else "")
def getLastPageNum(alamatURL):


	strHTML = fetchHTML(alamatURL)

	mysoup = BeautifulSoup(strHTML)

	arrURL = mysoup.findAll('tfoot')[0].findAll('tr')[0].findAll('a')
	
	maxPage = 0

	if arrURL:
		
		for i in range (0, len(arrURL)):

			lastPageNum = int(arrURL[i].get('href').split('/')[7].split('?')[0])

			if lastPageNum > maxPage:

				maxPage = lastPageNum

		lastPageNum = maxPage
		
	else:
		lastPageNum = 0
	print "last page number is:", lastPageNum
	return int(lastPageNum)
Beispiel #3
0
 def find_external_urls(self, gbobject):
     """Find external urls in an gbobject"""
     soup = BeautifulSoup(gbobject.html_content)
     external_urls = [a['href'] for a in soup.findAll('a')
                      if self.is_external_url(
                          a['href'], self.ressources.site_url)]
     return external_urls
Beispiel #4
0
def get_daily_specials(day=None):
	page = urlopen(URL)
	soup = BeautifulSoup(page)
	page.close()

	daily_specials = {
		"name": "Dolcetto",
		"specials": [],
		"streetaddress": "Kyrkogatan 8, Sundsvall",
		"dataurl": URL,
		"mapurl": "http://www.hitta.se/ViewDetailsPink.aspx?Vkiid=4uG7%252fiYMOcHQKtp0VSkMNw%253d%253d&Vkid=3215131"
	}

	if day == None:
		day = date.today().weekday()

	# No lunch on Saturday or Sunday
	if day == 5 or day == 6:
		return daily_specials

	day = [u"måndag", u"tisdag", u"onsdag", u"torsdag", u"fredag"][day]
	anchor = soup.find(lambda t: t.name == "h2" and t.text == "Lunchmeny")
	menu = filter(lambda x: isinstance(x, NavigableString), anchor.findNextSibling("p"))
	for i, v in enumerate(menu):
		if day == v.lower():
			daily_specials["specials"].append(menu[i+1])
			break	

	return daily_specials
Beispiel #5
0
    def crawl_again(self, item, q, s):
        """
        Crawls the content page, looking for all urls in the same domain.

        """
        r = s.get(item['link'])
        soup = BeautifulSoup(r.text)
        main = soup.title.getText()
        urls = soup.findAll('a')
        chre = re.compile("(?<=chpt=)\d+")
        for url in urls:
            href = url['href']
            isChapt = chre.search(href)
            if isChapt == None:
                mySub = "NoChap"
            else:
                mySub = isChapt.group(0)
            if href.startswith('/'):
                link = domain + href
                q.enq({
                    'main_page': main,
                    'sub-page': mySub,
                    'section': url.parent.parent.getText().lstrip(),
                    'link': link
                })
        return len(urls)
Beispiel #6
0
def main():

    #for p in range(1,intGetMaxPage +1):
    #soup = BeautifulSoup()
    try:
        resp = urllib2.urlopen(getUrl,timeout=10)
        soup = BeautifulSoup(resp)
        soup = soup.find('div' ,{'id':'prodlist'})

    
        #for k in soup.findAll("div", {'class': 'p-name'}): # 抓< div class='p=name'>...< /div>
        for k in soup.findAll('a', href=True): 
            try:
            
                url = k.get('href') 
                print k.text
                print url 
        
                page_url = homeUrl + url
                print page_url
                resp_text_page = urllib2.urlopen(homeUrl + url, timeout=10)
            
                soup_text_page = BeautifulSoup(resp_text_page)
                contextPageUrl(soup_text_page,page_url)    
            except:
                print "Unexpected error:", sys.exc_info()[0]
                print "Unexpected error:", sys.exc_info()[1]
                continue
    except:
        #continue
        print "Unexpected error:", sys.exc_info()[0]
        print "Unexpected error:", sys.exc_info()[1]
        pass
Beispiel #7
0
def theme_worker():
    def get_projects(doc):
        for result in doc.findAll(title=u"Project acronym"):
            a = result.a
            link = "http://cordis.europa.eu" + dict(a.attrs)['href'][2:]
            yield link

    logging.info('START THEME WORKER')
    while True:
        count = 0
        theme = q.get()
        logging.info('THEME: %s', repr(theme))

        url = THEME_URL % {'theme': theme}
        try:
            while True:
                r = requests.get(url, config=REQUESTS_CONFIG)
                if not r.ok:
                    logging.error("Request failed for url: %s", url)
                    continue
                doc = BeautifulSoup(r.content)
                for proj in get_projects(doc):
                    project_queue.put((theme, proj))
                    count += 1
                try:
                    next_ = dict(doc.find(
                            text="Next 20 projects &raquo;").parent.attrs
                        )['href'][2:]
                except AttributeError:
                    break
                url = "http://cordis.europa.eu" + next_
        except Exception, e:
            logging.error("THEME_WORKER: Error for url: %s", url)
            logging.error(e)
        finally:
Beispiel #8
0
def get_favicon_url(url):
    if not url.startswith('http'):
        url = "http://{0}".format(url)

    # Check if the root location has a favicon before parsing for it
    if _has_root_favicon(url):
        return urlparse.urljoin(url, 'favicon.ico')

    headers = {'User-Agent': 'Mozilla/5.0'}
    request = urllib2.Request(url, None, headers)

    website = urllib2.urlopen(request).read()

    soup = BeautifulSoup(website)
    favicon_element = soup.find("link", rel="shortcut icon")

    if favicon_element:
        hostname = urlparse.urlparse(url).hostname
        favicon_url = favicon_element['href']

        if favicon_url.startswith('//cdn'):
            return "http:" + favicon_url
        # favicon url is relative and must be converted to absolute path
        elif hostname not in favicon_url:
            return urlparse.urljoin(url, favicon_url)
        else:
            return favicon_url
    else:
        return None
Beispiel #9
0
 def split_contents(self):
     """ Iterates over the elements in the block """
     if self.split_content:
         return self.split_content
     split = self.soup.findAll({'link' : True, 'style' : True})
     for elem in split:
         if elem.name == 'link' and elem['rel'] == 'stylesheet':
             filename = self.get_filename(elem['href'])
             path, ext = os.path.splitext(filename)
             if ext in settings.COMPILER_FORMATS.keys():
                 if self.recompile(filename):
                     self.compile(path,settings.COMPILER_FORMATS[ext])
                 basename = os.path.splitext(os.path.basename(filename))[0]
                 elem = BeautifulSoup(re.sub(basename+ext,basename+'.css',unicode(elem)))
                 filename = path + '.css'
             try:
                 self.split_content.append(('file', filename, elem))
             except UncompressableFileError:
                 if django_settings.DEBUG:
                     raise
         if elem.name == 'style':
             data = elem.string            
             elem_type = elem.get('type', '').lower()
             if elem_type and elem_type != "text/css":
                 # it has to be preprocessed
                 if '/' in elem_type:
                     # we accept 'text/ccss' and plain 'ccss' too
                     elem_type = elem_type.split('/')[1]
                 # TODO: that dot-adding compatibility stuff looks strange.
                 # do we really need a dot in COMPILER_FORMATS keys?
                 ext = '.'+elem_type
                 data = self.compile_inline(data,ext)
                 elem = ''.join(("<style type='text/css'>\n",data,"\n</style>"))
             self.split_content.append(('hunk', data, elem))
     return self.split_content
Beispiel #10
0
def get_epfile(url):
    """
    Return the file (mp3) URL to be read from the website to play the selected
    reloaded episode.
    Input
        the webpage URL of the episode to be played.
        E.g.: http://www.deejay.it/audio/20130526-4/269989/
    Output
        the URL of the mp3 (rarely a wma) file to be played to listen to the
        selected episode. E.g.:
        http://flv.kataweb.it/deejay/audio/dee_giallo/deegiallolosmemoratodicollegno.mp3
        Returns an empty string if the file cannot be found.
    """
    soup = BeautifulSoup(urllib2.urlopen(url))
    fileurl = soup.find('div', {'id': 'playerCont'})

    if not fileurl:
        return ''
    else:
        hit = re.findall("file=(.*.mp3)&",
            fileurl.iframe['src'])
        if not hit:
            return ''
        else:
            return hit[0]
Beispiel #11
0
 def start(self):
     with QMutexLocker(self.mutex):
         self.stoped = False
         
     #for i in range(self.start_p,self.end_p):
     for i in range(1,3):
         while self.suspended:
             self.wait()  
             return
         if self.stoped:
             return
         url ="http://www.99fang.com/service/agency/a1/?p=%d" % i
         print url            
         
         try:
             r = urllib2.urlopen(url).read()
             soup = BeautifulSoup(r)
             box = soup.find("div",{'class':'agency-call-box'})
             lis = box("li")
             for li in lis:
                 
                 tel = li.a.string
                 print tel
                 r =urllib2.urlopen("http://suzhou.jjr360.com/app.php?c=spider&a=index&city=&tel=%s" % tel)
                 print r.read()
         except:
             pass
         else:
             #self.emit(SIGNAL("updateTime()"))
             time.sleep(1)
Beispiel #12
0
def fetch_page(link_id):
    link = Link.objects.get(pk=link_id)
    url = link.url

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'}
    req = urllib2.Request(url, None, headers)

    try:
        html = urllib2.urlopen(req).read()
        soup = BeautifulSoup(html)
        link.title = soup.find('title').text

        favicon = soup.find('link', rel='shortcut icon')
        if favicon and favicon['href']:
            link.favicon = urljoin(url, favicon['href'])

        for item in soup.findAll('meta'):
            if item.get('name', '').lower() in ('description', 'og:description') and item.get('content', ''):
                link.description = item.get('content', '')

    except Exception as e:
        link.is_error = 1
        link.error_text = e.reason.__str__()

    link.save()
Beispiel #13
0
    def _on_login(self, page):
        soup = BeautifulSoup(page)
        if soup.find('a', text='Log in'):
            raise LoginError(page)

        self._browser.save_cookies()
        return soup
Beispiel #14
0
    def _on_page(self, page):
        if not page:
            import ipdb
            ipdb.set_trace()

        soup = BeautifulSoup(page)
        if not soup.find('a', text='Log in'):
            event = soup.find('b', text='Something has happened!')
            if event:
                cell = event.findParent('table').findAll('td')[2]
                text = ''.join([x.text if hasattr(x, 'text') else x
                        for x in cell.childGenerator()])
                self._logger.info("Something has happned: %s", text)

            try:
                self._neopoints = get_np(soup)
            except NoNpInPage:
                pass

            return soup

        self._logger.info('Need to login. Using account %s', self._username)
        data = dict(username=self._username, password=self._password,
                    destination=soup.find(
                        'input', attrs=dict(name='destination'))['value'])
        d = self._browser.post('http://www.neopets.com/login.phtml', data)
        d.addCallback(self._on_login)
        return d
Beispiel #15
0
def getsubhyperlink(origin_url, html_content, reslist, temp_set):
	soup = BeautifulSoup(html_content, parseOnlyThese=SoupStrainer('a'))
	hyperlink = soup.findAll('a',href=True)

	for tag in hyperlink:
		if "https" in tag['href'] or "http" in tag['href']:
			if tag['href'] not in temp_set:
				if origin_url in tag['href']:
					reslist.append(tag['href'])
					temp_set.append(tag['href'])
		else:
			if "www" in tag['href']:
				temp_url = "http://"+tag['href']
				if temp_url not in temp_set:
					if origin_url in temp_url:
						reslist.append(temp_url)
						temp_set.append(temp_url)
			else:
				if tag['href'] and tag['href'][0] == '/': 
					temp_url = origin_url + tag['href']
					if temp_url not in temp_set:
						reslist.append(temp_url)
						temp_set.append(temp_url)
				else:
					temp_url = origin_url + tag['href']
					if temp_url not in temp_set:
						reslist.append(temp_url)
						temp_set.append(temp_url)
def parseLyrics(lyricList,outlist,s,e):
	baseURL = u'http://www.darklyrics.com' 
	i = 0 ;
	for key in lyricList :
		i = i + 1 ;
		if(i >= s and i<= e):
			#key = 'In Flames'  # REMOVE FOR 100 Bands
			time.sleep(1)
			turl = lyricList[key] ;
			print 'Looking up band ' + key
			#print turl
			opener = urllib2.build_opener()
			opener.addheaders = [('User-agent', 'Mozilla/5.0')]
			page = opener.open(turl)
			soup = BeautifulSoup(page.read())
			divs = soup.findChildren('div',attrs={"class" : "album"})
			#get the sub-URL to the lyrics of the latest album and then full URL to the lyrics source
			if(len(divs)>0):
				sub_url =  divs[len(divs)-1].findChildren('a')[0]['href']
				lurl = baseURL + sub_url.split('#')[0][2:]
				#print lurl
				# hit the URL and get data
				page = opener.open(lurl)
				soup = BeautifulSoup(page.read())
				lydiv = soup.findChildren('div',attrs={"class" : "lyrics"})[0]
				[x.extract() for x in lydiv('div')]
				#lyrictext = re.sub('\'lydiv.text ;
				rly = getRawLyrics(lydiv) 
			else:
				rly = "Manual"
				print rly
			outlist[key] = rly
		#break ; # remove once started full testing
	print 'done' , s, ' to ', e	
	return outlist
Beispiel #17
0
	def selectForm(self, r):
		html = r.content
		linkget = r.url
		forms_filter = SoupStrainer('form');
		soup = BeautifulSoup(html, parseOnlyThese=forms_filter);
		forms_post = ClientForm.ParseFile(StringIO.StringIO(soup.prettify()), linkget, backwards_compat=False);
		return forms_post
Beispiel #18
0
def removecut(string):
    soup = BeautifulSoup(string, selfClosingTags=['img','br'])
    tag = soup.find('yvcut')
    if not tag: return string
    tag.extract()
    string = soup.renderContents()
    return string    
	def get(self, regno):
		#self.response.headers['Content-Type'] = 'text/html'
		br= _mechanize.Browser()
		cj = cookielib.CookieJar()
		br.set_cookiejar(cj)
		br.set_handle_equiv(True)
		br.set_handle_redirect(True)
		br.set_handle_referer(True)
		br.set_handle_robots(False)
		n=262
		while(n<=262):
			m=str(n).zfill(4) # filling zeros for roll no like 001,002 etc.
			n=n+1
			#self.response.write('11BEC') # This is where roll no goes, for 09BCE just replace by 09BCE.
			#u=regno
			r=br.open('https://academics.vit.ac.in/parent/parent_login.asp')
			html=r.read()
			soup=BeautifulSoup(html)
			img = soup.find('img', id='imgCaptcha')
			image_response = br.open_novisit(img['src'])
			captcha = Captcha()
			#captcha.cookie = "123456788sids"
			#captcha.image = db.Blob(image_response.read())
			captcha.regno = regno
			for cook in cj:
                                                                captcha.cookie = cook.value
                                                                captcha.cookiename = cook.name
																
			captcha.put()
			self.response.headers['Content-Type'] = 'image/jpeg'
			self.response.out.write(image_response.read())
Beispiel #20
0
  def getMovieData(self):
    list = []
    #-- get serial play list & parameters  -------------------------------------
    html = self.Auth.get_HTML(self.serial_url, None, 'http://serialu.net/media/uppod.swf')

    # -- parsing web page
    html = re.compile('<body>(.+?)<\/body>', re.MULTILINE|re.DOTALL).findall(html)[0]
    soup = BeautifulSoup(html)
    pl_url = ''

    is_multiseason = len(soup.findAll('object', {'type':'application/x-shockwave-flash'}))

    for rec in soup.findAll('object', {'type':'application/x-shockwave-flash'}):
        if is_multiseason > 1:
            season = rec.parent.previousSibling.previousSibling.text+r' '
        else:
            season = r''

        for par in rec.find('param', {'name':'flashvars'})['value'].split('&'):
            if par.split('=')[0] == 'pl':
                pl_url = par[3:]

        if pl_url.find('http:') == -1:
            pl_url = xppod.Decode(pl_url)

        #-- get playlist details ---------------------------------------------------
        html = self.Auth.get_HTML(pl_url, None, 'http://serialu.net/media/uppod.swf')
        self.pl_url = pl_url

        # -- check if playlist is encoded
        if html.find('{"playlist":[') == -1:
            html = xppod.Decode(html).encode('utf-8').split(' or ')[0] #-- TODO: make smart choice

        # -- parsing web page
        s_url = ''
        s_num = 0
        movie_list = []
        for rec in re.compile('{(.+?)}', re.MULTILINE|re.DOTALL).findall(html.replace('{"playlist":[', '')):
            for par in rec.replace('"','').split(','):
                if par.split(':')[0]== 'comment':
                    name = str(s_num+1) + ' серия' #par.split(':')[1]+' '
                if par.split(':')[0]== 'file':
                    if 'http' in par.split(':')[1]:
                        s_url = par.split(':')[1]+':'+par.split(':')[2]
                    else:
                        s_url = xppod.Decode(par.split(':')[1]).split(' or ')[0]
            s_num += 1

            # mark part for history
            name = season.encode('utf-8') + name

            movie_list.append({'movie_name': name, 'url': s_url})
            #if h_part <> '-':
            #    if name == h_part:
            #        name = '[COLOR FF00FF00]'+name+'[/COLOR]'
        #-- parse data
        list.append({'name':self.serial_name, 'img': self.serial_img, 'descr': self.serial_descr, 'season_number':s_num, 'name_orig':'', 'movie': movie_list})

    #-- return movie list
    return list
def getpresentationdetails(sender, **kwargs):
    print "Pre Save!"
    #print sender
    model =  kwargs['instance']
    
    
    # fetch the presentation url
    
    try:
        import urllib
        from BeautifulSoup import BeautifulSoup as BS
        html = urllib.urlopen(kwargs['instance'].url).read()
        bs = BS(html)
        # find the let's get the media url

        presurl = bs.find('link', rel='media:presentation')
        print "* Presentation: " + presurl['href']
        # and the thumbnail
        thumburl = bs.find('link', rel='image_src')
        print "* Thumbnail: " + thumburl['href']
        # and the author ame
        creator = bs.find('meta', property='dc:creator')
        print "* Creator: " + creator['content']
        
        title = bs.find('meta', property="media:title")
        print "* Content: " + title['content']

    except Exception, e:
        raise e
Beispiel #22
0
def extract_title(url):
    page = open(page_loc(url))
    soup = BeautifulSoup(page.read())
    title = soup.find('title')
    title = title.string.encode('utf-8')
    gadgets.string_to_file(title, title_loc(url))
    page.close()
Beispiel #23
0
    def crawl(self, url, q):
        """
        Crawls the main url looking for sub-urls.

        """
        print 'calling crawl with url', url
        s = requests.Session()

        num_urls = 0
        r = requests.get(url)
        soup = BeautifulSoup(r.text)

        trs = soup.findAll('tr')
        for tr in trs:
            tds = tr.findAll('td')
            if len(tds) == 6:
                title = tds[1].getText()
                link = tds[3].find('a')['href']
                item = {
                    'main_page': title,
                }
                item['link'] = self.get_data_link(link, s)
                num_urls += self.crawl_again(item, q, s)

        print 'total urls crawled:', num_urls
Beispiel #24
0
def get_syllables(word):
	url = 'http://www.wordcalc.com/index.php'

	post_data = urllib.urlencode(
	   {'text': word})
	post_data = '%s&optionSyllableCount&optionWordCount' % post_data


	cnxn = urllib.urlopen(url, post_data)
	response = cnxn.read()
	cnxn.close()

	soup = BeautifulSoup(response)
	h3_matches = [h3 for h3 in soup.findAll('h3') if h3.text == 'Statistics']
	if len(h3_matches) != 1:
	 raise Exception('Wrong number of <h3>Statistics</h3>')
	h3_match = h3_matches[0]
	table = h3_match.findNextSibling('table')

	td_matches = [td for td in table.findAll('td')
	             if td.text == 'Syllable Count']
	if len(td_matches) != 1:
	 raise Exception('Wrong number of <td>Syllable Count</td>')
	td_match = td_matches[0]

	td_value = td_match.findNextSibling('td')
	syllable_count = int(td_value.text)
	return syllable_count
def getRowsHeadNumber(table):

	# bagaimana cara menentukan berapa jumlah baris yang terpakai sebagai header?

	soup = BeautifulSoup(str(table))
	rows = soup.findAll('tr')
	numRows = len(table.findAll(lambda tag: tag.name == 'tr' and tag.findParent('table') == table))

	# inisialisasi variabel numRowsHead sebagai jumlah baris yang mengandung header

	numRowsHead = 0	
	
	# periksa satu per satu setiap baris

	for i in range (0, numRows):
		
		# apabila dalam suatu baris tertentu terdapat tag <th>
		if rows[i].findAll('th'):
			
			# maka numRows bertambah 1
			numRowsHead = i + 1


	# hasil akhir fungsi getTableDimension ini menghasilkan jumlah baris, jumlah baris yang terpakai header, jumlah kolom dan isi tabel itu sendiri

	return numRowsHead
 def setUp(self):
     "Setting common information"
     try:
         from BeautifulSoup import BeautifulSoup, SoupStrainer
     except ImportError:
         self.indices = None
         return
     # Load the file as a tree, but only take the SST table (border=1)
     from urllib import urlopen
     url = "http://www.cpc.noaa.gov/products/analysis_monitoring/"\
           "ensostuff/ensoyears.shtml"
     url = urlopen(url)
     table = BeautifulSoup(url.read(),
                           parseOnlyThese=SoupStrainer("table", border=1))
     # Separate it by rows, but skip the first one (the header)
     years = []
     indices = []
     color = dict(red=+1, white=0, blue=-1)
     deft = [(None,'color:white')]
     for row in table.findAll("tr")[1:]:
         cols = row.findAll('td')
         years.append(int(cols.pop(0).strong.string))
         indices.append([color[getattr(_.span, 'attrs', deft)[0][-1].split(':')[-1]]
                         for _ in cols])
     start_date = ts.Date('M', year=years[0], month=1)
     self.indices = time_series(np.array(indices).ravel(),
                                start_date=start_date)
def getAvailabilityRank(table):

	try:

		#print "getting List of ATMs requires attention..."
	
		soup = BeautifulSoup(str(table))
	
		rows = soup.findAll('tr')

		numRows = getRowsNumber(table)

		numRowsHead = getRowsHeadNumber(table)

	
		arrBestBranchBri = []
		
		for a in range (2, numRows-1):

			trs = BeautifulSoup(str(rows[a]))
			tdcells = trs.findAll("td")

			percentAvailBri = float(tdcells[17].getText())
			ukerName = cleanUpNamaUker(tdcells[0].getText())

			if (percentAvailBri == 100.00):

				#arrBestBranch.append(ukerName+", "+jumlahATM)
				arrBestBranchBri.append(ukerName)

	except IndexError:

		arrBestBranchBri = getAvailabilityRank(table)

	return sorted(arrBestBranchBri)
def scrape_and_look_for_next_link(url):      
    html = scraperwiki.scrape(url)
    #print html
    root = lxml.html.fromstring(html)
    soup = BeautifulSoup(html)                        #using BeautifulSoup to find next page links
    scrape_table(root)                                     #before carrying on scrape the hrefs using the scrape_table function
    #print soup
    
    items = soup.findAll('a',title="Next page")           # findAll "next page" links        
    if items:                                             # if there is a next page link continue
        
        next_link = root.cssselect("div.srch-Page.srch-Page-bg a")
    #print next_link
        if next_link:
            next_link2 = next_link[2].attrib['href']
            #print next_link2
            split_link = re.split("\)+",next_link2)
            split_link2 = re.split("\=+",split_link[0])
            split_link3 = re.split("\'+",split_link2[2])
            #print split_link3[0]
        #print split_link2
        #if split_link ==11:
            next_url = nextlink_url+split_link3[0]
            if next_url:
                print next_url
                scrape_and_look_for_next_link(next_url)
Beispiel #29
0
def links(args):
    """
    %prog links url

    Extract all the links "<a href=''>" from web page.
    """
    p = OptionParser(links.__doc__)
    p.add_option("--img", default=False, action="store_true",
                 help="Extract <img> tags [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    url, = args
    img = opts.img

    htmlfile = download(url)
    page = open(htmlfile).read()
    soup = BeautifulSoup(page)

    tag = 'img' if img else 'a'
    src = 'src' if img else 'href'
    aa = soup.findAll(tag)
    for a in aa:
        link = a.get(src)
        link = urljoin(url, link)
        print(link)
def whitespace(options):
    # clean events
    Event.objects.filter(source="whitespace").delete()

    soup = BeautifulSoup(urlopen("http://www.0x20.be/Main_Page").read())

    for event in soup.ul('li'):
        if event.text == 'More...':
            continue
        title = event.a.text
        url = "http://www.0x20.be" + event.a["href"]
        if "-" in event.b.text[:-1]:
            start, end = map(lambda x: parse(x.strip()), event.b.text[:-1].split("-"))
        else:
            start = parse(event.b.text[:-1])
            end = None
        location = event('a')[1].text

        Event.objects.create(
            title=title,
            source="whitespace",
            url=url,
            start=start,
            end=end,
            location=location.strip() if location else None
        )

        if not options["quiet"]:
            print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "whitespace", location.encode("Utf-8"))