def wolfplex(options): # clean events Event.objects.filter(source="wolfplex").delete() html_parser = HTMLParser() soup = BeautifulSoup(urlopen("http://www.wolfplex.org/wiki/Main_Page").read()) events = soup.find("div", id="accueil-agenda").dl for date_info, event in zip(events('dt'), events('dd')[1::2]): if event.span: event.span.clear() title = html_parser.unescape(event.text) base_domain = "http://www.wolfplex.org" if not event.a["href"].startswith("http") else "" url = (base_domain + event.a["href"]) if event.a else "http://www.wolfplex.org" start = parse(date_info.span["title"]) if "@" in title: title, location = title.split("@", 1) else: location = None Event.objects.create( title=title, source="wolfplex", url=url, start=start, location=location ) if not options["quiet"]: print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "wolfplex", location.encode("Utf-8") if location else "")
def getLastPageNum(alamatURL): strHTML = fetchHTML(alamatURL) mysoup = BeautifulSoup(strHTML) arrURL = mysoup.findAll('tfoot')[0].findAll('tr')[0].findAll('a') maxPage = 0 if arrURL: for i in range (0, len(arrURL)): lastPageNum = int(arrURL[i].get('href').split('/')[7].split('?')[0]) if lastPageNum > maxPage: maxPage = lastPageNum lastPageNum = maxPage else: lastPageNum = 0 print "last page number is:", lastPageNum return int(lastPageNum)
def find_external_urls(self, gbobject): """Find external urls in an gbobject""" soup = BeautifulSoup(gbobject.html_content) external_urls = [a['href'] for a in soup.findAll('a') if self.is_external_url( a['href'], self.ressources.site_url)] return external_urls
def get_daily_specials(day=None): page = urlopen(URL) soup = BeautifulSoup(page) page.close() daily_specials = { "name": "Dolcetto", "specials": [], "streetaddress": "Kyrkogatan 8, Sundsvall", "dataurl": URL, "mapurl": "http://www.hitta.se/ViewDetailsPink.aspx?Vkiid=4uG7%252fiYMOcHQKtp0VSkMNw%253d%253d&Vkid=3215131" } if day == None: day = date.today().weekday() # No lunch on Saturday or Sunday if day == 5 or day == 6: return daily_specials day = [u"måndag", u"tisdag", u"onsdag", u"torsdag", u"fredag"][day] anchor = soup.find(lambda t: t.name == "h2" and t.text == "Lunchmeny") menu = filter(lambda x: isinstance(x, NavigableString), anchor.findNextSibling("p")) for i, v in enumerate(menu): if day == v.lower(): daily_specials["specials"].append(menu[i+1]) break return daily_specials
def crawl_again(self, item, q, s): """ Crawls the content page, looking for all urls in the same domain. """ r = s.get(item['link']) soup = BeautifulSoup(r.text) main = soup.title.getText() urls = soup.findAll('a') chre = re.compile("(?<=chpt=)\d+") for url in urls: href = url['href'] isChapt = chre.search(href) if isChapt == None: mySub = "NoChap" else: mySub = isChapt.group(0) if href.startswith('/'): link = domain + href q.enq({ 'main_page': main, 'sub-page': mySub, 'section': url.parent.parent.getText().lstrip(), 'link': link }) return len(urls)
def main(): #for p in range(1,intGetMaxPage +1): #soup = BeautifulSoup() try: resp = urllib2.urlopen(getUrl,timeout=10) soup = BeautifulSoup(resp) soup = soup.find('div' ,{'id':'prodlist'}) #for k in soup.findAll("div", {'class': 'p-name'}): # 抓< div class='p=name'>...< /div> for k in soup.findAll('a', href=True): try: url = k.get('href') print k.text print url page_url = homeUrl + url print page_url resp_text_page = urllib2.urlopen(homeUrl + url, timeout=10) soup_text_page = BeautifulSoup(resp_text_page) contextPageUrl(soup_text_page,page_url) except: print "Unexpected error:", sys.exc_info()[0] print "Unexpected error:", sys.exc_info()[1] continue except: #continue print "Unexpected error:", sys.exc_info()[0] print "Unexpected error:", sys.exc_info()[1] pass
def theme_worker(): def get_projects(doc): for result in doc.findAll(title=u"Project acronym"): a = result.a link = "http://cordis.europa.eu" + dict(a.attrs)['href'][2:] yield link logging.info('START THEME WORKER') while True: count = 0 theme = q.get() logging.info('THEME: %s', repr(theme)) url = THEME_URL % {'theme': theme} try: while True: r = requests.get(url, config=REQUESTS_CONFIG) if not r.ok: logging.error("Request failed for url: %s", url) continue doc = BeautifulSoup(r.content) for proj in get_projects(doc): project_queue.put((theme, proj)) count += 1 try: next_ = dict(doc.find( text="Next 20 projects »").parent.attrs )['href'][2:] except AttributeError: break url = "http://cordis.europa.eu" + next_ except Exception, e: logging.error("THEME_WORKER: Error for url: %s", url) logging.error(e) finally:
def get_favicon_url(url): if not url.startswith('http'): url = "http://{0}".format(url) # Check if the root location has a favicon before parsing for it if _has_root_favicon(url): return urlparse.urljoin(url, 'favicon.ico') headers = {'User-Agent': 'Mozilla/5.0'} request = urllib2.Request(url, None, headers) website = urllib2.urlopen(request).read() soup = BeautifulSoup(website) favicon_element = soup.find("link", rel="shortcut icon") if favicon_element: hostname = urlparse.urlparse(url).hostname favicon_url = favicon_element['href'] if favicon_url.startswith('//cdn'): return "http:" + favicon_url # favicon url is relative and must be converted to absolute path elif hostname not in favicon_url: return urlparse.urljoin(url, favicon_url) else: return favicon_url else: return None
def split_contents(self): """ Iterates over the elements in the block """ if self.split_content: return self.split_content split = self.soup.findAll({'link' : True, 'style' : True}) for elem in split: if elem.name == 'link' and elem['rel'] == 'stylesheet': filename = self.get_filename(elem['href']) path, ext = os.path.splitext(filename) if ext in settings.COMPILER_FORMATS.keys(): if self.recompile(filename): self.compile(path,settings.COMPILER_FORMATS[ext]) basename = os.path.splitext(os.path.basename(filename))[0] elem = BeautifulSoup(re.sub(basename+ext,basename+'.css',unicode(elem))) filename = path + '.css' try: self.split_content.append(('file', filename, elem)) except UncompressableFileError: if django_settings.DEBUG: raise if elem.name == 'style': data = elem.string elem_type = elem.get('type', '').lower() if elem_type and elem_type != "text/css": # it has to be preprocessed if '/' in elem_type: # we accept 'text/ccss' and plain 'ccss' too elem_type = elem_type.split('/')[1] # TODO: that dot-adding compatibility stuff looks strange. # do we really need a dot in COMPILER_FORMATS keys? ext = '.'+elem_type data = self.compile_inline(data,ext) elem = ''.join(("<style type='text/css'>\n",data,"\n</style>")) self.split_content.append(('hunk', data, elem)) return self.split_content
def get_epfile(url): """ Return the file (mp3) URL to be read from the website to play the selected reloaded episode. Input the webpage URL of the episode to be played. E.g.: http://www.deejay.it/audio/20130526-4/269989/ Output the URL of the mp3 (rarely a wma) file to be played to listen to the selected episode. E.g.: http://flv.kataweb.it/deejay/audio/dee_giallo/deegiallolosmemoratodicollegno.mp3 Returns an empty string if the file cannot be found. """ soup = BeautifulSoup(urllib2.urlopen(url)) fileurl = soup.find('div', {'id': 'playerCont'}) if not fileurl: return '' else: hit = re.findall("file=(.*.mp3)&", fileurl.iframe['src']) if not hit: return '' else: return hit[0]
def start(self): with QMutexLocker(self.mutex): self.stoped = False #for i in range(self.start_p,self.end_p): for i in range(1,3): while self.suspended: self.wait() return if self.stoped: return url ="http://www.99fang.com/service/agency/a1/?p=%d" % i print url try: r = urllib2.urlopen(url).read() soup = BeautifulSoup(r) box = soup.find("div",{'class':'agency-call-box'}) lis = box("li") for li in lis: tel = li.a.string print tel r =urllib2.urlopen("http://suzhou.jjr360.com/app.php?c=spider&a=index&city=&tel=%s" % tel) print r.read() except: pass else: #self.emit(SIGNAL("updateTime()")) time.sleep(1)
def fetch_page(link_id): link = Link.objects.get(pk=link_id) url = link.url headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'} req = urllib2.Request(url, None, headers) try: html = urllib2.urlopen(req).read() soup = BeautifulSoup(html) link.title = soup.find('title').text favicon = soup.find('link', rel='shortcut icon') if favicon and favicon['href']: link.favicon = urljoin(url, favicon['href']) for item in soup.findAll('meta'): if item.get('name', '').lower() in ('description', 'og:description') and item.get('content', ''): link.description = item.get('content', '') except Exception as e: link.is_error = 1 link.error_text = e.reason.__str__() link.save()
def _on_login(self, page): soup = BeautifulSoup(page) if soup.find('a', text='Log in'): raise LoginError(page) self._browser.save_cookies() return soup
def _on_page(self, page): if not page: import ipdb ipdb.set_trace() soup = BeautifulSoup(page) if not soup.find('a', text='Log in'): event = soup.find('b', text='Something has happened!') if event: cell = event.findParent('table').findAll('td')[2] text = ''.join([x.text if hasattr(x, 'text') else x for x in cell.childGenerator()]) self._logger.info("Something has happned: %s", text) try: self._neopoints = get_np(soup) except NoNpInPage: pass return soup self._logger.info('Need to login. Using account %s', self._username) data = dict(username=self._username, password=self._password, destination=soup.find( 'input', attrs=dict(name='destination'))['value']) d = self._browser.post('http://www.neopets.com/login.phtml', data) d.addCallback(self._on_login) return d
def getsubhyperlink(origin_url, html_content, reslist, temp_set): soup = BeautifulSoup(html_content, parseOnlyThese=SoupStrainer('a')) hyperlink = soup.findAll('a',href=True) for tag in hyperlink: if "https" in tag['href'] or "http" in tag['href']: if tag['href'] not in temp_set: if origin_url in tag['href']: reslist.append(tag['href']) temp_set.append(tag['href']) else: if "www" in tag['href']: temp_url = "http://"+tag['href'] if temp_url not in temp_set: if origin_url in temp_url: reslist.append(temp_url) temp_set.append(temp_url) else: if tag['href'] and tag['href'][0] == '/': temp_url = origin_url + tag['href'] if temp_url not in temp_set: reslist.append(temp_url) temp_set.append(temp_url) else: temp_url = origin_url + tag['href'] if temp_url not in temp_set: reslist.append(temp_url) temp_set.append(temp_url)
def parseLyrics(lyricList,outlist,s,e): baseURL = u'http://www.darklyrics.com' i = 0 ; for key in lyricList : i = i + 1 ; if(i >= s and i<= e): #key = 'In Flames' # REMOVE FOR 100 Bands time.sleep(1) turl = lyricList[key] ; print 'Looking up band ' + key #print turl opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] page = opener.open(turl) soup = BeautifulSoup(page.read()) divs = soup.findChildren('div',attrs={"class" : "album"}) #get the sub-URL to the lyrics of the latest album and then full URL to the lyrics source if(len(divs)>0): sub_url = divs[len(divs)-1].findChildren('a')[0]['href'] lurl = baseURL + sub_url.split('#')[0][2:] #print lurl # hit the URL and get data page = opener.open(lurl) soup = BeautifulSoup(page.read()) lydiv = soup.findChildren('div',attrs={"class" : "lyrics"})[0] [x.extract() for x in lydiv('div')] #lyrictext = re.sub('\'lydiv.text ; rly = getRawLyrics(lydiv) else: rly = "Manual" print rly outlist[key] = rly #break ; # remove once started full testing print 'done' , s, ' to ', e return outlist
def selectForm(self, r): html = r.content linkget = r.url forms_filter = SoupStrainer('form'); soup = BeautifulSoup(html, parseOnlyThese=forms_filter); forms_post = ClientForm.ParseFile(StringIO.StringIO(soup.prettify()), linkget, backwards_compat=False); return forms_post
def removecut(string): soup = BeautifulSoup(string, selfClosingTags=['img','br']) tag = soup.find('yvcut') if not tag: return string tag.extract() string = soup.renderContents() return string
def get(self, regno): #self.response.headers['Content-Type'] = 'text/html' br= _mechanize.Browser() cj = cookielib.CookieJar() br.set_cookiejar(cj) br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) n=262 while(n<=262): m=str(n).zfill(4) # filling zeros for roll no like 001,002 etc. n=n+1 #self.response.write('11BEC') # This is where roll no goes, for 09BCE just replace by 09BCE. #u=regno r=br.open('https://academics.vit.ac.in/parent/parent_login.asp') html=r.read() soup=BeautifulSoup(html) img = soup.find('img', id='imgCaptcha') image_response = br.open_novisit(img['src']) captcha = Captcha() #captcha.cookie = "123456788sids" #captcha.image = db.Blob(image_response.read()) captcha.regno = regno for cook in cj: captcha.cookie = cook.value captcha.cookiename = cook.name captcha.put() self.response.headers['Content-Type'] = 'image/jpeg' self.response.out.write(image_response.read())
def getMovieData(self): list = [] #-- get serial play list & parameters ------------------------------------- html = self.Auth.get_HTML(self.serial_url, None, 'http://serialu.net/media/uppod.swf') # -- parsing web page html = re.compile('<body>(.+?)<\/body>', re.MULTILINE|re.DOTALL).findall(html)[0] soup = BeautifulSoup(html) pl_url = '' is_multiseason = len(soup.findAll('object', {'type':'application/x-shockwave-flash'})) for rec in soup.findAll('object', {'type':'application/x-shockwave-flash'}): if is_multiseason > 1: season = rec.parent.previousSibling.previousSibling.text+r' ' else: season = r'' for par in rec.find('param', {'name':'flashvars'})['value'].split('&'): if par.split('=')[0] == 'pl': pl_url = par[3:] if pl_url.find('http:') == -1: pl_url = xppod.Decode(pl_url) #-- get playlist details --------------------------------------------------- html = self.Auth.get_HTML(pl_url, None, 'http://serialu.net/media/uppod.swf') self.pl_url = pl_url # -- check if playlist is encoded if html.find('{"playlist":[') == -1: html = xppod.Decode(html).encode('utf-8').split(' or ')[0] #-- TODO: make smart choice # -- parsing web page s_url = '' s_num = 0 movie_list = [] for rec in re.compile('{(.+?)}', re.MULTILINE|re.DOTALL).findall(html.replace('{"playlist":[', '')): for par in rec.replace('"','').split(','): if par.split(':')[0]== 'comment': name = str(s_num+1) + ' серия' #par.split(':')[1]+' ' if par.split(':')[0]== 'file': if 'http' in par.split(':')[1]: s_url = par.split(':')[1]+':'+par.split(':')[2] else: s_url = xppod.Decode(par.split(':')[1]).split(' or ')[0] s_num += 1 # mark part for history name = season.encode('utf-8') + name movie_list.append({'movie_name': name, 'url': s_url}) #if h_part <> '-': # if name == h_part: # name = '[COLOR FF00FF00]'+name+'[/COLOR]' #-- parse data list.append({'name':self.serial_name, 'img': self.serial_img, 'descr': self.serial_descr, 'season_number':s_num, 'name_orig':'', 'movie': movie_list}) #-- return movie list return list
def getpresentationdetails(sender, **kwargs): print "Pre Save!" #print sender model = kwargs['instance'] # fetch the presentation url try: import urllib from BeautifulSoup import BeautifulSoup as BS html = urllib.urlopen(kwargs['instance'].url).read() bs = BS(html) # find the let's get the media url presurl = bs.find('link', rel='media:presentation') print "* Presentation: " + presurl['href'] # and the thumbnail thumburl = bs.find('link', rel='image_src') print "* Thumbnail: " + thumburl['href'] # and the author ame creator = bs.find('meta', property='dc:creator') print "* Creator: " + creator['content'] title = bs.find('meta', property="media:title") print "* Content: " + title['content'] except Exception, e: raise e
def extract_title(url): page = open(page_loc(url)) soup = BeautifulSoup(page.read()) title = soup.find('title') title = title.string.encode('utf-8') gadgets.string_to_file(title, title_loc(url)) page.close()
def crawl(self, url, q): """ Crawls the main url looking for sub-urls. """ print 'calling crawl with url', url s = requests.Session() num_urls = 0 r = requests.get(url) soup = BeautifulSoup(r.text) trs = soup.findAll('tr') for tr in trs: tds = tr.findAll('td') if len(tds) == 6: title = tds[1].getText() link = tds[3].find('a')['href'] item = { 'main_page': title, } item['link'] = self.get_data_link(link, s) num_urls += self.crawl_again(item, q, s) print 'total urls crawled:', num_urls
def get_syllables(word): url = 'http://www.wordcalc.com/index.php' post_data = urllib.urlencode( {'text': word}) post_data = '%s&optionSyllableCount&optionWordCount' % post_data cnxn = urllib.urlopen(url, post_data) response = cnxn.read() cnxn.close() soup = BeautifulSoup(response) h3_matches = [h3 for h3 in soup.findAll('h3') if h3.text == 'Statistics'] if len(h3_matches) != 1: raise Exception('Wrong number of <h3>Statistics</h3>') h3_match = h3_matches[0] table = h3_match.findNextSibling('table') td_matches = [td for td in table.findAll('td') if td.text == 'Syllable Count'] if len(td_matches) != 1: raise Exception('Wrong number of <td>Syllable Count</td>') td_match = td_matches[0] td_value = td_match.findNextSibling('td') syllable_count = int(td_value.text) return syllable_count
def getRowsHeadNumber(table): # bagaimana cara menentukan berapa jumlah baris yang terpakai sebagai header? soup = BeautifulSoup(str(table)) rows = soup.findAll('tr') numRows = len(table.findAll(lambda tag: tag.name == 'tr' and tag.findParent('table') == table)) # inisialisasi variabel numRowsHead sebagai jumlah baris yang mengandung header numRowsHead = 0 # periksa satu per satu setiap baris for i in range (0, numRows): # apabila dalam suatu baris tertentu terdapat tag <th> if rows[i].findAll('th'): # maka numRows bertambah 1 numRowsHead = i + 1 # hasil akhir fungsi getTableDimension ini menghasilkan jumlah baris, jumlah baris yang terpakai header, jumlah kolom dan isi tabel itu sendiri return numRowsHead
def setUp(self): "Setting common information" try: from BeautifulSoup import BeautifulSoup, SoupStrainer except ImportError: self.indices = None return # Load the file as a tree, but only take the SST table (border=1) from urllib import urlopen url = "http://www.cpc.noaa.gov/products/analysis_monitoring/"\ "ensostuff/ensoyears.shtml" url = urlopen(url) table = BeautifulSoup(url.read(), parseOnlyThese=SoupStrainer("table", border=1)) # Separate it by rows, but skip the first one (the header) years = [] indices = [] color = dict(red=+1, white=0, blue=-1) deft = [(None,'color:white')] for row in table.findAll("tr")[1:]: cols = row.findAll('td') years.append(int(cols.pop(0).strong.string)) indices.append([color[getattr(_.span, 'attrs', deft)[0][-1].split(':')[-1]] for _ in cols]) start_date = ts.Date('M', year=years[0], month=1) self.indices = time_series(np.array(indices).ravel(), start_date=start_date)
def getAvailabilityRank(table): try: #print "getting List of ATMs requires attention..." soup = BeautifulSoup(str(table)) rows = soup.findAll('tr') numRows = getRowsNumber(table) numRowsHead = getRowsHeadNumber(table) arrBestBranchBri = [] for a in range (2, numRows-1): trs = BeautifulSoup(str(rows[a])) tdcells = trs.findAll("td") percentAvailBri = float(tdcells[17].getText()) ukerName = cleanUpNamaUker(tdcells[0].getText()) if (percentAvailBri == 100.00): #arrBestBranch.append(ukerName+", "+jumlahATM) arrBestBranchBri.append(ukerName) except IndexError: arrBestBranchBri = getAvailabilityRank(table) return sorted(arrBestBranchBri)
def scrape_and_look_for_next_link(url): html = scraperwiki.scrape(url) #print html root = lxml.html.fromstring(html) soup = BeautifulSoup(html) #using BeautifulSoup to find next page links scrape_table(root) #before carrying on scrape the hrefs using the scrape_table function #print soup items = soup.findAll('a',title="Next page") # findAll "next page" links if items: # if there is a next page link continue next_link = root.cssselect("div.srch-Page.srch-Page-bg a") #print next_link if next_link: next_link2 = next_link[2].attrib['href'] #print next_link2 split_link = re.split("\)+",next_link2) split_link2 = re.split("\=+",split_link[0]) split_link3 = re.split("\'+",split_link2[2]) #print split_link3[0] #print split_link2 #if split_link ==11: next_url = nextlink_url+split_link3[0] if next_url: print next_url scrape_and_look_for_next_link(next_url)
def links(args): """ %prog links url Extract all the links "<a href=''>" from web page. """ p = OptionParser(links.__doc__) p.add_option("--img", default=False, action="store_true", help="Extract <img> tags [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) url, = args img = opts.img htmlfile = download(url) page = open(htmlfile).read() soup = BeautifulSoup(page) tag = 'img' if img else 'a' src = 'src' if img else 'href' aa = soup.findAll(tag) for a in aa: link = a.get(src) link = urljoin(url, link) print(link)
def whitespace(options): # clean events Event.objects.filter(source="whitespace").delete() soup = BeautifulSoup(urlopen("http://www.0x20.be/Main_Page").read()) for event in soup.ul('li'): if event.text == 'More...': continue title = event.a.text url = "http://www.0x20.be" + event.a["href"] if "-" in event.b.text[:-1]: start, end = map(lambda x: parse(x.strip()), event.b.text[:-1].split("-")) else: start = parse(event.b.text[:-1]) end = None location = event('a')[1].text Event.objects.create( title=title, source="whitespace", url=url, start=start, end=end, location=location.strip() if location else None ) if not options["quiet"]: print "Adding %s [%s] (%s)..." % (title.encode("Utf-8"), "whitespace", location.encode("Utf-8"))