def test_title_is_only_shown_once(self): self._grant_permission('anonymous', 'TRAC_ADMIN') page = create_tagged_page(self.env, self.req(), 'Foo', '= SomeTitle =\ncontent', ('blog',)) page.save(None, None, '127.0.0.1') soup = BeautifulSoup(self._expand_macro()) plain_text = ''.join(soup.div(text=True)) # looking in plain text (with all tags stripped) as Trac will add the # title also in the dom node id and an anchor to link to that heading matches = re.findall('SomeTitle', plain_text) assert_length(1, matches)
def parse(self): links = [] page = urllib2.urlopen(self.base_url) page_content = page.read() match = re.search("<div id=\"this_month\">.*<div class=\"matchbook_rain_light\">", page_content, re.DOTALL) contents = BeautifulSoup(match.group(0)) match = re.search("<div id=\"mag_package\">.*<div id=\"mag_archive\">", page_content, re.DOTALL) contents2 = BeautifulSoup(match.group(0)) match = re.search('<area.*?href="(?P<url>.*?)"', page_content) main_url = match.group('url') url = 'http://www.wired.com/print%s' % main_url page = urllib2.urlopen(url) main_article_content = page.read() match = re.search('<h1 id="articlehed">(?P<title>.*?)</h1>', main_article_content) main_title = match.group('title') self.parse_article(main_title, url) for div in contents.div(attrs={'class': 'story'}): link = div.find('a') links.append(link) for div in contents2.div(attrs={'class': 'headline_image'}): link = div.find('a') links.append(link) for div in contents2.div(attrs={'class': 'headline'}): link = div.find('a') links.append(link) for link_element in links: title = self.text(link_element) href = link_element['href'] url = 'http://www.wired.com/print%s' % href self.parse_article(title, url)
def getMeta(self, url): """Get metadata from gtnpdatabase.org/boreholes/view/#### page""" try: html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) except: print("Page not found. Error 404. %s added to list of failed sites" ) % url if url not in self.failedURL: self.failedURL.append(url) return # Get pagename ##TODO: assert this is the same as the page name somewhere else? pageName = soup.div(id="formHeader")[0].h1.text # Get other data from table structure meta = [x.text for x in soup.findAll("tr")] meta = [ unicodedata.normalize('NFKD', x).encode('ascii', 'ignore') for x in meta ] metaDict = dict() for info in meta: if ":" in info: info = re.sub("°", "deg", info) info = re.sub(" ", "", info) info = info.split(":") metaDict.update({info[0]: info[1]}) index = re.search("view/(\d+)", url) index = index.group(1) metaDict.update({"URL": url}) metaDict.update({"index": index}) metaDict.update({"Name": pageName}) self.cur_siteMeta = metaDict outstr = self.buildNameStringBH() ## write to csv metafile = self.out_dir + "/" + outstr + "_metadata.csv" writer = csv.writer(open(metafile, 'wb')) for key, value in self.cur_siteMeta.items(): writer.writerow([key, value])
def getMeta(self,url): """Get metadata from gtnpdatabase.org/boreholes/view/#### page""" try: html = urllib2.urlopen(url).read() soup = BeautifulSoup(html) except: print("Page not found. Error 404. %s added to list of failed sites") %url if url not in self.failedURL: self.failedURL.append(url) return # Get pagename ##TODO: assert this is the same as the page name somewhere else? pageName = soup.div(id="formHeader")[0].h1.text # Get other data from table structure meta = [x.text for x in soup.findAll("tr")] meta = [unicodedata.normalize('NFKD', x).encode('ascii','ignore') for x in meta] metaDict = dict() for info in meta: if ":" in info: info = re.sub("°","deg",info) info = re.sub(" ","",info) info = info.split(":") metaDict.update({info[0]:info[1]}) index = re.search("view/(\d+)",url) index = index.group(1) metaDict.update({"URL":url}) metaDict.update({"index":index}) metaDict.update({"Name":pageName}) self.cur_siteMeta = metaDict outstr = self.buildNameStringBH() ## write to csv metafile = self.out_dir + "/" + outstr + "_metadata.csv" writer = csv.writer(open(metafile, 'wb')) for key, value in self.cur_siteMeta.items(): writer.writerow([key, value])
#!/usr/bin/env python from BeautifulSoup import BeautifulSoup import urllib2 import requests import posixpath url = "http://4sq.com/18aGENW" headers = {"User-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0"} request = requests.get(url, headers=headers) final_url = request.url parsed = urllib2.urlparse.urlparse(final_url) query = parsed.query signature = urllib2.urlparse.parse_qs(query)["s"][0] checkin_id = posixpath.basename(parsed.path) user = posixpath.dirname(parsed.path).split('/')[1] soup = BeautifulSoup(request.text) venue_push = soup.div(attrs={"class": "venue push"})[0] screen_name = venue_push.h1.strong.text venue = venue_push.a["href"] print "Checkin %s is for User \"%s\" with Name \"%s\" checking in at %s"\ % (checkin_id, user, screen_name, posixpath.basename(venue))