def test_title_is_only_shown_once(self):
     self._grant_permission('anonymous', 'TRAC_ADMIN')
     page = create_tagged_page(self.env, self.req(), 'Foo', '= SomeTitle =\ncontent', ('blog',))
     page.save(None, None, '127.0.0.1')
     
     soup = BeautifulSoup(self._expand_macro())
     plain_text = ''.join(soup.div(text=True))
     # looking in plain text (with all tags stripped) as Trac will add the 
     # title also in the dom node id and an anchor to link to that heading
     matches = re.findall('SomeTitle', plain_text)
     assert_length(1, matches)
Example #2
0
    def parse(self):
        
        links = []
        page = urllib2.urlopen(self.base_url)
        page_content = page.read()
        match = re.search("<div id=\"this_month\">.*<div class=\"matchbook_rain_light\">", page_content, re.DOTALL)
        contents = BeautifulSoup(match.group(0))

        match = re.search("<div id=\"mag_package\">.*<div id=\"mag_archive\">", page_content, re.DOTALL)
        contents2 = BeautifulSoup(match.group(0))

        match = re.search('<area.*?href="(?P<url>.*?)"', page_content)
        main_url = match.group('url')
        url = 'http://www.wired.com/print%s' % main_url
        page = urllib2.urlopen(url)
        main_article_content = page.read()
        match = re.search('<h1 id="articlehed">(?P<title>.*?)</h1>', main_article_content)
        main_title = match.group('title')
        self.parse_article(main_title, url)

        for div in contents.div(attrs={'class': 'story'}):
            link = div.find('a')
            links.append(link)
            
        for div in contents2.div(attrs={'class': 'headline_image'}):
            link = div.find('a')
            links.append(link)
            
        for div in contents2.div(attrs={'class': 'headline'}):
            link = div.find('a')
            links.append(link)

        for link_element in links:
            title = self.text(link_element)
            href = link_element['href']
            url = 'http://www.wired.com/print%s' % href
            
            self.parse_article(title, url)
Example #3
0
    def getMeta(self, url):
        """Get metadata from gtnpdatabase.org/boreholes/view/#### page"""
        try:
            html = urllib2.urlopen(url).read()
            soup = BeautifulSoup(html)
        except:
            print("Page not found. Error 404. %s added to list of failed sites"
                  ) % url
            if url not in self.failedURL:
                self.failedURL.append(url)
            return
        # Get pagename ##TODO: assert this is the same as the page name somewhere else?
        pageName = soup.div(id="formHeader")[0].h1.text

        # Get other data from table structure
        meta = [x.text for x in soup.findAll("tr")]
        meta = [
            unicodedata.normalize('NFKD', x).encode('ascii', 'ignore')
            for x in meta
        ]
        metaDict = dict()
        for info in meta:
            if ":" in info:
                info = re.sub("&#176", "deg", info)
                info = re.sub("&nbsp;", "", info)
                info = info.split(":")
                metaDict.update({info[0]: info[1]})
        index = re.search("view/(\d+)", url)
        index = index.group(1)
        metaDict.update({"URL": url})
        metaDict.update({"index": index})
        metaDict.update({"Name": pageName})
        self.cur_siteMeta = metaDict

        outstr = self.buildNameStringBH()

        ## write to csv
        metafile = self.out_dir + "/" + outstr + "_metadata.csv"
        writer = csv.writer(open(metafile, 'wb'))
        for key, value in self.cur_siteMeta.items():
            writer.writerow([key, value])
Example #4
0
    def getMeta(self,url):  
        """Get metadata from gtnpdatabase.org/boreholes/view/#### page"""
        try:
            html = urllib2.urlopen(url).read()
            soup = BeautifulSoup(html)
        except:
            print("Page not found. Error 404. %s added to list of failed sites") %url
            if url not in self.failedURL:
                self.failedURL.append(url)
            return
        # Get pagename ##TODO: assert this is the same as the page name somewhere else?
        pageName = soup.div(id="formHeader")[0].h1.text
        
        # Get other data from table structure
        meta = [x.text for x in soup.findAll("tr")]
        meta = [unicodedata.normalize('NFKD', x).encode('ascii','ignore') for x in meta]
        metaDict = dict()
        for info in meta:
            if ":" in info:
                info = re.sub("&#176","deg",info)
                info = re.sub("&nbsp;","",info)
                info = info.split(":")
                metaDict.update({info[0]:info[1]})
        index = re.search("view/(\d+)",url)
        index = index.group(1)
        metaDict.update({"URL":url})
        metaDict.update({"index":index})
        metaDict.update({"Name":pageName})
        self.cur_siteMeta = metaDict
        
        outstr = self.buildNameStringBH()

        ## write to csv
        metafile = self.out_dir + "/" + outstr + "_metadata.csv"
        writer = csv.writer(open(metafile, 'wb'))
        for key, value in self.cur_siteMeta.items():
            writer.writerow([key, value])
#!/usr/bin/env python

from BeautifulSoup import BeautifulSoup
import urllib2
import requests
import posixpath

url = "http://4sq.com/18aGENW"

headers = {"User-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0"}

request = requests.get(url, headers=headers)

final_url = request.url
parsed = urllib2.urlparse.urlparse(final_url)
query = parsed.query
signature = urllib2.urlparse.parse_qs(query)["s"][0]

checkin_id = posixpath.basename(parsed.path)
user = posixpath.dirname(parsed.path).split('/')[1]

soup = BeautifulSoup(request.text)

venue_push = soup.div(attrs={"class": "venue push"})[0]
screen_name = venue_push.h1.strong.text

venue = venue_push.a["href"]

print "Checkin %s is for User \"%s\" with Name \"%s\" checking in at %s"\
    % (checkin_id, user, screen_name, posixpath.basename(venue))