def scrapeFortune100(self): """Scrapes off 100 companies from CNN's Fortune 500 companies list.""" url = 'http://money.cnn.com/magazines/fortune/global500/2012/full_list/index.html' soup = Soup(urllib2.urlopen(url)) companies = soup.findSelect('td.cnncol2 a') con = MySQLdb.connect( host='silo.cs.indiana.edu', user='******', passwd='rutabega', db='glassdoor', port=14272 ) x = con.cursor() for company in companies: try: query = """INSERT INTO companies (name) VALUES (%s)""" x.execute(query, (company.string)) con.commit() except: con.rollback() con.close()
def main(): base_url = 'http://www.ohloh.net/' api_key = '1xe6mtGfqDLsq7tMdV2hg' # resource = 'projects/1.xml' conn = Ohloh(base_url, api_key) # data = conn.request(resource) data = conn.project(1) if not data: sys.exit("No data") soup = BeautifulSoup(data) print soup.findSelect('project > name') # returns bot project-name and used-project-names print soup.prettify()
def main(): base_url = 'http://www.ohloh.net/' api_key = '1xe6mtGfqDLsq7tMdV2hg' # resource = 'projects/1.xml' conn = Ohloh(base_url, api_key) # data = conn.request(resource) data = conn.project(1) if not data: sys.exit("No data") soup = BeautifulSoup(data) print soup.findSelect( 'project > name') # returns bot project-name and used-project-names print soup.prettify()
def parse(self, content): results = [] document = BeautifulSoup(content) if not document: return [] for entry in document.findSelect('ol > li p'): url = entry.text abl = AlexaBacklink(url) results.append(abl) return results
def test_monkeypatch_implicit(self): soup = BeautifulSoup(HTML) self.assertRaises(TypeError, soup.findSelect, "*") monkeypatch() self.assert_(soup.findSelect("*")) self.assertSelectMultipleExplicit(soup, ("link", ["l1"]), ("div#main", ["main"]), ("div div", ["inner"])) unmonkeypatch() self.assertRaises(TypeError, soup.findSelect, "*")
def test_monkeypatch_implicit(self): soup = BeautifulSoup(HTML) self.assertRaises(TypeError, soup.findSelect, '*') monkeypatch() self.assert_(soup.findSelect('*')) self.assertSelectMultipleExplicit(soup, ('link', ['l1']), ('div#main', ['main']), ('div div', ['inner']), ) unmonkeypatch() self.assertRaises(TypeError, soup.findSelect, '*')
def scrapeFortune100(self): """Scrapes off 100 companies from CNN's Fortune 500 companies list.""" url = 'http://money.cnn.com/magazines/fortune/global500/2012/full_list/index.html' soup = Soup(urllib2.urlopen(url)) companies = soup.findSelect('td.cnncol2 a') con = MySQLdb.connect(host='silo.cs.indiana.edu', user='******', passwd='rutabega', db='glassdoor', port=14272) x = con.cursor() for company in companies: try: query = """INSERT INTO companies (name) VALUES (%s)""" x.execute(query, (company.string)) con.commit() except: con.rollback() con.close()
def transform(self): """change the self.html and return it with CSS turned into style attributes. """ page = BeautifulSoup(self.html) if page is None: print repr(self.html) raise PremailerError("Could not parse the html") # Strip comments. comments = page.findAll(text=lambda text: isinstance(text, Comment)) map(lambda c: c.extract(), comments) ## ## style selectors ## rules = [] for style in page.find("style") or []: css_body = str(style) these_rules, these_leftover = self._parse_style_rules(css_body) rules.extend(these_rules) if these_leftover: style.text = '\n'.join( ['%s {%s}' % (k, v) for (k, v) in these_leftover]) elif not self.keep_style_tags: style.extract() if self.external_styles: for stylefile in self.external_styles: print stylefile if stylefile.startswith('http://'): css_body = urllib.urlopen(stylefile).read() elif os.path.exists(stylefile): try: f = codecs.open(stylefile) css_body = f.read() finally: f.close() else: raise ValueError(u"Could not find external style: %s" % stylefile) these_rules, these_leftover = self._parse_style_rules(css_body) rules.extend(these_rules) for selector, style in rules: class_ = '' if ':' in selector: selector, class_ = re.split(':', selector, 1) class_ = ':%s' % class_ #sel = CSSSelector(selector) items = page.findSelect(selector) for item in items: old_style = item.get('style', '') new_style = _merge_styles(old_style, style, class_) item['style'] = new_style self._style_to_basic_html_attributes(item, new_style) for item in page.findAll(lambda tag: tag.get('class', None) != None): # delete the 'class' attribute del item['class'] ## ## URLs ## if self.base_url: for attr in ('href', 'src'): for item in page.findAll( lambda tag: tag.get(attr, None) != None): item[attr] = urlparse.urljoin(self.base_url, item[attr]) # The default __repr__ encoding for the used version of BeautifulSoup is utf-8 return str(page).replace('<head/>', '<head></head>')
def transform(self): """change the self.html and return it with CSS turned into style attributes. """ page = BeautifulSoup(self.html) if page is None: print repr(self.html) raise PremailerError("Could not parse the html") # Strip comments. comments = page.findAll(text=lambda text: isinstance(text, Comment)) map(lambda c: c.extract(), comments) ## ## style selectors ## rules = [] for style in page.find("style") or []: css_body = str(style) these_rules, these_leftover = self._parse_style_rules(css_body) rules.extend(these_rules) if these_leftover: style.text = '\n'.join(['%s {%s}' % (k, v) for (k, v) in these_leftover]) elif not self.keep_style_tags: style.extract() if self.external_styles: for stylefile in self.external_styles: print stylefile if stylefile.startswith('http://'): css_body = urllib.urlopen(stylefile).read() elif os.path.exists(stylefile): try: f = codecs.open(stylefile) css_body = f.read() finally: f.close() else: raise ValueError(u"Could not find external style: %s" % stylefile) these_rules, these_leftover = self._parse_style_rules(css_body) rules.extend(these_rules) for selector, style in rules: class_ = '' if ':' in selector: selector, class_ = re.split(':', selector, 1) class_ = ':%s' % class_ #sel = CSSSelector(selector) items = page.findSelect(selector) for item in items: old_style = item.get('style','') new_style = _merge_styles(old_style, style, class_) item['style'] = new_style self._style_to_basic_html_attributes(item, new_style) for item in page.findAll(lambda tag: tag.get('class', None) != None): # delete the 'class' attribute del item['class'] ## ## URLs ## if self.base_url: for attr in ('href', 'src'): for item in page.findAll(lambda tag: tag.get(attr, None)!= None): item[attr] = urlparse.urljoin(self.base_url, item[attr]) # The default __repr__ encoding for the used version of BeautifulSoup is utf-8 return str(page).replace('<head/>','<head></head>')