def scrapeFortune100(self):
        """Scrapes off 100 companies from CNN's Fortune 500 companies list."""
        url = 'http://money.cnn.com/magazines/fortune/global500/2012/full_list/index.html'
        soup = Soup(urllib2.urlopen(url))
        companies = soup.findSelect('td.cnncol2 a')
        
        con = MySQLdb.connect(
            host='silo.cs.indiana.edu',
            user='******',
            passwd='rutabega',
            db='glassdoor',
            port=14272
        )

        x = con.cursor()
        
        for company in companies:
            try:
                query = """INSERT INTO companies (name) VALUES (%s)"""
                x.execute(query, (company.string))
                con.commit()
            except:
                con.rollback()

        con.close()
Example #2
0
def main():
    base_url = 'http://www.ohloh.net/'
    api_key = '1xe6mtGfqDLsq7tMdV2hg'
    # resource = 'projects/1.xml'

    conn = Ohloh(base_url, api_key)
    # data = conn.request(resource)
    data = conn.project(1)
    
    if not data:
        sys.exit("No data")
        
    soup = BeautifulSoup(data)
    print soup.findSelect('project > name') # returns bot project-name and used-project-names

    print soup.prettify()
Example #3
0
def main():
    base_url = 'http://www.ohloh.net/'
    api_key = '1xe6mtGfqDLsq7tMdV2hg'
    # resource = 'projects/1.xml'

    conn = Ohloh(base_url, api_key)
    # data = conn.request(resource)
    data = conn.project(1)

    if not data:
        sys.exit("No data")

    soup = BeautifulSoup(data)
    print soup.findSelect(
        'project > name')  # returns bot project-name and used-project-names

    print soup.prettify()
Example #4
0
 def parse(self, content):
     results = []
     document = BeautifulSoup(content)
     if not document:
         return []
     for entry in document.findSelect('ol > li p'):
         url = entry.text
         abl = AlexaBacklink(url)
         results.append(abl)
     return results
Example #5
0
    def test_monkeypatch_implicit(self):
        soup = BeautifulSoup(HTML)
        self.assertRaises(TypeError, soup.findSelect, "*")

        monkeypatch()

        self.assert_(soup.findSelect("*"))
        self.assertSelectMultipleExplicit(soup, ("link", ["l1"]), ("div#main", ["main"]), ("div div", ["inner"]))

        unmonkeypatch()

        self.assertRaises(TypeError, soup.findSelect, "*")
    def test_monkeypatch_implicit(self):
        soup = BeautifulSoup(HTML)
        self.assertRaises(TypeError, soup.findSelect, '*')

        monkeypatch()

        self.assert_(soup.findSelect('*'))
        self.assertSelectMultipleExplicit(soup,
            ('link', ['l1']),
            ('div#main', ['main']),
            ('div div', ['inner']),
        )
        
        unmonkeypatch()
        
        self.assertRaises(TypeError, soup.findSelect, '*')
Example #7
0
    def test_monkeypatch_implicit(self):
        soup = BeautifulSoup(HTML)
        self.assertRaises(TypeError, soup.findSelect, '*')

        monkeypatch()

        self.assert_(soup.findSelect('*'))
        self.assertSelectMultipleExplicit(soup,
            ('link', ['l1']),
            ('div#main', ['main']),
            ('div div', ['inner']),
        )
        
        unmonkeypatch()
        
        self.assertRaises(TypeError, soup.findSelect, '*')
    def scrapeFortune100(self):
        """Scrapes off 100 companies from CNN's Fortune 500 companies list."""
        url = 'http://money.cnn.com/magazines/fortune/global500/2012/full_list/index.html'
        soup = Soup(urllib2.urlopen(url))
        companies = soup.findSelect('td.cnncol2 a')

        con = MySQLdb.connect(host='silo.cs.indiana.edu',
                              user='******',
                              passwd='rutabega',
                              db='glassdoor',
                              port=14272)

        x = con.cursor()

        for company in companies:
            try:
                query = """INSERT INTO companies (name) VALUES (%s)"""
                x.execute(query, (company.string))
                con.commit()
            except:
                con.rollback()

        con.close()
Example #9
0
    def transform(self):
        """change the self.html and return it with CSS turned into style
        attributes.
        """
        page = BeautifulSoup(self.html)

        if page is None:
            print repr(self.html)
            raise PremailerError("Could not parse the html")

        # Strip comments.
        comments = page.findAll(text=lambda text: isinstance(text, Comment))
        map(lambda c: c.extract(), comments)

        ##
        ## style selectors
        ##

        rules = []

        for style in page.find("style") or []:
            css_body = str(style)
            these_rules, these_leftover = self._parse_style_rules(css_body)
            rules.extend(these_rules)

            if these_leftover:
                style.text = '\n'.join(
                    ['%s {%s}' % (k, v) for (k, v) in these_leftover])
            elif not self.keep_style_tags:
                style.extract()

        if self.external_styles:
            for stylefile in self.external_styles:
                print stylefile
                if stylefile.startswith('http://'):
                    css_body = urllib.urlopen(stylefile).read()
                elif os.path.exists(stylefile):
                    try:
                        f = codecs.open(stylefile)
                        css_body = f.read()
                    finally:
                        f.close()
                else:
                    raise ValueError(u"Could not find external style: %s" %
                                     stylefile)
                these_rules, these_leftover = self._parse_style_rules(css_body)
                rules.extend(these_rules)

        for selector, style in rules:
            class_ = ''
            if ':' in selector:
                selector, class_ = re.split(':', selector, 1)
                class_ = ':%s' % class_

            #sel = CSSSelector(selector)
            items = page.findSelect(selector)
            for item in items:
                old_style = item.get('style', '')
                new_style = _merge_styles(old_style, style, class_)
                item['style'] = new_style
                self._style_to_basic_html_attributes(item, new_style)

        for item in page.findAll(lambda tag: tag.get('class', None) != None):
            # delete the 'class' attribute
            del item['class']

        ##
        ## URLs
        ##
        if self.base_url:
            for attr in ('href', 'src'):
                for item in page.findAll(
                        lambda tag: tag.get(attr, None) != None):
                    item[attr] = urlparse.urljoin(self.base_url, item[attr])

        # The default __repr__ encoding for the used version of BeautifulSoup is utf-8
        return str(page).replace('<head/>', '<head></head>')
    def transform(self):
        """change the self.html and return it with CSS turned into style
        attributes.
        """
        page = BeautifulSoup(self.html)

        if page is None:
            print repr(self.html)
            raise PremailerError("Could not parse the html")

        # Strip comments.
        comments = page.findAll(text=lambda text: isinstance(text, Comment))
        map(lambda c: c.extract(), comments)
        
        ##
        ## style selectors
        ##
        
        rules = []
        
        for style in page.find("style") or []:
            css_body = str(style)
            these_rules, these_leftover = self._parse_style_rules(css_body)
            rules.extend(these_rules)
            
            if these_leftover:
                style.text = '\n'.join(['%s {%s}' % (k, v) for (k, v) in these_leftover])
            elif not self.keep_style_tags:
                style.extract()
                       
        if self.external_styles:
            for stylefile in self.external_styles:
                print stylefile
                if stylefile.startswith('http://'):
                    css_body = urllib.urlopen(stylefile).read()
                elif os.path.exists(stylefile):
                    try:
                        f = codecs.open(stylefile)
                        css_body = f.read()
                    finally:
                        f.close()
                else:
                    raise ValueError(u"Could not find external style: %s" % stylefile) 
                these_rules, these_leftover = self._parse_style_rules(css_body)
                rules.extend(these_rules)              
            
        for selector, style in rules:
            class_ = ''
            if ':' in selector:
                selector, class_ = re.split(':', selector, 1)
                class_ = ':%s' % class_

            #sel = CSSSelector(selector)
            items = page.findSelect(selector)
            for item in items:
                old_style = item.get('style','')
                new_style = _merge_styles(old_style, style, class_)
                item['style'] = new_style
                self._style_to_basic_html_attributes(item, new_style)

        for item in page.findAll(lambda tag: tag.get('class', None) != None):
            # delete the 'class' attribute
            del item['class']            

        ##
        ## URLs
        ##
        if self.base_url:
            for attr in ('href', 'src'):
                for item in page.findAll(lambda tag: tag.get(attr, None)!= None):
                    item[attr] = urlparse.urljoin(self.base_url, item[attr])
        
        # The default __repr__ encoding for the used version of BeautifulSoup is utf-8
        return str(page).replace('<head/>','<head></head>')