url= 'http://rfi.fr/afrique' page = urllib2.urlopen(url) soup = BeautifulSoup(page.read(), 'html.parser') sections = soup.findAll('section', {'id':'news'}) #anchors = [td.find('a') for td in soup.findAll('li', {'data-bo-type':'article'})] articles = soup.findAll('li', {'data-bo-type':'article'}) print 'Number of article:', len(articles) for article in articles: if article: a = article.find('a') if a.get('title'): post = Article() print a.get('title').encode('utf-8') post.title = a.get('title').encode('utf-8') print 'Link:', a['href'] post.link = 'http://rfi.fr' + a['href'] print a.get('data-height') if a.get('data-image'): print 'Image:', a.get('data-image') post.thumbnail = a.get('data-image') posts = Article.objects.filter(link = post.link) if posts.count()==0: post.source = 'RFI Afrique' post.view_count = 0
reload(sys) sys.setdefaultencoding('utf8') from news_app.models import Article url= 'http://cameroon-info.net' page = urllib2.urlopen(url) soup = BeautifulSoup(page.read(), 'html.parser') #soup = BeautifulSoup(page.read(), convertEntities=BeautifulSoup.HTML_ENTITIES) #soup = soup.prettify(formatter="html") articles = soup.find_all('td', {'width': '475'}) for a in articles[7:]: print '-'*60 if a: post = Article() link = a.find('a', {'class': 'morehltitle2012'}) if link : print 'Link ', link.get('href') post.link = url + link.get('href') desc = a.find('div', {'class': 'morehldesc'}) if desc : print 'Title:\n', desc.get_text().encode('utf-8') post.title = desc.get_text().encode('utf-8') if desc.img: print '\n\nImage', url + desc.img.get('src') post.thumbnail = url + desc.img.get('src') source = a.find('div', {'class': 'morehlsource'}) if source: print source.encode('utf-8')