def _get_zgqnb_article(self, url, date, page_name): print url medium = Medium.objects.get(pk=1836) urls = bs4.BeautifulSoup(requests.get(url).content).find('div', {'id' : 'titleList'}).find_all('a') for a in urls: article_url = urljoin(url, a.get('href')) soup = bs4.BeautifulSoup(requests.get(article_url).content) title = soup.find('h1').text print title article = Article() article.medium = medium article.title = title article.url = article_url article.publication_date = date article.page = page_name p_list = [] for p in soup.find('div', {'id' : 'ozoom'}).find_all('p'): p_list.append(p.text) content = '\n'.join(p_list) article.content = content if Article.objects.filter(medium=medium).filter(url=article_url).count(): article = Article.objects.filter(medium=medium).get(url=article_url) article.content = content article.save()
def _get_rmrb_article(self, content): medium = Medium.objects.get(pk=1081) article = Article() article.medium = medium article.title = content.find('h1').text.strip() for author_name in content.find_all('div', {'class' : 'summary'})[0].find_all('a'): try: author = Journalist.objects.get(medium=medium, name=author_name.text.strip()) except: pass else: article.author = author break for li in content.find_all('div', {'class' : 'summary'})[-1].find_all('li'): if li.text.find(u'报纸日期') != -1: p = re.compile('(\d+)-(\d+)-(\d+)') publication_date = p.search(li.text).group() if li.text.find(u'版名') != -1: page = li.text.replace('\n','').replace(u'【版名】', '').replace(' ', '') else: page = '头版' article.issue = self._get_issue_from_date(publication_date, 'rmrb') article.page = page article.publication_date = datetime.datetime.strptime(publication_date, '%Y-%m-%d') article, created = Article.objects.get_or_create(medium=article.medium, title=article.title, issue=article.issue, publication_date=article.publication_date) print article.title return article
def _get_whb_article(self, url, date,issue, page): medium = Medium.objects.get(pk=1399) soup = bs4.BeautifulSoup(requests.get(url).content) for title in soup.find('div', {'id' : 'BT'}).find_all('a'): article_page_url = urljoin(url, title.get('href')) r = requests.get(article_page_url) if r.status_code == 404: continue article_page = bs4.BeautifulSoup( r.content) if Article.objects.filter(medium=medium).filter(url=article_page_url).count(): article = Article.objects.filter(medium=medium).get(url=article_page_url) else: article = Article() article.medium = medium article.url = article_page_url article.publication_date = date article.page = page.text.strip() article.issue = issue print article_page_url title = article_page.title.text.strip().replace(u'文汇报 - ', '') article.title = title article.content = article_page.find('div', {'id' : 'articleText'}).text.strip().replace(u' ', '\n ') article.save()
def _get_qlwb_article(self, url, date, issue, page): print page medium = Medium.objects.get(pk=1025) soup = bs4.BeautifulSoup(requests.get(url).content) if Article.objects.filter(medium=medium).filter(url=url).count(): article = Article.objects.filter(medium=medium).get(url=url) else: article = Article() article.medium = medium article.title = soup.find('td', {'class' : 'font01'}).text.strip().replace(u' ', '\n ') article.url = url article.publication_date = date article.page = page article.issue = issue article.content = soup.find('span', {'id' : 'contenttext'}).text.strip().replace(u' ', '\n ') article.save()