def _get_zgqnb_article(self, url, date, page_name): print url medium = Medium.objects.get(pk=1836) urls = bs4.BeautifulSoup(requests.get(url).content).find('div', {'id' : 'titleList'}).find_all('a') for a in urls: article_url = urljoin(url, a.get('href')) soup = bs4.BeautifulSoup(requests.get(article_url).content) title = soup.find('h1').text print title article = Article() article.medium = medium article.title = title article.url = article_url article.publication_date = date article.page = page_name p_list = [] for p in soup.find('div', {'id' : 'ozoom'}).find_all('p'): p_list.append(p.text) content = '\n'.join(p_list) article.content = content if Article.objects.filter(medium=medium).filter(url=article_url).count(): article = Article.objects.filter(medium=medium).get(url=article_url) article.content = content article.save()
def _get_rmrb_article(self, content): medium = Medium.objects.get(pk=1081) article = Article() article.medium = medium article.title = content.find('h1').text.strip() for author_name in content.find_all('div', {'class' : 'summary'})[0].find_all('a'): try: author = Journalist.objects.get(medium=medium, name=author_name.text.strip()) except: pass else: article.author = author break for li in content.find_all('div', {'class' : 'summary'})[-1].find_all('li'): if li.text.find(u'报纸日期') != -1: p = re.compile('(\d+)-(\d+)-(\d+)') publication_date = p.search(li.text).group() if li.text.find(u'版名') != -1: page = li.text.replace('\n','').replace(u'【版名】', '').replace(' ', '') else: page = '头版' article.issue = self._get_issue_from_date(publication_date, 'rmrb') article.page = page article.publication_date = datetime.datetime.strptime(publication_date, '%Y-%m-%d') article, created = Article.objects.get_or_create(medium=article.medium, title=article.title, issue=article.issue, publication_date=article.publication_date) print article.title return article
def _get_nfzm_article(self, url, date, issue): medium = Medium.objects.get(pk=951) article = Article() article.medium = medium article.issue = issue article.url = url article.publication_date = date r = requests.get(url, cookies={'PHPSESSID': 'l19dgbf6ticijmo9ka9osvufk0'}) content = bs4.BeautifulSoup(r.content) article.title = content.title.string.split('-')[-1].strip() article.content = content.find('section', {'id' : 'articleContent'}).text author = content.find('span', {'class' : 'author'}).find_all('em') if author[1].text.find(u'南方周末记者') != -1: author, created = Journalist.objects.get_or_create(medium=medium, name=author[2].text.strip()) if not created: article.author = author elif author[1].text.find(u'南方周末特约撰稿') != -1: article.author_name = author[2].text.strip() elif author[1].text.find(u'南方周末编辑部') != -1: article.author_name = u'南方周末编辑部' print article.author or article.author_name return article
def _get_whb_article(self, url, date,issue, page): medium = Medium.objects.get(pk=1399) soup = bs4.BeautifulSoup(requests.get(url).content) for title in soup.find('div', {'id' : 'BT'}).find_all('a'): article_page_url = urljoin(url, title.get('href')) r = requests.get(article_page_url) if r.status_code == 404: continue article_page = bs4.BeautifulSoup( r.content) if Article.objects.filter(medium=medium).filter(url=article_page_url).count(): article = Article.objects.filter(medium=medium).get(url=article_page_url) else: article = Article() article.medium = medium article.url = article_page_url article.publication_date = date article.page = page.text.strip() article.issue = issue print article_page_url title = article_page.title.text.strip().replace(u'文汇报 - ', '') article.title = title article.content = article_page.find('div', {'id' : 'articleText'}).text.strip().replace(u' ', '\n ') article.save()
def _get_qlwb_article(self, url, date, issue, page): print page medium = Medium.objects.get(pk=1025) soup = bs4.BeautifulSoup(requests.get(url).content) if Article.objects.filter(medium=medium).filter(url=url).count(): article = Article.objects.filter(medium=medium).get(url=url) else: article = Article() article.medium = medium article.title = soup.find('td', {'class' : 'font01'}).text.strip().replace(u' ', '\n ') article.url = url article.publication_date = date article.page = page article.issue = issue article.content = soup.find('span', {'id' : 'contenttext'}).text.strip().replace(u' ', '\n ') article.save()
def add_view(request): # page is required authentication to view if not request.user.is_authenticated(): return HttpResponseRedirect(reverse("articles:login_required")) if request.method == "POST": # get story title and text title = request.POST.get("title", None) text = request.POST.get("story", None) # if title or text is not added if not title or not text: return render( request, "articles/add.html", {"msg": "У вашей истории обязательно должно быть название и сам текст."} ) # get current User object u = User.objects.get(pk=request.user.id) # get current UserProfile object profile = UserProfile.objects.get(user=u) # create and save new story article = Article(title=title, text=text, author=profile) article.publication_date = datetime.now() article.save() return HttpResponseRedirect(reverse("articles:add_success")) return render(request, "articles/add.html")