def _get_rmrb_article(self, content): medium = Medium.objects.get(pk=1081) article = Article() article.medium = medium article.title = content.find('h1').text.strip() for author_name in content.find_all('div', {'class' : 'summary'})[0].find_all('a'): try: author = Journalist.objects.get(medium=medium, name=author_name.text.strip()) except: pass else: article.author = author break for li in content.find_all('div', {'class' : 'summary'})[-1].find_all('li'): if li.text.find(u'报纸日期') != -1: p = re.compile('(\d+)-(\d+)-(\d+)') publication_date = p.search(li.text).group() if li.text.find(u'版名') != -1: page = li.text.replace('\n','').replace(u'【版名】', '').replace(' ', '') else: page = '头版' article.issue = self._get_issue_from_date(publication_date, 'rmrb') article.page = page article.publication_date = datetime.datetime.strptime(publication_date, '%Y-%m-%d') article, created = Article.objects.get_or_create(medium=article.medium, title=article.title, issue=article.issue, publication_date=article.publication_date) print article.title return article
def _get_nfzm_article(self, url, date, issue): medium = Medium.objects.get(pk=951) article = Article() article.medium = medium article.issue = issue article.url = url article.publication_date = date r = requests.get(url, cookies={'PHPSESSID': 'l19dgbf6ticijmo9ka9osvufk0'}) content = bs4.BeautifulSoup(r.content) article.title = content.title.string.split('-')[-1].strip() article.content = content.find('section', {'id' : 'articleContent'}).text author = content.find('span', {'class' : 'author'}).find_all('em') if author[1].text.find(u'南方周末记者') != -1: author, created = Journalist.objects.get_or_create(medium=medium, name=author[2].text.strip()) if not created: article.author = author elif author[1].text.find(u'南方周末特约撰稿') != -1: article.author_name = author[2].text.strip() elif author[1].text.find(u'南方周末编辑部') != -1: article.author_name = u'南方周末编辑部' print article.author or article.author_name return article
def _get_whb_article(self, url, date,issue, page): medium = Medium.objects.get(pk=1399) soup = bs4.BeautifulSoup(requests.get(url).content) for title in soup.find('div', {'id' : 'BT'}).find_all('a'): article_page_url = urljoin(url, title.get('href')) r = requests.get(article_page_url) if r.status_code == 404: continue article_page = bs4.BeautifulSoup( r.content) if Article.objects.filter(medium=medium).filter(url=article_page_url).count(): article = Article.objects.filter(medium=medium).get(url=article_page_url) else: article = Article() article.medium = medium article.url = article_page_url article.publication_date = date article.page = page.text.strip() article.issue = issue print article_page_url title = article_page.title.text.strip().replace(u'文汇报 - ', '') article.title = title article.content = article_page.find('div', {'id' : 'articleText'}).text.strip().replace(u' ', '\n ') article.save()
def _get_qlwb_article(self, url, date, issue, page): print page medium = Medium.objects.get(pk=1025) soup = bs4.BeautifulSoup(requests.get(url).content) if Article.objects.filter(medium=medium).filter(url=url).count(): article = Article.objects.filter(medium=medium).get(url=url) else: article = Article() article.medium = medium article.title = soup.find('td', {'class' : 'font01'}).text.strip().replace(u' ', '\n ') article.url = url article.publication_date = date article.page = page article.issue = issue article.content = soup.find('span', {'id' : 'contenttext'}).text.strip().replace(u' ', '\n ') article.save()
if (len(obj['post_content']) == 0) : continue; if (len(obj['post_title']) == 0) : continue; post_id = obj['ID'] if (post_id in post_dict): article = post_dict[post_id] else: article = Article() post_dict[post_id] = article article.issue = oldissue article.title = obj['post_title'] article.save() article.date_published = dateutil.parser.parse(obj['post_modified']) article.text = "%s%s%s" % ('<p>', obj['post_content'].replace(' ','').replace(' \n','').replace('\n\n', '\n').replace('\n', '</p><p>'), '</p>') article.authors.add(usr_dict[obj['post_author']]) for k, v in post_dict.items(): v.save() category_data = open('termzeroo.json') data = json.load(category_data)