def feed_update(): feed_updated_list = [] article_updated_list = [] feeds = Feed.objects.all() for feed in feeds: try: feed_update = feedparser.parse(feed.url, etag=feed.etag) except: print feed.title, "Does not have an etag!!!!!!!!!!!!!!!!!!" title_list = [] if feed_update.status != 304: article_inner = [] feed_updated_list.append(feed.title) feeds = Article.objects.all() for i in feeds: title_list.append(i.title) for entry in feed_update.entries: if entry.title not in title_list: article_inner.append(entry.title) article = Article() article.title = entry.title article.url = entry.link c = 'https?://(.*?)/' try: article.domain = re.findall(c, entry.link)[0] except: article.domain = entry.link if feed.author: article.author = feed.author else: article.author = entry.author article.authorSlug = slugify(article.author) #description script try: remove = re.findall('<p>(The post.*?)</p>', entry.description)[0] article.description = entry.description.replace( remove, '') except: try: remove = re.findall('<p>(The post.*?)</p>', entry.description)[0] except: article.description = entry.description #end descripton script d = datetime.datetime(*(entry.published_parsed[0:6])) dateString = d.strftime('%Y-%m-%d') article.publication_date = dateString article.feed = feed article.practiceArea = feed.practiceArea article.practiceAreaSlug = feed.practiceArea.replace( " ", "_").lower() article.save() article_updated_list.append(article_inner) try: feed.etag = feed_update.etag except: pass feed.save() return feed_updated_list, article_updated_list
def create_new_article(data): #task unstage dictionnary new_article = Article() new_article.title = data['title'] new_article.author = data['author'] new_article.publication_date = data['publication_date'] new_article.summary = data['summary'] new_article.article_image = data['image_url'] new_article.article_url = data['article_url'] new_article.save()
def test(): base_url = "http://www.lz13.cn/lizhi/qingchunlizhi.html" response = requests.get(base_url) parsed_body = html.fromstring(response.text) article_urls = parsed_body.xpath('//a[contains(@href, "/qingchunlizhi/")]/@href') g = Goose({'stopwords_class': StopWordsChinese}) for url in article_urls: article = g.extract(url=url) t = article.title c = article.cleaned_text art = Article(title=t, content=c) art.author = 'lizhi' art.save() print 'get data from %s at %s' % (url, time.ctime())
def refresh(request): foreign_policy_req = requests.get("https://foreignpolicy.com/category/latest/") foreign_policy_soup = BeautifulSoup(foreign_policy_req.content, "html.parser") foreign_policy = foreign_policy_soup.find_all('div', {'class': 'excerpt-content--list content-block'}) for headline in foreign_policy[::-1]: new_article = Article() new_article.title = headline.find_all('h3', {'class':'hed'})[0].text new_article.url= headline.find_all('a', {'class':'hed-heading -excerpt'})[0]['href'] new_article.image_url = headline.find_all('img')[0]['data-src'] auth = headline.find_all('a', {'class':'author'}) if len(auth) != 0: new_article.author = auth[0].text else: new_article.author = "FP" new_article.site = "Foreign Policy" new_article.site_url = "https://foreignpolicy.com" try: new_article.save() #checks for errors except IntegrityError as e: if 'UNIQUE constraint' in str(e.args): #a repeat article pass foreign_affairs_req = requests.get("https://www.foreignaffairs.com") foreign_affairs_soup = BeautifulSoup(foreign_affairs_req.content, "html.parser") foreign_affairs = foreign_affairs_soup.find_all('div', {'class' : 'magazine-list-item--image-link row'}) for headline in foreign_affairs[::-1]: new_article = Article() new_article.title = headline.find_all('h3', {'class':'article-card-title font-weight-bold ls-0 mb-0 f-sans'})[0].text new_article.image_url = headline.find_all('img',{'class':'b-lazy b-lazy-ratio magazine-list-item--image d-none d-md-block'})[0]['data-src'] if len(new_article.image_url) > 199: new_article.image_url = 'https://subscribe.foreignaffairs.com/FAF/pub_templates/faf/images/logo.png' new_article.url = headline.find_all('a', {'class':'d-block flex-grow-1'})[0]['href'] new_article.author = headline.find_all('h4', {'class':'magazine-author font-italic ls-0 mb-0 f-serif'})[0].text new_article.site = "Foreign Affairs" new_article.site_url = "https://www.foreignaffairs.com" try: new_article.save() except IntegrityError as e: if 'UNIQUE constraint' in str(e.args): pass #they give a 403 error for other methods china_power_req = Request("https://chinapower.csis.org/podcasts/", headers = {'User-Agent' : 'Mozilla/5.0'}) china_power_page = urlopen(china_power_req).read() china_power_soup = BeautifulSoup(china_power_page, "html.parser") china_power = china_power_soup.find_all('article') for headline in china_power[::-1]: #finding author disc = headline.find_all('h2', {'class':'entry-title'})[0].text #description has the author's name list_disc = disc.split() #find it in the text record = False list_auth = [] for name in list_disc: if record: list_auth.append(name) #add the name if name == "with": #start at 'episode,' record = True; new_article = Article() new_article.title = headline.find_all('h2', {'class':'entry-title'})[0].text new_article.image_url = "https://megaphone.imgix.net/podcasts/722b9c2a-e6e1-11ea-a520-3349f6671499/image/uploads_2F1598366366917-v9rdxhpawhc-bee946f884ea9a141d33af2322074d0d_2F_ART_ChinaPower.jpg?ixlib=rails-2.1.2&w=400&h=400" new_article.url = headline.find_all('a')[0]['href'] if len(list_auth) != 0: new_article.author = " ".join(list_auth) + " & Bonnie Glaser" else: new_article.author = "Bonnie Glaser" new_article.site = "China Power Podcasts" new_article.site_url = "https://chinapower.csis.org/podcasts/" try: new_article.save() except IntegrityError as e: if 'UNIQUE constraint' in str(e.args): pass #for war on the rocks, each div class for the articles is different warontherocks_req = Request("https://warontherocks.com/", headers = {'User-Agent' : 'Mozilla/5.0'}) warontherocks_page = urlopen(warontherocks_req).read() warontherocks_soup = BeautifulSoup(warontherocks_page, "html.parser") warontherocks = warontherocks_soup.find_all('div', {'class' : 'all-posts'}) #very nice and straight forward html from warontherocks header_ = warontherocks[0].find_all('h3') link_ = warontherocks[0].find_all('a') img_ = warontherocks[0].find_all('img') writer_ = warontherocks[0].find_all('h4') for i in range(12,1,-1): new_article = Article() new_article.title = header_[i-1].text new_article.image_url = img_[i-1]['src'] new_article.url = link_[2*i-1]['href'] new_article.author = writer_[i-1].text new_article.site = "War on the Rocks" new_article.site_url = "https://warontherocks.com" try: new_article.save() except IntegrityError as e: if 'UNIQUE constraint' in str(e.args): pass """AP_FP_req = Request("https://apnews.com/hub/foreign-policy", headers = {'User-Agent' : 'Mozilla/5.0'}) AP_FP_page = urlopen(AP_FP_req).read() AP_IL_req = Request("https://apnews.com/hub/international-relations", headers = {'User-Agent' : 'Mozilla/5.0'}) AP_IL_page = urlopen(AP_IL_req).read() AP_FP_soup = BeautifulSoup(AP_FP_page, "html.parser") AP_IL_soup = BeautifulSoup(AP_IL_page, "html.parser") AP = AP_FP_soup.find_all('div', {'data-key': 'feed-card-wire-story-with-image'}) + AP_IL_soup.find_all('div', {'data-key': 'feed-card-wire-story-with-image'}) for headline in AP[::-1]: new_article = Article() new_article.title = headline.find_all('h1')[0].text new_article.url= "https://apnews.com" + headline.find_all('a')[0]['href'] #img machine broke img = headline.find_all('img', {'class': 'image-0-2-132'}) if len(img) == 0: new_article.image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Associated_Press_logo_2012.svg/220px-Associated_Press_logo_2012.svg.png" else: new_article.image_url = img[0]['src'] list_auth = (headline.find_all('span')[0].text).split(" ") if "GMT" in list_auth: new_article.author = "AP" else: new_article.author = headline.find_all('span')[0].text new_article.site = "Associated Press" new_article.site_url = "https://apnews.com" try: new_article.save() #checks for errors except IntegrityError as e: if 'UNIQUE constraint' in str(e.args): #a repeat article pass""" #lowy institute LI_req = Request("https://www.lowyinstitute.org/the-interpreter/archive", headers = {'User-Agent' : 'Mozilla/5.0'}) LI_page = urlopen(LI_req).read() LI_soup = BeautifulSoup(LI_page, "html.parser") LI = LI_soup.find_all('article') for headline in LI[::-1]: img = headline.find_all('div',{'class':'article-thumb'})[0] if len(img) == 0: img = headline.find_all('div',{'class':'article-thumb-wrap'})[0] word = [] #getting the link into a list of chars record = False for letter in list(img['style']): if record: word.append(letter) if letter == "'": if record: word.pop() #revmoving the ' at the end break record = True new_article = Article() new_article.title = headline.find_all('h2', {'class':'article-title txt-f4 txt-s6 mv-0 pv-xs'})[0].text new_article.url= "https://www.lowyinstitute.org" + headline.find_all('a', {'class':'txt-dn'})[0]['href'] new_article.image_url = "".join(word) new_article.author = headline.find_all('a', {'class':'txt-dn'})[1].text new_article.site = "Lowy Institute" new_article.site_url = "https://www.lowyinstitute.org/the-interpreter/archive" try: new_article.save() except IntegrityError as e: if 'UNIQUE constraint' in str(e.args): pass return redirect("../")