Ejemplo n.º 1
0
def feed_update():
    feed_updated_list = []
    article_updated_list = []
    feeds = Feed.objects.all()
    for feed in feeds:
        try:
            feed_update = feedparser.parse(feed.url, etag=feed.etag)
        except:
            print feed.title, "Does not have an etag!!!!!!!!!!!!!!!!!!"
        title_list = []
        if feed_update.status != 304:
            article_inner = []
            feed_updated_list.append(feed.title)
            feeds = Article.objects.all()
            for i in feeds:
                title_list.append(i.title)
            for entry in feed_update.entries:
                if entry.title not in title_list:
                    article_inner.append(entry.title)
                    article = Article()
                    article.title = entry.title
                    article.url = entry.link
                    c = 'https?://(.*?)/'
                    try:
                        article.domain = re.findall(c, entry.link)[0]
                    except:
                        article.domain = entry.link
                    if feed.author:
                        article.author = feed.author
                    else:
                        article.author = entry.author
                    article.authorSlug = slugify(article.author)
                    #description script
                    try:
                        remove = re.findall('<p>(The post.*?)</p>',
                                            entry.description)[0]
                        article.description = entry.description.replace(
                            remove, '')
                    except:
                        try:
                            remove = re.findall('<p>(The post.*?)</p>',
                                                entry.description)[0]
                        except:
                            article.description = entry.description
                    #end descripton script
                    d = datetime.datetime(*(entry.published_parsed[0:6]))
                    dateString = d.strftime('%Y-%m-%d')
                    article.publication_date = dateString
                    article.feed = feed
                    article.practiceArea = feed.practiceArea
                    article.practiceAreaSlug = feed.practiceArea.replace(
                        " ", "_").lower()
                    article.save()
            article_updated_list.append(article_inner)
            try:
                feed.etag = feed_update.etag
            except:
                pass
            feed.save()
    return feed_updated_list, article_updated_list
Ejemplo n.º 2
0
def create_new_article(data):
    #task unstage dictionnary
    new_article = Article()
    new_article.title = data['title']
    new_article.author = data['author']
    new_article.publication_date = data['publication_date']
    new_article.summary = data['summary']
    new_article.article_image = data['image_url']
    new_article.article_url = data['article_url']
    new_article.save()
Ejemplo n.º 3
0
def test():
	base_url = "http://www.lz13.cn/lizhi/qingchunlizhi.html"
	response = requests.get(base_url)
	parsed_body = html.fromstring(response.text)
	article_urls = parsed_body.xpath('//a[contains(@href, "/qingchunlizhi/")]/@href')
	g = Goose({'stopwords_class': StopWordsChinese})

	for url in article_urls:
		article = g.extract(url=url)
		t = article.title
		c = article.cleaned_text
		art = Article(title=t, content=c)
		art.author = 'lizhi'
		art.save()
		print 'get data from %s at %s' % (url, time.ctime())
Ejemplo n.º 4
0
def refresh(request):
	foreign_policy_req = requests.get("https://foreignpolicy.com/category/latest/")
	foreign_policy_soup = BeautifulSoup(foreign_policy_req.content, "html.parser")
	foreign_policy = foreign_policy_soup.find_all('div', {'class': 'excerpt-content--list content-block'})
	for headline in foreign_policy[::-1]:
		new_article = Article()
		new_article.title = headline.find_all('h3', {'class':'hed'})[0].text
		new_article.url= headline.find_all('a', {'class':'hed-heading -excerpt'})[0]['href']
		new_article.image_url = headline.find_all('img')[0]['data-src']
		auth = headline.find_all('a', {'class':'author'})
		if len(auth) != 0:
			new_article.author = auth[0].text
		else:
			new_article.author = "FP"
		new_article.site = "Foreign Policy"
		new_article.site_url = "https://foreignpolicy.com"
		try:
			new_article.save() #checks for errors
		except IntegrityError as e: 
   			if 'UNIQUE constraint' in str(e.args): #a repeat article
   				pass

	foreign_affairs_req = requests.get("https://www.foreignaffairs.com")
	foreign_affairs_soup = BeautifulSoup(foreign_affairs_req.content, "html.parser")
	foreign_affairs = foreign_affairs_soup.find_all('div', {'class' : 'magazine-list-item--image-link row'})
	for headline in foreign_affairs[::-1]:
		new_article = Article()
		new_article.title = headline.find_all('h3', {'class':'article-card-title font-weight-bold ls-0 mb-0 f-sans'})[0].text
		new_article.image_url = headline.find_all('img',{'class':'b-lazy b-lazy-ratio magazine-list-item--image d-none d-md-block'})[0]['data-src']
		if len(new_article.image_url) > 199:
			new_article.image_url = 'https://subscribe.foreignaffairs.com/FAF/pub_templates/faf/images/logo.png'
		new_article.url = headline.find_all('a', {'class':'d-block flex-grow-1'})[0]['href']
		new_article.author = headline.find_all('h4', {'class':'magazine-author font-italic ls-0 mb-0 f-serif'})[0].text
		new_article.site = "Foreign Affairs"
		new_article.site_url = "https://www.foreignaffairs.com"
		try: 
			new_article.save()
		except IntegrityError as e: 
	   		if 'UNIQUE constraint' in str(e.args):
	   			pass

	#they give a 403 error for other methods
	china_power_req = Request("https://chinapower.csis.org/podcasts/", headers = {'User-Agent' : 'Mozilla/5.0'})
	china_power_page = urlopen(china_power_req).read()
	china_power_soup = BeautifulSoup(china_power_page, "html.parser")
	china_power = china_power_soup.find_all('article')

	for headline in china_power[::-1]:
		#finding author
		disc = headline.find_all('h2', {'class':'entry-title'})[0].text #description has the author's name
		list_disc = disc.split() #find it in the text
		record = False
		list_auth = []
		for name in list_disc:
			if record:
				list_auth.append(name) #add the name
			if name == "with": #start at 'episode,'
				record = True;

		new_article = Article()
		new_article.title = headline.find_all('h2', {'class':'entry-title'})[0].text
		new_article.image_url = "https://megaphone.imgix.net/podcasts/722b9c2a-e6e1-11ea-a520-3349f6671499/image/uploads_2F1598366366917-v9rdxhpawhc-bee946f884ea9a141d33af2322074d0d_2F_ART_ChinaPower.jpg?ixlib=rails-2.1.2&w=400&h=400"
		new_article.url = headline.find_all('a')[0]['href']
		if len(list_auth) != 0:
			new_article.author = " ".join(list_auth) + " & Bonnie Glaser"
		else:
			new_article.author = "Bonnie Glaser"
		new_article.site = "China Power Podcasts"
		new_article.site_url = "https://chinapower.csis.org/podcasts/"
		try: 
			new_article.save()
		except IntegrityError as e: 
	   		if 'UNIQUE constraint' in str(e.args):
	   			pass

	#for war on the rocks, each div class for the articles is different
	warontherocks_req = Request("https://warontherocks.com/", headers = {'User-Agent' : 'Mozilla/5.0'})
	warontherocks_page = urlopen(warontherocks_req).read()
	warontherocks_soup = BeautifulSoup(warontherocks_page, "html.parser")
	warontherocks = warontherocks_soup.find_all('div', {'class' : 'all-posts'})

	#very nice and straight forward html from warontherocks
	header_ = warontherocks[0].find_all('h3')
	link_ = warontherocks[0].find_all('a')
	img_ = warontherocks[0].find_all('img')
	writer_ = warontherocks[0].find_all('h4')

	for i in range(12,1,-1):
		new_article = Article()
		new_article.title = header_[i-1].text
		new_article.image_url = img_[i-1]['src']
		new_article.url = link_[2*i-1]['href']
		new_article.author = writer_[i-1].text
		new_article.site = "War on the Rocks"
		new_article.site_url = "https://warontherocks.com"
		try: 
			new_article.save()
		except IntegrityError as e: 
	   		if 'UNIQUE constraint' in str(e.args):
	   			pass

	"""AP_FP_req = Request("https://apnews.com/hub/foreign-policy", headers = {'User-Agent' : 'Mozilla/5.0'})
	AP_FP_page = urlopen(AP_FP_req).read()
	AP_IL_req = Request("https://apnews.com/hub/international-relations", headers = {'User-Agent' : 'Mozilla/5.0'})
	AP_IL_page = urlopen(AP_IL_req).read()
	AP_FP_soup = BeautifulSoup(AP_FP_page, "html.parser")
	AP_IL_soup = BeautifulSoup(AP_IL_page, "html.parser")
	AP = AP_FP_soup.find_all('div', {'data-key': 'feed-card-wire-story-with-image'}) + AP_IL_soup.find_all('div', {'data-key': 'feed-card-wire-story-with-image'})
	for headline in AP[::-1]:
		new_article = Article()
		new_article.title = headline.find_all('h1')[0].text
		new_article.url= "https://apnews.com" + headline.find_all('a')[0]['href']
		#img machine broke
		img = headline.find_all('img', {'class': 'image-0-2-132'})
		if len(img) == 0:
			new_article.image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Associated_Press_logo_2012.svg/220px-Associated_Press_logo_2012.svg.png"
		else:
			new_article.image_url = img[0]['src']
		list_auth = (headline.find_all('span')[0].text).split(" ")
		if "GMT" in list_auth:
			new_article.author = "AP"
		else:
			new_article.author = headline.find_all('span')[0].text
		new_article.site = "Associated Press"
		new_article.site_url = "https://apnews.com"
		try:
			new_article.save() #checks for errors
		except IntegrityError as e: 
   			if 'UNIQUE constraint' in str(e.args): #a repeat article
   				pass"""

   	#lowy institute
	LI_req = Request("https://www.lowyinstitute.org/the-interpreter/archive", headers = {'User-Agent' : 'Mozilla/5.0'})
	LI_page = urlopen(LI_req).read()
	LI_soup = BeautifulSoup(LI_page, "html.parser")
	LI = LI_soup.find_all('article')

	for headline in LI[::-1]:
		img = headline.find_all('div',{'class':'article-thumb'})[0]
		if len(img) == 0:
			img = headline.find_all('div',{'class':'article-thumb-wrap'})[0]
		word = [] #getting the link into a list of chars
		record = False
		for letter in list(img['style']):
			if record:
				word.append(letter)
			if letter == "'":
				if record:
					word.pop() #revmoving the ' at the end
					break
				record = True

		new_article = Article()
		new_article.title = headline.find_all('h2', {'class':'article-title txt-f4 txt-s6 mv-0 pv-xs'})[0].text
		new_article.url= "https://www.lowyinstitute.org" + headline.find_all('a', {'class':'txt-dn'})[0]['href']
		new_article.image_url = "".join(word)
		new_article.author = headline.find_all('a', {'class':'txt-dn'})[1].text
		new_article.site = "Lowy Institute"
		new_article.site_url = "https://www.lowyinstitute.org/the-interpreter/archive"
		
		try:
			new_article.save()
		except IntegrityError as e: 
   			if 'UNIQUE constraint' in str(e.args):
   				pass

	return redirect("../")