Ejemplo n.º 1
0
def scrape():
    import requests
    from bs4 import BeautifulSoup
    from django.conf import settings
    import os
    import shutil
    from news.models import Headline, UserProfile
    from datetime import datetime
    #user_p, created = UserProfile.objects.get_or_create(user=request.user)
    #user_p.last_scrape = datetime.now(timezone.utc)
    #user_p.save()
    url = "https://premierleague.com"
    session = requests.Session()
    session.headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"
    }
    content = session.get(url + "/news",
                          verify=False).content  # .content grabs all html

    soup = BeautifulSoup(content, 'html.parser')
    articles = soup.find_all("section", {"class": "featuredArticle"})

    for item in articles:
        url_suffix = item.find("a", {"class": "thumbnail thumbLong"})['href']
        news_link = url + url_suffix if not re.search(
            '^https://', url_suffix) else url_suffix
        img_src = item.find("img")['src'].strip()

        new_headline = Headline()
        new_headline.url = news_link
        new_headline.pub_date = datetime.now()
        # use img_src to get the link,
        # then use the link to get the actual image,
        # and save the image in BASE_DIR/src/static

        media_root_path = settings.MEDIA_ROOT
        local_fname = img_src.split("/")[-1].split("?")[0]
        try:
            if not local_fname.startswith(
                    "audioboomgraphics") and local_fname not in os.listdir(
                        media_root_path):
                r = session.get(img_src, stream=True, verify=False)
                with open(local_fname, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=1024):
                        f.write(chunk)
                cur_img_abspath = os.path.abspath(local_fname)
                shutil.move(cur_img_abspath, media_root_path)
                new_headline.image = local_fname
            elif local_fname in os.listdir(media_root_path):
                new_headline.image = local_fname
        except:
            pass
        info = get_summary(news_link)
        new_headline.title = info['title']
        new_headline.summary = info['summary']
        try:
            new_headline.save()
        except:
            pass
Ejemplo n.º 2
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bo.html)"
    }
    url = "https://timesofindia.indiatimes.com/briefs"

    content = session.get(url, verify=False).content
    soup = BeautifulSoup(content, 'html.parser')
    News = soup.find_all('div', {"class": "brief_box"})
    for article in News:
        try:
            main = article.find('h2').find('a')
        except Exception as e:
            print(e)
        link = str(main['href'])
        link = url + link
        title = main.text
        #image_src = article.find('a')
        #image_src = article.find('div', {"class":"posrel"})
        #image = image_src.find('img')['src']
        #print(image_src)
        new_headline = Headline()
        new_headline.title = title
        #new_headline.image = image
        new_headline.url = link
        new_headline.save()

    return redirect('../')
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article', {"class": "sc-1pw4fyi-5 RkwFH"})
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = main['href']
        title = artcile.find('h4', {"class": "sc-1qoge05-0 eoIfRA"}).text
        News3 = artcile.find('img', {"class": "dv4r5q-2 iaqrWM"})
        if News3 is None:
            image_src = temp
        else:
            image_src = News3['srcset'].split(' ')[0]
            temp = image_src
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()

    return redirect("../")
Ejemplo n.º 4
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"
    content = session.get(url, verify=True).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article',
                         {"class": "sc-1pw4fyi-7 gDJTEP js_post_item"})
    for article in News:
        main = article.find_all('a')[0]
        title = article.find_all('h4')[0]
        link = main['href']
        images = main.find('img')
        if images is not None:
            if images.has_attr('srcset'):
                #print(images)
                image_src = str(main.find('img')['srcset']).split(".jpg")[0]
                print('title: ', title.text)
                print('link: ', link)

                titlet = str(title.text)
                image_src = image_src + '.jpg'
                print('image_src: ', image_src)
                if link is not None and image_src is not None and title is not None:
                    new_headline = Headline(title=titlet,
                                            image=image_src,
                                            url=link)
                    new_headline.save()
    return redirect('news')
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"

    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article', {"class": "js_post_item"})
    for article in News:
        title = article.find_all('a', {"class": "js_link"})[-1].text
        link = article.find("a", {"class": "js_link"}).attrs["href"]
        image_src = article.find("a", {"class": "js_link"}).find("img")
        if image_src:
            try:
                image_src = image_src.attrs["srcset"]
                image_src = image_src.split(" ")[-4]
            except:
                try:
                    image_src = image_src.attrs["data-expanded-srcset"]
                    image_src = image_src.split(" ")[-4]
                except:
                    continue
        else:
            continue
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()
    return redirect("../")
Ejemplo n.º 6
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)%22%7D"
    }
    url = "https://www.ynet.co.il/home/0,7340,L-8,00.html"

    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('div',
                         {"class": "str3s str3s_small str3s_type_small"})
    Titles = soup.find_all('div', {"class": "title"})
    TitlesText = []
    for title in Titles:
        t = title.text
        TitlesText.append(t)

    i = 0
    new_headline_links = []
    for article in Headline.objects.all():
        new_headline_links.append(article.title)

    for artcile in News:
        main = artcile.find_all('a')[0]

        link = main['href']
        image_src = str(main.find('img')['src']).split(" ")[0]

        if (TitlesText[i] in new_headline_links):
            break

        if (link.find("https") != -1):
            link2 = link
        else:
            link2 = "https://www.ynet.co.il/" + link

        link2 = link2.replace('#autoplay', '')
        articleContent = session.get(link2, verify=False).content
        print(link2)
        soup = BSoup(articleContent, "html.parser")

        new_headline = Headline()

        ok = "פורסם:"
        #header = soup.find_all('div', {"class":"element B3 ghcite noBottomPadding"})[0]
        dates = soup.find_all('span', string=ok)
        print(dates)

        new_headline.date = dates[1].text
        new_headline.title = TitlesText[i]
        new_headline.url = link2
        new_headline.image = image_src
        #if (new_headline.date != 'error#'):
        #    new_headline.save()
        new_headline.save()
        i = i + 1

    return redirect("../")
Ejemplo n.º 7
0
def scrape(request):
    url = "https://www.theonion.com/"
    r = requests.get(url)
    soup = BSoup(r.content, 'html.parser')
    val = soup.find_all('article', {'class': "js_post_item"})
    for link in val:
        main = link.find('a')
        try:
            image_url = (str(main.find('img')['data-srcset']).split(" ")[0])
            new_headine = Headline()
            new_headine.image = image_url
            new_headine.url = main['href']
            new_headine.title = link.find('h4').get_text()
            new_headine.save()
        except:
            pass
    return redirect("../")
Ejemplo n.º 8
0
def scrape(request): # scrape news articles from theonion.com
  session = requests.Session()
  session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
  url = "https://www.theonion.com/"
  content = session.get(url, verify=False).content
  soup = BSoup(content, "html.parser") # create a soup object
  News = soup.find_all('div', {"class":"curation-module__item"})
  for artcile in News: # to iterate over soup objects
    main = artcile.find_all('a')[0]
    link = main['href']
    image_src = str(main.find('img')['srcset']).split(" ")[-4]
    title = main['title']
    new_headline = Headline()
    new_headline.title = title
    new_headline.url = link
    new_headline.image = image_src
    new_headline.save()
  return redirect("../")
def scrape(request):
    Headline.objects.all().delete()

    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }

    url = "https://www.freecodecamp.org/news/"
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article', {"class": "post-card"})
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = "https://www.freecodecamp.org" + main['href']
        image_src = str(main.find('img')['src'])
        if not "http" in image_src:
            image_src = "https://www.freecodecamp.org" + str(
                main.find('img')['src'])
        title = str(main.find('img')['alt'])
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()

    url2 = "https://www.entrepreneur.com/topic/coders"
    content = session.get(url2, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('div', {"class": "hero"})
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = "https://www.entrepreneur.com" + main['href']
        image_ = str(main.find('img')['src'])
        image_ = image_.replace('&blur=50', '')
        title = str(main.find('img')['alt'])
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_
        new_headline.save()

    return redirect("../")
Ejemplo n.º 10
0
def scrape(request):
    session = requests.Session()
    session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
    url = "https://www.theonion.com/"
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('article', {"class":"pg455-2 gegKxg js_post_item sc-1pw4fyi-7 dCsSCd"})
    for artcile in News:
        main = artcile.find_all('a')[0]

        link = main['href']
        image_src = str(main.find('img')['data-anim-src']).split(" ")[0]
        title = main['href'][30:80]
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()
    return redirect("/")
Ejemplo n.º 11
0
def scrape(request):
	session = requests.Session()
	session.headers ={"User-Agent":"Googlebot/2.1 (+http://www.google.com/bot.html)"}
	url = "https://www.indiatoday.in/world"

	#for page in range(1,5):
			#url+='?page=%d'%page

	content = session.get(url, verify=False).content
	soup = BSoup(content,"html.parser")
    #News = soup.find_all('div',{"class": "view-content"})

    #for article in News:
	
	News = soup.find_all('div',{"class": "catagory-listing"})

	for article in News:
	
		image_url = article.find('div',{"class": "pic"}).img['src']
		title=  article.find('div',{"class": "detail"}).h2.a.contents[0]
		link = str(url[:-6]+article.find('div',{"class": "detail"}).h2.a['href'])
		try:
			description = str(article.find('div',{"class": "detail"}).p.text)
		except:
			description = str(article.find('div',{"class": "detail"}).p)


		new_headline = Headline()
		new_headline.title = title
		new_headline.url = link
		new_headline.image = image_url
		new_headline.description= description

		try:
			new_headline.save()
		
		except IntegrityError as e: 
			if 'unique constraint' in e.args:
				continue 


	return redirect("../")
Ejemplo n.º 12
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        'User-Agent': 'Googlebot/2.1 (+http://www.google.com/bot.html)'
    }
    url = 'https://www.stirileprotv.ro/'
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('div', {"class": "curation-module__item"})
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = main['href']
        image_src = str(main.find('img')['srcset']).split(" ")[-4]
        title = main['title']
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()
    return redirect("../")
Ejemplo n.º 13
0
def scrape(request):
    session = requests.Session()
    session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
    url = "https://lite.cnn.com"

    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find('ul').find_all('li')
    for article in News:
        main = article.find_all('a')[0]
        link = main['href']
        title = main.text
        try:
            new_headline = Headline()
            new_headline.title = title
            new_headline.url = url + link
            new_headline.save()
            return HttpResponse("Success Scrapping Data")
        except e as Exception:
            return HttpResponse(f"Failed {e}")
Ejemplo n.º 14
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"
    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    #   sc-1pw4fyi-3 ziaet, sc-1qkakgd-0 eSMucW, sc-1whp23a-1 kphRNd, a1de4o-5 cKrhTm
    News = soup.find_all(
        'div', {"class": ["sc-1qkakgd-0 eSMucW", "sc-1whp23a-1 kphRNd"]})
    for artcile in News:
        main = artcile.find_all('a')[0]
        link = main['href']
        image_src = str(main.find('img')['srcset']).split(" ")[-4]
        title = "News"
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()
    return redirect("../")
Ejemplo n.º 15
0
def scrape(request):
  Headline.objects.all().delete()
  session = requests.Session()
  session.headers = {"User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"}
  url = "https://www.theonion.com/latest"
  content = session.get(url).content
  soup = BSoup(content, "html.parser")
  News = soup.find_all('div', {"class":"cw4lnv-11 dFCKPx"})
  for article in News:
    main = article.find_all('a',href=True)
    linkx = article.find('a', {"class":"sc-1out364-0 hMndXN js_link"})
    link=linkx['href']
    imgx=main[0].find('img',src=True)
    image_src=imgx['srcset'].split(" ")[-4]
    titlex = article.find('h2', {"class":"sc-759qgu-0 cYlVdn cw4lnv-6 eXwNRE"})
    title = titlex.text
    new_headline = Headline()
    #Headline.objects.all().delete()
    new_headline.title = title
    new_headline.url = link
    new_headline.image = image_src
    new_headline.save()
  return redirect("../")
Ejemplo n.º 16
0
    def scrape(request):
        session = requests.Session()
        session.headers = {
            "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
        }
        url = ur.urlopen("http://www.sinovision.net/").read()
        soup = BSoup(url.decode("utf-8"), 'lxml')
        main_news = soup.find('div', {'class': 'centersection-r'})

        for news in main_news.find_all('li', {'class': 'rolltitle'}):

            title = news.find('a').get_text()
            main = news.find_all('a')[0]
            link = main['href']

            if Headline.objects.filter(url=link):
                continue
            elif link == '':
                continue
            new_headline = Headline()
            new_headline.title = title
            new_headline.url = link
            new_headline.save()
Ejemplo n.º 17
0
def scrape(request):
    session = requests.Session()
    session.verify = False
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://news.abplive.com/news"
    content = session.get(url, verify=True).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('div', {"class": "other_news"})
    for article in News:
        print(article)
        break
        # check if already exist
        if not Headline.objects.filter(
                title__iexact=article.a['title']).exists():
            print("not exist")
            new_headline = Headline()
            new_headline.title = article.a['title']
            new_headline.url = article.a['href']
            new_headline.image = article.img['data-src']
            new_headline.save()
    return redirect("../")
Ejemplo n.º 18
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://timesofindia.indiatimes.com/briefs"

    content = session.get(url, verify=False).content
    soup = BSoup(content, "html.parser")
    News = soup.find_all('div', {"class": "brief_box"})
    for i in range(0, 60):
        if i % 6 == 4:
            continue
        article = News[i]
        link = "https://timesofindia.indiatimes.com" + article.h2.a['href']
        title = article.h2.text
        text = article.p.text
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.text = text
        new_headline.save()
    return redirect("../")
Ejemplo n.º 19
0
def scrape(request):
    session = requests.Session()
    session.headers = {
        "User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)"
    }
    url = "https://www.theonion.com/"
    #content = session.get(url).text
    page = requests.get(url)

    html = page.text
    soup = BSoup(html, "html.parser")
    News = soup.findAll('article')

    for artcile in News:
        link = str(artcile.find('a')['href']).split(" ")
        image_src = str(artcile.find('img')['srcset']).split(" ")[-4]
        title = str(artcile.find('h4').get_text())
        new_headline = Headline()
        new_headline.title = title
        new_headline.url = link
        new_headline.image = image_src
        new_headline.save()
    return redirect("/")
Ejemplo n.º 20
0
def scrape(request):
	session = requests.Session()
	session.headers = ("User-Agent": "Googlebot/2.1 (+http://www.google.com/bot.html)")
	url = "https://www.theonion.com/"

	content = session.get(url, verify=False).content 
	soup = BSoup(content, "html.parser")
	News = soup.find_all('div', ("class":"curation-module__item"))
	for article in News:
		main = article.find_all('a')[0]
		link = main['href']
		image_src = str(main.find('img')['srcset']).split(" ")[-4]
		title = main['title']
		new_headline = Headline()
		new_headline.title = title
		new_headline.url = link 
		new_headline.image = image_src
		new_headline.save()
		return redirect("../")


# DataFlair 
def news_list(request):
	headlines = Headline.objects.all()[::-1]
	context = {
		'object_list': headlines,

	}
	return render(request, "news/home.html", context)
	
Ejemplo n.º 21
0
def get_news():
    STORIES = 50
    sites = 2
    per_site = STORIES // sites

    fox_html = requests.get('http://www.foxnews.com/politics')
    fox_soup = BeautifulSoup(fox_html.text, 'lxml')

    fox_list = fox_soup.find_all('div', class_='content article-list')
    fox_count = 0
    for lst in fox_list:
        fox = lst.find_all('article', class_='article')
        for article in fox:
            if fox_count < per_site:
                headline = Headline()
                pic = article.find('a').find('img')['src']
                url = article.find('a')['href']
                title = article.find('h4', class_='title').find('a').text
                time_ago = article.find('span', class_='time').text
                leaning = 'right'

                headline.img = pic
                if url[0] == '/':
                    headline.url = "http://www.foxnews.com" + url
                else:
                    headline.url = url
                headline.title = title
                headline.leaning = leaning
                headline.time_ago_str = time_ago
                headline.save()

                fox_count += 1

    politico_html = requests.get('http://www.politico.com/politics')
    politico_soup = BeautifulSoup(politico_html.text, 'lxml')

    politico = politico_soup.find_all('article', class_='story-frag format-sm')
    politico_count = 0
    for article in politico:
        if len(article.find('a').text.split(
                " ")) > 4 and politico_count < per_site:
            headline = Headline()
            if article.find('img') != None:
                pic = article.find('img')['src']
            else:
                pic = ""
            url = article.find('a')['href']
            title = article.find('a').text
            leaning = 'left'

            now = datetime.datetime.now()
            pub_datetime_str = article.find('time')['datetime']
            pub_datetime = datetime.datetime.strptime(pub_datetime_str,
                                                      '%Y-%m-%d %H:%M:%S')
            time_ago = (now - pub_datetime).seconds // 60

            headline.img = pic
            headline.url = url
            headline.title = title
            headline.leaning = leaning
            headline.mins_ago = time_ago
            headline.save()

            politico_count += 1