def get_opener(): # get opener link url = 'https://www.handelsblatt.com/' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36' } response = requests.get(url, headers) soup = BeautifulSoup(response.text, features="html.parser") opener_link = url[:-1] + soup.find('h1').find_parent('a')['href'] # get keywords, headline, author response = requests.get(opener_link, headers) soup = BeautifulSoup(response.text, features="html.parser") headline = soup.find("title").text keywords = soup.find("meta", {"name": "keywords"})['content'].split(',') authors = [ a['content'] for a in soup.find_all("meta", {"name": "vr:author"}) ] if len(authors) == 0: authors = authors[0].split(',') # find ressorts ressort = find_tag(ressort_tag, response) sub_ressort = find_tag(sub_ressort_tag, response) op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords, name) return op
def get_opener(): url = 'https://www.n-tv.de/' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36' } response = requests.get(url, headers) soup = BeautifulSoup(response.text, features="html.parser") opener_link = soup.find('section', {"class": "group"}).find('a')['href'] # get keywords, headline, author from meta response = requests.get(opener_link, headers) soup = BeautifulSoup(response.text, features="html.parser") headline = soup.find("title").text.rsplit(' - ', 1)[0] keywords = soup.find("meta", {"name": "news_keywords"})['content'].split(', ') authors = [a['content'] for a in soup.find_all("meta", {"name": "author"})] # get ressort from breadcrumb breadcrumbs = soup.find("nav", {"class": "breadcrumb"}).find_all('a') ressort = breadcrumbs[1].text.strip() if len(breadcrumbs) > 2: sub_ressort = breadcrumbs[2].text.strip() else: sub_ressort = None op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords, name) return op
def get_opener(): url = 'https://www.sueddeutsche.de/' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36' } # get link to opener response = requests.get(url, headers) soup = BeautifulSoup(response.text, features="html.parser") opener_link = soup.find("main").find('a')['href'] # get data on opener response = requests.get(opener_link, headers) soup = BeautifulSoup(response.text, features="html.parser") # headline, keywords and authors are in meta tags headline = soup.find("meta", {"property": "og:title"})['content'] keywords = soup.find("meta", {"name": "keywords"})['content'].split(',') authors = soup.find_all("meta", {"name": "author"}) authors = [a['content'] for a in authors] # ressort is in java script data = soup.find("script", {"type": "text/javascript"}) data = str(data).split('[')[1].split(']')[0] data = json.loads(data) ressort = data.get("ressort") sub_ressort = data.get("thema") # initialise opener op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords, name) return op
def get_opener(): url = 'https://www.bild.de/' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36' } response = requests.get(url, headers) soup = BeautifulSoup(response.text, features="html.parser") opener_link = url[:-1] + soup.find(id='innerContent').find("a")['href'] # get keywords, headline, author response = requests.get(opener_link, headers) soup = BeautifulSoup(response.text, features="html.parser") keywords = soup.find("meta", {"name": "keywords"})['content'].split(',') data = soup.find_all('script', type='application/ld+json') for d in data: inner_script = str(d)[36:-10] json_script = json.loads(inner_script) headline = json_script.get('headline') authors = json_script.get('author') if authors is None: continue else: try: authors_temp = authors[0].get('name') authors_temp = re.split(' und |, ', authors_temp) except KeyError: authors_temp = ['Organisation'] authors = authors_temp break if headline is None: headline = soup.find("title").text headline = headline.rsplit('-', 2)[0] headline = headline.strip() ressort = find_tag(soup, ressort_tag, 1) sub_ressort = find_tag(soup, sub_ressort_tag, 1) op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords, name) return op
def get_opener(): url = 'https://www.zeit.de/index' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36' } response = requests.get(url, headers) soup = BeautifulSoup(response.text, features="html.parser") opener_link = soup.find(id='main').find("a")['href'] # get keywords, headline, author response = requests.get(opener_link, headers) soup = BeautifulSoup(response.text, features="html.parser") keywords = soup.find("meta", {"name": "keywords"})['content'].split(', ') headline = soup.find("title").text.split(' |')[0] soup = BeautifulSoup(response.text, features="html.parser") data = soup.find_all('script', type='application/ld+json') for d in data: inner_script = str(d)[36:-10] json_script = json.loads(inner_script) authors = json_script.get('author') if authors is None: continue else: try: authors_temp = [a.get('name') for a in authors] except AttributeError: authors_temp = [authors.get('name')] authors = authors_temp break ressort = find_tag(soup, ressort_tag, 1) sub_ressort = find_tag(soup, sub_ressort_tag, 1) if len(sub_ressort) == 0: sub_ressort = None op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords, name) return op
def get_opener(): url = 'https://www.faz.net/aktuell/' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36' } response = requests.get(url, headers) soup = BeautifulSoup(response.text, features="html.parser") opener_link = soup.find(class_="Home").find('a')['href'] # get keywords, headline, author response = requests.get(opener_link, headers) soup = BeautifulSoup(response.text, features="html.parser") keywords = soup.find("meta", {"name": "keywords"})['content'].split(', ') headline = soup.find("title").text data = soup.find_all('script', type='application/ld+json') for d in data: inner_script = str(d)[36:-10] json_script = json.loads(inner_script) authors = json_script.get('author') if authors is None: continue else: try: authors_temp = [a.get('name') for a in authors] except AttributeError: authors_temp = [authors.get('name')] authors = authors_temp break data = soup.find( class_="js-adobe-digital-data is-Invisible")['data-digital-data'] data = json.loads(data) ressort = data.get('page').get('ressort') sub_ressort = data.get('page').get('subressort1') op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords, name) return op
def get_opener(): flist = [find_by_article, find_by_bold_text, find_by_h2_tag, find_first_a] url = 'https://www.spiegel.de/' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36' } response = requests.get(url, headers) soup = BeautifulSoup(response.text, features="html.parser") opener_link = find_opener_link(soup, flist) # get keywords, headline, author response = requests.get(opener_link, headers) soup = BeautifulSoup(response.text, features="html.parser") headline = soup.find('title').text.strip().split('DER SPIEGEL')[0][:-3] authors = soup.find("meta", {"name": "author"})['content'].split(', ') authors = authors[0] if len(authors) == 1 else authors[:-1] keywords = soup.find("meta", {"name": "news_keywords"})['content'].split(', ') # structured data # <script type="application/ld+json"> data = soup.find("script", {"type": "application/ld+json"}) data = str(data).split('>', 1)[1] data = data.split('</script>')[0] data = json.loads(data) ressort = data[0].get('articleSection') sub_ressort = data[1].get("itemListElement") sub_ressort = [a.get('item').get('name') for a in sub_ressort] if len(sub_ressort) > 3: sub_ressort = sub_ressort[2] else: sub_ressort = None op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords, name) return op
def get_opener(): # get link to opener url = 'https://www.welt.de/' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36' } response = requests.get(url, headers) soup = BeautifulSoup(response.text, features="html.parser") opener_link = url[:-1] + soup.find("section").find( class_='o-headline').find_parent('a')['href'] # get data on opener response = requests.get(opener_link, headers) soup = BeautifulSoup(response.text, features="html.parser") # meta tags have headline and keywords headline = soup.find('title').text[:-7] keywords = soup.find('meta', {"name": "keywords"})['content'].split(', ') # structured data data = str(soup.find("script", {"type":"application/ld+json", "data-qa":"StructuredData"}))\ .split('>', 1)[1].split('</script>')[0] data = json.loads(data) authors = [data.get('author')['name']] ressort = data.get('category') # subressort is in breadcrumb breadcrumb = soup.find('div', {"class":"c-breadcrumb"})\ .find_all('li') if len(breadcrumb) > 3: sub_ressort = breadcrumb[2].text.strip() else: sub_ressort = None op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords, name) return op