Ejemplo n.º 1
0
def get_opener():
    # get opener link
    url = 'https://www.handelsblatt.com/'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36'
    }
    response = requests.get(url, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    opener_link = url[:-1] + soup.find('h1').find_parent('a')['href']

    # get keywords, headline, author
    response = requests.get(opener_link, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    headline = soup.find("title").text
    keywords = soup.find("meta", {"name": "keywords"})['content'].split(',')
    authors = [
        a['content'] for a in soup.find_all("meta", {"name": "vr:author"})
    ]

    if len(authors) == 0:
        authors = authors[0].split(',')

    # find ressorts
    ressort = find_tag(ressort_tag, response)
    sub_ressort = find_tag(sub_ressort_tag, response)

    op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords,
                name)
    return op
Ejemplo n.º 2
0
def get_opener():
    url = 'https://www.n-tv.de/'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36'
    }
    response = requests.get(url, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    opener_link = soup.find('section', {"class": "group"}).find('a')['href']

    # get keywords, headline, author from meta
    response = requests.get(opener_link, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    headline = soup.find("title").text.rsplit(' - ', 1)[0]
    keywords = soup.find("meta",
                         {"name": "news_keywords"})['content'].split(', ')
    authors = [a['content'] for a in soup.find_all("meta", {"name": "author"})]

    # get ressort from breadcrumb
    breadcrumbs = soup.find("nav", {"class": "breadcrumb"}).find_all('a')
    ressort = breadcrumbs[1].text.strip()

    if len(breadcrumbs) > 2:
        sub_ressort = breadcrumbs[2].text.strip()
    else:
        sub_ressort = None

    op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords,
                name)
    return op
Ejemplo n.º 3
0
def get_opener():
    url = 'https://www.sueddeutsche.de/'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36'
    }

    # get link to opener
    response = requests.get(url, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    opener_link = soup.find("main").find('a')['href']

    # get data on opener
    response = requests.get(opener_link, headers)
    soup = BeautifulSoup(response.text, features="html.parser")

    # headline, keywords and authors are in meta tags
    headline = soup.find("meta", {"property": "og:title"})['content']
    keywords = soup.find("meta", {"name": "keywords"})['content'].split(',')
    authors = soup.find_all("meta", {"name": "author"})
    authors = [a['content'] for a in authors]

    # ressort is in java script
    data = soup.find("script", {"type": "text/javascript"})
    data = str(data).split('[')[1].split(']')[0]
    data = json.loads(data)
    ressort = data.get("ressort")
    sub_ressort = data.get("thema")

    # initialise opener
    op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords,
                name)
    return op
Ejemplo n.º 4
0
def get_opener():
    url = 'https://www.bild.de/'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36'
    }
    response = requests.get(url, headers)

    soup = BeautifulSoup(response.text, features="html.parser")
    opener_link = url[:-1] + soup.find(id='innerContent').find("a")['href']

    # get keywords, headline, author
    response = requests.get(opener_link, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    keywords = soup.find("meta", {"name": "keywords"})['content'].split(',')
    data = soup.find_all('script', type='application/ld+json')

    for d in data:
        inner_script = str(d)[36:-10]
        json_script = json.loads(inner_script)
        headline = json_script.get('headline')
        authors = json_script.get('author')
        if authors is None:
            continue
        else:
            try:
                authors_temp = authors[0].get('name')
                authors_temp = re.split(' und |, ', authors_temp)
            except KeyError:
                authors_temp = ['Organisation']

            authors = authors_temp
            break

    if headline is None:
        headline = soup.find("title").text
        headline = headline.rsplit('-', 2)[0]
        headline = headline.strip()

    ressort = find_tag(soup, ressort_tag, 1)
    sub_ressort = find_tag(soup, sub_ressort_tag, 1)

    op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords,
                name)
    return op
Ejemplo n.º 5
0
def get_opener():
    url = 'https://www.zeit.de/index'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36'
    }
    response = requests.get(url, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    opener_link = soup.find(id='main').find("a")['href']

    # get keywords, headline, author
    response = requests.get(opener_link, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    keywords = soup.find("meta", {"name": "keywords"})['content'].split(', ')
    headline = soup.find("title").text.split(' |')[0]

    soup = BeautifulSoup(response.text, features="html.parser")
    data = soup.find_all('script', type='application/ld+json')

    for d in data:
        inner_script = str(d)[36:-10]
        json_script = json.loads(inner_script)
        authors = json_script.get('author')
        if authors is None:
            continue
        else:
            try:
                authors_temp = [a.get('name') for a in authors]
            except AttributeError:
                authors_temp = [authors.get('name')]

            authors = authors_temp
            break

    ressort = find_tag(soup, ressort_tag, 1)
    sub_ressort = find_tag(soup, sub_ressort_tag, 1)
    if len(sub_ressort) == 0:
        sub_ressort = None

    op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords,
                name)
    return op
Ejemplo n.º 6
0
def get_opener():
    url = 'https://www.faz.net/aktuell/'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36'
    }
    response = requests.get(url, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    opener_link = soup.find(class_="Home").find('a')['href']

    # get keywords, headline, author
    response = requests.get(opener_link, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    keywords = soup.find("meta", {"name": "keywords"})['content'].split(', ')
    headline = soup.find("title").text

    data = soup.find_all('script', type='application/ld+json')
    for d in data:
        inner_script = str(d)[36:-10]
        json_script = json.loads(inner_script)
        authors = json_script.get('author')
        if authors is None:
            continue
        else:
            try:
                authors_temp = [a.get('name') for a in authors]
            except AttributeError:
                authors_temp = [authors.get('name')]

            authors = authors_temp
            break

    data = soup.find(
        class_="js-adobe-digital-data is-Invisible")['data-digital-data']
    data = json.loads(data)
    ressort = data.get('page').get('ressort')
    sub_ressort = data.get('page').get('subressort1')

    op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords,
                name)
    return op
Ejemplo n.º 7
0
def get_opener():
    flist = [find_by_article, find_by_bold_text, find_by_h2_tag, find_first_a]
    url = 'https://www.spiegel.de/'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36'
    }
    response = requests.get(url, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    opener_link = find_opener_link(soup, flist)

    # get keywords, headline, author
    response = requests.get(opener_link, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    headline = soup.find('title').text.strip().split('DER SPIEGEL')[0][:-3]
    authors = soup.find("meta", {"name": "author"})['content'].split(', ')
    authors = authors[0] if len(authors) == 1 else authors[:-1]
    keywords = soup.find("meta",
                         {"name": "news_keywords"})['content'].split(', ')

    # structured data
    # <script type="application/ld+json">
    data = soup.find("script", {"type": "application/ld+json"})
    data = str(data).split('>', 1)[1]
    data = data.split('</script>')[0]
    data = json.loads(data)

    ressort = data[0].get('articleSection')
    sub_ressort = data[1].get("itemListElement")

    sub_ressort = [a.get('item').get('name') for a in sub_ressort]
    if len(sub_ressort) > 3:
        sub_ressort = sub_ressort[2]
    else:
        sub_ressort = None

    op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords,
                name)

    return op
Ejemplo n.º 8
0
def get_opener():
    # get link to opener
    url = 'https://www.welt.de/'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2866.71 Safari/537.36'
    }
    response = requests.get(url, headers)
    soup = BeautifulSoup(response.text, features="html.parser")
    opener_link = url[:-1] + soup.find("section").find(
        class_='o-headline').find_parent('a')['href']

    # get data on opener
    response = requests.get(opener_link, headers)
    soup = BeautifulSoup(response.text, features="html.parser")

    # meta tags have headline and keywords
    headline = soup.find('title').text[:-7]
    keywords = soup.find('meta', {"name": "keywords"})['content'].split(', ')

    # structured data
    data = str(soup.find("script", {"type":"application/ld+json", "data-qa":"StructuredData"}))\
        .split('>', 1)[1].split('</script>')[0]
    data = json.loads(data)
    authors = [data.get('author')['name']]
    ressort = data.get('category')

    # subressort is in breadcrumb
    breadcrumb = soup.find('div', {"class":"c-breadcrumb"})\
                     .find_all('li')

    if len(breadcrumb) > 3:
        sub_ressort = breadcrumb[2].text.strip()
    else:
        sub_ressort = None

    op = opener(headline, opener_link, authors, ressort, sub_ressort, keywords,
                name)
    return op