Esempi in Python per recovery_article, esempi in Python per g4_utils_v40.recovery_article

Esempio n. 1

0

Mostra file

File: g4_crawler_scienceetvie_v1.py Progetto: ProjetSID2018/Groupe4_Robot

def recovery_information_sv(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    title = ''
    title = soup_article.find('h1', attrs={'class': 'like-h1'}).get_text()

    # date
    date = soup_article.find("time")["datetime"]

    # author
    author = []
    for span in soup_article.find_all('span', attrs={'class': 'author'}):
        author.append(span.span.get_text())

    # content
    content = ''
    for div in soup_article.find_all('div',
                                     attrs={'class': ['content', 'left']}):
        for p in div.find_all('p'):
            content += p.get_text() + ' '

    # theme
    theme = ''
    tag_meta = soup_article.find('meta', attrs={'property': 'article:tag'})
    theme = tag_meta.get_text('content')

    article = utils.recovery_article(title, 'Scienceetvie',
                                     author, date, content, theme)
    return(article)

Esempio n. 2

0

Mostra file

File: g4_crawler_20minutes_v1.py Progetto: ProjetSID2018/Groupe4_Robot

def get_article(url):
    soup = utils.recovery_flux_url_rss(url)
    article = soup.find("article")
    # Titre de l'article
    title = article.find("h1").get_text()
    # tableau vide quand il y'a pas d'autheur sinon tableau de(s) auteur(s)
    if article.find("header").find("p", class_="authorsign-label") is None:
        authors = []
    else:
        authors = article.find("header").find(
            "p", class_="authorsign-label").get_text().split(" et ")

    # Date de publication de l'article
    date_pub = article.find("time").get("datetime")[0:10]

    # Theme de l'article
    theme = article.find("ol", class_="breadcrumb-list").find_all("li")[1]\
        .find("span").get_text()
    # Contenu de l'article
    content = ""
    for p in article.find("div", class_="content").find_all("p"):
        content = content + p.get_text()
    # Nom du journal
    newspaper = soup.find("footer").find(has_copyright).find("a").get_text()

    return utils.recovery_article(title, newspaper, authors, date_pub, content,
                                  theme)

Esempio n. 3

0

Mostra file

def get_article(url):
    """
        Prend en argument une adresse url (url) et retourne un dictionnaire
    """
    from unidecode import unidecode
    soup = utils.recovery_flux_url_rss(url)
    article = soup.find("article")
    meta = soup.find("meta", property="og:title").get("content")
    tab = meta.split("-")
    n = len(tab)
    theme = tab[n - 2]
    title = "-".join(tab[:n - 2])
    authors = []
    regex = re.compile(r'[\n\r\t]')
    for span in article.find_all("span", class_="author--name"):
        author = regex.sub("", unidecode(span.get_text()))
        authors.append(author.strip())
    date_pub = article.find(
        "span",
        itemprop="datePublished").get("datetime")[:10].replace("-", "/")
    date_pub = str(date.datetime.strptime(date_pub, "%d/%m/%Y").date())
    content = ""
    for div in article.find_all("div",
                                class_=[
                                    "article--intro", "article--wysiwyg",
                                    "article--footnotes"
                                ]):
        for p in div.find_all("p"):
            content = content + p.get_text()
    content = regex.sub("", content)
    return utils.recovery_article(title, "Telerama", authors, date_pub,
                                  content, theme)

Esempio n. 4

0

Mostra file

File: g4_lepoint_v11.py Progetto: ProjetSID2018/Groupe4_Robot

def collect_articles(list_dictionaries, list_url_articles, theme):
    """Add the articles (dictionaries) from a list of URL in a list of
    dictionaries
    Arguments:
        list_dictionaries {list} -- list of dictionaries
        list_url_articles {list} -- list of URL
        theme {string} -- theme related to the list of dictionaries
    """
    for url_article in list_url_articles:
        try:
            req = requests.get(url_article)
            data = req.text
            soup = BeautifulSoup(data, "lxml")

            balise_title = soup.title.string
            sep = balise_title.split(" - Le Point")
            title = sep[0]

            list_authors = []
            for div in soup.find_all('div'):
                if div.get('class') == ['mbs']:
                    for span in div.find_all('span'):
                        name = span.get_text()
                        name = re.sub('Par', '', name)
                        name = re.sub("\s\s+", "", name)
            list_authors.append(name)

            dates = []
            for balise_time in soup.find_all('time'):
                for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                                          str(balise_time)):
                    dates.append(
                        date.datetime.strptime(valeur.group(0), '%d/%m/%Y'))
            date_publication = date.datetime.strftime(min(dates), '%d/%m/%Y')
            date_publication = str(
                date.datetime.strptime(date_publication, "%d/%m/%Y").date())

            content = ''
            for h2 in soup.find_all('h2'):
                if h2.get('class') == ['art-chapeau']:
                    content += h2.get_text() + " "
            for div in soup.find_all('div'):
                if div.get('class') == ['art-text']:
                    for p in div.find_all('p'):
                        content += p.get_text() + " "

            new_article = utils.recovery_article(title, 'LePoint',
                                                 list_authors,
                                                 date_publication, content,
                                                 theme)
            if not utils.is_empty(new_article):
                list_dictionaries.append(new_article)

        except:
            print("Erreur lors de l'enregistrement de l'article")

Esempio n. 5

0

Mostra file

def recovery_information_noob(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    title = soup_article.title.get_text()

    # Retrieval of publication date
    find_date = soup_article.find('time', attrs={"class": "date"})
    for a in find_date.find_all('a'):
        find_valeur = re.compile('[0-9]{4}\/[0-9]{2}\/[0-9]{2}')
        for valeur in find_valeur.finditer(str(a.get("href"))):
            date_p = valeur.group(0)
            date_p = datetime.strptime(date_p, "%Y/%m/%d")\
                .strftime("%Y-%m-%d")

    # Retrieval of the author of the article
    author = []
    for div in soup_article.find_all('div'):
        if re.search('author', str(div.get("class"))):
            author.append(div.p.span.get_text())

    # Retrieval of the artical theme
    theme = ""
    for nav in soup_article.find_all('nav'):
        if nav.get("class") == ['breadcrumb']:
            for ol in nav.find_all('ol'):
                for a in ol.find_all('a'):
                    theme = a.get_text()

    # Retrieving the content of the article
    contents = ""
    for div in soup_article.find_all('div'):
        if re.search('body', str(div.get("id"))):
            for aside in div.find_all('aside'):
                for p in aside.find_all('p'):
                    p.string = ""
            for p in div.find_all('p'):
                for a in p.find_all('a'):
                    if a.get("class") == ['lire']:
                        a.string = ""
                for img in p.find_all('img'):
                    p.string = ""
                contents += p.get_text() + " "

    article = utils.recovery_article(title, 'NouvelObservateur',
                                     author, date_p, contents, theme)
    return(article)

Esempio n. 6

0

Mostra file

def get_article(url):
    """
    Arguments :
        - URL address
    Returns :
        - An article
        {
            "title" : str,
            "newspaper" : str,
            "author" : [str],
            "date_publi" : str,
            "content" : str,
            "theme" : str
            }
    """
    soup = utils.recovery_flux_url_rss(url)
    article = soup.find("article")
    # Titre de l'article
    title = article.find("h1").get_text()

    # tableau vide quand il y'a pas d'autheur sinon tableau de(s) auteur(s)
    authors = [] if article.find("header") .find(
        "p", class_="authorsign-label") is None else unidecode(
        article.find("header") .find(
            "p", class_="authorsign-label").get_text()).split(" et ")

    # Date de publication de l'article
    date_pub = article.find("time").get("datetime")[:10]

    # Theme de l'article
    theme = article.find("ol", class_="breadcrumb-list")\
        .find_all("li")[1].find("span").get_text()

    # Contenu de l'article
    content = ""
    for p in article.find("div", class_="content").find_all("p"):
        content = content + p.get_text() + " "

    # Nom du journal
    newspaper = soup.find("footer").find(has_copyright).find("a").get_text()
    regex = re.compile(r'[\n\r\t]')
    # Elever les \n \r \t du contenu
    content = regex.sub("", content)
    return utils.recovery_article(
        unidecode(title),
        unidecode(newspaper),
        authors,
        str(date_pub),
        unidecode(content),
        unidecode(theme))

Esempio n. 7

0

Mostra file

File: g4_crawler_figaro_v12.py Progetto: ProjetSID2018/Groupe4_Robot

def collect_articles(list_dictionaries, list_url_articles, theme):
    """Add the articles (dictionaries) from a list of URL in a list of
    dictionaries
    Arguments:
        list_dictionaries {list} -- list of dictionaries
        list_url_articles {list} -- list of URL
        theme {string} -- theme related to the list of dictionaries
    """
    for url_article in list_url_articles:
        req = requests.get(url_article)
        data = req.text
        soup = BeautifulSoup(data, "lxml")

        title = soup.title.string

        list_authors = []
        for a in soup.find_all('a'):
            if a.get("class") == ['fig-content-metas__author']:
                name = re.sub("\s\s+", "", a.get_text())
                name = re.sub("\n", "", name)
                list_authors.append(name)
        if len(list_authors) == 0:
            for span in soup.find_all('span'):
                if span.get("class") == ['fig-content-metas__author']:
                    name = re.sub("\s\s+", "", span.get_text())
                    name = re.sub("\n", "", name)
                    list_authors.append(name)

        date_publication = ''
        for marker_time in soup.find_all('time'):
            for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                                      str(marker_time)):
                date_publication = valeur.group(0)
        date_publication = str(
            date.datetime.strptime(date_publication, "%d/%m/%Y").date())

        content = ''
        for p in soup.find_all('p'):
            if p.get("class") == ['fig-content__chapo']:
                content = p.get_text() + " "

        for div in soup.find_all('div'):
            if div.get("class") == ['fig-content__body']:
                for p in div.find_all('p'):
                    content += p.get_text() + " "

        new_article = utils.recovery_article(title, 'LeFigaro', list_authors,
                                             date_publication, content, theme)
        if not utils.is_empty(new_article):
            list_dictionaries.append(new_article)

Esempio n. 8

0

Mostra file

File: g4_lefigaro_v15.py Progetto: ProjetSID2018/Groupe4_Robot

def collect_articles(list_dictionaries, list_url_articles, theme):

    for url_article in list_url_articles:
        try:
            req = requests.get(url_article)
            data = req.text
            soup = BeautifulSoup(data, 'lxml')

            title = soup.title.string

            list_authors = []
            for a in soup.find_all('a'):
                if a.get("class") == ['fig-content-metas__author']:
                    name = re.sub("\s\s+", "", a.get_text())
                    name = re.sub("\n", "", name)
                    list_authors.append(name)
            if len(list_authors) == 0:
                for span in soup.find_all('span'):
                    if span.get("class") == ['fig-content-metas__author']:
                        name = re.sub("\s\s+", "", span.get_text())
                        name = re.sub("\n", "", name)
                        list_authors.append(name)

            date_publication = ""
            for marker_time in soup.find_all('time'):
                for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                                          str(marker_time)):
                    date_publication = valeur.group(0)
            date_publication = str(
                date.datetime.strptime(date_publication, "%d/%m/%Y").date())

            content = ""
            for p in soup.find_all('p'):
                if p.get("class") == ['fig-content__chapo']:
                    content = p.get_text() + " "

            for div in soup.find_all('div'):
                if div.get("class") == ['fig-content__body']:
                    for p in div.find_all('p'):
                        content += p.get_text() + " "

            new_article = utils.recovery_article(title, 'LeFigaro',
                                                 list_authors,
                                                 date_publication, content,
                                                 theme)
            if not utils.is_empty(new_article):
                list_dictionaries.append(new_article)

        except:
            print("Erreur lors de l'enregistrement de l'article")

Esempio n. 9

0

Mostra file

def collect_articles(list_dictionaries, list_url_articles, list_titre):
    j = 0
    for url_article in list_url_articles:
        j = j + 1
        soup = utils.recovery_flux_url_rss(url_article)

        for titl in soup.find_all('title'):  # find the title
            tit = titl.get_text()
            if len(tit.split('-')) == 2:
                title = tit.split('-')[0]

        authors = []
        for a in soup.find_all('a'):  # find the authors
            if a.get('href') is not None:
                if "dpi-authors" in a.get('href').split('/'):
                    tit = a.get('href').split('/')[-1]
                    authors.append(tit.split('-')[0] + ' ' + tit.split('-')[1])
        if len(authors) == 0:
            authors.append('')

        dates = []
        date_publication = []
        for balise_time in soup.find_all('time'):  # find publication's date
            if 'pubdate' in balise_time.get('class'):
                dates.append(balise_time.get('datetime').split('T')[0])
                date_publication.append(
                    balise_time.get('datetime').split('T')[0])

        theme = re.search("www.lesoir.be/(.*)/", url_article)[1]

        content = ''
        for p in soup.find_all('p'):
            if len(p.get_text().split(" ")) >= 2:
                content += p.get_text()

        new_article = utils.recovery_article(title, 'lesoir', authors,
                                             date_publication, content, theme)

        if (j == 3):
            time.sleep(71)
            j = 0

        if not utils.is_empty(new_article):
            erreur = "non"
            for tit in list_titre:
                if title == tit:
                    erreur = "oui"
            if len(content) > 10 and erreur == "non":
                list_titre.append(title)
                list_dictionaries.append(new_article)

Esempio n. 10

0

Mostra file

File: g4_crawler_humanite_v1.py Progetto: ProjetSID2018/Groupe4_Robot

def recovery_information_hum(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'og:title':
            title = meta.get("content")

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:section':
            theme = meta.get("content")

    author = []
    for h2 in soup_article.find_all('h2'):
        for a in h2.find_all('a'):
            if re.search('auteur', str(a.get("href"))):
                author.append(a.get_text())

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:published_time':
            raw_date = meta.get("content")
            date_p = raw_date[0:10]
            date_p = str(datetime.strptime(date_p, "%Y-%m-%d").date())

    contents = ""
    for div in soup_article.find_all('div'):
        if div.get("class") == [
                'field', 'field-name-field-news-chapo', 'field-type-text-long',
                'field-label-hidden'
        ]:
            for p in div.find_all('p'):
                contents += p.get_text()
        if div.get("class") == [
                'field', 'field-name-field-news-text', 'field-type-text-long',
                'field-label-hidden'
        ]:
            for p in div.find_all('p'):
                contents += p.get_text()

    article = utils.recovery_article(title, 'Humanite', author, date_p,
                                     contents, theme)
    return (article)

Esempio n. 11

0

Mostra file

File: G4_femina_crawler_function.py Progetto: ProjetSID2018/Groupe4_Robot

def recovery_information_fem(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    title = soup_article.title.get_text()
    title = title.split(" - ")
    title = title[0]

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:published_time':
            raw_date = meta.get("content")
            date_p = raw_date[0:10]

    author = []
    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:author':
            author.append(meta.get("content"))

    theme = ""
    for link in soup_article.find_all('link'):
        if link.get("rel") == ['Index']:
            link_theme = link.get("href")
            part_link = link_theme.split("/")
            theme = part_link[3]

    contents = ""
    for div in soup_article.find_all('div'):
        if div.get("class") == ['chapo']:
            for p in div.find_all('p'):
                contents += p.get_text() + " "
        if div.get("class") == ['contenu']:
            for p in div.find_all('p'):
                contents += p.get_text() + " "
        if div.get("class") == ['diaporama']:
            for p in div.find_all('p'):
                contents += p.get_text() + " "
    contents = re.sub(r"\s\s+", " ", contents)

    article = utils.recovery_article(title, 'Femina', author, date_p, contents,
                                     theme)
    return (article)

Esempio n. 12

0

Mostra file

def recovery_information_lg(url):
    """
        Arguments:
            url : string
        Return :
            article : dictionary
        It retrieve for each article the title, newspaper, author, date, theme
    """
    soup = utils.recovery_flux_url_rss(url)

    # Retrieving the title
    title = ''
    balise_title = soup.title.string
    sep = balise_title.split('—')
    title = unidecode.unidecode('—'.join(sep[:-1]))

    tag_context = soup.find('span', attrs={'class': 'context'})

    # Retrieving of author
    author = []
    author.append(tag_context.a.get_text())

    # Retrieving of publication date
    date_p = ''
    regex_date = re.search('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                           tag_context.get_text())
    date_p = regex_date.group(0)
    date_p = str(date.datetime.strptime(date_p, '%d/%m/%Y').date())

    # Retrieving the theme
    tag_post_cat = soup.find('ul', attrs={'class': 'post-categories'})
    for li in tag_post_cat.find_all('li'):
        theme = li.get_text()

    # Retrieving the content of the article
    contents = ''
    tag_content = soup.find('div', attrs={'class': 'content'})
    if tag_content:
        for p in tag_content.find_all('p'):
            contents += p.get_text() + " "

    new_article = utils.recovery_article(title, 'LeGorafi', author, date_p,
                                         contents, theme)
    return (new_article)

Esempio n. 13

0

Mostra file

File: g4_anciens_articles_latribune_V0.py Progetto: ProjetSID2018/Groupe4_Robot

def recovery_information_lt(url_article):
    """
    Arguments:
        - url of one article
    Returns:
        - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    # Retrieve the title
    title = soup_article.title.string

    # Retrieve the theme
    theme = ""
    for li in soup_article.find_all('li'):
        if li.get("itemprop") == 'itemListElement':
            theme = li.a.span.get_text()

    # Retrieve the author
    author = []
    for span in soup_article.find_all('span'):
        if span.get("class") == ['author-name']:
            author.append(span.a.span.get_text())

    # Retrieve the publication date
    date_p = ""
    for time in soup_article.find_all('time'):
        date_p = time.get("datetime")
        for valeur in re.finditer('[0-9]{4}\-[0-9]{2}\-[0-9]{2}', str(time)):
            date_p = valeur.group(0)

    # Retrieve the content
    contents = ""
    for div in soup_article.find_all('div'):
        if div.get("itemprop") == 'articleBody':
            for p in div.find_all('p'):
                contents += p.get_text() + " "

    article = utils.recovery_article(title, 'LaTribune', author, date_p,
                                     contents, theme)
    return (article)

Esempio n. 14

0

Mostra file

File: g4_crawler_futurasciences_v1.py Progetto: ProjetSID2018/Groupe4_Robot

def recovery_information_fusc(url):
    """
        Arguments:
            url : string
        Return :
            article : dictionary
        It retrieve for each article the title, newspaper, author, date, theme
    """
    soup = utils.recovery_flux_url_rss(url)
    # retrieve title
    title = ''
    title = unidecode.unidecode(soup.title.string)
    indice = title.find('|')
    if indice != -1:
        title = title[:indice - 1]
    # retrieve the author
    author = []
    tag_author = soup.find('h3', attrs={'itemprop': 'author'})
    author.append(tag_author.get_text())

    # retrieve date
    publi_date = ''
    regex_date = re.search('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', soup.time.string)
    publi_date = regex_date.group(0)
    publi_date = str(date.datetime.strptime(publi_date, '%d/%m/%Y').date())

    # retrieve content
    content = ''
    for p in soup.find_all('p'):
        for p2 in re.finditer('py0p5', p.get('class')[-1]):
            content += p.get_text()
    content = unidecode.unidecode(content)

    # retrieve theme
    delimiter = url.split('/')
    theme = delimiter[3]

    article = utils.recovery_article(title, 'FuturaSciences', author,
                                     publi_date, content, theme)
    return (article)

Esempio n. 15

0

Mostra file

File: g4_ladepeche_V1.py Progetto: ProjetSID2018/Groupe4_Robot

def recovery_information_ld(url):

    soup = utils.recovery_flux_url_rss(url)
    # Retrieve the title
    for meta in soup.find_all('meta'):
        if meta.get("property") == 'og:title':
            title = meta.get("content")

    # Retrieve the publication date
    for time in soup.find_all('time'):
        if time.get("itemprop") == 'datePublished':
            date = time.get("itemprop")
            for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                                      str(time)):
                date = valeur.group(0)
                date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")
    print(date)

    # Retrieve the author
    author = []
    for div in soup.find_all('div'):
        if div.get("class") == ['article_author']:
            author.append(div.span.get_text())

    # Retrieve the content
    content = ""
    for div in soup.find_all('div'):
        if div.get("itemprop") == 'articleBody':
            for p in div.find_all('p'):
                content += p.get_text() + " "

    # Retrieve the theme
    theme = ""
    for h2 in soup.find_all('h2'):
        if h2.get("itemprop") == 'about':
            theme = h2.get_text()

    article = utils.recovery_article(title, 'La Depeche', author, date,
                                     content, theme)
    return (article)

Esempio n. 16

0

Mostra file

File: g4_crawler_ladepeche_v11.py Progetto: ProjetSID2018/Groupe4_Robot

def recovery_article_ld(url):
    """
        Arguments:
            url : string
        Return :
            article : dictionary
        It retrieve for each article the title, newspaper, author, date, theme
    """
    soup = utils.recovery_flux_url_rss(url)

    # Retrieve the title
    tag_meta = soup.find('meta', attrs={'property': 'og:title'})
    title = tag_meta.get('content')

    # Retrieve the publication date
    publi_date = ''
    tag_publi_date = soup.find('time', attrs={'itemprop': 'datePublished'})
    regex_date = re.search('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                           tag_publi_date.string)
    publi_date = regex_date.group(0)

    # Retrieve the author
    author = []
    for div in soup.find_all('div', attrs={'class': 'article_author'}):
        author.append(div.span.get_text())

    # Retrieve the content
    content = ''
    for div in soup.find_all('div', attrs={'itemprop': 'articleBody'}):
        for p in div.find_all('p'):
            content += p.get_text() + ' '
    # Retrieve the theme
    theme = ""
    theme = soup.find('h2', attrs={'itemprop': 'about'}).get_text()

    # Retrieve all the informations off the article
    article = utils.recovery_article(title, 'LaDepeche', author, publi_date,
                                     content, theme)

    return (article)

Esempio n. 17

0

Mostra file

File: g4_scienceetvie_v1.py Progetto: ProjetSID2018/Groupe4_Robot

def recovery_information_sv(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    # title
    title = ""
    for h1 in soup_article.find_all("h1"):
        if h1.get("class") == ["like-h1"]:
            title = h1.get_text()

    # date
    t_date = soup_article.find("time")["datetime"]

    # author
    author = []
    for span in soup_article.find_all('span'):
        if span.get("class") == ["author"]:
            author.append(span.span.get_text())

    # content
    content = ""
    for div in soup_article.find_all('div'):
        if div.get("class") == ['content', 'left']:
            for p in div.find_all('p'):
                content += p.get_text() + " "

    # theme
    theme = ""
    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:tag':
            theme = meta.get("content")

    article = utils.recovery_article(title, 'Scienceetvie', author, date,
                                     content, theme)
    return (article)

Esempio n. 18

0

Mostra file

File: g4_telerama_v1.py Progetto: ProjetSID2018/Groupe4_Robot

def get_article(url):
    """
        Arguments :
            - URL address
        Returns :
            - Dictionnary
    """
    from unidecode import unidecode
    soup = utils.recovery_flux_url_rss(url)
    article = soup.find("article")
    meta = soup.find("meta", property="og:title").get("content")
    tab = meta.split("-")
    n = len(tab)

    theme = tab[n - 2]

    title = "-".join(tab[:n - 2])

    authors = []
    regex = re.compile(r'[\n\r\t]')
    for span in article.find_all("span", class_="author--name"):
        author = regex.sub("", unidecode(span.get_text()))
        authors.append(author.strip())

    date_pub = article.find(
        "span", itemprop="datePublished").get("datetime")[:10].split("-")
    date_pub = date_pub[2] + "-" + date_pub[1] + "-" + date_pub[0]

    content = ""
    for div in article.find_all("div",
                                class_=[
                                    "article--intro", "article--wysiwyg",
                                    "article--footnotes"
                                ]):
        for p in div.find_all("p"):
            content = content + p.get_text()
    content = regex.sub("", content)
    return utils.recovery_article(title, "Telerama", authors, date_pub,
                                  content, theme)

Esempio n. 19

0

Mostra file

def collect_article(article_link):
    """Extact informations from all these articles

    Arguments:
        article_link {string} -- url of an article

    Returns:
        dict -- dict of all informations
    """

    if "video" in article_link or "/apps/" in article_link or "checknews" in article_link or not re.search(
            r"\d\d\d\d/\d\d/\d\d", article_link):
        return None

    else:
        req = requests.get(article_link)
        data = req.text
        soup = BeautifulSoup(data, "lxml")
        try:
            theme = re.search("http://www.liberation.fr/(.*)", article_link)[1]
            theme = theme.split('/')[0]
        except:
            theme = ''

        if soup.find("div",
                     class_="direct-headband") or article_link != req.url:
            return None
        else:
            balise_title = soup.find("h1")
            balise_title = balise_title.get_text()
            balise_title = re.sub(r"\s\s+", "", balise_title)

            newspaper = "Liberation"
            title = unidecode.unidecode(balise_title)

            date_p = ""
            authors = []
            for span in soup.find_all('span'):
                if span.get("class") == ['author']:
                    if (span.a):
                        author = span.a.string
                        if author:
                            authors.append(author)
                if span.get("class") == ['date']:
                    if (span.time):
                        date_p = date.datetime.strptime(
                            span.time.get("datetime"),
                            "%Y-%m-%dT%H:%M:%S").date()
                        date_p = date_p.strftime("%d/%m/%Y")
            date_p = str(date.datetime.strptime(date_p, "%d/%m/%Y").date())
            if not authors:
                authors = ["liberation"]

            content = ""
            for div in soup.find_all('div'):
                for p in div.find_all('p'):
                    content += p.get_text() + " "
            content = re.sub("<>", "", content)
            content = unidecode.unidecode(content)

            new_article = utils.recovery_article(title, newspaper, authors,
                                                 date_p, content, theme)

            return new_article

Esempio n. 20

0

Mostra file

File: g4_crawler_equipe_v12.py Progetto: ProjetSID2018/Groupe4_Robot

def recovery_information_equi(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    try:
        soup_article = utils.recovery_flux_url_rss(url_article)

        # Retrieving of title
        balise_title = soup_article.title.get_text()
        sep = balise_title.split(" - ")
        title = sep[0]
        title = title.encode("latin1").decode()

        # Retrieving of the author
        author = []
        aut = ""
        for meta in soup_article.find_all('meta'):
            if meta.get("name") == 'Author':
                aut = meta.get("content")
                aut = aut.encode("latin1").decode()
                author.append(aut)

        # Retrieving of date of publication
        date_p = ""
        for div in soup_article.find_all('div'):
            if div.get("class") == ['article__date']:
                for t in div.find_all('time'):
                    if t.get("itemprop") == 'datePublished':
                        raw_date = t.get("datetime")
                        date_p = raw_date[0:10]

        # Retrieving of the artical theme
        theme = ""
        for div in soup_article.find_all('div'):
            if div.get("class") == ['navigation__sousmenu']:
                theme = div.get("libelle")

        # Retrieving the content of the article
        contents = ""
        for div in soup_article.find_all('div'):
            if div.get("itemprop") == 'mainEntityOfPage':
                for div2 in div.find_all('div'):
                    for valeur in re.finditer('lire-aussi',
                                              str(div2.get("class"))):
                        div2.string = ""
                    for valeur2 in re.finditer('paragraphe__exergue',
                                               str(div2.get("class"))):
                        div2.string = ""
                for span in div.find_all('span'):
                    span.string = ""
                for block in div.find_all('blockquote'):
                    block.string = ""
                for p in div.find_all('p'):
                    if p.get("data-type") == 'Accroche':
                        contents = p.get_text()
                        contents = re.sub(r"\s\s+", " ", contents)
                        contents = contents.replace('°', ' ° ')

        article = utils.recovery_article(title, 'Equipe', author, date_p,
                                         contents, theme)

    except:
        article = utils.recovery_article('', 'Equipe', '', '', '', '')
    return (article)

Esempio n. 21

0

Mostra file

def get_information(article_link):
    """Extact informations from all these articles

    Arguments:
        article_link {string} -- url of an article

    Returns:
        dict -- dict of all informations
    """

    if "video" in article_link or "/apps/" in article_link or "checknews" in\
            article_link or not re.search(r"\d\d\d\d/\d\d/\d\d", article_link):
        return None

    else:

        date_article = re.search(r"\d{4}/\d{2}/\d{2}", article_link)[0]
        date_article = date.datetime.strptime(date_article, "%Y/%m/%d")

        diff_date = date.datetime.now() - date_article

        if diff_date.days > 7:
            return None

        else:
            req = requests.get(article_link)
            req.encoding = "utf-8"
            data = req.text
            soup = BeautifulSoup(data, "lxml")

            if soup.find("div",
                         class_="direct-headband") or article_link != req.url:
                return None
            else:
                balise_title = soup.find("h1")
                balise_title = balise_title.get_text()
                balise_title = re.sub(r"\s\s+", "", balise_title)

                newspaper = "Liberation"
                title = unidecode.unidecode(balise_title)

                author = ""
                for span in soup.find_all('span'):
                    if span.get("class") == ['author']:
                        if (span.a):
                            author = span.a.string
                    if span.get("class") == ['date']:
                        if (span.time):
                            date_p = date.datetime.strptime(
                                span.time.get("datetime"),
                                "%Y-%m-%dT" + "%H:%M:%S").date()
                            date_p = date_p.strftime("%Y-%m-%d")
                print(date_p)

                content = ""
                for div in soup.find_all('div'):
                    for p in div.find_all('p'):
                        content += p.get_text() + " "
                content = re.sub("<>", "", content)
                content = unidecode.unidecode(content)

                new_article = utils.recovery_article(title, newspaper,
                                                     [author], date_p, content,
                                                     " ")

                return new_article