Beispiel #1
0
def get_rss_infos():
    """Get all articles link

    Returns:
        list -- list of articles url
    """

    url_rss_lib = "http://www.liberation.fr/rss"
    soup = utils.recovery_flux_url_rss(url_rss_lib)

    rss_items = soup.find_all("li")

    rss_list = []

    link_rss = []

    for ri in rss_items:
        if ri.get("class") == ['rss-item']:
            rss_list.append(ri.a.get('href'))

    for rl in rss_list:
        soup = utils.recovery_flux_url_rss(rl)
        entre = soup.find_all('entry')
        for e in entre:
            link_rss.append(e.link.get('href'))

    return link_rss
def recovery_link_new_articles_ld(url_rss):
    # We retrieve the rss feeds for each article page.
    # Each HTML-coded article is scanned with beautiful soup.
    soup = utils.recovery_flux_url_rss(url_rss)
    list_link = []
    for link in soup.find_all("a"):
        if link.get("class") == ["rss"]:
            url = link.get("href")
            url = "https://www.ladepeche.fr/" + url
            soup = utils.recovery_flux_url_rss(url)
            items = soup.find_all("item")
            # We retrieve all articles
            for item in items:
                list_link.append(re.search(r"<link/>(.*)", str(item))[1])
    return (list_link)
def is_article(url):
    """
        Prend en argument une adresse url et retourne
        vrai s'il est une article et faux sinon
    """
    soup = utils.recovery_flux_url_rss(url)
    return soup.find("div", class_="article--text") is not None
def recovery_old_articles_LD(
        file_target='/var/www/html/projet2018/data/clean/robot/' +
    str(datetime.datetime.now().date())):
    """
        it create a json for each article
    """
    list_category = [
        'grand-sud', 'actu', 'faits-divers', 'economie', 'sports', 'sante',
        'tv-people', 'sorties'
    ]
    links_article = []
    list_articles = []
    for cat in list_category:
        for i in range(1, 100):
            try:
                url = 'https://www.ladepeche.fr/recherche/?p=' + str(i)\
                        + '&c=' + cat + '&plus-infos=1'
                soup = utils.recovery_flux_url_rss(url)
            except:
                break

        for h2 in soup.find_all('h2'):
            for item in h2.find_all('a'):
                link = 'https://www.ladepeche.fr' + str(item.get('href'))
                links_article.append(link)

        for link in links_article:
            new_article = recovery_article_ld(link)
            if not utils.is_empty(new_article):
                list_articles.append(new_article)
        utils.create_json(file_target, list_articles, "Ladepeche", "LD")
Beispiel #5
0
def get_article(url):
    """
        Prend en argument une adresse url (url) et retourne un dictionnaire
    """
    from unidecode import unidecode
    soup = utils.recovery_flux_url_rss(url)
    article = soup.find("article")
    meta = soup.find("meta", property="og:title").get("content")
    tab = meta.split("-")
    n = len(tab)
    theme = tab[n - 2]
    title = "-".join(tab[:n - 2])
    authors = []
    regex = re.compile(r'[\n\r\t]')
    for span in article.find_all("span", class_="author--name"):
        author = regex.sub("", unidecode(span.get_text()))
        authors.append(author.strip())
    date_pub = article.find(
        "span",
        itemprop="datePublished").get("datetime")[:10].replace("-", "/")
    date_pub = str(date.datetime.strptime(date_pub, "%d/%m/%Y").date())
    content = ""
    for div in article.find_all("div",
                                class_=[
                                    "article--intro", "article--wysiwyg",
                                    "article--footnotes"
                                ]):
        for p in div.find_all("p"):
            content = content + p.get_text()
    content = regex.sub("", content)
    return utils.recovery_article(title, "Telerama", authors, date_pub,
                                  content, theme)
def get_article(url):
    soup = utils.recovery_flux_url_rss(url)
    article = soup.find("article")
    # Titre de l'article
    title = article.find("h1").get_text()
    # tableau vide quand il y'a pas d'autheur sinon tableau de(s) auteur(s)
    if article.find("header").find("p", class_="authorsign-label") is None:
        authors = []
    else:
        authors = article.find("header").find(
            "p", class_="authorsign-label").get_text().split(" et ")

    # Date de publication de l'article
    date_pub = article.find("time").get("datetime")[0:10]

    # Theme de l'article
    theme = article.find("ol", class_="breadcrumb-list").find_all("li")[1]\
        .find("span").get_text()
    # Contenu de l'article
    content = ""
    for p in article.find("div", class_="content").find_all("p"):
        content = content + p.get_text()
    # Nom du journal
    newspaper = soup.find("footer").find(has_copyright).find("a").get_text()

    return utils.recovery_article(title, newspaper, authors, date_pub, content,
                                  theme)
Beispiel #7
0
def recovery_link_new_articles_noob_crawler():
    """
        Arguments:
            - url of the page containing feed links for
            the different categories
        Returns :
            - list of urls of the different categories
    """
    list_category = ["politique", "monde", "economie", "culture",
                     "editos-et-chroniques", "debat"]

    article_noob = []
    for cat in list_category:
        # We retrieve the URL feeds for each page of article
        # Each HTML-coded article is analyzed with beautiful soup
        for i in range(2, 8):
            url_rss_noob = "http://www.nouvelobs.com/" + cat +\
                "/page-" + str(i) + ".html"

            soup_url = utils.recovery_flux_url_rss(url_rss_noob)

            # We retrieve all the articles for a given page
            for h3 in soup_url.find_all('h3'):
                if h3.get("class") == ['title']:
                    if re.search('^\/', str(h3.a.get("href"))):
                        new_article = "http://www.nouvelobs.com" +\
                            h3.a.get("href")
                        article_noob.append(new_article)

    return(article_noob)
def recovery_link_old_articles_equi(url_rss):
    """
        Argument:
            url_rss : string
        Return:
            link_article = list
        Retrieving links of new articles thanks to the rss feed
    """
    list_category = [
        "Athletisme", "Aussi/Aviron", "Auto-moto", "Aussi/Badminton",
        "Aussi/Baseball", "Basket", "Aussi/Biathlon", "Aussi/Boxe",
        "Aussi/Canoe-kayak", "Cyclisme", "Aussi/Equitation", "Aussi/Escrime",
        "Adrenaline/Escalade", "Football", "Aussi/Football-americain",
        "Formule-1", "Golf", "Aussi/Gymnastique", "Aussi/Halterophilie",
        "Handball", "Hippisme", "Aussi/Hockey-sur-gazon", "Aussi/Judo",
        "Natation", "Basket/NBA", "Aussi/Pentathlon-moderne", "Rugby",
        "Sports-de-combat", "Sports-us", "Aussi/Squash", "Adrenaline/Surf",
        "Tennis", "Aussi/Tennis-de-table", "Aussi/Tir", "Aussi/Tir-a-l-arc",
        "Aussi/Triathlon", "Aussi/Mma", "Voile", "Aussi/Volley-ball",
        "Natation/Water-polo", "Aussi/Jeux-paralympiques"
    ]

    # We retrieve the URL feeds for each page of category
    link_article = []
    for cat in list_category:
        url_rss_cat = url_rss + cat + "/"
        soup = utils.recovery_flux_url_rss(url_rss_cat)
        # We retrieve all the articles for a given page
        for div in soup.find_all('div'):
            if div.get("class") == ['home__colead__split']:
                new_article = "https://www.lequipe.fr" + div.a.get("href")
                link_article.append(new_article)
    return (link_article)
def recovery_link_new_articles_hum_crawler():
    """
        Arguments:
            - url of the page containing feed links for
            the different categories
        Returns :
            - list of urls of the different categories

    """
    list_category = [
        "politique", "société", "social-eco", "culture", "sports", "monde",
        "environnement", "rubriques/en-debat"
    ]

    article_humanite = []
    for cat in list_category:
        # We retrieve the URL feeds for each page of article
        # Each HTML-coded article is analyzed with beautiful soup
        for i in range(2, 10):
            try:
                url_rss_humanite = "https://humanite.fr/" + cat + "?page=" +\
                    str(i) + "/feed/"
                soup_url = utils.recovery_flux_url_rss(url_rss_humanite)
                # We retrieve all the articles for a given page
                for div in soup_url.find_all('div'):
                    if re.search('field-name-field-news-chapo',
                                 str(div.get("class"))):
                        for a in div.find_all('a'):
                            article_humanite.append(a.get("href"))
            except:
                break

    return (article_humanite)
def recovery_new_articles_equipe(file_target="data/clean/robot/" +
                                 str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """
    file_json = []
    i = 0
    list_url = recovery_link_new_articles_equipe("https://www.lequipe.fr/rss/")
    for url in list_url:
        soup_url = utils.recovery_flux_url_rss(url)
        items = soup_url.find_all("item")
        article_equipe = []

        # We're picking up every new article in a list
        for item in items:
            article_equipe.append(re.search(r"<link/>(.*)", str(item))[1])
        # Each article is analized one by one
        for article in article_equipe:
            new_article = recovery_information_equipe(article)
            if utils.is_empty(new_article) is False:
                file_json.append(recovery_information_equipe(article))
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "Equipe_rss/", "equi")
            i = 0
            file_json = []

    utils.create_json(file_target, file_json, "Equipe_rss/", "equi")
def recovery_information_sv(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    title = ''
    title = soup_article.find('h1', attrs={'class': 'like-h1'}).get_text()

    # date
    date = soup_article.find("time")["datetime"]

    # author
    author = []
    for span in soup_article.find_all('span', attrs={'class': 'author'}):
        author.append(span.span.get_text())

    # content
    content = ''
    for div in soup_article.find_all('div',
                                     attrs={'class': ['content', 'left']}):
        for p in div.find_all('p'):
            content += p.get_text() + ' '

    # theme
    theme = ''
    tag_meta = soup_article.find('meta', attrs={'property': 'article:tag'})
    theme = tag_meta.get_text('content')

    article = utils.recovery_article(title, 'Scienceetvie',
                                     author, date, content, theme)
    return(article)
def get_article_of_category(url):
    result = []
    soup = utils.recovery_flux_url_rss(url)
    articles = soup.find_all('article')
    for article in articles:
        url_article = "http://www.20minutes.fr" + article.find("a").get("href")
        # Insérer le nouveau article dans un le tableau
        if (is_article(url_article)):
            result.append(get_article(url_article))
    return result
Beispiel #13
0
def is_article(url):
    """
        Arguments :
            - URL address
        Returns :
            - True if the page contains an article
            - False otherwise
    """
    soup = utils.recovery_flux_url_rss(url)
    article = soup.find("article")
    return article is not None
Beispiel #14
0
def recovery_link_new_articles(url_rss):
    """
        Argument:
            url_rss : string
        Return:
            retrieving links of new articles thanks to the rss feed
    """
    soup = utils.recovery_flux_url_rss(url_rss)
    list_link = []
    for link in soup.find_all('a', attrs={'class': 'first-capitalize'}):
        list_link.append('https://www.futura-sciences.com' + link.get('href'))
    return (list_link)
Beispiel #15
0
def recovery_information_noob(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    title = soup_article.title.get_text()

    # Retrieval of publication date
    find_date = soup_article.find('time', attrs={"class": "date"})
    for a in find_date.find_all('a'):
        find_valeur = re.compile('[0-9]{4}\/[0-9]{2}\/[0-9]{2}')
        for valeur in find_valeur.finditer(str(a.get("href"))):
            date_p = valeur.group(0)
            date_p = datetime.strptime(date_p, "%Y/%m/%d")\
                .strftime("%Y-%m-%d")

    # Retrieval of the author of the article
    author = []
    for div in soup_article.find_all('div'):
        if re.search('author', str(div.get("class"))):
            author.append(div.p.span.get_text())

    # Retrieval of the artical theme
    theme = ""
    for nav in soup_article.find_all('nav'):
        if nav.get("class") == ['breadcrumb']:
            for ol in nav.find_all('ol'):
                for a in ol.find_all('a'):
                    theme = a.get_text()

    # Retrieving the content of the article
    contents = ""
    for div in soup_article.find_all('div'):
        if re.search('body', str(div.get("id"))):
            for aside in div.find_all('aside'):
                for p in aside.find_all('p'):
                    p.string = ""
            for p in div.find_all('p'):
                for a in p.find_all('a'):
                    if a.get("class") == ['lire']:
                        a.string = ""
                for img in p.find_all('img'):
                    p.string = ""
                contents += p.get_text() + " "

    article = utils.recovery_article(title, 'NouvelObservateur',
                                     author, date_p, contents, theme)
    return(article)
Beispiel #16
0
def get_article_of_category(url):
    """
    Prend en parametre une catégorie et retour toutes les articles de cette
    catégorie
    """
    result = []
    soup = utils.recovery_flux_url_rss(url)
    articles = soup.find_all("div", class_="item--body")
    for article in articles:
        url_article = article.find("a").get("href")
        if is_article(url_article):
            result.append(get_article(url_article))
    return result
Beispiel #17
0
def recovery_new_articles_lpt(
    file_target="C:/Users/cmisid/Documents/TableauDeBord/LESOIR/" +
    str(date.datetime.now().date()) + "/"):

    list_url_articles = []
    j = 0
    for i in range(0, 1650, 10):
        j = j + 1
        url1 = 'http://www.lesoir.be/archives/recherche?datefilter=lastyear&sort=date+desc&start='
        +str(i) + '&word=terrorisme'
        soup1 = utils.recovery_flux_url_rss(url1)

        for a in soup1.find_all('a'):
            tit = a.get('href')
            if '/archive/' in tit.split('d'):
                url = 'http://www.lesoir.be' + tit
                list_url_articles.append(url)


######################
        url2 = 'http://www.lesoir.be/archives/recherche?datefilter=lastyear&sort=date+desc&start='
        +str(i) + '&word=attentat'
        soup2 = utils.recovery_flux_url_rss(url2)

        for a in soup2.find_all('a'):
            tit = a.get('href')
            if '/archive/' in tit.split('d'):
                url = 'http://www.lesoir.be' + tit
                list_url_articles.append(url)

        if (j == 3):
            time.sleep(71)
            j = 0

    list_dictionaries = []
    list_titre = []
    collect_articles(list_dictionaries, list_url_articles, list_titre)
    utils.create_json(file_target, list_dictionaries, "lesoir/", "lsr")
Beispiel #18
0
def collect_articles(list_dictionaries, list_url_articles, list_titre):
    j = 0
    for url_article in list_url_articles:
        j = j + 1
        soup = utils.recovery_flux_url_rss(url_article)

        for titl in soup.find_all('title'):  # find the title
            tit = titl.get_text()
            if len(tit.split('-')) == 2:
                title = tit.split('-')[0]

        authors = []
        for a in soup.find_all('a'):  # find the authors
            if a.get('href') is not None:
                if "dpi-authors" in a.get('href').split('/'):
                    tit = a.get('href').split('/')[-1]
                    authors.append(tit.split('-')[0] + ' ' + tit.split('-')[1])
        if len(authors) == 0:
            authors.append('')

        dates = []
        date_publication = []
        for balise_time in soup.find_all('time'):  # find publication's date
            if 'pubdate' in balise_time.get('class'):
                dates.append(balise_time.get('datetime').split('T')[0])
                date_publication.append(
                    balise_time.get('datetime').split('T')[0])

        theme = re.search("www.lesoir.be/(.*)/", url_article)[1]

        content = ''
        for p in soup.find_all('p'):
            if len(p.get_text().split(" ")) >= 2:
                content += p.get_text()

        new_article = utils.recovery_article(title, 'lesoir', authors,
                                             date_publication, content, theme)

        if (j == 3):
            time.sleep(71)
            j = 0

        if not utils.is_empty(new_article):
            erreur = "non"
            for tit in list_titre:
                if title == tit:
                    erreur = "oui"
            if len(content) > 10 and erreur == "non":
                list_titre.append(title)
                list_dictionaries.append(new_article)
Beispiel #19
0
def recovery_link_new_articles_lg(url_rss):
    """
        Argument:
            url_rss : string
        Return:
            link_article = list
        Retrieving links of new articles thanks to the rss feed
    """
    soup = utils.recovery_flux_url_rss(url_rss)
    items = soup.find_all('item')
    links_article_gorafi = []
    for item in items:
        links_article_gorafi.append(re.search(r"<link/>(.*)", str(item))[1])
    return (links_article_gorafi)
Beispiel #20
0
def get_article(url):
    """
    Arguments :
        - URL address
    Returns :
        - An article
        {
            "title" : str,
            "newspaper" : str,
            "author" : [str],
            "date_publi" : str,
            "content" : str,
            "theme" : str
            }
    """
    soup = utils.recovery_flux_url_rss(url)
    article = soup.find("article")
    # Titre de l'article
    title = article.find("h1").get_text()

    # tableau vide quand il y'a pas d'autheur sinon tableau de(s) auteur(s)
    authors = [] if article.find("header") .find(
        "p", class_="authorsign-label") is None else unidecode(
        article.find("header") .find(
            "p", class_="authorsign-label").get_text()).split(" et ")

    # Date de publication de l'article
    date_pub = article.find("time").get("datetime")[:10]

    # Theme de l'article
    theme = article.find("ol", class_="breadcrumb-list")\
        .find_all("li")[1].find("span").get_text()

    # Contenu de l'article
    content = ""
    for p in article.find("div", class_="content").find_all("p"):
        content = content + p.get_text() + " "

    # Nom du journal
    newspaper = soup.find("footer").find(has_copyright).find("a").get_text()
    regex = re.compile(r'[\n\r\t]')
    # Elever les \n \r \t du contenu
    content = regex.sub("", content)
    return utils.recovery_article(
        unidecode(title),
        unidecode(newspaper),
        authors,
        str(date_pub),
        unidecode(content),
        unidecode(theme))
def get_article_of_category(url):
    """
        Arguments :
            - Category
        Returns :
            - All articles of this category
    """
    result = []
    soup = utils.recovery_flux_url_rss(url)
    articles = soup.find_all("div", class_="item--body")
    for article in articles:
        url_article = article.find("a").get("href")
        if is_article(url_article):
            result.append(get_article(url_article))
    return result
def recovery_new_articles_lt(file_target="C:/Users/lea/Desktop/PROJET/" +
                             str(date.datetime.now().date()) + "/"):

    list_category = [
        "actualites/economie/economie", "Entreprises-secteurs",
        "media-telecom-entreprise", "finance-patrimoine-investir", "opinions",
        "regions/economie-en-region"
    ]
    file_json = []
    articles_latribune = []
    # We retrieve the URL feeds for each page of article
    for cat in list_category:
        url_latribune = "https://www.latribune.fr/" + cat + ".html"
        soup_url = utils.recovery_flux_url_rss(url_latribune)

        for ul in soup_url.find_all("ul"):
            if ul.get("class") == ['pagination-archive', 'pages']:
                for li in ul.find_all("li"):
                    for a in li.find_all("a"):
                        link = a.get("href")
                        link2 = "https://www.latribune.fr" + link
                        soup_url = utils.recovery_flux_url_rss(link2)

                        for div in soup_url.find_all("div"):
                            for valeur in re.finditer('title-river',
                                                      str(div.get("class"))):
                                for a in div.find_all('a'):
                                    articles_latribune.append(a.get("href"))

    # Each article is analized one by one
    for article in articles_latribune:
        new_article = recovery_information_lt(article)
        if utils.is_empty(new_article) is False:
            file_json.append(new_article)

    utils.create_json(file_target, file_json, "latribune_crawler/", "lt")
def fileJson(article_latribune):
    file_json = []
    for article in article_latribune:
        soup = utils.recovery_flux_url_rss(article)
        # Retrieve the title
        title = soup.title.string

        # Retrieve the theme
        for li in soup.find_all('li'):
            if li.get("itemprop") == 'itemListElement':
                theme = li.a.span.get_text()

        # Retrieve the author
        author = []
        for span in soup.find_all('span'):
            if span.get("class") == ['author-name']:
                author.append(span.a.span.get_text())

        # Retrieve the publication date
        for time in soup.find_all('time'):
            if time.get("itemprop") == 'datePublished':
                date = time.get("itemprop")
                for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                                          str(time)):
                    date = valeur.group(0)
                    date = datetime.strptime(date, "%d/%m/%Y")\
                        .strftime("%Y-%m-%d")
        print(date)

        # Retrieve the content
        content = ""
        for div in soup.find_all('div'):
            if div.get("itemprop") == 'articleBody':
                for p in div.find_all('p'):
                    content += p.get_text() + " "

        new_article = {
            "title": title,
            "newspaper": "La tribune",
            "author": author,
            "date_publi": date,
            "content": content,
            "theme": theme
        }
        # add each new article in the "file_json" table
        if utils.is_empty(new_article) is False:
            file_json.append(new_article)
    return (file_json)
def recovery_information_hum(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'og:title':
            title = meta.get("content")

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:section':
            theme = meta.get("content")

    author = []
    for h2 in soup_article.find_all('h2'):
        for a in h2.find_all('a'):
            if re.search('auteur', str(a.get("href"))):
                author.append(a.get_text())

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:published_time':
            raw_date = meta.get("content")
            date_p = raw_date[0:10]
            date_p = str(datetime.strptime(date_p, "%Y-%m-%d").date())

    contents = ""
    for div in soup_article.find_all('div'):
        if div.get("class") == [
                'field', 'field-name-field-news-chapo', 'field-type-text-long',
                'field-label-hidden'
        ]:
            for p in div.find_all('p'):
                contents += p.get_text()
        if div.get("class") == [
                'field', 'field-name-field-news-text', 'field-type-text-long',
                'field-label-hidden'
        ]:
            for p in div.find_all('p'):
                contents += p.get_text()

    article = utils.recovery_article(title, 'Humanite', author, date_p,
                                     contents, theme)
    return (article)
def recovery_information_fem(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    title = soup_article.title.get_text()
    title = title.split(" - ")
    title = title[0]

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:published_time':
            raw_date = meta.get("content")
            date_p = raw_date[0:10]

    author = []
    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:author':
            author.append(meta.get("content"))

    theme = ""
    for link in soup_article.find_all('link'):
        if link.get("rel") == ['Index']:
            link_theme = link.get("href")
            part_link = link_theme.split("/")
            theme = part_link[3]

    contents = ""
    for div in soup_article.find_all('div'):
        if div.get("class") == ['chapo']:
            for p in div.find_all('p'):
                contents += p.get_text() + " "
        if div.get("class") == ['contenu']:
            for p in div.find_all('p'):
                contents += p.get_text() + " "
        if div.get("class") == ['diaporama']:
            for p in div.find_all('p'):
                contents += p.get_text() + " "
    contents = re.sub(r"\s\s+", " ", contents)

    article = utils.recovery_article(title, 'Femina', author, date_p, contents,
                                     theme)
    return (article)
Beispiel #26
0
def add_articles(file_target="data/clean/robot/" +
                 str(date.datetime.now().date()) + "/"):
    """
        it create a json for each new article
    """
    soup = utils.recovery_flux_url_rss(
        "http://www.20minutes.fr/feeds/rss-actu-france.xml")
    items = soup.find_all("item")
    articles = []
    for item in items:
        # Récuperer le lien des articles
        url = re.search(r"<link/>(.*)<pubdate>", str(item)).group(1)
        if is_article(url):
            new_article = get_article(url)
            if utils.is_empty(new_article):
                articles.append(new_article)
    utils.create_json(file_target, articles, "Minutes/", "min")
def recovery_link_new_articles_hum_rss(url_rss):
    """
        Arguments:
            - url of the page containing feed links for
            the different categories
        Returns :
            - list of urls of the different categories

    """
    soup = utils.recovery_flux_url_rss(url_rss)

    items = soup.find_all("item")
    article_humanite = []
    # Retrieving all urls of new RSS feeds of different categories
    for item in items:
        article_humanite.append(re.search(r"<link/>(.*)", str(item))[1])

    return (article_humanite)
def recovery_old_articles_sv(
    file_target="C:/Users/Laetitia/Desktop/Groupe4_Robot" +
    str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """

    list_category = [
        "corps-et-sante", "nature-et-enviro", "ciel-et-espace",
        "technos-et-futur", "cerveau-et-intelligence", "science-et-culture"
    ]

    file_json = []
    i = 0
    for cat in list_category:
        # We retrieve the URL feeds for each page of article
        # Each HTML-coded article is analyzed with beautiful soup
        url_rss_sv = "https://www.science-et-vie.com/" + cat

        soup_url = utils.recovery_flux_url_rss(url_rss_sv)

        article_sv = []
        # We retrieve all the articles for a given page
        for div in soup_url.find_all("div"):
            if div.get("class") == ["title"]:
                for item in div.find_all("a"):
                    links = "https://www.science-et-vie.com/" + \
                        str(item.get("href"))
                    article_sv.append(links)

        # Each article is analized one by one
        for article in article_sv:
            new_article = recovery_information_sv(article)
            if utils.is_empty(new_article) is False:
                file_json.append(recovery_information_sv(article))
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "ScienceEtVie_crawler/",
                              "sv")
            i = 0
            file_json = []

    utils.create_json(file_target, file_json, "ScienceEtVie_crawler/", "sv")
def recovery_link_new_articles_fem():
    """
        Arguments:
            - url of the page containing feed links for
            the different categories
        Returns :
            - list of urls of the different categories
    """

    list_category = ["Beaute/Coiffure", "Beaute/Beaute-People",
                     "Beaute/Parfums", "Beaute/Soins-visage-et-corps",
                     "Beaute/Maquillage", "Mode/Tendances", "Mode/Defiles",
                     "Mode/Lingerie", "Mode/Mode-People",
                     "Cuisine/Recettes-de-chefs",
                     "Cuisine/Shopping-et-conseils",
                     "Cuisine/Idees-de-recettes-par-theme",
                     "Psychologie/Psycho", "Psychologie/Societe",
                     "Psychologie/Argent-Droit", "People/Vie-des-people",
                     "Culture/Series", "Culture/Musique",
                     "Culture/Cinema-et-DVD", "Culture/Sorties",
                     "Loisirs/Jardinage", "Loisirs/Voyages",
                     "Loisirs/Tendace-deco", "Sexo/Sexualite", "Sexo/Amour",
                     "Sante-Forme/Bien-etre", "Sante-Forme/Sport",
                     "Sante-Forme/Regimes-Nutrition", "Sante-Forme/Sante",
                     "Famille/Grossesse", "Famille/Bebe", "Famille/Enfant",
                     "Famille/Adolescent"]
    article_fem = []
    for category in list_category:
        for i in range(2, 45):
            try:
                url_rss_fem = "http://www.femina.fr/" +\
                    category + "/page-" + str(i)
                soup_url = utils.recovery_flux_url_rss(url_rss_fem)

                for h2 in soup_url.find_all('h2'):
                    for a in h2.find_all('a'):
                        article_fem.append(a.get("href"))
                for h3 in soup_url.find_all('h3'):
                    for a in h3.find_all('a'):
                        article_fem.append(a.get("href"))
            except:
                break

    return(article_fem)
Beispiel #30
0
def recovery_information_lg(url):
    """
        Arguments:
            url : string
        Return :
            article : dictionary
        It retrieve for each article the title, newspaper, author, date, theme
    """
    soup = utils.recovery_flux_url_rss(url)

    # Retrieving the title
    title = ''
    balise_title = soup.title.string
    sep = balise_title.split('—')
    title = unidecode.unidecode('—'.join(sep[:-1]))

    tag_context = soup.find('span', attrs={'class': 'context'})

    # Retrieving of author
    author = []
    author.append(tag_context.a.get_text())

    # Retrieving of publication date
    date_p = ''
    regex_date = re.search('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                           tag_context.get_text())
    date_p = regex_date.group(0)
    date_p = str(date.datetime.strptime(date_p, '%d/%m/%Y').date())

    # Retrieving the theme
    tag_post_cat = soup.find('ul', attrs={'class': 'post-categories'})
    for li in tag_post_cat.find_all('li'):
        theme = li.get_text()

    # Retrieving the content of the article
    contents = ''
    tag_content = soup.find('div', attrs={'class': 'content'})
    if tag_content:
        for p in tag_content.find_all('p'):
            contents += p.get_text() + " "

    new_article = utils.recovery_article(title, 'LeGorafi', author, date_p,
                                         contents, theme)
    return (new_article)