Beispiel #1
0
def recuperation_info_lmde(file_target="/Users/sofian/Documents/Projet_att/" +
                           str(date.datetime.now().date()) + "/"):

    list_url_articles = []
    j = 0
    # récupération des articles avec la recherche: impact attentat
    for i in range(1, 16):
        j = j+1
        url = 'http://www.lemonde.fr/recherche/?keywords=attentat+impact&page_num=' + str(i) +'&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=18&end_month=02&end_year=2018&sort=desc'
        soup = utils.recovery_flux_url_rss(url)

        for h3 in soup.find_all('h3'):
            for a in h3.find_all('a'):
                url = 'http://www.lemonde.fr' + a.get("href")
                list_url_articles.append(url)

        if (j == 3):
            time.sleep(61)
            j = 0

    # récupération des articles avec la recherche: attentat
    for i in range(1, 600):
        j = j+1
        url = 'http://www.lemonde.fr/recherche/?keywords=attentat&page_num=' + str(i) + "&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=31&end_month=01&end_year=2018&sort=desc"
        soup = utils.recovery_flux_url_rss(url)

        for h3 in soup.find_all('h3'):
            for a in h3.find_all('a'):
                url = 'http://www.lemonde.fr' + a.get("href")
                list_url_articles.append(url)

        if (j == 3):
            time.sleep(61)
            j = 0

    # récupération des articles avec la recherche: terrorisme
    for i in range(1, 800):
        j = j+1
        url = 'http://www.lemonde.fr/recherche/?keywords=terrorisme&page_num=' + str(i) + '&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=18&end_month=02&end_year=2018&sort=desc'

        soup = utils.recovery_flux_url_rss(url)

        for h3 in soup.find_all('h3'):
            for a in h3.find_all('a'):
                url = 'http://www.lemonde.fr' + a.get("href")
                list_url_articles.append(url)

        if (j == 3):
            time.sleep(61)
            j = 0

    list_dictionaries = []

    info_articles(list_dictionaries, list_url_articles)
    utils.create_json(file_target, list_dictionaries, "LeMonde/", "lmde")
Beispiel #2
0
def recovery_new_articles_lpt(file_target="/Users/sofian/Documents/Projet_att/" +
                              str(date.datetime.now().date()) + "/"):
    """Procedure that calls all the others functions and procedures in order to
    collect articles from a newspaper in a file
    Arguments:
        file_target {string} -- path where the articles will be recorded
    """    
    list_url_articles = []
    # boucle qui va aller de la page 1 à la page 95 de la recherche attentat
    for i in range(1, 95):
        url = 'http://www.lepoint.fr/recherche/index.php?query=attentats&page=' + str(i)
        soup = utils.recovery_flux_url_rss(url) # récupère le code html de l'url

		# on récupère tout les urls des articles qui sont dans les balises <div class='image-search-wrap'> <figure> </figure> </div>
        for div in soup.find_all('div'):
            if re.search('image-search-wrap', str(div.get("class"))): 
                for fig in div.find_all('figure'):
                    url = "http://www.lepoint.fr" + fig.a.get("href")
                    list_url_articles.append(url)
        time.sleep(61) # On se met en veille pendant 61 secondes pour pas que le site internet reconnaisse qu'on soit un robot
    list_dictionaries = []
    
    collect_articles(list_dictionaries, list_url_articles) ## Appel à la fonction collect_articles, pour avoir les informations de l'article
     
    utils.create_json(file_target, list_dictionaries, "LePoint/",
                          "lpt") # On crée le fichier json pour chaque article
Beispiel #3
0
def collect_articles(list_dictionaries, list_url_articles):
    """Add the articles (dictionaries) from a list of URL in a list of
    dictionaries
    Arguments:
        list_dictionaries {list} -- list of dictionaries
        list_url_articles {list} -- list of URL
        theme {string} -- theme related to the list of dictionaries
    """
    ## Boucle qui va pour chaque url d'un article récupéré les informations (titre, date de publication, auteur, le theme et le contenu de l'article
    for url_article in list_url_articles:
        soup = utils.recovery_flux_url_rss(url_article) # récupère le code html de l'url

        balise_title = soup.title.string # récupère le titre de l'article
        sep = balise_title.split(" - Le Point")
        title = sep[0] 

        list_authors = []
        # On va récupéré les auteurs qui sont dans une balise <span> qui est dans une balise <div>
        for div in soup.find_all('div'):
            if div.get('class') == ['mbs']:
                for span in div.find_all('span'):
                    name = span.get_text()
                    name = re.sub('Par', '', name)
                    name = re.sub('\n', '', name)
                    list_authors.append(name)

        dates = []
        # On va récupéré la date qui est dans une balise <time>
        for balise_time in soup.find_all('time'):
            for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                                      str(balise_time)):
                dates.append(date.datetime.strptime(valeur.group(0),
                                                    '%d/%m/%Y'))
        date_publication = date.datetime.strftime(min(dates), '%d/%m/%Y')
        date_publication = str(date.datetime.strptime(date_publication,
                                                      "%d/%m/%Y").date())

		# On récupère le theme à l'aide de l'url de l'article par exemple : "http:/www.lepoint.fr/ "  pour cette article on récupère le thème sport
        theme = re.search("www.lepoint.fr/(.*)/", url_article)[1]
		
		# On va récupéré le contenu de l'article qui est dans une balise <h2 class='art-chapeau'> et aussi dans une balise <div class='art-text">
        content = ''
        for h2 in soup.find_all('h2'):
            if h2.get('class') == ['art-chapeau']:
                content += h2.get_text()+" "
        for div in soup.find_all('div'):
            if div.get('class') == ['art-text']:
                for p in div.find_all('p'):
                    content += p.get_text()+" "

        new_article = utils.recovery_article(title, 'LePoint',
                                             list_authors,
                                             date_publication, content,
                                             theme)
        if not utils.is_empty(new_article):
            list_dictionaries.append(new_article)
Beispiel #4
0
def recovery_new_articles_lpt(
    file_target="/Users/sofian/Documents/Projet_att/" +
    str(date.datetime.now().date()) + "/"):
    """Procedure that calls all the others functions and procedures in order to
    collect articles from a newspaper in a file
    Arguments:
        file_target {string} -- path where the articles will be recorded
    """
    list_url_articles = []
    j = 0
    for i in range(1, 90):
        j = j + 1
        url = 'http://www.lepoint.fr/recherche/index.php?query=attentats&sort=pertinence&page=' + str(
            i)
        soup = utils.recovery_flux_url_rss(url)

        for div in soup.find_all('div'):
            if re.search('image-search-wrap', str(div.get("class"))):
                for fig in div.find_all('figure'):
                    url = "http://www.lepoint.fr" + fig.a.get("href")
                    list_url_articles.append(url)
        if (j == 3):
            time.sleep(61)
            j = 0
    for i in range(1, 90):
        j = j + 1
        url = 'http://www.lepoint.fr/recherche/index.php?query=attentat&sort=pertinence&page=' + str(
            i)
        soup = utils.recovery_flux_url_rss(url)

        for div in soup.find_all('div'):
            if re.search('image-search-wrap', str(div.get("class"))):
                for fig in div.find_all('figure'):
                    url = "http://www.lepoint.fr" + fig.a.get("href")
                    list_url_articles.append(url)
        if (j == 3):
            time.sleep(5)
            j = 0

    for i in range(1, 90):
        j = j + 1
        url = 'http://www.lepoint.fr/recherche/index.php?query=terrorisme&sort=pertinence&page=' + str(
            i)
        soup = utils.recovery_flux_url_rss(url)

        for div in soup.find_all('div'):
            if re.search('image-search-wrap', str(div.get("class"))):
                for fig in div.find_all('figure'):
                    url = "http://www.lepoint.fr" + fig.a.get("href")
                    list_url_articles.append(url)
        if (j == 3):
            time.sleep(61)
            j = 0

    for i in range(1, 90):
        j = j + 1
        url = 'http://www.lepoint.fr/recherche/index.php?query=terroriste&sort=pertinence&page=' + str(
            i)
        soup = utils.recovery_flux_url_rss(url)

        for div in soup.find_all('div'):
            if re.search('image-search-wrap', str(div.get("class"))):
                for fig in div.find_all('figure'):
                    url = "http://www.lepoint.fr" + fig.a.get("href")
                    list_url_articles.append(url)
        if (j == 3):
            time.sleep(61)
            j = 0

    for i in range(1, 90):
        j = j + 1
        url = 'http://www.lepoint.fr/recherche/index.php?query=terroristes&sort=pertinence&page=' + str(
            i)
        soup = utils.recovery_flux_url_rss(url)

        for div in soup.find_all('div'):
            if re.search('image-search-wrap', str(div.get("class"))):
                for fig in div.find_all('figure'):
                    url = "http://www.lepoint.fr" + fig.a.get("href")
                    list_url_articles.append(url)

        if (j == 3):
            time.sleep(61)
            j = 0

    list_dictionaries = []

    collect_articles(list_dictionaries, list_url_articles)

    utils.create_json(file_target, list_dictionaries, "LePoint/", "lpt")
Beispiel #5
0
def collect_articles(list_dictionaries, list_url_articles):
    """Add the articles (dictionaries) from a list of URL in a list of
    dictionaries
    Arguments:
        list_dictionaries {list} -- list of dictionaries
        list_url_articles {list} -- list of URL
        theme {string} -- theme related to the list of dictionaries
    """
    try:
        j = 0
        titre = []
        for url_article in list_url_articles:
            soup = utils.recovery_flux_url_rss(
                url_article)  # récupère le code html de l'url

            balise_title = soup.title.string  # récupère le titre de l'article
            sep = balise_title.split(" - Le Point")
            title = sep[0]

            list_authors = []
            # On va récupéré les auteurs qui sont dans une balise <span> qui est dans une balise <div>
            for div in soup.find_all('div'):
                if div.get('class') == ['mbs']:
                    for span in div.find_all('span'):
                        name = span.get_text()
                        name = re.sub('Par', '', name)
                        name = re.sub('\n', '', name)
                        list_authors.append(name)

            dates = []
            # On va récupéré la date qui est dans une balise <time>
            for balise_time in soup.find_all('time'):
                for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                                          str(balise_time)):
                    dates.append(
                        date.datetime.strptime(valeur.group(0), '%d/%m/%Y'))
            date_publication = date.datetime.strftime(min(dates), '%d/%m/%Y')
            date_publication = str(
                date.datetime.strptime(date_publication, "%d/%m/%Y").date())

            theme = re.search("www.lepoint.fr/(.*)/", url_article)[1]

            content = ''
            for h2 in soup.find_all('h2'):
                if h2.get('class') == ['art-chapeau']:
                    content += h2.get_text() + " "
            for div in soup.find_all('div'):
                if div.get('class') == ['art-text']:
                    for p in div.find_all('p'):
                        content += p.get_text() + " "

            new_article = utils.recovery_article(title, 'LePoint',
                                                 list_authors,
                                                 date_publication, content,
                                                 theme)
            if (j == 5):
                time.sleep(61)
                j = 0

            erreur = "non"
            for tit in titre:
                if title == tit:
                    erreur = "oui"
            if len(content) > 10 and erreur == "non":
                titre.append(title)
                list_dictionaries.append(new_article)

    except:
        print("Probleme")
Beispiel #6
0
import utils_v0 as utils
import re
from datetime import datetime
import datetime as date


file_target = "/Users/sofian/Documents/Projet_att/" + str(date.datetime.now().date()) + "/"

article_noob = []
for i in range(1, 30):
    url_rss_noob = "http://recherche.nouvelobs.com/?p=" + str(i) + "&q=attentat&c=bnJlc3VsdHMlM0QxMCUyNnN0YXJ0JTNEMjgwJTI2bG9naWMlM0RzbHJlZm9udGUtZ2xvYmFsZSUyNnElM0RhdHRlbnRhdCUyQiUyNTI4Tk9UJTJCY29ycG9yYXRlJTI1MkZ0cmVlJTI1M0FUb3AlMjUyRnR5cGUlMjUyRmRlcGVjaGVzJTJCQU5EJTJCTk9UJTJCY29ycG9yYXRlJTI1MkZ0cmVlJTI1M0FUb3AlMjUyRnR5cGUlMjUyRnJlZGlyZWN0aW9uJTI1MjklMkJBTkQlMkIlMjUyOGNvcnBvcmF0ZSUyNTJGdHJlZSUyNTNBVG9wJTI1MkZteXNvdXJjZSUyNTJGbm91dmVsb2JzLmNvbSUyQk9SJTJCY29ycG9yYXRlJTI1MkZ0cmVlJTI1M0FUb3AlMjUyRm15c291cmNlJTI1MkZsZXBsdXMlMkJPUiUyQmNvcnBvcmF0ZSUyNTJGdHJlZSUyNTNBVG9wJTI1MkZteXNvdXJjZSUyNTJGb2JzZXNzaW9uJTJCT1IlMkJjb3Jwb3JhdGUlMjUyRnRyZWUlMjUzQVRvcCUyNTJGbXlzb3VyY2UlMjUyRnRlbGVvYnMuY29tJTJCT1IlMkJjb3Jwb3JhdGUlMjUyRnRyZWUlMjUzQVRvcCUyNTJGbXlzb3VyY2UlMjUyRmJpYmxpb2JzJTI1Mjk%3D"
    soup_url = utils.recovery_flux_url_rss(url_rss_noob)
    for h2 in soup_url.find_all('h2'):
        if h2.get("class") == ['title']:
            if re.search('www.nouvelobs.com', str(h2.a.get("href"))):
                article_noob.append(h2.a.get("href"))

for i in range(1, 30):
    url_rss_noob = "http://recherche.nouvelobs.com/?p=" + str(i) + "&q=terrorisme&c=bnJlc3VsdHMlM0QxMCUyNnN0YXJ0JTNEMjkwJTI2bG9naWMlM0RzbHJlZm9udGUtZ2xvYmFsZSUyNnElM0R0ZXJyb3Jpc21lJTJCJTI1MjhOT1QlMkJjb3Jwb3JhdGUlMjUyRnRyZWUlMjUzQVRvcCUyNTJGdHlwZSUyNTJGZGVwZWNoZXMlMkJBTkQlMkJOT1QlMkJjb3Jwb3JhdGUlMjUyRnRyZWUlMjUzQVRvcCUyNTJGdHlwZSUyNTJGcmVkaXJlY3Rpb24lMjUyOSUyQkFORCUyQiUyNTI4Y29ycG9yYXRlJTI1MkZ0cmVlJTI1M0FUb3AlMjUyRm15c291cmNlJTI1MkZub3V2ZWxvYnMuY29tJTJCT1IlMkJjb3Jwb3JhdGUlMjUyRnRyZWUlMjUzQVRvcCUyNTJGbXlzb3VyY2UlMjUyRmxlcGx1cyUyQk9SJTJCY29ycG9yYXRlJTI1MkZ0cmVlJTI1M0FUb3AlMjUyRm15c291cmNlJTI1MkZvYnNlc3Npb24lMkJPUiUyQmNvcnBvcmF0ZSUyNTJGdHJlZSUyNTNBVG9wJTI1MkZteXNvdXJjZSUyNTJGdGVsZW9icy5jb20lMkJPUiUyQmNvcnBvcmF0ZSUyNTJGdHJlZSUyNTNBVG9wJTI1MkZteXNvdXJjZSUyNTJGYmlibGlvYnMlMjUyOQ%3D%3D"
    soup_url = utils.recovery_flux_url_rss(url_rss_noob)
    for h2 in soup_url.find_all('h2'):
        if h2.get("class") == ['title']:
            if re.search('www.nouvelobs.com', str(h2.a.get("href"))):
                article_noob.append(h2.a.get("href"))

# analyse de chaque article
titre = []
for url_article in article_noob:

    try:
        soup_article = utils.recovery_flux_url_rss(url_article)
Beispiel #7
0
def info_articles(list_dictionaries, list_url_articles):
    try:
        j = 0
        titre = []
        for url_article in list_url_articles:
            soup = utils.recovery_flux_url_rss(url_article)

            title = soup.find('title').string
            title = title.lower()

            newspaper = "Le Monde"

            # Article theme
            theme = ""
            for li in soup.find_all('li'):
                for val in re.finditer('ariane', str(li.get("class"))):
                    theme = li.a.get_text()

            # Author of the article
            if(soup.find("span", class_="auteur")):
                if(soup.find("span", class_="auteur").a):
                    author = soup.find("span",class_="auteur").find("a").get_text()
                else:
                    author = soup.find("span", class_="auteur").get_text()
                author = re.sub(r"\s\s+", " ", author)
                author = re.sub(r"^ ", "", author)
            else:
                author = ""

            # publication date
            date_p = ""
            for tim in soup.find_all('time'):
                if tim.get("itemprop") == 'datePublished':
                    date_t = tim.get('datetime')
                    date_p = date_t[0:10]
                    date_p = datetime.strptime(date_p, "%Y-%m-%d").strftime("%d/%m/%Y")

            # Article content
            content = ""
            for div in soup.find_all('div'):
                if div.get("id") == 'articleBody':
                    for p in div.find_all('p'):
                        if p.get("class") == ['lire']:
                            p.string = ""
                    content += div.get_text() + " "

            new_article = utils.recovery_article(title, newspaper, author,
                                                 date_p, content, theme)

            if (j == 3):
                time.sleep(61)
                j = 0

            erreur = "non"
            for tit in titre:
                if title == tit:
                    erreur = "oui"
            if len(content) > 10 and erreur == "non":
                titre.append(title)
                list_dictionaries.append(new_article)

    except:
        print("Probleme")