Ejemplo n.º 1
0
def recuperation_info_lmde(
        file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"):
    source = "lemonde/"
    url_rss_lib = "http://www.lemonde.fr/rss/"
    abbreviation = "lmde"
    url = "http://www.lemonde.fr"

    list_articles = []
    i = 0

    listRSS = linkRSS(url_rss_lib)
    for article_link in listRSS:
        if "/article/" in article_link:
            i += 1
            list_articles.append(info_articles(article_link))

            if i == 20:
                utilsg4.create_json(
                    file_target, list_articles, source, abbreviation)
                i = 0
                list_articles = []

    # links = recent(url)
    # list_articles.extend(articlesList(links))
    utilsg4.create_json(file_target, list_articles, source, abbreviation)
Ejemplo n.º 2
0
def recovery_new_articles_fusc(file_target = '/var/www/html/projet2018/data/clean/robot/'):
    """
        it create a json for each new article
    """
    links = recovery_link_new_articles('https://www.futura-sciences.com/' +
                                       'flux-rss/')
    list_articles = []
    for article in links:
        new_article = recovery_information_fusc(article)
        if not utils.is_empty(new_article):
            list_articles.append(new_article)
    utils.create_json(file_target, list_articles, 'FuturaSciences', 'fusc')
Ejemplo n.º 3
0
def recovery_new_article_lg(file_target = "/var/www/html/projet2018/data/clean/robot/"):
    """
         Retrieving new articles thanks to the rss feed
         and create for each article a json
    """
    url_rss = "http://www.legorafi.fr/feed/"
    links_article = recovery_link_new_articles_lg(url_rss)
    list_article = []
    for link_article in links_article:
        new_article = recovery_information_lg(link_article)
        if new_article["theme"] != "Magazine":
            list_article.append(new_article)
    utils.create_json(file_target, list_article, "LeGorafi", "lg")
Ejemplo n.º 4
0
def recovery_new_articles_hum(file_target="data/clean/robot/" +
                              str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """
    file_json = []
    # Each url is analized one by one
    article_humanite = recovery_link_new_articles_hum("https://www.humanite" +
                                                      ".fr/rss/actu.rss")
    for article in article_humanite:
        file_json.append(recovery_information_hum(article))

    utils.create_json(file_target, file_json, "Humanite/",
                      "hum")
Ejemplo n.º 5
0
def add_articles(file_target="data/clean/robot/" +
                 str(date.datetime.now().date()) + "/"):
    """
        it create a json for each new article
    """
    soup = utils.recovery_flux_url_rss(
        "http://www.20minutes.fr/feeds/rss-actu-france.xml")
    items = soup.find_all("item")
    articles = []
    for item in items:
        # Récuperer le lien des articles
        url = re.search(r"<link/>(.*)<pubdate>", str(item)).group(1)
        if is_article(url):
            articles.append(get_article(url))
    utils.create_json(file_target, articles, "Minutes/", "min")
Ejemplo n.º 6
0
def recovery_new_articles_ld(file_target="data/clean/robot/" +
                             str(date.datetime.now().date()) + "/"):

    links = recovery_link_new_articles_ld(
        "https://www.ladepeche.fr/services/flux-rss/")

    list_articles = []
    i = 0
    for article in links:
        new_article = recovery_information_ld(article)
        list_articles.append(new_article)
        i += 1
        if i == 50:
            utilsg4.create_json(file_target, list_articles, "ladepeche/", "LD")

            i = 0
            list_articles = []

    utilsg4.create_json(file_target, list_articles, "ladepeche/", "LD")
def add_articles(
        file_target="/home/etudiant/Documents/ProjetSID/Groupe4_Robot/Telerama/Art/" +
        str(
            date.datetime.now().date()) +
        "/"):
    """
        it create a json for each new article
    """
    categories = {
        "cinema": 40,
        "scenes": 30,
        "enfants": 3,
        "idees": 30,
    }
    articles = []
    for category, nbre in categories.items():
        for i in range(0, nbre):
            url = "http://www.telerama.fr/" + \
                category + "/articles?page=" + str(i)
            articles.extend(get_article_of_category(url))
            utils.create_json(file_target, articles, "Telerama/", "tera")
def recovery_old_articles_sv(
    file_target="C:/Users/Laetitia/Desktop/Groupe4_Robot" + str(
        date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """

    list_category = [
        "corps-et-sante",
        "nature-et-enviro",
        "ciel-et-espace",
        "technos-et-futur",
        "cerveau-et-intelligence",
        "science-et-culture"]

    file_json = []
    for cat in list_category:
        # We retrieve the URL feeds for each page of article
        # Each HTML-coded article is analyzed with beautiful soup
        url_rss_sv = url_rss = "https://www.science-et-vie.com/" + cat

        soup_url = utils.recovery_flux_url_rss(url_rss_sv)

        article_sv = []
        # We retrieve all the articles for a given page
        for div in soup_url.find_all("div"):
            if div.get("class") == ["title"]:
                for item in div.find_all("a"):
                    links = "https://www.science-et-vie.com/" + \
                        str(item.get("href"))
                    article_sv.append(links)

        # Each article is analized one by one
        for article in article_sv:
            file_json.append(recovery_information_sv(article))

    utils.create_json(file_target, file_json, "Scienceetvie_crawler/",
                      "sv")
Ejemplo n.º 9
0
def recovery_new_articles_fem(file_target="data/clean/robot/" +
                              str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """
    file_json = []
    i = 0
    article_fem = recovery_link_new_articles_fem()
    for article in article_fem:
        new_article = recovery_information_fem(article)
        if utils.is_empty(new_article) is False:
            file_json.append(new_article)
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "Femina_crawler/",
                              "fem")
            i = 0
            file_json = []

    utils.create_json(file_target, file_json, "Femina_crawler/",
                      "fem")
Ejemplo n.º 10
0
def recovery_new_articles_noob_rss(file_target="data/clean/robot/" +
                                   str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """
    file_json = []
    i = 0
    # Each url is analized one by one
    list_url = recovery_link_new_articles_noob_rss("http://www.nouvelobs." +
                                                   "com/rss/")
    for url in list_url:
        soup_url = utils.recovery_flux_url_rss(url)
        items = soup_url.find_all("item")
        article_noob = []

        # We're picking up every new article in a list
        for item in items:
            link_article = re.search(r"<link/>(.*)", str(item))[1]
            link_article = link_article.split("<description>")
            link_article = link_article[0]
            article_noob.append(link_article)
            if re.search("\/galeries\-photos\/", link_article):
                article_noob.remove(link_article)
        # Each article is analized one by one
        for article in article_noob:
            new_article = recovery_information_noob(article)
            if utils.is_empty(new_article) is False:
                file_json.append(new_article)
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "NouvelObs_rss/", "noob")
            i = 0
            file_json = []

    utils.create_json(file_target, file_json, "NouvelObs/", "noob")