def recovery_new_articles_lfi(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """Procedure that calls all the others functions and procedures in order to collect articles from a newspaper in a file Arguments: file_target {string} -- path where the articles will be recorded """ list_url_themes = collect_url_themes("http://www.lefigaro.fr/") for url_theme in list_url_themes: list_dictionaries = [] theme = re.search("http://www.lefigaro.fr/(.*)", url_theme)[1] theme = re.sub("/", "", theme) print(theme) list_url_sub_themes = collect_url_sub_themes(url_theme) list_url_articles = [] for url_sub_theme in list_url_sub_themes: collect_url_articles(list_url_articles, url_sub_theme) collect_articles(list_dictionaries, list_url_articles, theme) time.sleep(3) utils.create_json(file_target, list_dictionaries, 'leFigaro/', 'lfi')
def recuperation_info_libe(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Main fonction that get all articles url, extract the informations and create a JSON File """ source = "liberation/" link_rss = get_rss_infos() list_articles = [] i = 0 for lr in link_rss: i += 1 if "www.liberation.fr" in lr: informations = get_information(lr) else: informations = None if informations: new_article = get_information(lr) if utils.is_empty(new_article) is False: list_articles.append(new_article) if i > 49: i = 0 utils.create_json(file_target, list_articles, source, "libe") list_articles = [] utils.create_json(file_target, list_articles, source, "libe")
def recovery_new_articles_noob_crawler(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] i = 0 article_noob = recovery_link_new_articles_noob_crawler() # Each article is analized one by one for article in article_noob: new_article = recovery_information_noob(article) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "NouvelObs_crawler/", "noob") i = 0 file_json = [] utils.create_json(file_target, file_json, "NouvelObs_crawler/", "noob")
def recovery_new_articles_equipe(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] i = 0 list_url = recovery_link_new_articles_equipe("https://www.lequipe.fr/rss/") for url in list_url: soup_url = utils.recovery_flux_url_rss(url) items = soup_url.find_all("item") article_equipe = [] # We're picking up every new article in a list for item in items: article_equipe.append(re.search(r"<link/>(.*)", str(item))[1]) # Each article is analized one by one for article in article_equipe: new_article = recovery_information_equipe(article) if utils.is_empty(new_article) is False: file_json.append(recovery_information_equipe(article)) i += 1 if i == 20: utils.create_json(file_target, file_json, "Equipe_rss/", "equi") i = 0 file_json = [] utils.create_json(file_target, file_json, "Equipe_rss/", "equi")
def recovery_old_articles_LD( file_target='/var/www/html/projet2018/data/clean/robot/' + str(datetime.datetime.now().date())): """ it create a json for each article """ list_category = [ 'grand-sud', 'actu', 'faits-divers', 'economie', 'sports', 'sante', 'tv-people', 'sorties' ] links_article = [] list_articles = [] for cat in list_category: for i in range(1, 100): try: url = 'https://www.ladepeche.fr/recherche/?p=' + str(i)\ + '&c=' + cat + '&plus-infos=1' soup = utils.recovery_flux_url_rss(url) except: break for h2 in soup.find_all('h2'): for item in h2.find_all('a'): link = 'https://www.ladepeche.fr' + str(item.get('href')) links_article.append(link) for link in links_article: new_article = recovery_article_ld(link) if not utils.is_empty(new_article): list_articles.append(new_article) utils.create_json(file_target, list_articles, "Ladepeche", "LD")
def recovery_new_articles_lpt(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """Procedure that calls all the others functions and procedures in order to collect articles from a newspaper in a file Arguments: file_target {string} -- path where the articles will be recorded """ list_url_themes = collect_url_themes('http://www.lepoint.fr/rss/') for url_theme in list_url_themes: list_url_articles = [] list_dictionaries = [] theme = re.search("http://www.lepoint.fr/(.*)/rss.xml", url_theme)[1] print("---------------------------" + theme + "------------------------") collect_url_articles(list_url_articles, url_theme) for index_page in range(2, 10): collect_url_articles( list_url_articles, url_theme + "index_" + str(index_page) + ".php") collect_articles(list_dictionaries, list_url_articles, theme) time.sleep(3) utils.create_json(file_target, list_dictionaries, "LePoint/", "lpt")
def recuperation_info_lt(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): url_rss_latribune = "http://www.latribune.fr/rss/rubriques/actualite.html" articles = article_lt(url_rss_latribune) file_json = fileJson(articles) sources = "Latribune/" if not os.path.exists(file_target + sources): os.makedirs(file_target + sources) # Call the create_json function utils.create_json(file_target, file_json, sources, "lt")
def recovery_new_articles_fusc( file_target='/var/www/html/projet2018/data/clean/robot/' + str(date.datetime.now().date()) + "/"): """ it create a json for each new article """ links = recovery_link_new_articles('https://www.futura-sciences.com/' + 'flux-rss/') list_articles = [] for article in links: new_article = recovery_information_fusc(article) if not utils.is_empty(new_article): list_articles.append(new_article) utils.create_json(file_target, list_articles, 'FuturaSciences', 'fusc')
def recovery_new_article_lg(): """ Retrieving new articles thanks to the rss feed and create for each article a json """ file_target = "/var/www/html/projet2018/data/clean/robot/" url_rss = "http://www.legorafi.fr/feed/" links_article = recovery_link_new_articles_lg(url_rss) list_article = [] for link_article in links_article: new_article = recovery_information_lg(link_article) if not utils.is_empty(new_article): print(new_article) list_article.append(new_article) utils.create_json(file_target, list_article, 'LeGorafi', 'lg')
def recovery_old_article_equi(file_target="data/clean/robot/"): file_json = [] url_rss = "https://www.lequipe.fr/" links_article = recovery_link_old_articles_equi(url_rss) i = 0 for link in links_article: new_article = recovery_information_equi(link) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "Equip_old/", "equi") i = 0 file_json = [] utils.create_json(file_target, file_json, "Equip_old/", "equi")
def add_articles(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ it create a json for each new article """ soup = utils.recovery_flux_url_rss( "http://www.20minutes.fr/feeds/rss-actu-france.xml") items = soup.find_all("item") articles = [] for item in items: # Récuperer le lien des articles url = re.search(r"<link/>(.*)<pubdate>", str(item)).group(1) if is_article(url): new_article = get_article(url) if utils.is_empty(new_article): articles.append(new_article) utils.create_json(file_target, articles, "Minutes/", "min")
def add_articles(file_target="/var/www/html/projet2018/data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ it creates a json for each new article """ categories = { "cinema": 40, "scenes": 30, "enfants": 3, "idees": 30, } articles = [] for category, nbre in categories.items(): for i in range(0, nbre): url = "http://www.telerama.fr/" + category + \ "/articles?page=" + str(i) articles.extend(get_article_of_category(url)) utils.create_json(file_target, articles, "Telerama/", "tera")
def recovery_old_article_equi( file_target="/var/www/html/projet2018/data/clean/robot/" + str(date.datetime.now().date()) + "/"): file_json = [] url_rss = "https://www.lequipe.fr/" links_article = recovery_link_old_articles_equi(url_rss) i = 0 for link in links_article: new_article = recovery_information_equi(link) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "Equipe/", "equi") i = 0 file_json = [] utils.create_json(file_target, file_json, "Equipe/", "equi")
def recovery_old_articles_sv( file_target="C:/Users/Laetitia/Desktop/Groupe4_Robot" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ list_category = [ "corps-et-sante", "nature-et-enviro", "ciel-et-espace", "technos-et-futur", "cerveau-et-intelligence", "science-et-culture" ] file_json = [] i = 0 for cat in list_category: # We retrieve the URL feeds for each page of article # Each HTML-coded article is analyzed with beautiful soup url_rss_sv = "https://www.science-et-vie.com/" + cat soup_url = utils.recovery_flux_url_rss(url_rss_sv) article_sv = [] # We retrieve all the articles for a given page for div in soup_url.find_all("div"): if div.get("class") == ["title"]: for item in div.find_all("a"): links = "https://www.science-et-vie.com/" + \ str(item.get("href")) article_sv.append(links) # Each article is analized one by one for article in article_sv: new_article = recovery_information_sv(article) if utils.is_empty(new_article) is False: file_json.append(recovery_information_sv(article)) i += 1 if i == 20: utils.create_json(file_target, file_json, "ScienceEtVie_crawler/", "sv") i = 0 file_json = [] utils.create_json(file_target, file_json, "ScienceEtVie_crawler/", "sv")
def recovery_old_articles_fusc( file_target='/var/www/html/projet2018/data/clean/robot/'): """ it create a json for each article """ url = "https://www.futura-sciences.com/sitemap-html/actualites/" url_fusc = "https://www.futura-sciences.com" for ii in range(1, 202): links_article = [] soup = utils.recovery_flux_url_rss(url + str(ii) + "/") for tag_div_link in soup.find_all( 'div', attrs={"class": "has-divider-bottom latest-item"}): links_article.append(url_fusc + tag_div_link.a.get('href')) list_articles = [] for link_article in links_article: new_article = recovery_information_fusc(link_article) if not utils.is_empty(new_article): list_articles.append(new_article) utils.create_json(file_target, list_articles, 'FuturaSciences', 'fusc')
def recovery_old_article_lg(file_target="/var/www/html/projet2018/data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ it create a json for each new article """ list_article = [] ii = 0 url_rss = 'http://www.legorafi.fr/category/' links_article = recovery_link_old_articles_lg(url_rss) for link in links_article: new_article = recovery_information_lg(link) if not utils.is_empty(new_article): list_article.append(new_article) ii += 1 if ii == 20: utils.create_json(file_target, list_article, 'LeGorafi', 'lg') ii = 0 list_article = [] utils.create_json(file_target, list_article, 'LeGorafi', 'lg')
def recovery_new_articles_fem(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] i = 0 article_fem = recovery_link_new_articles_fem() for article in article_fem: new_article = recovery_information_fem(article) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "Femina_crawler/", "fem") i = 0 file_json = [] utils.create_json(file_target, file_json, "Femina_crawler/", "fem")
def add_articles( file_target="/home/etudiant/Documents/ProjetSID/Groupe4_Robot/" + "Telerama/Art/" + str(date.datetime.now().date()) + "/"): """ it create a json for each new article """ categories = { "cinema": 5, "scenes": 5, "enfants": 5, "idees": 5, } articles = [] for category, nbre in categories.items(): for i in range(0, nbre): url = "http://www.telerama.fr/" + \ category + "/articles?page=" + str(i) new_article = get_article_of_category(url) if utils.is_empty(new_article) is False: articles.append(new_article) utils.create_json(file_target, articles, "Telerama/", "tera")
def recovery_new_articles_ld(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): links = recovery_link_new_articles_ld( "https://www.ladepeche.fr/services/flux-rss/") list_articles = [] i = 0 for article in links: new_article = recovery_information_ld(article) if utils.is_empty(new_article) is False: list_articles.append(new_article) i += 1 if i == 50: utils.create_json(file_target, list_articles, "ladepeche/", "LD") i = 0 list_articles = [] utils.create_json(file_target, list_articles, "ladepeche/", "LD")
def recovery_new_articles_hum_crawler(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] article_humanite = recovery_link_new_articles_hum_crawler() # Each url is analized one by one i = 0 for article in article_humanite: new_article = recovery_information_hum(article) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "Humanite/", "hum") i = 0 file_json = [] utils.create_json(file_target, file_json, "Humanite/", "hum")
def recovery_old_articles_sv( file_target='/var/www/html/projet2018/data/clean/robot/' + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ list_category = [ 'corps-et-sante', 'nature-et-enviro', 'ciel-et-espace', 'technos-et-futur', 'cerveau-et-intelligence', 'science-et-culture' ] list_articles = [] i = 0 for cat in list_category: url_rss_sv = 'https://www.science-et-vie.com/' + cat soup_url = utils.recovery_flux_url_rss(url_rss_sv) article_sv = [] # We retrieve all the articles for a given page for div in soup_url.find_all('div', attrs={'class': 'title'}): for item in div.find_all("a"): links = 'https://www.science-et-vie.com/' + \ str(item.get('href')) article_sv.append(links) # Each article is analized one by one for article in article_sv: new_article = recovery_information_sv(article) if not utils.is_empty(new_article): list_articles.append(recovery_information_sv(article)) i += 1 if i == 20: utils.create_json(file_target, list_articles, 'ScienceEtVie/', 'sv') i = 0 list_articles = [] utils.create_json(file_target, list_articles, 'ScienceEtVie/', 'sv')
def recovery_old_article_minutes( file_target="/var/www/html/projet2018/data/clean/robot/" + str(date.datetime.now().date()) + "/"): # Chemin repertoire des articles source = "Minutes/" soup = utils.recovery_flux_url_rss("http://www.20minutes.fr") categories = soup.find("nav", class_="header-nav").find_all("li") articles = [] for category in categories: url = category.find("a").get("href") theme = unidecode(category['data-theme']) if theme in [ "default", "entertainment", "sport", "economy", "hightech", "planet" ]: articles.extend(get_article_of_category(url)) utils.create_json(file_target, articles, source, "min")
def recovery_new_articles_lpt( file_target="C:/Users/cmisid/Documents/TableauDeBord/LESOIR/" + str(date.datetime.now().date()) + "/"): list_url_articles = [] j = 0 for i in range(0, 1650, 10): j = j + 1 url1 = 'http://www.lesoir.be/archives/recherche?datefilter=lastyear&sort=date+desc&start=' +str(i) + '&word=terrorisme' soup1 = utils.recovery_flux_url_rss(url1) for a in soup1.find_all('a'): tit = a.get('href') if '/archive/' in tit.split('d'): url = 'http://www.lesoir.be' + tit list_url_articles.append(url) ###################### url2 = 'http://www.lesoir.be/archives/recherche?datefilter=lastyear&sort=date+desc&start=' +str(i) + '&word=attentat' soup2 = utils.recovery_flux_url_rss(url2) for a in soup2.find_all('a'): tit = a.get('href') if '/archive/' in tit.split('d'): url = 'http://www.lesoir.be' + tit list_url_articles.append(url) if (j == 3): time.sleep(71) j = 0 list_dictionaries = [] list_titre = [] collect_articles(list_dictionaries, list_url_articles, list_titre) utils.create_json(file_target, list_dictionaries, "lesoir/", "lsr")
def recovery_new_articles_libe( file_target="/var/www/html/projet2018/data/clean/robot/" + str(date.datetime.now().date()) + "/"): """Procedure that calls all the others functions and procedures in order to collect articles from a newspaper in a file Arguments: file_target {string} -- path where the articles will be recorded """ list_dictionaries = [] list_url_articles = collect_url_articles('http://www.liberation.fr/') number_articles = 0 for url_article in list_url_articles: article = collect_article(url_article) if article != None: list_dictionaries.append(article) # Buffer number_articles += 1 if number_articles % 50 == 0: utils.create_json(file_target, list_dictionaries, 'Liberation/', 'libe') list_dictionaries.clear() utils.create_json(file_target, list_dictionaries, 'Liberation/', 'libe')
def recovery_new_articles_noob_rss(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] i = 0 # Each url is analized one by one list_url = recovery_link_new_articles_noob_rss("http://www.nouvelobs." + "com/rss/") for url in list_url: soup_url = utils.recovery_flux_url_rss(url) items = soup_url.find_all("item") article_noob = [] # We're picking up every new article in a list for item in items: link_article = re.search(r"<link/>(.*)", str(item))[1] link_article = link_article.split("<description>") link_article = link_article[0] article_noob.append(link_article) if re.search("\/galeries\-photos\/", link_article): article_noob.remove(link_article) # Each article is analized one by one for article in article_noob: new_article = recovery_information_noob(article) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "NouvelObs_rss/", "noob") i = 0 file_json = [] utils.create_json(file_target, file_json, "NouvelObs/", "noob")
def recovery_new_articles_lt(file_target="C:/Users/lea/Desktop/PROJET/" + str(date.datetime.now().date()) + "/"): list_category = [ "actualites/economie/economie", "Entreprises-secteurs", "media-telecom-entreprise", "finance-patrimoine-investir", "opinions", "regions/economie-en-region" ] file_json = [] articles_latribune = [] # We retrieve the URL feeds for each page of article for cat in list_category: url_latribune = "https://www.latribune.fr/" + cat + ".html" soup_url = utils.recovery_flux_url_rss(url_latribune) for ul in soup_url.find_all("ul"): if ul.get("class") == ['pagination-archive', 'pages']: for li in ul.find_all("li"): for a in li.find_all("a"): link = a.get("href") link2 = "https://www.latribune.fr" + link soup_url = utils.recovery_flux_url_rss(link2) for div in soup_url.find_all("div"): for valeur in re.finditer('title-river', str(div.get("class"))): for a in div.find_all('a'): articles_latribune.append(a.get("href")) # Each article is analized one by one for article in articles_latribune: new_article = recovery_information_lt(article) if utils.is_empty(new_article) is False: file_json.append(new_article) utils.create_json(file_target, file_json, "latribune_crawler/", "lt")
soup_url = utils.recovery_flux_url_rss(url_rss_sv) article_sv = [] # We retrieve all the articles for a given page for div in soup_url.find_all('div', attrs={'class': 'title'}): for item in div.find_all("a"): links = 'https://www.science-et-vie.com/' + \ str(item.get('href')) article_sv.append(links) # Each article is analized one by one for article in article_sv: new_article = recovery_information_sv(article) if not utils.is_empty(new_article): list_articles.append(recovery_information_sv(article)) i += 1 if i == 20: utils.create_json(file_target, list_articles, 'ScienceEtVie/', 'sv') i = 0 list_articles = [] utils.create_json(file_target, list_articles, 'ScienceEtVie/', 'sv') if __name__ == '__main__': recovery_old_articles_sv() # /var/www/html/projet2018/data/clean/robot/
# content content = "" for h2 in soup_article.find_all('h2'): if h2.get("class") == ['article-full__header']: content = h2.get_text() + " " for div in soup_article.find_all('div'): if div.get("class") == ['article-full__body-content']: for b in div.find_all('b'): b.string = "" for a in div.find_all('a'): a.string = "" content += div.get_text() + " " data = [{ "title": title, "newspaper": "leparisien", "author": author, "date_publi": date_p, "theme": categorie, "content": content }] # Mis sous json les articles erreur = "non" for tit in titre: if title == tit: erreur = "oui" if len(content) > 10 and erreur == "non": titre.append(title) utilsg4.create_json(fileTarget, data, "leparisien", "lp")