def get_rss_infos(): """Get all articles link Returns: list -- list of articles url """ url_rss_lib = "http://www.liberation.fr/rss" soup = utils.recovery_flux_url_rss(url_rss_lib) rss_items = soup.find_all("li") rss_list = [] link_rss = [] for ri in rss_items: if ri.get("class") == ['rss-item']: rss_list.append(ri.a.get('href')) for rl in rss_list: soup = utils.recovery_flux_url_rss(rl) entre = soup.find_all('entry') for e in entre: link_rss.append(e.link.get('href')) return link_rss
def recovery_link_new_articles_ld(url_rss): # We retrieve the rss feeds for each article page. # Each HTML-coded article is scanned with beautiful soup. soup = utils.recovery_flux_url_rss(url_rss) list_link = [] for link in soup.find_all("a"): if link.get("class") == ["rss"]: url = link.get("href") url = "https://www.ladepeche.fr/" + url soup = utils.recovery_flux_url_rss(url) items = soup.find_all("item") # We retrieve all articles for item in items: list_link.append(re.search(r"<link/>(.*)", str(item))[1]) return (list_link)
def is_article(url): """ Prend en argument une adresse url et retourne vrai s'il est une article et faux sinon """ soup = utils.recovery_flux_url_rss(url) return soup.find("div", class_="article--text") is not None
def recovery_old_articles_LD( file_target='/var/www/html/projet2018/data/clean/robot/' + str(datetime.datetime.now().date())): """ it create a json for each article """ list_category = [ 'grand-sud', 'actu', 'faits-divers', 'economie', 'sports', 'sante', 'tv-people', 'sorties' ] links_article = [] list_articles = [] for cat in list_category: for i in range(1, 100): try: url = 'https://www.ladepeche.fr/recherche/?p=' + str(i)\ + '&c=' + cat + '&plus-infos=1' soup = utils.recovery_flux_url_rss(url) except: break for h2 in soup.find_all('h2'): for item in h2.find_all('a'): link = 'https://www.ladepeche.fr' + str(item.get('href')) links_article.append(link) for link in links_article: new_article = recovery_article_ld(link) if not utils.is_empty(new_article): list_articles.append(new_article) utils.create_json(file_target, list_articles, "Ladepeche", "LD")
def get_article(url): """ Prend en argument une adresse url (url) et retourne un dictionnaire """ from unidecode import unidecode soup = utils.recovery_flux_url_rss(url) article = soup.find("article") meta = soup.find("meta", property="og:title").get("content") tab = meta.split("-") n = len(tab) theme = tab[n - 2] title = "-".join(tab[:n - 2]) authors = [] regex = re.compile(r'[\n\r\t]') for span in article.find_all("span", class_="author--name"): author = regex.sub("", unidecode(span.get_text())) authors.append(author.strip()) date_pub = article.find( "span", itemprop="datePublished").get("datetime")[:10].replace("-", "/") date_pub = str(date.datetime.strptime(date_pub, "%d/%m/%Y").date()) content = "" for div in article.find_all("div", class_=[ "article--intro", "article--wysiwyg", "article--footnotes" ]): for p in div.find_all("p"): content = content + p.get_text() content = regex.sub("", content) return utils.recovery_article(title, "Telerama", authors, date_pub, content, theme)
def get_article(url): soup = utils.recovery_flux_url_rss(url) article = soup.find("article") # Titre de l'article title = article.find("h1").get_text() # tableau vide quand il y'a pas d'autheur sinon tableau de(s) auteur(s) if article.find("header").find("p", class_="authorsign-label") is None: authors = [] else: authors = article.find("header").find( "p", class_="authorsign-label").get_text().split(" et ") # Date de publication de l'article date_pub = article.find("time").get("datetime")[0:10] # Theme de l'article theme = article.find("ol", class_="breadcrumb-list").find_all("li")[1]\ .find("span").get_text() # Contenu de l'article content = "" for p in article.find("div", class_="content").find_all("p"): content = content + p.get_text() # Nom du journal newspaper = soup.find("footer").find(has_copyright).find("a").get_text() return utils.recovery_article(title, newspaper, authors, date_pub, content, theme)
def recovery_link_new_articles_noob_crawler(): """ Arguments: - url of the page containing feed links for the different categories Returns : - list of urls of the different categories """ list_category = ["politique", "monde", "economie", "culture", "editos-et-chroniques", "debat"] article_noob = [] for cat in list_category: # We retrieve the URL feeds for each page of article # Each HTML-coded article is analyzed with beautiful soup for i in range(2, 8): url_rss_noob = "http://www.nouvelobs.com/" + cat +\ "/page-" + str(i) + ".html" soup_url = utils.recovery_flux_url_rss(url_rss_noob) # We retrieve all the articles for a given page for h3 in soup_url.find_all('h3'): if h3.get("class") == ['title']: if re.search('^\/', str(h3.a.get("href"))): new_article = "http://www.nouvelobs.com" +\ h3.a.get("href") article_noob.append(new_article) return(article_noob)
def recovery_link_old_articles_equi(url_rss): """ Argument: url_rss : string Return: link_article = list Retrieving links of new articles thanks to the rss feed """ list_category = [ "Athletisme", "Aussi/Aviron", "Auto-moto", "Aussi/Badminton", "Aussi/Baseball", "Basket", "Aussi/Biathlon", "Aussi/Boxe", "Aussi/Canoe-kayak", "Cyclisme", "Aussi/Equitation", "Aussi/Escrime", "Adrenaline/Escalade", "Football", "Aussi/Football-americain", "Formule-1", "Golf", "Aussi/Gymnastique", "Aussi/Halterophilie", "Handball", "Hippisme", "Aussi/Hockey-sur-gazon", "Aussi/Judo", "Natation", "Basket/NBA", "Aussi/Pentathlon-moderne", "Rugby", "Sports-de-combat", "Sports-us", "Aussi/Squash", "Adrenaline/Surf", "Tennis", "Aussi/Tennis-de-table", "Aussi/Tir", "Aussi/Tir-a-l-arc", "Aussi/Triathlon", "Aussi/Mma", "Voile", "Aussi/Volley-ball", "Natation/Water-polo", "Aussi/Jeux-paralympiques" ] # We retrieve the URL feeds for each page of category link_article = [] for cat in list_category: url_rss_cat = url_rss + cat + "/" soup = utils.recovery_flux_url_rss(url_rss_cat) # We retrieve all the articles for a given page for div in soup.find_all('div'): if div.get("class") == ['home__colead__split']: new_article = "https://www.lequipe.fr" + div.a.get("href") link_article.append(new_article) return (link_article)
def recovery_link_new_articles_hum_crawler(): """ Arguments: - url of the page containing feed links for the different categories Returns : - list of urls of the different categories """ list_category = [ "politique", "société", "social-eco", "culture", "sports", "monde", "environnement", "rubriques/en-debat" ] article_humanite = [] for cat in list_category: # We retrieve the URL feeds for each page of article # Each HTML-coded article is analyzed with beautiful soup for i in range(2, 10): try: url_rss_humanite = "https://humanite.fr/" + cat + "?page=" +\ str(i) + "/feed/" soup_url = utils.recovery_flux_url_rss(url_rss_humanite) # We retrieve all the articles for a given page for div in soup_url.find_all('div'): if re.search('field-name-field-news-chapo', str(div.get("class"))): for a in div.find_all('a'): article_humanite.append(a.get("href")) except: break return (article_humanite)
def recovery_new_articles_equipe(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] i = 0 list_url = recovery_link_new_articles_equipe("https://www.lequipe.fr/rss/") for url in list_url: soup_url = utils.recovery_flux_url_rss(url) items = soup_url.find_all("item") article_equipe = [] # We're picking up every new article in a list for item in items: article_equipe.append(re.search(r"<link/>(.*)", str(item))[1]) # Each article is analized one by one for article in article_equipe: new_article = recovery_information_equipe(article) if utils.is_empty(new_article) is False: file_json.append(recovery_information_equipe(article)) i += 1 if i == 20: utils.create_json(file_target, file_json, "Equipe_rss/", "equi") i = 0 file_json = [] utils.create_json(file_target, file_json, "Equipe_rss/", "equi")
def recovery_information_sv(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) title = '' title = soup_article.find('h1', attrs={'class': 'like-h1'}).get_text() # date date = soup_article.find("time")["datetime"] # author author = [] for span in soup_article.find_all('span', attrs={'class': 'author'}): author.append(span.span.get_text()) # content content = '' for div in soup_article.find_all('div', attrs={'class': ['content', 'left']}): for p in div.find_all('p'): content += p.get_text() + ' ' # theme theme = '' tag_meta = soup_article.find('meta', attrs={'property': 'article:tag'}) theme = tag_meta.get_text('content') article = utils.recovery_article(title, 'Scienceetvie', author, date, content, theme) return(article)
def get_article_of_category(url): result = [] soup = utils.recovery_flux_url_rss(url) articles = soup.find_all('article') for article in articles: url_article = "http://www.20minutes.fr" + article.find("a").get("href") # Insérer le nouveau article dans un le tableau if (is_article(url_article)): result.append(get_article(url_article)) return result
def is_article(url): """ Arguments : - URL address Returns : - True if the page contains an article - False otherwise """ soup = utils.recovery_flux_url_rss(url) article = soup.find("article") return article is not None
def recovery_link_new_articles(url_rss): """ Argument: url_rss : string Return: retrieving links of new articles thanks to the rss feed """ soup = utils.recovery_flux_url_rss(url_rss) list_link = [] for link in soup.find_all('a', attrs={'class': 'first-capitalize'}): list_link.append('https://www.futura-sciences.com' + link.get('href')) return (list_link)
def recovery_information_noob(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) title = soup_article.title.get_text() # Retrieval of publication date find_date = soup_article.find('time', attrs={"class": "date"}) for a in find_date.find_all('a'): find_valeur = re.compile('[0-9]{4}\/[0-9]{2}\/[0-9]{2}') for valeur in find_valeur.finditer(str(a.get("href"))): date_p = valeur.group(0) date_p = datetime.strptime(date_p, "%Y/%m/%d")\ .strftime("%Y-%m-%d") # Retrieval of the author of the article author = [] for div in soup_article.find_all('div'): if re.search('author', str(div.get("class"))): author.append(div.p.span.get_text()) # Retrieval of the artical theme theme = "" for nav in soup_article.find_all('nav'): if nav.get("class") == ['breadcrumb']: for ol in nav.find_all('ol'): for a in ol.find_all('a'): theme = a.get_text() # Retrieving the content of the article contents = "" for div in soup_article.find_all('div'): if re.search('body', str(div.get("id"))): for aside in div.find_all('aside'): for p in aside.find_all('p'): p.string = "" for p in div.find_all('p'): for a in p.find_all('a'): if a.get("class") == ['lire']: a.string = "" for img in p.find_all('img'): p.string = "" contents += p.get_text() + " " article = utils.recovery_article(title, 'NouvelObservateur', author, date_p, contents, theme) return(article)
def get_article_of_category(url): """ Prend en parametre une catégorie et retour toutes les articles de cette catégorie """ result = [] soup = utils.recovery_flux_url_rss(url) articles = soup.find_all("div", class_="item--body") for article in articles: url_article = article.find("a").get("href") if is_article(url_article): result.append(get_article(url_article)) return result
def recovery_new_articles_lpt( file_target="C:/Users/cmisid/Documents/TableauDeBord/LESOIR/" + str(date.datetime.now().date()) + "/"): list_url_articles = [] j = 0 for i in range(0, 1650, 10): j = j + 1 url1 = 'http://www.lesoir.be/archives/recherche?datefilter=lastyear&sort=date+desc&start=' +str(i) + '&word=terrorisme' soup1 = utils.recovery_flux_url_rss(url1) for a in soup1.find_all('a'): tit = a.get('href') if '/archive/' in tit.split('d'): url = 'http://www.lesoir.be' + tit list_url_articles.append(url) ###################### url2 = 'http://www.lesoir.be/archives/recherche?datefilter=lastyear&sort=date+desc&start=' +str(i) + '&word=attentat' soup2 = utils.recovery_flux_url_rss(url2) for a in soup2.find_all('a'): tit = a.get('href') if '/archive/' in tit.split('d'): url = 'http://www.lesoir.be' + tit list_url_articles.append(url) if (j == 3): time.sleep(71) j = 0 list_dictionaries = [] list_titre = [] collect_articles(list_dictionaries, list_url_articles, list_titre) utils.create_json(file_target, list_dictionaries, "lesoir/", "lsr")
def collect_articles(list_dictionaries, list_url_articles, list_titre): j = 0 for url_article in list_url_articles: j = j + 1 soup = utils.recovery_flux_url_rss(url_article) for titl in soup.find_all('title'): # find the title tit = titl.get_text() if len(tit.split('-')) == 2: title = tit.split('-')[0] authors = [] for a in soup.find_all('a'): # find the authors if a.get('href') is not None: if "dpi-authors" in a.get('href').split('/'): tit = a.get('href').split('/')[-1] authors.append(tit.split('-')[0] + ' ' + tit.split('-')[1]) if len(authors) == 0: authors.append('') dates = [] date_publication = [] for balise_time in soup.find_all('time'): # find publication's date if 'pubdate' in balise_time.get('class'): dates.append(balise_time.get('datetime').split('T')[0]) date_publication.append( balise_time.get('datetime').split('T')[0]) theme = re.search("www.lesoir.be/(.*)/", url_article)[1] content = '' for p in soup.find_all('p'): if len(p.get_text().split(" ")) >= 2: content += p.get_text() new_article = utils.recovery_article(title, 'lesoir', authors, date_publication, content, theme) if (j == 3): time.sleep(71) j = 0 if not utils.is_empty(new_article): erreur = "non" for tit in list_titre: if title == tit: erreur = "oui" if len(content) > 10 and erreur == "non": list_titre.append(title) list_dictionaries.append(new_article)
def recovery_link_new_articles_lg(url_rss): """ Argument: url_rss : string Return: link_article = list Retrieving links of new articles thanks to the rss feed """ soup = utils.recovery_flux_url_rss(url_rss) items = soup.find_all('item') links_article_gorafi = [] for item in items: links_article_gorafi.append(re.search(r"<link/>(.*)", str(item))[1]) return (links_article_gorafi)
def get_article(url): """ Arguments : - URL address Returns : - An article { "title" : str, "newspaper" : str, "author" : [str], "date_publi" : str, "content" : str, "theme" : str } """ soup = utils.recovery_flux_url_rss(url) article = soup.find("article") # Titre de l'article title = article.find("h1").get_text() # tableau vide quand il y'a pas d'autheur sinon tableau de(s) auteur(s) authors = [] if article.find("header") .find( "p", class_="authorsign-label") is None else unidecode( article.find("header") .find( "p", class_="authorsign-label").get_text()).split(" et ") # Date de publication de l'article date_pub = article.find("time").get("datetime")[:10] # Theme de l'article theme = article.find("ol", class_="breadcrumb-list")\ .find_all("li")[1].find("span").get_text() # Contenu de l'article content = "" for p in article.find("div", class_="content").find_all("p"): content = content + p.get_text() + " " # Nom du journal newspaper = soup.find("footer").find(has_copyright).find("a").get_text() regex = re.compile(r'[\n\r\t]') # Elever les \n \r \t du contenu content = regex.sub("", content) return utils.recovery_article( unidecode(title), unidecode(newspaper), authors, str(date_pub), unidecode(content), unidecode(theme))
def get_article_of_category(url): """ Arguments : - Category Returns : - All articles of this category """ result = [] soup = utils.recovery_flux_url_rss(url) articles = soup.find_all("div", class_="item--body") for article in articles: url_article = article.find("a").get("href") if is_article(url_article): result.append(get_article(url_article)) return result
def recovery_new_articles_lt(file_target="C:/Users/lea/Desktop/PROJET/" + str(date.datetime.now().date()) + "/"): list_category = [ "actualites/economie/economie", "Entreprises-secteurs", "media-telecom-entreprise", "finance-patrimoine-investir", "opinions", "regions/economie-en-region" ] file_json = [] articles_latribune = [] # We retrieve the URL feeds for each page of article for cat in list_category: url_latribune = "https://www.latribune.fr/" + cat + ".html" soup_url = utils.recovery_flux_url_rss(url_latribune) for ul in soup_url.find_all("ul"): if ul.get("class") == ['pagination-archive', 'pages']: for li in ul.find_all("li"): for a in li.find_all("a"): link = a.get("href") link2 = "https://www.latribune.fr" + link soup_url = utils.recovery_flux_url_rss(link2) for div in soup_url.find_all("div"): for valeur in re.finditer('title-river', str(div.get("class"))): for a in div.find_all('a'): articles_latribune.append(a.get("href")) # Each article is analized one by one for article in articles_latribune: new_article = recovery_information_lt(article) if utils.is_empty(new_article) is False: file_json.append(new_article) utils.create_json(file_target, file_json, "latribune_crawler/", "lt")
def fileJson(article_latribune): file_json = [] for article in article_latribune: soup = utils.recovery_flux_url_rss(article) # Retrieve the title title = soup.title.string # Retrieve the theme for li in soup.find_all('li'): if li.get("itemprop") == 'itemListElement': theme = li.a.span.get_text() # Retrieve the author author = [] for span in soup.find_all('span'): if span.get("class") == ['author-name']: author.append(span.a.span.get_text()) # Retrieve the publication date for time in soup.find_all('time'): if time.get("itemprop") == 'datePublished': date = time.get("itemprop") for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(time)): date = valeur.group(0) date = datetime.strptime(date, "%d/%m/%Y")\ .strftime("%Y-%m-%d") print(date) # Retrieve the content content = "" for div in soup.find_all('div'): if div.get("itemprop") == 'articleBody': for p in div.find_all('p'): content += p.get_text() + " " new_article = { "title": title, "newspaper": "La tribune", "author": author, "date_publi": date, "content": content, "theme": theme } # add each new article in the "file_json" table if utils.is_empty(new_article) is False: file_json.append(new_article) return (file_json)
def recovery_information_hum(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) for meta in soup_article.find_all('meta'): if meta.get("property") == 'og:title': title = meta.get("content") for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:section': theme = meta.get("content") author = [] for h2 in soup_article.find_all('h2'): for a in h2.find_all('a'): if re.search('auteur', str(a.get("href"))): author.append(a.get_text()) for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:published_time': raw_date = meta.get("content") date_p = raw_date[0:10] date_p = str(datetime.strptime(date_p, "%Y-%m-%d").date()) contents = "" for div in soup_article.find_all('div'): if div.get("class") == [ 'field', 'field-name-field-news-chapo', 'field-type-text-long', 'field-label-hidden' ]: for p in div.find_all('p'): contents += p.get_text() if div.get("class") == [ 'field', 'field-name-field-news-text', 'field-type-text-long', 'field-label-hidden' ]: for p in div.find_all('p'): contents += p.get_text() article = utils.recovery_article(title, 'Humanite', author, date_p, contents, theme) return (article)
def recovery_information_fem(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) title = soup_article.title.get_text() title = title.split(" - ") title = title[0] for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:published_time': raw_date = meta.get("content") date_p = raw_date[0:10] author = [] for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:author': author.append(meta.get("content")) theme = "" for link in soup_article.find_all('link'): if link.get("rel") == ['Index']: link_theme = link.get("href") part_link = link_theme.split("/") theme = part_link[3] contents = "" for div in soup_article.find_all('div'): if div.get("class") == ['chapo']: for p in div.find_all('p'): contents += p.get_text() + " " if div.get("class") == ['contenu']: for p in div.find_all('p'): contents += p.get_text() + " " if div.get("class") == ['diaporama']: for p in div.find_all('p'): contents += p.get_text() + " " contents = re.sub(r"\s\s+", " ", contents) article = utils.recovery_article(title, 'Femina', author, date_p, contents, theme) return (article)
def add_articles(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ it create a json for each new article """ soup = utils.recovery_flux_url_rss( "http://www.20minutes.fr/feeds/rss-actu-france.xml") items = soup.find_all("item") articles = [] for item in items: # Récuperer le lien des articles url = re.search(r"<link/>(.*)<pubdate>", str(item)).group(1) if is_article(url): new_article = get_article(url) if utils.is_empty(new_article): articles.append(new_article) utils.create_json(file_target, articles, "Minutes/", "min")
def recovery_link_new_articles_hum_rss(url_rss): """ Arguments: - url of the page containing feed links for the different categories Returns : - list of urls of the different categories """ soup = utils.recovery_flux_url_rss(url_rss) items = soup.find_all("item") article_humanite = [] # Retrieving all urls of new RSS feeds of different categories for item in items: article_humanite.append(re.search(r"<link/>(.*)", str(item))[1]) return (article_humanite)
def recovery_old_articles_sv( file_target="C:/Users/Laetitia/Desktop/Groupe4_Robot" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ list_category = [ "corps-et-sante", "nature-et-enviro", "ciel-et-espace", "technos-et-futur", "cerveau-et-intelligence", "science-et-culture" ] file_json = [] i = 0 for cat in list_category: # We retrieve the URL feeds for each page of article # Each HTML-coded article is analyzed with beautiful soup url_rss_sv = "https://www.science-et-vie.com/" + cat soup_url = utils.recovery_flux_url_rss(url_rss_sv) article_sv = [] # We retrieve all the articles for a given page for div in soup_url.find_all("div"): if div.get("class") == ["title"]: for item in div.find_all("a"): links = "https://www.science-et-vie.com/" + \ str(item.get("href")) article_sv.append(links) # Each article is analized one by one for article in article_sv: new_article = recovery_information_sv(article) if utils.is_empty(new_article) is False: file_json.append(recovery_information_sv(article)) i += 1 if i == 20: utils.create_json(file_target, file_json, "ScienceEtVie_crawler/", "sv") i = 0 file_json = [] utils.create_json(file_target, file_json, "ScienceEtVie_crawler/", "sv")
def recovery_link_new_articles_fem(): """ Arguments: - url of the page containing feed links for the different categories Returns : - list of urls of the different categories """ list_category = ["Beaute/Coiffure", "Beaute/Beaute-People", "Beaute/Parfums", "Beaute/Soins-visage-et-corps", "Beaute/Maquillage", "Mode/Tendances", "Mode/Defiles", "Mode/Lingerie", "Mode/Mode-People", "Cuisine/Recettes-de-chefs", "Cuisine/Shopping-et-conseils", "Cuisine/Idees-de-recettes-par-theme", "Psychologie/Psycho", "Psychologie/Societe", "Psychologie/Argent-Droit", "People/Vie-des-people", "Culture/Series", "Culture/Musique", "Culture/Cinema-et-DVD", "Culture/Sorties", "Loisirs/Jardinage", "Loisirs/Voyages", "Loisirs/Tendace-deco", "Sexo/Sexualite", "Sexo/Amour", "Sante-Forme/Bien-etre", "Sante-Forme/Sport", "Sante-Forme/Regimes-Nutrition", "Sante-Forme/Sante", "Famille/Grossesse", "Famille/Bebe", "Famille/Enfant", "Famille/Adolescent"] article_fem = [] for category in list_category: for i in range(2, 45): try: url_rss_fem = "http://www.femina.fr/" +\ category + "/page-" + str(i) soup_url = utils.recovery_flux_url_rss(url_rss_fem) for h2 in soup_url.find_all('h2'): for a in h2.find_all('a'): article_fem.append(a.get("href")) for h3 in soup_url.find_all('h3'): for a in h3.find_all('a'): article_fem.append(a.get("href")) except: break return(article_fem)
def recovery_information_lg(url): """ Arguments: url : string Return : article : dictionary It retrieve for each article the title, newspaper, author, date, theme """ soup = utils.recovery_flux_url_rss(url) # Retrieving the title title = '' balise_title = soup.title.string sep = balise_title.split('—') title = unidecode.unidecode('—'.join(sep[:-1])) tag_context = soup.find('span', attrs={'class': 'context'}) # Retrieving of author author = [] author.append(tag_context.a.get_text()) # Retrieving of publication date date_p = '' regex_date = re.search('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', tag_context.get_text()) date_p = regex_date.group(0) date_p = str(date.datetime.strptime(date_p, '%d/%m/%Y').date()) # Retrieving the theme tag_post_cat = soup.find('ul', attrs={'class': 'post-categories'}) for li in tag_post_cat.find_all('li'): theme = li.get_text() # Retrieving the content of the article contents = '' tag_content = soup.find('div', attrs={'class': 'content'}) if tag_content: for p in tag_content.find_all('p'): contents += p.get_text() + " " new_article = utils.recovery_article(title, 'LeGorafi', author, date_p, contents, theme) return (new_article)