def recovery_information_sv(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) title = '' title = soup_article.find('h1', attrs={'class': 'like-h1'}).get_text() # date date = soup_article.find("time")["datetime"] # author author = [] for span in soup_article.find_all('span', attrs={'class': 'author'}): author.append(span.span.get_text()) # content content = '' for div in soup_article.find_all('div', attrs={'class': ['content', 'left']}): for p in div.find_all('p'): content += p.get_text() + ' ' # theme theme = '' tag_meta = soup_article.find('meta', attrs={'property': 'article:tag'}) theme = tag_meta.get_text('content') article = utils.recovery_article(title, 'Scienceetvie', author, date, content, theme) return(article)
def get_article(url): soup = utils.recovery_flux_url_rss(url) article = soup.find("article") # Titre de l'article title = article.find("h1").get_text() # tableau vide quand il y'a pas d'autheur sinon tableau de(s) auteur(s) if article.find("header").find("p", class_="authorsign-label") is None: authors = [] else: authors = article.find("header").find( "p", class_="authorsign-label").get_text().split(" et ") # Date de publication de l'article date_pub = article.find("time").get("datetime")[0:10] # Theme de l'article theme = article.find("ol", class_="breadcrumb-list").find_all("li")[1]\ .find("span").get_text() # Contenu de l'article content = "" for p in article.find("div", class_="content").find_all("p"): content = content + p.get_text() # Nom du journal newspaper = soup.find("footer").find(has_copyright).find("a").get_text() return utils.recovery_article(title, newspaper, authors, date_pub, content, theme)
def get_article(url): """ Prend en argument une adresse url (url) et retourne un dictionnaire """ from unidecode import unidecode soup = utils.recovery_flux_url_rss(url) article = soup.find("article") meta = soup.find("meta", property="og:title").get("content") tab = meta.split("-") n = len(tab) theme = tab[n - 2] title = "-".join(tab[:n - 2]) authors = [] regex = re.compile(r'[\n\r\t]') for span in article.find_all("span", class_="author--name"): author = regex.sub("", unidecode(span.get_text())) authors.append(author.strip()) date_pub = article.find( "span", itemprop="datePublished").get("datetime")[:10].replace("-", "/") date_pub = str(date.datetime.strptime(date_pub, "%d/%m/%Y").date()) content = "" for div in article.find_all("div", class_=[ "article--intro", "article--wysiwyg", "article--footnotes" ]): for p in div.find_all("p"): content = content + p.get_text() content = regex.sub("", content) return utils.recovery_article(title, "Telerama", authors, date_pub, content, theme)
def collect_articles(list_dictionaries, list_url_articles, theme): """Add the articles (dictionaries) from a list of URL in a list of dictionaries Arguments: list_dictionaries {list} -- list of dictionaries list_url_articles {list} -- list of URL theme {string} -- theme related to the list of dictionaries """ for url_article in list_url_articles: try: req = requests.get(url_article) data = req.text soup = BeautifulSoup(data, "lxml") balise_title = soup.title.string sep = balise_title.split(" - Le Point") title = sep[0] list_authors = [] for div in soup.find_all('div'): if div.get('class') == ['mbs']: for span in div.find_all('span'): name = span.get_text() name = re.sub('Par', '', name) name = re.sub("\s\s+", "", name) list_authors.append(name) dates = [] for balise_time in soup.find_all('time'): for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(balise_time)): dates.append( date.datetime.strptime(valeur.group(0), '%d/%m/%Y')) date_publication = date.datetime.strftime(min(dates), '%d/%m/%Y') date_publication = str( date.datetime.strptime(date_publication, "%d/%m/%Y").date()) content = '' for h2 in soup.find_all('h2'): if h2.get('class') == ['art-chapeau']: content += h2.get_text() + " " for div in soup.find_all('div'): if div.get('class') == ['art-text']: for p in div.find_all('p'): content += p.get_text() + " " new_article = utils.recovery_article(title, 'LePoint', list_authors, date_publication, content, theme) if not utils.is_empty(new_article): list_dictionaries.append(new_article) except: print("Erreur lors de l'enregistrement de l'article")
def recovery_information_noob(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) title = soup_article.title.get_text() # Retrieval of publication date find_date = soup_article.find('time', attrs={"class": "date"}) for a in find_date.find_all('a'): find_valeur = re.compile('[0-9]{4}\/[0-9]{2}\/[0-9]{2}') for valeur in find_valeur.finditer(str(a.get("href"))): date_p = valeur.group(0) date_p = datetime.strptime(date_p, "%Y/%m/%d")\ .strftime("%Y-%m-%d") # Retrieval of the author of the article author = [] for div in soup_article.find_all('div'): if re.search('author', str(div.get("class"))): author.append(div.p.span.get_text()) # Retrieval of the artical theme theme = "" for nav in soup_article.find_all('nav'): if nav.get("class") == ['breadcrumb']: for ol in nav.find_all('ol'): for a in ol.find_all('a'): theme = a.get_text() # Retrieving the content of the article contents = "" for div in soup_article.find_all('div'): if re.search('body', str(div.get("id"))): for aside in div.find_all('aside'): for p in aside.find_all('p'): p.string = "" for p in div.find_all('p'): for a in p.find_all('a'): if a.get("class") == ['lire']: a.string = "" for img in p.find_all('img'): p.string = "" contents += p.get_text() + " " article = utils.recovery_article(title, 'NouvelObservateur', author, date_p, contents, theme) return(article)
def get_article(url): """ Arguments : - URL address Returns : - An article { "title" : str, "newspaper" : str, "author" : [str], "date_publi" : str, "content" : str, "theme" : str } """ soup = utils.recovery_flux_url_rss(url) article = soup.find("article") # Titre de l'article title = article.find("h1").get_text() # tableau vide quand il y'a pas d'autheur sinon tableau de(s) auteur(s) authors = [] if article.find("header") .find( "p", class_="authorsign-label") is None else unidecode( article.find("header") .find( "p", class_="authorsign-label").get_text()).split(" et ") # Date de publication de l'article date_pub = article.find("time").get("datetime")[:10] # Theme de l'article theme = article.find("ol", class_="breadcrumb-list")\ .find_all("li")[1].find("span").get_text() # Contenu de l'article content = "" for p in article.find("div", class_="content").find_all("p"): content = content + p.get_text() + " " # Nom du journal newspaper = soup.find("footer").find(has_copyright).find("a").get_text() regex = re.compile(r'[\n\r\t]') # Elever les \n \r \t du contenu content = regex.sub("", content) return utils.recovery_article( unidecode(title), unidecode(newspaper), authors, str(date_pub), unidecode(content), unidecode(theme))
def collect_articles(list_dictionaries, list_url_articles, theme): """Add the articles (dictionaries) from a list of URL in a list of dictionaries Arguments: list_dictionaries {list} -- list of dictionaries list_url_articles {list} -- list of URL theme {string} -- theme related to the list of dictionaries """ for url_article in list_url_articles: req = requests.get(url_article) data = req.text soup = BeautifulSoup(data, "lxml") title = soup.title.string list_authors = [] for a in soup.find_all('a'): if a.get("class") == ['fig-content-metas__author']: name = re.sub("\s\s+", "", a.get_text()) name = re.sub("\n", "", name) list_authors.append(name) if len(list_authors) == 0: for span in soup.find_all('span'): if span.get("class") == ['fig-content-metas__author']: name = re.sub("\s\s+", "", span.get_text()) name = re.sub("\n", "", name) list_authors.append(name) date_publication = '' for marker_time in soup.find_all('time'): for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(marker_time)): date_publication = valeur.group(0) date_publication = str( date.datetime.strptime(date_publication, "%d/%m/%Y").date()) content = '' for p in soup.find_all('p'): if p.get("class") == ['fig-content__chapo']: content = p.get_text() + " " for div in soup.find_all('div'): if div.get("class") == ['fig-content__body']: for p in div.find_all('p'): content += p.get_text() + " " new_article = utils.recovery_article(title, 'LeFigaro', list_authors, date_publication, content, theme) if not utils.is_empty(new_article): list_dictionaries.append(new_article)
def collect_articles(list_dictionaries, list_url_articles, theme): for url_article in list_url_articles: try: req = requests.get(url_article) data = req.text soup = BeautifulSoup(data, 'lxml') title = soup.title.string list_authors = [] for a in soup.find_all('a'): if a.get("class") == ['fig-content-metas__author']: name = re.sub("\s\s+", "", a.get_text()) name = re.sub("\n", "", name) list_authors.append(name) if len(list_authors) == 0: for span in soup.find_all('span'): if span.get("class") == ['fig-content-metas__author']: name = re.sub("\s\s+", "", span.get_text()) name = re.sub("\n", "", name) list_authors.append(name) date_publication = "" for marker_time in soup.find_all('time'): for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(marker_time)): date_publication = valeur.group(0) date_publication = str( date.datetime.strptime(date_publication, "%d/%m/%Y").date()) content = "" for p in soup.find_all('p'): if p.get("class") == ['fig-content__chapo']: content = p.get_text() + " " for div in soup.find_all('div'): if div.get("class") == ['fig-content__body']: for p in div.find_all('p'): content += p.get_text() + " " new_article = utils.recovery_article(title, 'LeFigaro', list_authors, date_publication, content, theme) if not utils.is_empty(new_article): list_dictionaries.append(new_article) except: print("Erreur lors de l'enregistrement de l'article")
def collect_articles(list_dictionaries, list_url_articles, list_titre): j = 0 for url_article in list_url_articles: j = j + 1 soup = utils.recovery_flux_url_rss(url_article) for titl in soup.find_all('title'): # find the title tit = titl.get_text() if len(tit.split('-')) == 2: title = tit.split('-')[0] authors = [] for a in soup.find_all('a'): # find the authors if a.get('href') is not None: if "dpi-authors" in a.get('href').split('/'): tit = a.get('href').split('/')[-1] authors.append(tit.split('-')[0] + ' ' + tit.split('-')[1]) if len(authors) == 0: authors.append('') dates = [] date_publication = [] for balise_time in soup.find_all('time'): # find publication's date if 'pubdate' in balise_time.get('class'): dates.append(balise_time.get('datetime').split('T')[0]) date_publication.append( balise_time.get('datetime').split('T')[0]) theme = re.search("www.lesoir.be/(.*)/", url_article)[1] content = '' for p in soup.find_all('p'): if len(p.get_text().split(" ")) >= 2: content += p.get_text() new_article = utils.recovery_article(title, 'lesoir', authors, date_publication, content, theme) if (j == 3): time.sleep(71) j = 0 if not utils.is_empty(new_article): erreur = "non" for tit in list_titre: if title == tit: erreur = "oui" if len(content) > 10 and erreur == "non": list_titre.append(title) list_dictionaries.append(new_article)
def recovery_information_hum(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) for meta in soup_article.find_all('meta'): if meta.get("property") == 'og:title': title = meta.get("content") for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:section': theme = meta.get("content") author = [] for h2 in soup_article.find_all('h2'): for a in h2.find_all('a'): if re.search('auteur', str(a.get("href"))): author.append(a.get_text()) for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:published_time': raw_date = meta.get("content") date_p = raw_date[0:10] date_p = str(datetime.strptime(date_p, "%Y-%m-%d").date()) contents = "" for div in soup_article.find_all('div'): if div.get("class") == [ 'field', 'field-name-field-news-chapo', 'field-type-text-long', 'field-label-hidden' ]: for p in div.find_all('p'): contents += p.get_text() if div.get("class") == [ 'field', 'field-name-field-news-text', 'field-type-text-long', 'field-label-hidden' ]: for p in div.find_all('p'): contents += p.get_text() article = utils.recovery_article(title, 'Humanite', author, date_p, contents, theme) return (article)
def recovery_information_fem(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) title = soup_article.title.get_text() title = title.split(" - ") title = title[0] for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:published_time': raw_date = meta.get("content") date_p = raw_date[0:10] author = [] for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:author': author.append(meta.get("content")) theme = "" for link in soup_article.find_all('link'): if link.get("rel") == ['Index']: link_theme = link.get("href") part_link = link_theme.split("/") theme = part_link[3] contents = "" for div in soup_article.find_all('div'): if div.get("class") == ['chapo']: for p in div.find_all('p'): contents += p.get_text() + " " if div.get("class") == ['contenu']: for p in div.find_all('p'): contents += p.get_text() + " " if div.get("class") == ['diaporama']: for p in div.find_all('p'): contents += p.get_text() + " " contents = re.sub(r"\s\s+", " ", contents) article = utils.recovery_article(title, 'Femina', author, date_p, contents, theme) return (article)
def recovery_information_lg(url): """ Arguments: url : string Return : article : dictionary It retrieve for each article the title, newspaper, author, date, theme """ soup = utils.recovery_flux_url_rss(url) # Retrieving the title title = '' balise_title = soup.title.string sep = balise_title.split('—') title = unidecode.unidecode('—'.join(sep[:-1])) tag_context = soup.find('span', attrs={'class': 'context'}) # Retrieving of author author = [] author.append(tag_context.a.get_text()) # Retrieving of publication date date_p = '' regex_date = re.search('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', tag_context.get_text()) date_p = regex_date.group(0) date_p = str(date.datetime.strptime(date_p, '%d/%m/%Y').date()) # Retrieving the theme tag_post_cat = soup.find('ul', attrs={'class': 'post-categories'}) for li in tag_post_cat.find_all('li'): theme = li.get_text() # Retrieving the content of the article contents = '' tag_content = soup.find('div', attrs={'class': 'content'}) if tag_content: for p in tag_content.find_all('p'): contents += p.get_text() + " " new_article = utils.recovery_article(title, 'LeGorafi', author, date_p, contents, theme) return (new_article)
def recovery_information_lt(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) # Retrieve the title title = soup_article.title.string # Retrieve the theme theme = "" for li in soup_article.find_all('li'): if li.get("itemprop") == 'itemListElement': theme = li.a.span.get_text() # Retrieve the author author = [] for span in soup_article.find_all('span'): if span.get("class") == ['author-name']: author.append(span.a.span.get_text()) # Retrieve the publication date date_p = "" for time in soup_article.find_all('time'): date_p = time.get("datetime") for valeur in re.finditer('[0-9]{4}\-[0-9]{2}\-[0-9]{2}', str(time)): date_p = valeur.group(0) # Retrieve the content contents = "" for div in soup_article.find_all('div'): if div.get("itemprop") == 'articleBody': for p in div.find_all('p'): contents += p.get_text() + " " article = utils.recovery_article(title, 'LaTribune', author, date_p, contents, theme) return (article)
def recovery_information_fusc(url): """ Arguments: url : string Return : article : dictionary It retrieve for each article the title, newspaper, author, date, theme """ soup = utils.recovery_flux_url_rss(url) # retrieve title title = '' title = unidecode.unidecode(soup.title.string) indice = title.find('|') if indice != -1: title = title[:indice - 1] # retrieve the author author = [] tag_author = soup.find('h3', attrs={'itemprop': 'author'}) author.append(tag_author.get_text()) # retrieve date publi_date = '' regex_date = re.search('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', soup.time.string) publi_date = regex_date.group(0) publi_date = str(date.datetime.strptime(publi_date, '%d/%m/%Y').date()) # retrieve content content = '' for p in soup.find_all('p'): for p2 in re.finditer('py0p5', p.get('class')[-1]): content += p.get_text() content = unidecode.unidecode(content) # retrieve theme delimiter = url.split('/') theme = delimiter[3] article = utils.recovery_article(title, 'FuturaSciences', author, publi_date, content, theme) return (article)
def recovery_information_ld(url): soup = utils.recovery_flux_url_rss(url) # Retrieve the title for meta in soup.find_all('meta'): if meta.get("property") == 'og:title': title = meta.get("content") # Retrieve the publication date for time in soup.find_all('time'): if time.get("itemprop") == 'datePublished': date = time.get("itemprop") for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(time)): date = valeur.group(0) date = datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d") print(date) # Retrieve the author author = [] for div in soup.find_all('div'): if div.get("class") == ['article_author']: author.append(div.span.get_text()) # Retrieve the content content = "" for div in soup.find_all('div'): if div.get("itemprop") == 'articleBody': for p in div.find_all('p'): content += p.get_text() + " " # Retrieve the theme theme = "" for h2 in soup.find_all('h2'): if h2.get("itemprop") == 'about': theme = h2.get_text() article = utils.recovery_article(title, 'La Depeche', author, date, content, theme) return (article)
def recovery_article_ld(url): """ Arguments: url : string Return : article : dictionary It retrieve for each article the title, newspaper, author, date, theme """ soup = utils.recovery_flux_url_rss(url) # Retrieve the title tag_meta = soup.find('meta', attrs={'property': 'og:title'}) title = tag_meta.get('content') # Retrieve the publication date publi_date = '' tag_publi_date = soup.find('time', attrs={'itemprop': 'datePublished'}) regex_date = re.search('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', tag_publi_date.string) publi_date = regex_date.group(0) # Retrieve the author author = [] for div in soup.find_all('div', attrs={'class': 'article_author'}): author.append(div.span.get_text()) # Retrieve the content content = '' for div in soup.find_all('div', attrs={'itemprop': 'articleBody'}): for p in div.find_all('p'): content += p.get_text() + ' ' # Retrieve the theme theme = "" theme = soup.find('h2', attrs={'itemprop': 'about'}).get_text() # Retrieve all the informations off the article article = utils.recovery_article(title, 'LaDepeche', author, publi_date, content, theme) return (article)
def recovery_information_sv(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) # title title = "" for h1 in soup_article.find_all("h1"): if h1.get("class") == ["like-h1"]: title = h1.get_text() # date t_date = soup_article.find("time")["datetime"] # author author = [] for span in soup_article.find_all('span'): if span.get("class") == ["author"]: author.append(span.span.get_text()) # content content = "" for div in soup_article.find_all('div'): if div.get("class") == ['content', 'left']: for p in div.find_all('p'): content += p.get_text() + " " # theme theme = "" for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:tag': theme = meta.get("content") article = utils.recovery_article(title, 'Scienceetvie', author, date, content, theme) return (article)
def get_article(url): """ Arguments : - URL address Returns : - Dictionnary """ from unidecode import unidecode soup = utils.recovery_flux_url_rss(url) article = soup.find("article") meta = soup.find("meta", property="og:title").get("content") tab = meta.split("-") n = len(tab) theme = tab[n - 2] title = "-".join(tab[:n - 2]) authors = [] regex = re.compile(r'[\n\r\t]') for span in article.find_all("span", class_="author--name"): author = regex.sub("", unidecode(span.get_text())) authors.append(author.strip()) date_pub = article.find( "span", itemprop="datePublished").get("datetime")[:10].split("-") date_pub = date_pub[2] + "-" + date_pub[1] + "-" + date_pub[0] content = "" for div in article.find_all("div", class_=[ "article--intro", "article--wysiwyg", "article--footnotes" ]): for p in div.find_all("p"): content = content + p.get_text() content = regex.sub("", content) return utils.recovery_article(title, "Telerama", authors, date_pub, content, theme)
def collect_article(article_link): """Extact informations from all these articles Arguments: article_link {string} -- url of an article Returns: dict -- dict of all informations """ if "video" in article_link or "/apps/" in article_link or "checknews" in article_link or not re.search( r"\d\d\d\d/\d\d/\d\d", article_link): return None else: req = requests.get(article_link) data = req.text soup = BeautifulSoup(data, "lxml") try: theme = re.search("http://www.liberation.fr/(.*)", article_link)[1] theme = theme.split('/')[0] except: theme = '' if soup.find("div", class_="direct-headband") or article_link != req.url: return None else: balise_title = soup.find("h1") balise_title = balise_title.get_text() balise_title = re.sub(r"\s\s+", "", balise_title) newspaper = "Liberation" title = unidecode.unidecode(balise_title) date_p = "" authors = [] for span in soup.find_all('span'): if span.get("class") == ['author']: if (span.a): author = span.a.string if author: authors.append(author) if span.get("class") == ['date']: if (span.time): date_p = date.datetime.strptime( span.time.get("datetime"), "%Y-%m-%dT%H:%M:%S").date() date_p = date_p.strftime("%d/%m/%Y") date_p = str(date.datetime.strptime(date_p, "%d/%m/%Y").date()) if not authors: authors = ["liberation"] content = "" for div in soup.find_all('div'): for p in div.find_all('p'): content += p.get_text() + " " content = re.sub("<>", "", content) content = unidecode.unidecode(content) new_article = utils.recovery_article(title, newspaper, authors, date_p, content, theme) return new_article
def recovery_information_equi(url_article): """ Arguments: - url of one article Returns: - informations of the article """ try: soup_article = utils.recovery_flux_url_rss(url_article) # Retrieving of title balise_title = soup_article.title.get_text() sep = balise_title.split(" - ") title = sep[0] title = title.encode("latin1").decode() # Retrieving of the author author = [] aut = "" for meta in soup_article.find_all('meta'): if meta.get("name") == 'Author': aut = meta.get("content") aut = aut.encode("latin1").decode() author.append(aut) # Retrieving of date of publication date_p = "" for div in soup_article.find_all('div'): if div.get("class") == ['article__date']: for t in div.find_all('time'): if t.get("itemprop") == 'datePublished': raw_date = t.get("datetime") date_p = raw_date[0:10] # Retrieving of the artical theme theme = "" for div in soup_article.find_all('div'): if div.get("class") == ['navigation__sousmenu']: theme = div.get("libelle") # Retrieving the content of the article contents = "" for div in soup_article.find_all('div'): if div.get("itemprop") == 'mainEntityOfPage': for div2 in div.find_all('div'): for valeur in re.finditer('lire-aussi', str(div2.get("class"))): div2.string = "" for valeur2 in re.finditer('paragraphe__exergue', str(div2.get("class"))): div2.string = "" for span in div.find_all('span'): span.string = "" for block in div.find_all('blockquote'): block.string = "" for p in div.find_all('p'): if p.get("data-type") == 'Accroche': contents = p.get_text() contents = re.sub(r"\s\s+", " ", contents) contents = contents.replace('°', ' ° ') article = utils.recovery_article(title, 'Equipe', author, date_p, contents, theme) except: article = utils.recovery_article('', 'Equipe', '', '', '', '') return (article)
def get_information(article_link): """Extact informations from all these articles Arguments: article_link {string} -- url of an article Returns: dict -- dict of all informations """ if "video" in article_link or "/apps/" in article_link or "checknews" in\ article_link or not re.search(r"\d\d\d\d/\d\d/\d\d", article_link): return None else: date_article = re.search(r"\d{4}/\d{2}/\d{2}", article_link)[0] date_article = date.datetime.strptime(date_article, "%Y/%m/%d") diff_date = date.datetime.now() - date_article if diff_date.days > 7: return None else: req = requests.get(article_link) req.encoding = "utf-8" data = req.text soup = BeautifulSoup(data, "lxml") if soup.find("div", class_="direct-headband") or article_link != req.url: return None else: balise_title = soup.find("h1") balise_title = balise_title.get_text() balise_title = re.sub(r"\s\s+", "", balise_title) newspaper = "Liberation" title = unidecode.unidecode(balise_title) author = "" for span in soup.find_all('span'): if span.get("class") == ['author']: if (span.a): author = span.a.string if span.get("class") == ['date']: if (span.time): date_p = date.datetime.strptime( span.time.get("datetime"), "%Y-%m-%dT" + "%H:%M:%S").date() date_p = date_p.strftime("%Y-%m-%d") print(date_p) content = "" for div in soup.find_all('div'): for p in div.find_all('p'): content += p.get_text() + " " content = re.sub("<>", "", content) content = unidecode.unidecode(content) new_article = utils.recovery_article(title, newspaper, [author], date_p, content, " ") return new_article