def unes(targetURL): file = urllib.urlopen(targetURL) data = file.read().decode('utf8') file.close() doc = lxml.html.document_fromstring(data) titres = doc.xpath("//a[@class='titre']/text()") liens = doc.xpath("//a[@class='titre']/@href") titres = zip(titres, liens) quotidien = {'nom': 'Le Dauphiné Libéré', 'URL': targetURL} methode_extraction.extraction(targetURL, quotidien, titres)
def unes(targetURL): file = urllib.urlopen(targetURL) data = file.read().decode('utf8') file.close() doc = lxml.html.document_fromstring(data) articles_href = doc.xpath('//h2[@class="app_title"]/a/@href') doc = lxml.html.document_fromstring(data) article_titles = doc.xpath('//h2[@class="app_title"]/a/text()') titres = zip(article_titles, articles_href) quotidien = {'nom': 'Le Journal du Net', 'URL': targetURL} methode_extraction.extraction('', quotidien, titres)
def unes(targetURL): file = urllib.urlopen(targetURL) data = file.read().decode('utf8') file.close() doc = lxml.html.document_fromstring(data) articles_href = doc.xpath('//article/h1/a/@href') + doc.xpath( '//article//h2/a/@href') + doc.xpath('//article//h3/a/@href') doc = lxml.html.document_fromstring(data) article_titles = doc.xpath('//article/h1/a//text()') + doc.xpath( '//article//h2/a/text()') + doc.xpath('//article//h3/a/text()') titres = zip(article_titles, articles_href) quotidien = {'nom': 'Le Parisien', 'URL': targetURL} methode_extraction.extraction('', quotidien, titres)
def unes(targetURL): file = urllib.urlopen(targetURL) data = file.read().decode('utf8') file.close() doc = lxml.html.document_fromstring(data) articles_href = doc.xpath('//main/div/a/@href') doc = lxml.html.document_fromstring(data) article_titles = doc.xpath('//main/div/a/article//h1/text()') titres= zip(article_titles, articles_href) quotidien={'nom':'Courrier International', 'URL':targetURL} methode_extraction.extraction (targetURL, quotidien,titres)
def unes(targetURL): file = urllib.urlopen("http://www.sudouest.fr/") data = file.read().decode('utf8') file.close() doc = lxml.html.document_fromstring(data) articles_href = doc.xpath('//section[@class="articles essentiel "]//div[@class="article-wrapper"]/a/@href') + doc.xpath('//section[@class="articles default "]//div[@class="article-wrapper"]/a/@href') doc = lxml.html.document_fromstring(data) article_titles = doc.xpath('//section[@class="articles essentiel "]//div[@class="article-wrapper"]/a/h2/text()') + doc.xpath('//section[@class="articles default "]//div[@class="article-wrapper"]/a/h2/text()') titres= zip(article_titles, articles_href) quotidien={'nom':'Sud Ouest', 'URL':targetURL} methode_extraction.extraction (targetURL, quotidien,titres)
def unes(targetURL): file = urllib.urlopen(targetURL) data = file.read().decode('utf8') file.close() doc = lxml.html.document_fromstring(data) articles_href = doc.xpath( '//div/article[@class="article-small article-medium"]//figure/a/@href') doc = lxml.html.document_fromstring(data) article_titles = doc.xpath( '//div/article[@class="article-small article-medium"]//div[@class="titre"]/a/text()' ) titres = zip(article_titles, articles_href) quotidien = {'nom': 'Les Echos', 'URL': targetURL} methode_extraction.extraction('', quotidien, titres)
def unes(targetURL): file = urllib.urlopen(targetURL) data = file.read().decode('utf8') file.close() doc = lxml.html.document_fromstring(data) articles = doc.xpath("//div[@class='main-article']//article/h2/a/text()") articles += doc.xpath('//div[@class="title-wrapper"]//a/text()') articles += doc.xpath('//div[@class="title-river"]//a/text()') liens = doc.xpath('//div[@class="main-article"]/article/h2/a/@href') liens += doc.xpath('//div[@class="title-wrapper"]//a/@href') liens += doc.xpath('//div[@class="title-river"]//a/@href') titres = zip(articles, liens) quotidien = {'nom': 'La Tribune', 'URL': targetURL} methode_extraction.extraction('', quotidien, titres)
def unes(targetURL): file = urllib.urlopen(targetURL) data = file.read().decode('utf8') file.close() doc = lxml.html.document_fromstring(data) articles_href = doc.xpath( '//article[@class="en-continu-li"]//a/@href') + doc.xpath( '//div[@class="row keep-cols"]/figure/a/@href') doc = lxml.html.document_fromstring(data) article_titles = doc.xpath('//h2[@class="art-lead"]/text()') + doc.xpath( '//div[@class="col plm"]//a/h2[@class="art-title"]/text()') titres = zip(article_titles, articles_href) quotidien = {'nom': 'Le Point', 'URL': targetURL} methode_extraction.extraction(targetURL, quotidien, titres)
def unes(targetURL): file = urllib.urlopen(targetURL) data = file.read().decode('utf8') file.close() doc = lxml.html.document_fromstring(data) articles_href = doc.xpath( '//h1[@class="fig-profil-headline"]/a/@href') + doc.xpath( '//section[contains(@class, "fig-profil ")]/div/h2/a/@href') doc = lxml.html.document_fromstring(data) article_titles = doc.xpath( '//h1[@class="fig-profil-headline"]/a/text()') + doc.xpath( '//section[contains(@class, "fig-profil ")]/div/h2/a/text()') titres = zip(article_titles, articles_href) quotidien = {'nom': 'Le Figaro', 'URL': targetURL} methode_extraction.extraction('', quotidien, titres)