Python extraction Examples, methode_extraction.extraction Python Examples

Example #1

0

Show file

def unes(targetURL):
    file = urllib.urlopen(targetURL)
    data = file.read().decode('utf8')
    file.close()

    doc = lxml.html.document_fromstring(data)
    titres = doc.xpath("//a[@class='titre']/text()")
    liens = doc.xpath("//a[@class='titre']/@href")

    titres = zip(titres, liens)
    quotidien = {'nom': 'Le Dauphiné Libéré', 'URL': targetURL}
    methode_extraction.extraction(targetURL, quotidien, titres)

Example #2

0

Show file

File: JournalduNet.py Project: laetitiacstm/alaune

def unes(targetURL):
    file = urllib.urlopen(targetURL)
    data = file.read().decode('utf8')
    file.close()

    doc = lxml.html.document_fromstring(data)
    articles_href = doc.xpath('//h2[@class="app_title"]/a/@href')
    doc = lxml.html.document_fromstring(data)
    article_titles = doc.xpath('//h2[@class="app_title"]/a/text()')

    titres = zip(article_titles, articles_href)
    quotidien = {'nom': 'Le Journal du Net', 'URL': targetURL}
    methode_extraction.extraction('', quotidien, titres)

Example #3

0

Show file

File: LeParisien.py Project: laetitiacstm/alaune

def unes(targetURL):
    file = urllib.urlopen(targetURL)
    data = file.read().decode('utf8')
    file.close()
    doc = lxml.html.document_fromstring(data)
    articles_href = doc.xpath('//article/h1/a/@href') + doc.xpath(
        '//article//h2/a/@href') + doc.xpath('//article//h3/a/@href')
    doc = lxml.html.document_fromstring(data)
    article_titles = doc.xpath('//article/h1/a//text()') + doc.xpath(
        '//article//h2/a/text()') + doc.xpath('//article//h3/a/text()')
    titres = zip(article_titles, articles_href)
    quotidien = {'nom': 'Le Parisien', 'URL': targetURL}
    methode_extraction.extraction('', quotidien, titres)

Example #4

0

Show file

def unes(targetURL):
	file = urllib.urlopen(targetURL)
	data = file.read().decode('utf8')
	file.close()

	doc = lxml.html.document_fromstring(data)
	articles_href = doc.xpath('//main/div/a/@href')

	doc = lxml.html.document_fromstring(data)
	article_titles = doc.xpath('//main/div/a/article//h1/text()')

	titres= zip(article_titles, articles_href)
	quotidien={'nom':'Courrier International', 'URL':targetURL}
	methode_extraction.extraction (targetURL, quotidien,titres)

Example #5

0

Show file

def unes(targetURL):
	file = urllib.urlopen("http://www.sudouest.fr/")
	data = file.read().decode('utf8')
	file.close()
 
	doc = lxml.html.document_fromstring(data)
	articles_href = doc.xpath('//section[@class="articles essentiel "]//div[@class="article-wrapper"]/a/@href') + doc.xpath('//section[@class="articles default "]//div[@class="article-wrapper"]/a/@href')

 
	doc = lxml.html.document_fromstring(data)
	article_titles = doc.xpath('//section[@class="articles essentiel "]//div[@class="article-wrapper"]/a/h2/text()') + doc.xpath('//section[@class="articles default "]//div[@class="article-wrapper"]/a/h2/text()')
 
	titres= zip(article_titles, articles_href)
	quotidien={'nom':'Sud Ouest', 'URL':targetURL}
	methode_extraction.extraction (targetURL, quotidien,titres)

Example #6

0

Show file

File: LesEchos.py Project: laetitiacstm/alaune

def unes(targetURL):
    file = urllib.urlopen(targetURL)
    data = file.read().decode('utf8')
    file.close()

    doc = lxml.html.document_fromstring(data)
    articles_href = doc.xpath(
        '//div/article[@class="article-small article-medium"]//figure/a/@href')

    doc = lxml.html.document_fromstring(data)
    article_titles = doc.xpath(
        '//div/article[@class="article-small article-medium"]//div[@class="titre"]/a/text()'
    )

    titres = zip(article_titles, articles_href)
    quotidien = {'nom': 'Les Echos', 'URL': targetURL}
    methode_extraction.extraction('', quotidien, titres)

Example #7

0

Show file

File: LaTribune.py Project: laetitiacstm/alaune

def unes(targetURL):
    file = urllib.urlopen(targetURL)
    data = file.read().decode('utf8')
    file.close()

    doc = lxml.html.document_fromstring(data)
    articles = doc.xpath("//div[@class='main-article']//article/h2/a/text()")
    articles += doc.xpath('//div[@class="title-wrapper"]//a/text()')
    articles += doc.xpath('//div[@class="title-river"]//a/text()')

    liens = doc.xpath('//div[@class="main-article"]/article/h2/a/@href')
    liens += doc.xpath('//div[@class="title-wrapper"]//a/@href')
    liens += doc.xpath('//div[@class="title-river"]//a/@href')

    titres = zip(articles, liens)
    quotidien = {'nom': 'La Tribune', 'URL': targetURL}
    methode_extraction.extraction('', quotidien, titres)

Example #8

0

Show file

File: LePoint.py Project: laetitiacstm/alaune

def unes(targetURL):
    file = urllib.urlopen(targetURL)
    data = file.read().decode('utf8')
    file.close()

    doc = lxml.html.document_fromstring(data)
    articles_href = doc.xpath(
        '//article[@class="en-continu-li"]//a/@href') + doc.xpath(
            '//div[@class="row keep-cols"]/figure/a/@href')

    doc = lxml.html.document_fromstring(data)
    article_titles = doc.xpath('//h2[@class="art-lead"]/text()') + doc.xpath(
        '//div[@class="col plm"]//a/h2[@class="art-title"]/text()')

    titres = zip(article_titles, articles_href)
    quotidien = {'nom': 'Le Point', 'URL': targetURL}
    methode_extraction.extraction(targetURL, quotidien, titres)

Example #9

0

Show file

def unes(targetURL):
    file = urllib.urlopen(targetURL)
    data = file.read().decode('utf8')
    file.close()

    doc = lxml.html.document_fromstring(data)
    articles_href = doc.xpath(
        '//h1[@class="fig-profil-headline"]/a/@href') + doc.xpath(
            '//section[contains(@class, "fig-profil ")]/div/h2/a/@href')
    doc = lxml.html.document_fromstring(data)
    article_titles = doc.xpath(
        '//h1[@class="fig-profil-headline"]/a/text()') + doc.xpath(
            '//section[contains(@class, "fig-profil ")]/div/h2/a/text()')

    titres = zip(article_titles, articles_href)
    quotidien = {'nom': 'Le Figaro', 'URL': targetURL}
    methode_extraction.extraction('', quotidien, titres)