class IMDBSearchResultsExtractor: def __init__(self, url): logger.debug("Create IMDB Search Result Extractor") page = urllib.urlopen(url) t = page.read() cleaner = CustomCleaner.CustomedCleaner_HTML() self.extractor = ExtractorHTML(t,cleaner) #logger.debug(self.extractor.cleanString) logger.debug("IMDB Search Result Extractor created for webpage {} ".format(url)) def extractNumberOfResults(self): logger.debug("Extract Number of Search Results") text = self.extractor.extractXpathText('//div[@id="left"]')[0] nb = int(re.findall(r'\d+', text.replace(",", ""))[-1]) logger.debug("Number of Results: {}".format(nb)) return nb def extractIds(self): logger.debug("Extract IMDB ids") links = self.extractor.extractXpathElement('//td[@class="title"]//a/@href') ids = map(lambda s: re.findall(r'tt\d+', s)[0], links) logger.debug("IMDB ids: {}".format(ids)) return ids def extractPositions(self): logger.debug("Extract positions") labels = self.extractor.extractXpathText('//td[@class="number"]') positions = map(lambda s: int(re.findall(r'\d+', s)[0]), labels) logger.debug("Positions: {}".format(positions)) return positions
def __init__(self, url): logger.debug("Create IMDB Search Result Extractor") page = urllib.urlopen(url) t = page.read() cleaner = CustomCleaner.CustomedCleaner_HTML() self.extractor = ExtractorHTML(t, cleaner) #logger.debug(self.extractor.cleanString) logger.debug( "IMDB Search Result Extractor created for webpage {} ".format(url))
def __init__(self, url): logger.debug("Create IMDB Search Result Extractor") page = urllib.urlopen(url) t = page.read() cleaner = CustomCleaner.CustomedCleaner_HTML() self.extractor = ExtractorHTML(t,cleaner) #logger.debug(self.extractor.cleanString) logger.debug("IMDB Search Result Extractor created for webpage {} ".format(url))
def createExtractorEngine(self): """ Crée l'extracteur au sens propre du terme est appelée si la page est "downloaded but not extracted" """ t = self.loadPage() if self.isExtractable: cleaner = CustomCleaner.CustomedCleaner_HTML() self.extractor = ExtractorHTML(t, cleaner)
class IMDBPersonSearchResultsExtractor: def __init__(self, url): logger.debug("Create IMDB Person Search Result Extractor") page = urllib.urlopen(url) t = page.read() cleaner = CustomCleaner.CustomedCleaner_HTML() self.extractor = ExtractorHTML(t, cleaner) #logger.debug(self.extractor.cleanString) logger.debug( "IMDB Person Search Result Extractor created for webpage {} ". format(url)) def extractNumberOfResults(self): logger.debug("Extract Number of Search Results") text = self.extractor.extractXpathText('//div[@id="left"]')[0] nb = int(re.findall(r'\d+', text.replace(",", ""))[-1]) logger.debug("Number of Results: {}".format(nb)) return nb def extractIds(self): logger.debug("Extract IMDB ids") links = self.extractor.extractXpathElement( '//td[@class="name"]//a/@href') ids = map(lambda s: re.findall(r'nm\d+', s)[0], links) logger.debug("IMDB ids: {}".format(ids)) return ids def extractPositions(self): logger.debug("Extract positions (priorities)") labels = self.extractor.extractXpathText('//td[@class="number"]') positions = map(lambda s: int(re.findall(r'\d+', s)[0]), labels) logger.debug("Positions: {}".format(positions)) return positions
#! /usr/bin/env python # -*- coding: latin-1 -*- import Extractor.superExtractor from Extractor.extractorHTML import ExtractorHTML import urllib import Extractor.customisedCleaner as CustomCleaner page = urllib.urlopen('http://www.lemonde.fr/') t = page.read() cleaner = CustomCleaner.CustomedCleaner_HTML() Ex = ExtractorHTML(t, cleaner) #Extraction du titre #Ex.extractTitle() #Extraction des tag H1, H2, H3 #Ex.extractH1() #Ex.extractH2() #Ex.extractH3() #Extraction des tags Strong et Em #Ex.extractStrong() #Ex.extractEm() #Extraction des Links + contenu des links Ex.extractLink()
#! /usr/bin/env python # -*- coding: latin-1 -*- import Extractor.superExtractor from Extractor.extractorHTML import ExtractorHTML import urllib import Extractor.customisedCleaner as CustomCleaner page = urllib.urlopen('http://www.lemonde.fr/') t = page.read() cleaner = CustomCleaner.CustomedCleaner_HTML() Ex = ExtractorHTML(t,cleaner) #Extraction du titre #Ex.extractTitle() #Extraction des tag H1, H2, H3 #Ex.extractH1() #Ex.extractH2() #Ex.extractH3() #Extraction des tags Strong et Em #Ex.extractStrong() #Ex.extractEm() #Extraction des Links + contenu des links Ex.extractLink()