예제 #1
0
class IMDBSearchResultsExtractor:

    def __init__(self, url):
        logger.debug("Create IMDB Search Result Extractor")
         
        page = urllib.urlopen(url)
        t = page.read()
        cleaner = CustomCleaner.CustomedCleaner_HTML()
        self.extractor = ExtractorHTML(t,cleaner)
        #logger.debug(self.extractor.cleanString)
        logger.debug("IMDB Search Result Extractor created for webpage {} ".format(url))
    
    def extractNumberOfResults(self):
        logger.debug("Extract Number of Search Results")
         
        text = self.extractor.extractXpathText('//div[@id="left"]')[0]
        nb = int(re.findall(r'\d+', text.replace(",", ""))[-1])
        logger.debug("Number of Results: {}".format(nb))
        return nb

    def extractIds(self):
        logger.debug("Extract IMDB ids")

        links = self.extractor.extractXpathElement('//td[@class="title"]//a/@href')
        ids = map(lambda s: re.findall(r'tt\d+', s)[0], links)
        logger.debug("IMDB ids: {}".format(ids))
        return ids
    
    def extractPositions(self):
        logger.debug("Extract positions")
        
        labels = self.extractor.extractXpathText('//td[@class="number"]')
        positions = map(lambda s: int(re.findall(r'\d+', s)[0]), labels)
        logger.debug("Positions: {}".format(positions))
        return positions
예제 #2
0
    def __init__(self, url):
        logger.debug("Create IMDB Search Result Extractor")

        page = urllib.urlopen(url)
        t = page.read()
        cleaner = CustomCleaner.CustomedCleaner_HTML()
        self.extractor = ExtractorHTML(t, cleaner)
        #logger.debug(self.extractor.cleanString)
        logger.debug(
            "IMDB Search Result Extractor created for webpage {} ".format(url))
예제 #3
0
 def __init__(self, url):
     logger.debug("Create IMDB Search Result Extractor")
      
     page = urllib.urlopen(url)
     t = page.read()
     cleaner = CustomCleaner.CustomedCleaner_HTML()
     self.extractor = ExtractorHTML(t,cleaner)
     #logger.debug(self.extractor.cleanString)
     logger.debug("IMDB Search Result Extractor created for webpage {} ".format(url))
예제 #4
0
 def createExtractorEngine(self):
     """
   Crée l'extracteur au sens propre du terme
   est appelée si la page est "downloaded but not extracted" 
   """
     t = self.loadPage()
     if self.isExtractable:
         cleaner = CustomCleaner.CustomedCleaner_HTML()
         self.extractor = ExtractorHTML(t, cleaner)
예제 #5
0
class IMDBPersonSearchResultsExtractor:
    def __init__(self, url):
        logger.debug("Create IMDB Person Search Result Extractor")

        page = urllib.urlopen(url)
        t = page.read()
        cleaner = CustomCleaner.CustomedCleaner_HTML()
        self.extractor = ExtractorHTML(t, cleaner)
        #logger.debug(self.extractor.cleanString)
        logger.debug(
            "IMDB Person Search Result Extractor created for webpage {} ".
            format(url))

    def extractNumberOfResults(self):
        logger.debug("Extract Number of Search Results")

        text = self.extractor.extractXpathText('//div[@id="left"]')[0]
        nb = int(re.findall(r'\d+', text.replace(",", ""))[-1])
        logger.debug("Number of Results: {}".format(nb))
        return nb

    def extractIds(self):
        logger.debug("Extract IMDB ids")

        links = self.extractor.extractXpathElement(
            '//td[@class="name"]//a/@href')
        ids = map(lambda s: re.findall(r'nm\d+', s)[0], links)
        logger.debug("IMDB ids: {}".format(ids))
        return ids

    def extractPositions(self):
        logger.debug("Extract positions (priorities)")

        labels = self.extractor.extractXpathText('//td[@class="number"]')
        positions = map(lambda s: int(re.findall(r'\d+', s)[0]), labels)
        logger.debug("Positions: {}".format(positions))
        return positions
예제 #6
0
#! /usr/bin/env python
# -*- coding: latin-1 -*-

import Extractor.superExtractor
from Extractor.extractorHTML import ExtractorHTML

import urllib

import Extractor.customisedCleaner as CustomCleaner

page = urllib.urlopen('http://www.lemonde.fr/')
t = page.read()

cleaner = CustomCleaner.CustomedCleaner_HTML()

Ex = ExtractorHTML(t, cleaner)
#Extraction du titre
#Ex.extractTitle()

#Extraction des tag H1, H2, H3
#Ex.extractH1()
#Ex.extractH2()
#Ex.extractH3()

#Extraction des tags Strong et Em
#Ex.extractStrong()
#Ex.extractEm()

#Extraction des Links + contenu des links
Ex.extractLink()
예제 #7
0
#! /usr/bin/env python
# -*- coding: latin-1 -*-

import Extractor.superExtractor
from Extractor.extractorHTML import ExtractorHTML

import urllib

import Extractor.customisedCleaner as CustomCleaner

page = urllib.urlopen('http://www.lemonde.fr/')
t = page.read()

cleaner = CustomCleaner.CustomedCleaner_HTML()

Ex = ExtractorHTML(t,cleaner)
#Extraction du titre
#Ex.extractTitle()

#Extraction des tag H1, H2, H3
#Ex.extractH1()
#Ex.extractH2()
#Ex.extractH3()

#Extraction des tags Strong et Em
#Ex.extractStrong()
#Ex.extractEm()

#Extraction des Links + contenu des links
Ex.extractLink()