def __init__(self): self.downloader = Downloader() self.failed_requests = 0 self.logger = initLogger.getLogger(DownloaderConfig.IMDB_PERSON_DOWNLOADER_LOGGER_NAME) self.connector = IMDBPersonStatusConnector() self.onepage_limit = DownloaderConfig.IMDB_PERSON_DOWNLOADER_ONEPAGE_REQUESTS_LIMIT self.global_limit = DownloaderConfig.IMDB_PERSON_DOWNLOADER_GLOBAL_REQUESTS_LIMIT self.logger.debug("IMDB Person Downloader Created")
# /usr/bin/env python # -*- coding: latin-1 -*- #################################################################### #importe les modules internes import Logger.init_logger as initLogger #Initialise le logger import Logger.logger_config as loggerConfig import FilmExtractor_config as FilmExtractorConfig from IMDBExtractor.IMDBExtractor import * from cinema.models import * logger = initLogger.getLogger(FilmExtractorConfig.IMDB_FILM_EXTRACTOR_LOGGER_NAME) ################################################################### ################################################################ # # IMDB_*Extract Family # ################################################################## """ Les fonctions de ce module créent les objets nécessaires à l'extraction et remplissent la DB en appelant les fonction du module FilmExtractor_utils. Il existe une fonction par type de page""" def IMDB_filmExtract(film_id):
######################## #importe les modules internes import Logger.init_logger as initLogger #Initialise le logger from Extractor.superExtractor import SuperExtractor #Charge la super classe import Logger.logger_config as loggerConfig import Extractor.extractor_config as extractorConfig #Importe les modules exterieures à l'application from lxml import etree from lxml.html.clean import Cleaner import StringIO logger = initLogger.getLogger(extractorConfig.EXTRACTOR_HTML_LOGGER_NAME) ######################### """ La classe ExtractorHTML herite de la classe SuperExtractor. Elle permet de définir les fonctions nécessaire à l'extraction de données dans un document HTML, dont la chaine de caractère est donnée en paramètre de l'objet""" class ExtractorHTML(SuperExtractor): def __init__(self, htmlString, cleaner): SuperExtractor.__init__(self, htmlString) self.htmlString = self.string.replace("\n", "").replace("\r", "") self.parser = etree.HTMLParser() self.cleaner = cleaner self.cleanString = self.cleaner.clean_html(self.htmlString) #Définit l'arbre sur lequel se feront toutes les XPath extraction self.tree = etree.parse(StringIO.StringIO(self.cleanString),
from Extractor.extractorHTML import ExtractorHTML import Extractor.customisedCleaner as CustomCleaner from cinema.models import * from Connector.IMDBStatusConnector import * from FilmExtractor_utils.define_entities import * import re import urllib import random import codecs logger = initLogger.getLogger(IMDBExtractorConfig.IMDB_EXTRACTOR_LOGGER_NAME) ################################################################### import md5 ################################################################# class IMDBExtractor: """ Chaque page nécessite un extractor qui lui est propre : Film Personne (la structure de Actor/Writer/Director est identique) Company
#################################################################### #importe les modules internes import Logger.init_logger as initLogger #Initialise le logger import Logger.logger_config as loggerConfig import FilmExtractor_utils_config as FilmExtractorUtilsConfig from cinema.models import * logger = initLogger.getLogger(FilmExtractorUtilsConfig.UTILS_EXTRACTOR_LOGGER_NAME) ################################################################### ########################################################## # # DEFINE FAMILY # ########################################################## """Crée/Renvoie les objects pour intéragir avec la base de données Django.""" def defineFilm(film_id): try : f = Film.objects.get(imdb_id=film_id) return f
from Extractor.extractorHTML import ExtractorHTML import Extractor.customisedCleaner as CustomCleaner from Connector.IMDBStatusConnector import IMDBFilmStatusConnector import spider_config as SpiderConfig import urllib from urllib import FancyURLopener import re import random # Logger for this module logger = initLogger.getLogger(SpiderConfig.IMDB_SPIDER_LOGGER_NAME) #################################################################### # Custom User-Agent to load IMDB search results class IMDBSpiderURLopener(FancyURLopener): version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11" urllib._urlopener = IMDBSpiderURLopener() #################################################################### def searchURL(year, start_pos): logger.debug("Compute the url for IMDB search:")
'--imdb-awards', dest='imdb_awards', help="Si présent, lance l'extraction d'awards des films", action='store_true') parser.add_argument( '-actor', '--imdb-actors', dest='imdb_actors', help="Si présent, lance l'extraction des acteurs des films", action='store_true') #Crée le tableau global qui donne accès aux arguments passés en paramètres sur la ligne de commande initConfig.args = parser.parse_args() ########## # Crée les loggers & co logger = initLogger.getLogger(initConfig.SCRAPER_INIT_LOGGER_NAME) logger.debug('Logger {} créé'.format(initConfig.SCRAPER_INIT_LOGGER_NAME)) ################# # Vide le fichier de log si demandé debug_file = initConfig.RUN_TIME_FOLDER + loggerConfig.LOG_FILE if initConfig.args.fresh_debug: logger.info('Vide le fichier {}...'.format(debug_file)) open(debug_file, 'w').close() if initConfig.args.imdb_spider: logger.info('Lancement du Spider') import Spider.IMDBSpider if initConfig.args.imdb_priority_spider: logger.info('Lancement du Spider de Priorités')
import Logger.init_logger as initLogger #Initialise le logger import Logger.logger_config as loggerConfig import IMDBExtractor_config as IMDBExtractorConfig import Connector.IMDBStatusConnector import FilmExtractor.IMDB_Extractor from status.models import * from cinema.models import * import threading import time import random logger = initLogger.getLogger( IMDBExtractorConfig.EXTRACTOR_PERSON_PIC_LOGGER_NAME) ################################################################### year_min = 1980 year_max = 1989 priority_max = 1000 film_conn = Connector.IMDBStatusConnector.IMDBFilmStatusConnector() film_id_tab = film_conn.getExtractedFiltered(year_min, year_max, priority_max) for film_id in film_id_tab: logger.debug('Film en cours d extraction : {}'.format(film_id)) Connector.IMDBStatusConnector.IMDBFilmStatusConnector().setExtractedStatus( film_id, "0") FilmExtractor.IMDB_Extractor.IMDB_actorsDirectorsExtract(film_id) Connector.IMDBStatusConnector.IMDBFilmStatusConnector().setExtractedStatus(
#importe les modules internes import Logger.init_logger as initLogger #Initialise le logger import Logger.logger_config as loggerConfig import FilmExtractor_config as FilmExtractorConfig import Connector.IMDBStatusConnector import FilmExtractor.IMDB_Extractor from status.models import * from cinema.models import * import threading import time logger = initLogger.getLogger(FilmExtractorConfig.EXTRACTOR_IMDB_INIT_LOGGER_NAME) ################################################################### year_min=2000 year_max=2012 priority_max=1000 def extractOneMovie(imdb_id): FilmExtractor.IMDB_Extractor.IMDB_SuperExtractor(imdb_id) def extractOneMovieAwards(imdb_id): FilmExtractor.IMDB_Extractor.IMDB_awardsExtract(imdb_id) def setUnextractedToOneMovie(imdb_id): Connector.IMDBStatusConnector.IMDBFilmStatusConnector().setExtractedStatus(imdb_id, "0")
# /usr/bin/env python # -*- coding: latin-1 -*- #################################################################### #importe les modules internes import Logger.init_logger as initLogger #Initialise le logger import Logger.logger_config as loggerConfig import FilmExtractor_utils_config as FilmExtractorUtilsConfig from cinema.models import * logger = initLogger.getLogger( FilmExtractorUtilsConfig.UTILS_EXTRACTOR_LOGGER_NAME) ################################################################### ########################################################## # # DEFINE FAMILY # ########################################################## """Crée/Renvoie les objects pour intéragir avec la base de données Django.""" def defineFilm(film_id): try: f = Film.objects.get(imdb_id=film_id) return f except Film.DoesNotExist:
from Extractor.extractorHTML import ExtractorHTML import Extractor.customisedCleaner as CustomCleaner from cinema.models import * from Connector.IMDBStatusConnector import * from FilmExtractor_utils.define_entities import * import re import urllib import random import codecs logger = initLogger.getLogger(IMDBExtractorConfig.IMDB_EXTRACTOR_LOGGER_NAME) ################################################################### import md5 ################################################################# class IMDBExtractor: """ Chaque page nécessite un extractor qui lui est propre : Film Personne (la structure de Actor/Writer/Director est identique) Company Keyword
def __init__(self): # Logger self.logger = initLogger.getLogger(DownloaderConfig.DOWNLOADER_LOGGER_NAME) urllib._urlopener = IMDBSpiderURLopener()
# /usr/bin/env python # -*- coding: latin-1 -*- #################################################################### #importe les modules internes import Logger.init_logger as initLogger #Initialise le logger import Logger.logger_config as loggerConfig import UserAgent.userAgent_config as userAgentConfig from urllib import FancyURLopener import urllib logger = initLogger.getLogger(userAgentConfig.USER_AGENT_LOGGER_NAME) ################################################################### #################################################################### # Custom User-Agent to load IMDB search results class CustomURLopener(FancyURLopener): version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11" logger.debug('Modification du User Agent') urllib._urlopener = CustomURLopener() ####################################################################
def __init__(self): self.logger = initLogger.getLogger( ConnectorConfig.IMDB_COMPANY_STATUS_CONNECTOR_LOGGER_NAME)
# /usr/bin/env python # -*- coding: latin-1 -*- #################################################################### #importe les modules internes import Logger.init_logger as initLogger #Initialise le logger import Logger.logger_config as loggerConfig import FilmExtractor_config as FilmExtractorConfig from IMDBExtractor.IMDBExtractor import * from cinema.models import * logger = initLogger.getLogger( FilmExtractorConfig.IMDB_FILM_EXTRACTOR_LOGGER_NAME) ################################################################### ################################################################ # # IMDB_*Extract Family # ################################################################## """ Les fonctions de ce module créent les objets nécessaires à l'extraction et remplissent la DB en appelant les fonction du module FilmExtractor_utils. Il existe une fonction par type de page""" def IMDB_filmExtract(film_id): logger.debug( "Lancement de l'extraction de la Page film pour le film {}".format( film_id))
from Extractor.extractorHTML import ExtractorHTML import Extractor.customisedCleaner as CustomCleaner from Connector.IMDBStatusConnector import IMDBFilmStatusConnector import spider_config as SpiderConfig import urllib from urllib import FancyURLopener import re import random # Logger for this module logger = initLogger.getLogger(SpiderConfig.IMDB_SPIDER_LOGGER_NAME) #################################################################### # Custom User-Agent to load IMDB search results class IMDBSpiderURLopener(FancyURLopener): version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11" urllib._urlopener = IMDBSpiderURLopener() ####################################################################
# -*- coding: latin-1 -*- #################################################################### #importe les modules internes import Logger.init_logger as initLogger #Initialise le logger import Logger.logger_config as loggerConfig import UserAgent.userAgent_config as userAgentConfig from urllib import FancyURLopener import urllib logger = initLogger.getLogger(userAgentConfig.USER_AGENT_LOGGER_NAME) ################################################################### #################################################################### # Custom User-Agent to load IMDB search results class CustomURLopener(FancyURLopener): version = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11" logger.debug('Modification du User Agent') urllib._urlopener = CustomURLopener() ####################################################################
def __init__(self): self.logger = initLogger.getLogger(ConnectorConfig.IMDB_COMPANY_STATUS_CONNECTOR_LOGGER_NAME)
def __init__(self): # Logger self.logger = initLogger.getLogger( DownloaderConfig.DOWNLOADER_LOGGER_NAME) urllib._urlopener = IMDBSpiderURLopener()
from downloader import Downloader import downloader_config as DownloaderConfig import urllib from urllib import FancyURLopener import re import random import os import time # Logger for this module logger = initLogger.getLogger(DownloaderConfig.IMDB_DOWNLOADER_LOGGER_NAME) #################################################################### # Pages local PATHs def personPath(imdb_id): path = "{0}{1}.html".format(DownloaderConfig.IMDB_PERSON_ROOT, imdb_id) return path #################################################################### # Page URLs def personURL(imdb_id): url = "http://www.imdb.com/name/{0}/".format(imdb_id)
######################## #importe les modules internes import Logger.init_logger as initLogger #Initialise le logger from Extractor.superExtractor import SuperExtractor #Charge la super classe import Logger.logger_config as loggerConfig import Extractor.extractor_config as extractorConfig #Importe les modules exterieures à l'application from lxml import etree from lxml.html.clean import Cleaner import StringIO logger = initLogger.getLogger(extractorConfig.EXTRACTOR_HTML_LOGGER_NAME) ######################### """ La classe ExtractorHTML herite de la classe SuperExtractor. Elle permet de définir les fonctions nécessaire à l'extraction de données dans un document HTML, dont la chaine de caractère est donnée en paramètre de l'objet""" class ExtractorHTML(SuperExtractor): def __init__(self,htmlString,cleaner): SuperExtractor.__init__(self,htmlString) self.htmlString = self.string.replace("\n","").replace("\r","") self.parser = etree.HTMLParser() self.cleaner = cleaner self.cleanString = self.cleaner.clean_html(self.htmlString)
#importe les modules internes import Logger.init_logger as initLogger #Initialise le logger import Logger.logger_config as loggerConfig import IMDBExtractor_config as IMDBExtractorConfig import Connector.IMDBStatusConnector import FilmExtractor.IMDB_Extractor from status.models import * from cinema.models import * import threading import time import random logger = initLogger.getLogger(IMDBExtractorConfig.EXTRACTOR_PERSON_PIC_LOGGER_NAME) ################################################################### year_min=1980 year_max=1989 priority_max=1000 film_conn = Connector.IMDBStatusConnector.IMDBFilmStatusConnector() film_id_tab = film_conn.getExtractedFiltered(year_min,year_max,priority_max) for film_id in film_id_tab: logger.debug('Film en cours d extraction : {}'.format(film_id)) Connector.IMDBStatusConnector.IMDBFilmStatusConnector().setExtractedStatus(film_id, "0") FilmExtractor.IMDB_Extractor.IMDB_actorsDirectorsExtract(film_id) Connector.IMDBStatusConnector.IMDBFilmStatusConnector().setExtractedStatus(film_id, "1")
parser.add_argument('-imdb_ex', '--imdb-extractor', dest = 'imdb_extractor', help = "Si présent, lance l'extraction des fichiers HTML en provenace de IMDB", action='store_true') parser.add_argument('-imdb_fsp', '--imdb-spider', dest = 'imdb_spider', help = "Si présent, lance le spider pour IMDB", action='store_true') parser.add_argument('-imdb_fpsp', '--imdb-priority-spider', dest = 'imdb_priority_spider', help = "Si présent, lance le spider pour les priorités de film IMDB", action='store_true') parser.add_argument('-imdb_psp', '--imdb-person-spider', dest = 'imdb_person_spider', help = "Si présent, lance le spider pour les personnes IMDB", action='store_true') parser.add_argument('-imdb_fdw', '--imdb-film-downloader', dest = 'imdb_film_downloader', help = "Si présent, lance le downloader des films IMDB", action='store_true') parser.add_argument('-imdb_pdw', '--imdb-person-downloader', dest = 'imdb_person_downloader', help = "Si présent, lance le downloader des personnes IMDB", action='store_true') parser.add_argument('-imdb_cdw', '--imdb-company-downloader', dest = 'imdb_company_downloader', help = "Si présent, lance le downloader des entreprises IMDB", action='store_true') parser.add_argument('-pic', '--imdb-person-picture', dest = 'imdb_person_picture', help = "Si présent, lance l'extraction d'image des personnes", action='store_true') parser.add_argument('-aw', '--imdb-awards', dest = 'imdb_awards', help = "Si présent, lance l'extraction d'awards des films", action='store_true') parser.add_argument('-actor', '--imdb-actors', dest = 'imdb_actors', help = "Si présent, lance l'extraction des acteurs des films", action='store_true') #Crée le tableau global qui donne accès aux arguments passés en paramètres sur la ligne de commande initConfig.args = parser.parse_args() ########## # Crée les loggers & co logger = initLogger.getLogger (initConfig.SCRAPER_INIT_LOGGER_NAME) logger.debug('Logger {} créé'.format(initConfig.SCRAPER_INIT_LOGGER_NAME)) ################# # Vide le fichier de log si demandé debug_file = initConfig.RUN_TIME_FOLDER + loggerConfig.LOG_FILE if initConfig.args.fresh_debug: logger.info ('Vide le fichier {}...'.format(debug_file)) open(debug_file, 'w').close() if initConfig.args.imdb_spider: logger.info ('Lancement du Spider') import Spider.IMDBSpider if initConfig.args.imdb_priority_spider: logger.info ('Lancement du Spider de Priorités')
#! /usr/bin/env python # -*- coding: latin-1 -*- #verifier que tous les modules peuvent être importés avant de commencer l'application import Logger.init_logger as initLogger #initialise le logger import init_config as initConfig logger = initLogger.getLogger(initConfig.IMPORTS_LOGGER_NAME) import MySQLdb try: import lxml except: logger.critical("Le module lxml est nécessaire pour l'application, mais n'a pas été trouvé. Installation requise") exit(1)
#importe les modules internes import Logger.init_logger as initLogger #Initialise le logger import Logger.logger_config as loggerConfig import FilmExtractor_config as FilmExtractorConfig import Connector.IMDBStatusConnector import FilmExtractor.IMDB_Extractor from status.models import * from cinema.models import * import threading import time logger = initLogger.getLogger( FilmExtractorConfig.EXTRACTOR_IMDB_INIT_LOGGER_NAME) ################################################################### year_min = 2000 year_max = 2012 priority_max = 1000 def extractOneMovie(imdb_id): FilmExtractor.IMDB_Extractor.IMDB_SuperExtractor(imdb_id) def extractOneMovieAwards(imdb_id): FilmExtractor.IMDB_Extractor.IMDB_awardsExtract(imdb_id)