def setCookie(self, path=False): """ set cookie handler """ if path: self.__url_cookiepath = path try: import cookielib except ImportError: try: import ClientCookie except ImportError: urlopen = urllib2.urlopen Request = urllib2.Request else: urlopen = ClientCookie.urlopen Request = ClientCookie.Request self.__url_cookie = ClientCookie.MozillaCookieJar() if path and os.path.isfile(path): #noinspection PyBroadException try: self.__url_cookcookie.load(path) except Exception, e: pass opener = ClientCookie.build_opener(ClientCookie.HTTPCookieProcessor(self.__url_cookie)) ClientCookie.install_opener(opener) self.__url_request = Request self.__url_urlopen = urlopen
def read_body_and_headers(url, post=None, headers=[], follow_redirects=False, timeout=None): _log("read_body_and_headers "+url) if post is not None: _log("read_body_and_headers post="+post) if len(headers)==0: headers.append(["User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:18.0) Gecko/20100101 Firefox/18.0"]) # Start cookie lib ficherocookies = os.path.join( get_data_path(), 'cookies.dat' ) _log("read_body_and_headers cookies_file="+ficherocookies) cj = None ClientCookie = None cookielib = None # Let's see if cookielib is available try: _log("read_body_and_headers importing cookielib") import cookielib except ImportError: _log("read_body_and_headers cookielib no disponible") # If importing cookielib fails # let's try ClientCookie try: _log("read_body_and_headers importing ClientCookie") import ClientCookie except ImportError: _log("read_body_and_headers ClientCookie not available") # ClientCookie isn't available either urlopen = urllib2.urlopen Request = urllib2.Request else: _log("read_body_and_headers ClientCookie available") # imported ClientCookie urlopen = ClientCookie.urlopen Request = ClientCookie.Request cj = ClientCookie.MozillaCookieJar() else: _log("read_body_and_headers cookielib available") # importing cookielib worked urlopen = urllib2.urlopen Request = urllib2.Request cj = cookielib.MozillaCookieJar() # This is a subclass of FileCookieJar # that has useful load and save methods if cj is not None: # we successfully imported # one of the two cookie handling modules _log("read_body_and_headers Cookies enabled") if os.path.isfile(ficherocookies): _log("read_body_and_headers Reading cookie file") # if we have a cookie file already saved # then load the cookies into the Cookie Jar try: cj.load(ficherocookies) except: _log("read_body_and_headers Wrong cookie file, deleting...") os.remove(ficherocookies) # Now we need to get our Cookie Jar # installed in the opener; # for fetching URLs if cookielib is not None: _log("read_body_and_headers opener using urllib2 (cookielib)") # if we use cookielib # then we get the HTTPCookieProcessor # and install the opener in urllib2 if not follow_redirects: opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=http_debug_log_enabled),urllib2.HTTPCookieProcessor(cj),NoRedirectHandler()) else: opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=http_debug_log_enabled),urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) else: _log("read_body_and_headers opener using ClientCookie") # if we use ClientCookie # then we get the HTTPCookieProcessor # and install the opener in ClientCookie opener = ClientCookie.build_opener(ClientCookie.HTTPCookieProcessor(cj)) ClientCookie.install_opener(opener) # ------------------------------------------------- # Cookies instaladas, lanza la petición # ------------------------------------------------- # Contador inicio = time.clock() # Diccionario para las cabeceras txheaders = {} # Construye el request if post is None: _log("read_body_and_headers GET request") else: _log("read_body_and_headers POST request") # Añade las cabeceras _log("read_body_and_headers ---------------------------") for header in headers: _log("read_body_and_headers header %s=%s" % (str(header[0]),str(header[1])) ) txheaders[header[0]]=header[1] _log("read_body_and_headers ---------------------------") req = Request(url, post, txheaders) if timeout is None: handle=urlopen(req) else: #Disponible en python 2.6 en adelante --> handle = urlopen(req, timeout=timeout) #Para todas las versiones: try: import socket deftimeout = socket.getdefaulttimeout() socket.setdefaulttimeout(timeout) handle=urlopen(req) socket.setdefaulttimeout(deftimeout) except: import sys for line in sys.exc_info(): _log( "%s" % line ) # Actualiza el almacén de cookies cj.save(ficherocookies) # Lee los datos y cierra if handle.info().get('Content-Encoding') == 'gzip': buf = StringIO( handle.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() else: data=handle.read() info = handle.info() _log("read_body_and_headers Response") returnheaders=[] _log("read_body_and_headers ---------------------------") for header in info: _log("read_body_and_headers "+header+"="+info[header]) returnheaders.append([header,info[header]]) handle.close() _log("read_body_and_headers ---------------------------") ''' # Lanza la petición try: response = urllib2.urlopen(req) # Si falla la repite sustituyendo caracteres especiales except: req = urllib2.Request(url.replace(" ","%20")) # Añade las cabeceras for header in headers: req.add_header(header[0],header[1]) response = urllib2.urlopen(req) ''' # Tiempo transcurrido fin = time.clock() _log("read_body_and_headers Downloaded in %d seconds " % (fin-inicio+1)) return data,returnheaders
def downloadpage( url, post=None, headers=[[ 'User-Agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; es-ES; rv:1.9.2.12) Gecko/20101026 Firefox/3.6.12' ]], follow_redirects=True, timeout=socket.getdefaulttimeout()): logger.info("[scrapertools.py] downloadpage") logger.info("[scrapertools.py] url=" + url) if post is not None: logger.info("[scrapertools.py] post=" + post) else: logger.info("[scrapertools.py] post=None") # --------------------------------- # Instala las cookies # --------------------------------- # Inicializa la librería de las cookies ficherocookies = os.path.join(config.get_setting("cookies.dir"), 'cookies.dat') logger.info("[scrapertools.py] ficherocookies=" + ficherocookies) cj = None ClientCookie = None cookielib = None # Let's see if cookielib is available try: logger.info("[scrapertools.py] Importando cookielib") import cookielib except ImportError: logger.info("[scrapertools.py] cookielib no disponible") # If importing cookielib fails # let's try ClientCookie try: logger.info("[scrapertools.py] Importando ClientCookie") import ClientCookie except ImportError: logger.info("[scrapertools.py] ClientCookie no disponible") # ClientCookie isn't available either urlopen = urllib2.urlopen Request = urllib2.Request else: logger.info("[scrapertools.py] ClientCookie disponible") # imported ClientCookie urlopen = ClientCookie.urlopen Request = ClientCookie.Request cj = ClientCookie.MozillaCookieJar() else: logger.info("[scrapertools.py] cookielib disponible") # importing cookielib worked urlopen = urllib2.urlopen Request = urllib2.Request cj = cookielib.MozillaCookieJar() # This is a subclass of FileCookieJar # that has useful load and save methods if cj is not None: # we successfully imported # one of the two cookie handling modules logger.info("[scrapertools.py] Hay cookies") if os.path.isfile(ficherocookies): logger.info("[scrapertools.py] Leyendo fichero cookies") # if we have a cookie file already saved # then load the cookies into the Cookie Jar try: cj.load(ficherocookies) except: logger.info( "[scrapertools.py] El fichero de cookies existe pero es ilegible, se borra" ) os.remove(ficherocookies) # Now we need to get our Cookie Jar # installed in the opener; # for fetching URLs if cookielib is not None: logger.info("[scrapertools.py] opener usando urllib2 (cookielib)") # if we use cookielib # then we get the HTTPCookieProcessor # and install the opener in urllib2 if not follow_redirects: opener = urllib2.build_opener( urllib2.HTTPHandler(debuglevel=DEBUG_LEVEL), urllib2.HTTPCookieProcessor(cj), NoRedirectHandler()) else: opener = urllib2.build_opener( urllib2.HTTPHandler(debuglevel=DEBUG_LEVEL), urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) else: logger.info("[scrapertools.py] opener usando ClientCookie") # if we use ClientCookie # then we get the HTTPCookieProcessor # and install the opener in ClientCookie opener = ClientCookie.build_opener( ClientCookie.HTTPCookieProcessor(cj)) ClientCookie.install_opener(opener) # ------------------------------------------------- # Cookies instaladas, lanza la petición # ------------------------------------------------- # Contador inicio = time.clock() # Diccionario para las cabeceras txheaders = {} # Construye el request if post is None: logger.info("[scrapertools.py] petición GET") else: logger.info("[scrapertools.py] petición POST") # Añade las cabeceras logger.info("[scrapertools.py] ---------------------------") for header in headers: logger.info("[scrapertools.py] header %s=%s" % (str(header[0]), str(header[1]))) txheaders[header[0]] = header[1] logger.info("[scrapertools.py] ---------------------------") req = Request(url, post, txheaders) try: if timeout is None: handle = urlopen(req) else: #Para todas las versiones: deftimeout = socket.getdefaulttimeout() socket.setdefaulttimeout(timeout) handle = urlopen(req) socket.setdefaulttimeout(deftimeout) # Actualiza el almacén de cookies #Exception #cj.save(ficherocookies) # Lee los datos y cierra if handle.info().get('Content-Encoding') == 'gzip': logger.info("[scrapertools.py] gzipped") import StringIO data = handle.read() compressedstream = StringIO.StringIO(data) import gzip gzipper = gzip.GzipFile(fileobj=compressedstream) data = gzipper.read() gzipper.close() else: logger.info("[scrapertools.py] normal") data = handle.read() except urllib2.HTTPError, e: logger.info("error " + repr(e)) import traceback traceback.print_exc() data = e.read() #logger.info("data="+repr(data)) return data
def downloadpageGzip(url): # Inicializa la librería de las cookies ficherocookies = os.path.join(config.get_data_path(), 'cookies.dat') logger.info("Cookiefile=" + ficherocookies) inicio = time.clock() cj = None ClientCookie = None cookielib = None # Let's see if cookielib is available try: import cookielib except ImportError: # If importing cookielib fails # let's try ClientCookie try: import ClientCookie except ImportError: # ClientCookie isn't available either urlopen = urllib2.urlopen Request = urllib2.Request else: # imported ClientCookie urlopen = ClientCookie.urlopen Request = ClientCookie.Request cj = ClientCookie.MozillaCookieJar() else: # importing cookielib worked urlopen = urllib2.urlopen Request = urllib2.Request cj = cookielib.MozillaCookieJar() # This is a subclass of FileCookieJar # that has useful load and save methods # --------------------------------- # Instala las cookies # --------------------------------- if cj is not None: # we successfully imported # one of the two cookie handling modules if os.path.isfile(ficherocookies): # if we have a cookie file already saved # then load the cookies into the Cookie Jar try: cj.load(ficherocookies) except: logger.info( "[scrapertools.py] El fichero de cookies existe pero es ilegible, se borra" ) os.remove(ficherocookies) # Now we need to get our Cookie Jar # installed in the opener; # for fetching URLs if cookielib is not None: # if we use cookielib # then we get the HTTPCookieProcessor # and install the opener in urllib2 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) else: # if we use ClientCookie # then we get the HTTPCookieProcessor # and install the opener in ClientCookie opener = ClientCookie.build_opener( ClientCookie.HTTPCookieProcessor(cj)) ClientCookie.install_opener(opener) #print "-------------------------------------------------------" theurl = url # an example url that sets a cookie, # try different urls here and see the cookie collection you can make ! #txheaders = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3', # 'Referer':'http://www.megavideo.com/?s=signup'} parsedurl = urlparse.urlparse(url) logger.info("parsedurl=" + str(parsedurl)) txheaders = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'es-es,es;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept-Encoding': 'gzip,deflate', 'Keep-Alive': '300', 'Connection': 'keep-alive', 'Referer': parsedurl[0] + "://" + parsedurl[1] } logger.info(str(txheaders)) # fake a user agent, some websites (like google) don't like automated exploration req = Request(theurl, None, txheaders) handle = urlopen(req) cj.save(ficherocookies) # save the cookies again data = handle.read() handle.close() fin = time.clock() logger.info("[scrapertools.py] Descargado 'Gzipped data' en %d segundos " % (fin - inicio + 1)) # Descomprime el archivo de datos Gzip try: fin = inicio import StringIO compressedstream = StringIO.StringIO(data) import gzip gzipper = gzip.GzipFile(fileobj=compressedstream) data1 = gzipper.read() gzipper.close() fin = time.clock() logger.info( "[scrapertools.py] 'Gzipped data' descomprimido en %d segundos " % (fin - inicio + 1)) return data1 except: return data
# # Author : t3rmin4t0r # Mail : gopalv82 -AT- yahoo.com # Site : http://t3.dotgnu.info/ # import sys, os, string from SOAPpy import WSDL, HTTPTransport, Config, SOAPAddress import ClientCookie import urllib2 Config.cookieJar = ClientCookie.MozillaCookieJar() # Uncomment the following line if you have cookies.txt # Config.cookieJar.load("cookies.txt") class CookieTransport(HTTPTransport): def call(self, addr, data, namespace, soapaction=None, encoding=None, http_proxy=None, config=Config): if not isinstance(addr, SOAPAddress): addr = SOAPAddress(addr, config) cookie_cutter = ClientCookie.HTTPCookieProcessor(config.cookieJar) hh = ClientCookie.HTTPHandler()
class recognizeApi(object): """Class to handle requests to recognize.im API. :param client_id: Your unique client ID. You can find it in the Account tab after logging in at recognize.im. :type client_id: str. :param api_key: Your unique API key. You can find it in the Account tab after logging in at recognize.im.. :type api_key: str. :param clapi_key: Your unique secret client key. You can find it in the Account tab after logging in at recognize.im. :type clapi_key: str. :returns: dict -- the server response. """ wsdl = "http://clapi.itraff.pl/wsdl" rest = "http://recognize.im/v2/recognize/" Config.cookieJar = ClientCookie.MozillaCookieJar() def __init__(self, client_id, api_key, clapi_key): self.client_id = client_id self.clapi_key = clapi_key self.api_key = api_key self._server = WSDL.Proxy(self.wsdl, transport = CookieTransport) result = self._server.auth(client_id, clapi_key, None) def convertOutput(self, soap): """Converts SOAPpy.Types.structType to dict. :param soap: The URL to the method you want us to call. :type soap: SOAPpy.Types.structType. :returns: dict -- the server response converted to dict. """ d = {} if type(soap).__name__=='instance' and 'item' in soap._keys(): soap = soap[0] if type(soap).__name__=='list': for i in range(0,len(soap)): if type(soap[i]['value']).__name__=='instance': d[soap[i]['key']] = self.convertOutput(soap[i]['value']) else: d[soap[i]['key']] = soap[i]['value'] elif type(soap).__name__=='instance': d[soap['key']] = soap['value'] return d def imageInsert(self, image_id, image_name, path): """Add new picture to your pictures list :param image_id: A unique identifier of the inserted image. :type image_id: str. :param image_name: A label you want to assign to the inserted image. :type image_name: str. :param path: Path to the image file. :type path: str. :returns: dict -- the server response. """ image = open(path, "rb").read() encoded = base64.b64encode(image) result = self._server.imageInsert(image_id, image_name, encoded); return self.convertOutput(result) def indexBuild(self): """You need to call indexBuild method in order to apply all your recent (from the previous call of this method) changes, including adding new images and deleting images. :returns: dict -- the server response. """ result = self._server.indexBuild() return self.convertOutput(result) def callback(self, callback_url): """There are some situations when we might need to call one of your methods. For example when we finish applying changes we may need to let you know that all your images are ready to be recognized. :param callback_url: The URL to the method you want us to call. :type callback_url: str. :returns: dict -- the server response. """ result = self._server.callback(callback_url) return self.convertOutput(result) def imageDelete(self, image_id): """If you don't need an image to be recognizable anymore you have to remove this image from the database. You can do this by calling imageDelete method passing the ID of the image you want to remove. You can also remove all of your images with one call of this method. In order to achieve this you need to pass null value as a parameter. :param image_id: ID of the image you would like to remove (this is the same ID you pass a an argument to the imageInsert method). Pass null value if you want to remove all of your images. :type image_id: str. :returns: dict -- the server response. """ result = self._server.imageDelete(image_id) return self.convertOutput(result) def imageUpdate(self, image_id, new_image_id, new_image_name): """There may be some situations when you would like to change the name or ID of an image stored in the database. You can do this by calling the imageUpdate method. :param image_id: ID of the image which data you would like to change (this is the same ID you pass a an argument to the imageInsert method). :type image_id: str. :param new_image_id: New ID of an image. :type new_image_id: str. :param new_image_name: New name of an image :type new_image_name: str. :returns: dict -- the server response. """ data = {"id": new_image_id, "name": new_image_name} result = self._server.imageUpdate(image_id, data) return self.convertOutput(result) def indexStatus(self): """You may be curious what is the progress of applying your changes. In order to do this you need to call indexStatus method. :returns: dict -- the server response. """ result = self._server.indexStatus() return self.convertOutput(result) def userLimits(self): """When using our API you are limited with regards the number of images and number of scans (recognition operations). The limits depend on the type of account you have. In order to check how many more images you can add and how many scans you have left use the userLimits method. :returns: dict -- the server response. """ result = self._server.userLimits() return self.convertOutput(result) def imageCount(self): """Returns number of images in your list. :returns: dict -- the server response. """ result = self._server.imageCount() return self.convertOutput(result) def imageGet(self, image_id): """Returns detailed information about image. :param image_id: ID of the image. :type image_id: str. :returns: dict -- the server response. """ result = self._server.imageGet(image_id) return self.convertOutput(result) def modeGet(self): """Returns recognition mode. :returns: dict -- the server response. """ result = self._server.modeGet() return self.convertOutput(result) def modeChange(self, mode): """Changes recognition mode. :returns: dict -- the server response. """ result = self._server.modeChange(mode) return self.convertOutput(result) def recognize(self, path, getAll=False, multi=False, shelf=False): """Sends image recognition request. :param path: Path to the image file. :type path: str. :returns: dict -- the server response. """ #fetch image data size = os.stat(path).st_size / 1024.0 #KB image = Image.open(path) width, height = image.size area = width * height / 10.0**6 #Mpix #check image data if (multi): if (size > MULTIIR_MAX_FILE_SIZE or width < MULTIIR_MIN_DIMENSION or height < MULTIIR_MIN_DIMENSION or area < MULTIIR_MIN_IMAGE_AREA or area > MULTIIR_MAX_IMAGE_AREA): return "Image does not meet the requirements of multi mode query image.\n" elif (shelf): if (size > SHELFIR_MAX_FILE_SIZE or width < SHELFIR_MIN_DIMENSION or height < SHELFIR_MIN_DIMENSION or area < SHELFIR_MIN_IMAGE_AREA or area > SHELFIR_MAX_IMAGE_AREA): return "Image does not meet the requirements of shelf mode query image.\n" else: if (size > SINGLEIR_MAX_FILE_SIZE or width < SINGLEIR_MIN_DIMENSION or height < SINGLEIR_MIN_DIMENSION or area < SINGLEIR_MIN_IMAGE_AREA or area > SINGLEIR_MAX_IMAGE_AREA): return "Image does not meet the requirements of single mode query image.\n" #get url url = self.rest if (multi): url += 'multi/' elif (shelf): url += 'shelf/' else: url += 'single/' if (getAll): url += 'all/' url += self.client_id imageData = open(path, "rb").read() m = hashlib.md5() m.update(self.api_key) m.update(imageData) md5hash = m.hexdigest() headers = { 'content-type':'image/jpeg', 'x-itraff-hash' : md5hash} request = urllib2.Request(url, imageData, headers) response = urllib2.urlopen(request) result = response.read() return ast.literal_eval(result) def drawFrames(self, path, result): """Draws frames on image. :param path: Path to the image file. :type path: str. :param result: Recognition results. :type result: dict. :returns: Image -- Image with frames. """ if (result['status'] == 0): image = Image.open(path) draw = ImageDraw.Draw(image) for obj in result['objects']: loc = obj['location'] draw.line((loc[0]['x'], loc[0]['y'], loc[1]['x'], loc[1]['y']), fill=(255,0,0,255), width=5) draw.line((loc[1]['x'], loc[1]['y'], loc[2]['x'], loc[2]['y']), fill=(255,0,0,255), width=5) draw.line((loc[2]['x'], loc[2]['y'], loc[3]['x'], loc[3]['y']), fill=(255,0,0,255), width=5) draw.line((loc[3]['x'], loc[3]['y'], loc[0]['x'], loc[0]['y']), fill=(255,0,0,255), width=5) return image else: return None
def downloadpagewithcookies(url): # --------------------------------- # Instala las cookies # --------------------------------- # Inicializa la librería de las cookies ficherocookies = os.path.join(config.get_data_path(), 'cookies.dat') if (DEBUG == True): logger.info("[scrapertools.py] Cookiefile=" + ficherocookies) cj = None ClientCookie = None cookielib = None # Let's see if cookielib is available try: import cookielib except ImportError: # If importing cookielib fails # let's try ClientCookie try: import ClientCookie except ImportError: # ClientCookie isn't available either urlopen = urllib2.urlopen Request = urllib2.Request else: # imported ClientCookie urlopen = ClientCookie.urlopen Request = ClientCookie.Request cj = ClientCookie.MozillaCookieJar() else: # importing cookielib worked urlopen = urllib2.urlopen Request = urllib2.Request cj = cookielib.MozillaCookieJar() # This is a subclass of FileCookieJar # that has useful load and save methods if cj is not None: # we successfully imported # one of the two cookie handling modules if os.path.isfile(ficherocookies): # if we have a cookie file already saved # then load the cookies into the Cookie Jar try: cj.load(ficherocookies) except: if (DEBUG == True): logger.info( "[scrapertools.py] El fichero de cookies existe pero es ilegible, se borra" ) os.remove(ficherocookies) # Now we need to get our Cookie Jar # installed in the opener; # for fetching URLs if cookielib is not None: # if we use cookielib # then we get the HTTPCookieProcessor # and install the opener in urllib2 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) else: # if we use ClientCookie # then we get the HTTPCookieProcessor # and install the opener in ClientCookie opener = ClientCookie.build_opener( ClientCookie.HTTPCookieProcessor(cj)) ClientCookie.install_opener(opener) #print "-------------------------------------------------------" theurl = url # an example url that sets a cookie, # try different urls here and see the cookie collection you can make ! #txheaders = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3', # 'Referer':'http://www.megavideo.com/?s=signup'} txheaders = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Host': 'www.meristation.com', 'Accept-Language': 'es-es,es;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Keep-Alive': '300', 'Connection': 'keep-alive' } # fake a user agent, some websites (like google) don't like automated exploration req = Request(theurl, None, txheaders) handle = urlopen(req) cj.save(ficherocookies) # save the cookies again data = handle.read() handle.close() return data
def downloadpage( url, post=None, headers=[[ 'User-Agent', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; es-ES; rv:1.9.2.12) Gecko/20101026 Firefox/3.6.12' ]], follow_redirects=True, timeout=socket.getdefaulttimeout()): if (DEBUG == True): logger.info("[scrapertools.py] downloadpage") if (DEBUG == True): logger.info("[scrapertools.py] url=" + url) if post is not None: if (DEBUG == True): logger.info("[scrapertools.py] post=" + post) else: if (DEBUG == True): logger.info("[scrapertools.py] post=None") # --------------------------------- # Instala las cookies # --------------------------------- # Inicializa la librería de las cookies ficherocookies = os.path.join(config.get_setting("cookies.dir"), 'cookies.dat') if (DEBUG == True): logger.info("[scrapertools.py] ficherocookies=" + ficherocookies) cj = None ClientCookie = None cookielib = None # Let's see if cookielib is available try: if (DEBUG == True): logger.info("[scrapertools.py] Importando cookielib") import cookielib except ImportError: if (DEBUG == True): logger.info("[scrapertools.py] cookielib no disponible") # If importing cookielib fails # let's try ClientCookie try: if (DEBUG == True): logger.info("[scrapertools.py] Importando ClientCookie") import ClientCookie except ImportError: if (DEBUG == True): logger.info("[scrapertools.py] ClientCookie no disponible") # ClientCookie isn't available either urlopen = urllib2.urlopen Request = urllib2.Request else: if (DEBUG == True): logger.info("[scrapertools.py] ClientCookie disponible") # imported ClientCookie urlopen = ClientCookie.urlopen Request = ClientCookie.Request cj = ClientCookie.MozillaCookieJar() else: if (DEBUG == True): logger.info("[scrapertools.py] cookielib disponible") # importing cookielib worked urlopen = urllib2.urlopen Request = urllib2.Request cj = cookielib.MozillaCookieJar() # This is a subclass of FileCookieJar # that has useful load and save methods if cj is not None: # we successfully imported # one of the two cookie handling modules if (DEBUG == True): logger.info("[scrapertools.py] Hay cookies") if os.path.isfile(ficherocookies): if (DEBUG == True): logger.info("[scrapertools.py] Leyendo fichero cookies") # if we have a cookie file already saved # then load the cookies into the Cookie Jar try: cj.load(ficherocookies) except: if (DEBUG == True): logger.info( "[scrapertools.py] El fichero de cookies existe pero es ilegible, se borra" ) os.remove(ficherocookies) # Now we need to get our Cookie Jar # installed in the opener; # for fetching URLs if cookielib is not None: if (DEBUG == True): logger.info( "[scrapertools.py] opener usando urllib2 (cookielib)") # if we use cookielib # then we get the HTTPCookieProcessor # and install the opener in urllib2 if not follow_redirects: opener = urllib2.build_opener( urllib2.HTTPHandler(debuglevel=DEBUG_LEVEL), urllib2.HTTPCookieProcessor(cj), NoRedirectHandler()) else: opener = urllib2.build_opener( urllib2.HTTPHandler(debuglevel=DEBUG_LEVEL), urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) else: if (DEBUG == True): logger.info("[scrapertools.py] opener usando ClientCookie") # if we use ClientCookie # then we get the HTTPCookieProcessor # and install the opener in ClientCookie opener = ClientCookie.build_opener( ClientCookie.HTTPCookieProcessor(cj)) ClientCookie.install_opener(opener) # ------------------------------------------------- # Cookies instaladas, lanza la petición # ------------------------------------------------- # Contador inicio = time.clock() # Diccionario para las cabeceras txheaders = {} # Construye el request if post is None: if (DEBUG == True): logger.info("[scrapertools.py] petición GET") else: if (DEBUG == True): logger.info("[scrapertools.py] petición POST") # Añade las cabeceras if (DEBUG == True): logger.info("[scrapertools.py] ---------------------------") for header in headers: if (DEBUG == True): logger.info("[scrapertools.py] header %s=%s" % (str(header[0]), str(header[1]))) txheaders[header[0]] = header[1] if (DEBUG == True): logger.info("[scrapertools.py] ---------------------------") req = Request(url, post, txheaders) if timeout is None: handle = urlopen(req) else: #Disponible en python 2.6 en adelante --> handle = urlopen(req, timeout=timeout) #Para todas las versiones: deftimeout = socket.getdefaulttimeout() try: socket.setdefaulttimeout(timeout) handle = urlopen(req) except: import sys for line in sys.exc_info(): logger.error("%s" % line) socket.setdefaulttimeout(deftimeout) # Actualiza el almacén de cookies cj.save(ficherocookies) # Lee los datos y cierra data = handle.read() info = handle.info() if (DEBUG == True): logger.info("[scrapertools.py] Respuesta") if (DEBUG == True): logger.info("[scrapertools.py] ---------------------------") for header in info: if (DEBUG == True): logger.info("[scrapertools.py] " + header + "=" + info[header]) handle.close() if (DEBUG == True): logger.info("[scrapertools.py] ---------------------------") ''' # Lanza la petición try: response = urllib2.urlopen(req) # Si falla la repite sustituyendo caracteres especiales except: req = urllib2.Request(url.replace(" ","%20")) # Añade las cabeceras for header in headers: req.add_header(header[0],header[1]) response = urllib2.urlopen(req) ''' # Tiempo transcurrido fin = time.clock() if (DEBUG == True): logger.info("[scrapertools.py] Descargado en %d segundos " % (fin - inicio + 1)) return data