def load_body(url, encoding=None): try: req = urllib2.Request(url) req.add_header('User-Agent', "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10") response=urllib2.urlopen(req) ce=response.headers.get('Content-Encoding',None) if ce and ce.lower().find('gzip')!=-1: body=cStringIO.StringIO(response.read()) body=gzip.GzipFile(fileobj=body,mode='rb').read() else: body = response.read() content_type = response.headers.get('Content-Type',None) body, _ = decoder.decode(url,{'Content-Type':content_type},body, encoding=encoding) return body except: return None
def _decode_doc(self, url, message): if message["crawl_type"] == "dynamic": encoding = "utf-8" elif message["encoding"] is not None and message["encoding_created_time"] is not None and \ datetime.datetime.utcnow() - timestamp2datetime(message["encoding_created_time"]) < \ datetime.timedelta(seconds = self._settings["encoding_expiry_duration"]): encoding = message["encoding"] else: encoding = None if encoding is None: encoding = DomainDecodingCache.get_domain_decoding(message["full_domain"]) content_type = message["headers"].get('Content-Type', None) decoded_doc, used_encoding = decoder.decode(url, {'Content-Type' : content_type}, \ message["doc"], encoding) if message['encoding'] is None: message['encoding'] = used_encoding message['encoding_create_time'] = datetime.datetime.utcnow() return decoded_doc, used_encoding