Beispiel #1
0
def load_body(url, encoding=None):
    try:
        req = urllib2.Request(url)
        req.add_header('User-Agent', "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10")
        response=urllib2.urlopen(req)
        ce=response.headers.get('Content-Encoding',None)
        if ce and ce.lower().find('gzip')!=-1:
            body=cStringIO.StringIO(response.read())
            body=gzip.GzipFile(fileobj=body,mode='rb').read()
        else:
            body = response.read()
        content_type = response.headers.get('Content-Type',None)
        body, _ = decoder.decode(url,{'Content-Type':content_type},body, encoding=encoding)
        return body
    except:
        return None
    def _decode_doc(self, url, message):
        if message["crawl_type"] == "dynamic":
            encoding = "utf-8"
        elif message["encoding"] is not None and message["encoding_created_time"] is not None and \
            datetime.datetime.utcnow() - timestamp2datetime(message["encoding_created_time"]) < \
            datetime.timedelta(seconds = self._settings["encoding_expiry_duration"]):
            encoding = message["encoding"]
        else:
            encoding = None

        if encoding is None:
            encoding = DomainDecodingCache.get_domain_decoding(message["full_domain"])

        content_type = message["headers"].get('Content-Type', None)
        decoded_doc, used_encoding = decoder.decode(url, {'Content-Type' : content_type}, \
            message["doc"], encoding)
        if message['encoding'] is None:
            message['encoding'] = used_encoding
            message['encoding_create_time'] = datetime.datetime.utcnow()

        return decoded_doc, used_encoding