コード例 #1
0
    def __init__(self, extractor='DefaultExtractor', **kwargs):
        if kwargs.get('url'):
            request     = urllib.request.Request(kwargs['url'], headers=self.headers)
            connection  = urllib.request.urlopen(request)
            self.data   = connection.read()
            encoding    = connection.headers['content-type'].lower().split('charset=')[-1]
            if encoding.lower() == 'text/html':
                encoding = charade.detect(self.data)['encoding']
            self.data = str(self.data, encoding)
        elif kwargs.get('html'):
            self.data = kwargs['html']
            if not isinstance(self.data, str):
                self.data = str(self.data, charade.detect(self.data)['encoding'])
        else:
            raise Exception('No text or url provided')

        try:
            # make it thread-safe
            if threading.activeCount() > 1:
                if jpype.isThreadAttachedToJVM() == False:
                    jpype.attachThreadToJVM()
            lock.acquire()
            
            self.extractor = jpype.JClass(
                "de.l3s.boilerpipe.extractors."+extractor).INSTANCE
        finally:
            lock.release()
    
        reader = StringReader(self.data)
        self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
        self.extractor.process(self.source)
コード例 #2
0
    def __init__(self, extractor='DefaultExtractor', **kwargs):
        if kwargs.get('url'):
            request = urllib2.Request(kwargs['url'], headers=self.headers)
            connection = urllib2.urlopen(request, timeout=10)
            self.data = connection.read()
            encoding = connection.headers['content-type'].lower().split(
                'charset=')[-1]
            if encoding.lower() == 'text/html':
                encoding = charade.detect(self.data)['encoding']
            if encoding is None:
                encoding = 'utf-8'
            self.data = str(self.data, encoding, errors='ignore')
        elif kwargs.get('html'):
            self.data = kwargs['html']
            if not isinstance(self.data, str):
                self.data = str(self.data,
                                charade.detect(self.data)['encoding'])
        else:
            raise Exception('No text or url provided')

        try:
            # make it thread-safe
            if threading.activeCount() > 1:
                if jpype.isThreadAttachedToJVM() == False:
                    jpype.attachThreadToJVM()
            lock.acquire()

            self.extractor = jpype.JClass("de.l3s.boilerpipe.extractors." +
                                          extractor).INSTANCE
        finally:
            lock.release()

        reader = StringReader(self.data)
        self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
        self.extractor.process(self.source)
コード例 #3
0
    def __init__(self, extractor='DefaultExtractor', **kwargs):
        if kwargs.get('url'):

            request     = urllib2.Request(kwargs['url'], headers=self.headers)

            # Version without headers
            # request     = urllib2.Request(kwargs['url'])

            connection  = urllib2.urlopen(request)

            self.data   = connection.read()

            encoding    = connection.headers['content-type'].lower().split('charset=')[-1]

            # Try requests
            # request     = requests.get(kwargs['url'], headers=self.headers, verify=False)

            # self.data   = request.text
            # encoding    = request.headers['content-type'].lower().split('charset=')[-1]

            if encoding.lower() == 'text/html':
                encoding = charade.detect(self.data)['encoding']

                try:

                    self.data = unicode(self.data, encoding, errors='replace')

                except LookupError as e:

                    print e
                    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT

        elif kwargs.get('html'):
            self.data = kwargs['html']

            if not isinstance(self.data, unicode):
                self.data = unicode(self.data, charade.detect(self.data)['encoding'], errors='replace')
                import ipdb; ipdb.set_trace()  # XXX BREAKPOINT

        else:
            raise Exception('No text or url provided')

        try:
            # make it thread-safe
            if threading.activeCount() > 1:
                if jpype.isThreadAttachedToJVM() == False:
                    jpype.attachThreadToJVM()
            lock.acquire()

            self.extractor = jpype.JClass(
                "de.l3s.boilerpipe.extractors."+extractor).INSTANCE
        finally:
            lock.release()

        reader = StringReader(self.data)
        self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
        self.extractor.process(self.source)
コード例 #4
0
    def __init__(self, extractor='DefaultExtractor', **kwargs):
        if kwargs.get('url'):
            # Correctly encode url
            url = unicode(kwargs['url'])
            if re_rus.search(url):
                url = re_http.sub("", url)
                url = re_slash.sub("", url)
                url = url.encode("idna")
                url = "http://" + url

            # Set header
            h = {'User-Agent': self.headers[0], 'Accept': '*/*'}

            # Download the page
            request = urllib2.Request(url, headers=h)
            connection = urllib2.urlopen(request)
            self.data = connection.read()
            encoding = connection.headers['content-type'].lower().split(
                'charset=')[-1]

            # Decode the page contents in the correct encoding
            if self.data is None:
                raise Exception('Html data cannot be extracted.')
            if encoding.lower() == 'text/html':
                encoding = charade.detect(self.data)['encoding']
            old = encoding
            encoding = re_enc_error.sub("", encoding)
            encoding = re_enc_error2.sub("", encoding)
            encoding = re_enc_win.sub("windows-1251", encoding)
            if re_enc_def.search(encoding): encoding = DEFAULT_ENCODING
            self.data = unicode(self.data, encoding, "ignore")
            connection.close()

        elif kwargs.get('html'):
            self.data = kwargs['html']
            if not isinstance(self.data, unicode):
                self.data = unicode(self.data,
                                    charade.detect(self.data)['encoding'])
        else:
            raise Exception('No text or url provided')

        try:
            # make it thread-safe
            if threading.activeCount() > 1:
                if jpype.isThreadAttachedToJVM() == False:
                    jpype.attachThreadToJVM()
            lock.acquire()

            self.extractor = jpype.JClass("de.l3s.boilerpipe.extractors." +
                                          extractor).INSTANCE
        finally:
            lock.release()

        reader = StringReader(self.data)
        self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
        self.extractor.process(self.source)
コード例 #5
0
    def __init__(self, extractor='DefaultExtractor', **kwargs):

        if kwargs.get('logger'):
            self.logger = kwargs['logger']
        else:
            self.logger = None

        if kwargs.get('url'):
            request = urllib2.Request(kwargs['url'], headers=self.headers)
            try:
                connection = urllib2.urlopen(request)
            except:
                connection = None
                if self.logger is not None:
                    self.logger.exception(
                        'boilerpipe extractor failed on urlopen() for uri %s' %
                        kwargs['url'])

            if connection is not None:
                self.data = connection.read()
                encoding = connection.headers['content-type'].lower().split(
                    'charset=')[-1]
                if encoding.lower() == 'text/html':
                    encoding = charade.detect(self.data)['encoding']
                self.data = unicode(self.data, encoding)
            else:
                if self.logger is not None:
                    self.logger.debug(
                        'boilerpipe execution continues with empty document')
                self.data = u''

        elif kwargs.get('html'):
            self.data = kwargs['html']
            if not isinstance(self.data, unicode):
                self.data = unicode(self.data,
                                    charade.detect(self.data)['encoding'])
        else:
            raise Exception('No text or url provided')

        try:
            # make it thread-safe
            if threading.activeCount() > 1:
                if jpype.isThreadAttachedToJVM() == False:
                    jpype.attachThreadToJVM()
            lock.acquire()

            self.extractor = jpype.JClass("de.l3s.boilerpipe.extractors." +
                                          extractor).INSTANCE
        finally:
            lock.release()

        reader = StringReader(self.data)
        self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
        self.extractor.process(self.source)
コード例 #6
0
    def __init__(self, extractor='DefaultExtractor', **kwargs):
        if kwargs.get('url'):
            # Correctly encode url  
            url = unicode(kwargs['url'])
            if re_rus.search(url):
                url = re_http.sub("", url)
                url = re_slash.sub("", url)
                url = url.encode("idna")
                url = "http://" + url

            # Set header 
            h = {'User-Agent':self.headers[0], 'Accept':'*/*'}
            
            # Download the page
            request     = urllib2.Request(url, headers=h)
            connection  = urllib2.urlopen(request)
            self.data   = connection.read()
            encoding    = connection.headers['content-type'].lower().split('charset=')[-1]

            # Decode the page contents in the correct encoding
            if self.data is None: 
		raise Exception('Html data cannot be extracted.')
            if encoding.lower() == 'text/html':
                encoding = charade.detect(self.data)['encoding']
            old = encoding
            encoding = re_enc_error.sub("", encoding)
	    encoding = re_enc_error2.sub("", encoding)
	    encoding = re_enc_win.sub("windows-1251", encoding)
            if re_enc_def.search(encoding): encoding = DEFAULT_ENCODING
	    self.data = unicode(self.data, encoding, "ignore")
	    connection.close()

        elif kwargs.get('html'):
            self.data = kwargs['html']
            if not isinstance(self.data, unicode):
                self.data = unicode(self.data, charade.detect(self.data)['encoding'])
        else:
            raise Exception('No text or url provided')

        try:
            # make it thread-safe
            if threading.activeCount() > 1:
                if jpype.isThreadAttachedToJVM() == False:
                    jpype.attachThreadToJVM()
            lock.acquire()
            
            self.extractor = jpype.JClass(
                "de.l3s.boilerpipe.extractors."+extractor).INSTANCE
        finally:
            lock.release()
    
        reader = StringReader(self.data)
        self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
        self.extractor.process(self.source)
コード例 #7
0
    def __init__(self, extractor='DefaultExtractor', **kwargs):

        if kwargs.get('logger'):
            self.logger = kwargs['logger']
        else:
            self.logger = None

        if kwargs.get('url'):
            request     = urllib2.Request(kwargs['url'], headers=self.headers)
            try:
                connection  = urllib2.urlopen(request)
            except:
                connection = None
                if self.logger is not None:
                    self.logger.exception( 'boilerpipe extractor failed on urlopen() for uri %s' % kwargs['url'] )

            if connection is not None:
                self.data   = connection.read()
                encoding    = connection.headers['content-type'].lower().split('charset=')[-1]
                if encoding.lower() == 'text/html':
                    encoding = charade.detect(self.data)['encoding']
                self.data = unicode(self.data, encoding)
            else:
                if self.logger is not None:
                    self.logger.debug('boilerpipe execution continues with empty document')
                self.data = u''

        elif kwargs.get('html'):
            self.data = kwargs['html']
            if not isinstance(self.data, unicode):
                self.data = unicode(self.data, charade.detect(self.data)['encoding'])
        else:
            raise Exception('No text or url provided')

        try:
            # make it thread-safe
            if threading.activeCount() > 1:
                if jpype.isThreadAttachedToJVM() == False:
                    jpype.attachThreadToJVM()
            lock.acquire()
            
            self.extractor = jpype.JClass(
                "de.l3s.boilerpipe.extractors."+extractor).INSTANCE
        finally:
            lock.release()
    
        reader = StringReader(self.data)
        self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
        self.extractor.process(self.source)
コード例 #8
0
ファイル: document.py プロジェクト: jelmer/breadability
def decode_html(html):
    """
    Converts bytes stream containing an HTML page into Unicode.
    Tries to guess character encoding from meta tag of by "charade" library.
    """
    if isinstance(html, unicode):
        return html

    match = CHARSET_META_TAG_PATTERN.search(html)
    if match:
        declared_encoding = match.group(1).decode("ASCII")
        # proceed unknown encoding as if it wasn't found at all
        with ignored(LookupError):
            return html.decode(declared_encoding, "ignore")

    # try to enforce UTF-8 firstly
    with ignored(UnicodeDecodeError):
        return html.decode("utf8")

    text = TAG_MARK_PATTERN.sub(to_bytes(" "), html)
    diff = text.decode("utf8", "ignore").encode("utf8")
    sizes = len(diff), len(text)

    # 99% of text is UTF-8
    if abs(len(text) - len(diff)) < max(sizes) * 0.01:
        return html.decode("utf8", "ignore")

    # try detect encoding
    encoding = "utf8"
    encoding_detector = charade.detect(text)
    if encoding_detector["encoding"]:
        encoding = encoding_detector["encoding"]

    return html.decode(encoding, "ignore")
コード例 #9
0
def decode(content, language):
    """Decode subtitle `content` in a specified `language`

    :param bytes content: content of the subtitle
    :param language: language of the subtitle
    :type language: :class:`babelfish.Language`
    :return: the decoded `content` bytes
    :rtype: string

    """
    # always try utf-8 first
    encodings = ['utf-8']

    # add language-specific encodings
    if language.alpha3 == 'zho':
        encodings.extend(['gb18030', 'big5'])
    elif language.alpha3 == 'jpn':
        encodings.append('shift-jis')
    elif language.alpha3 == 'ara':
        encodings.append('windows-1256')
    elif language.alpha3 == 'heb':
        encodings.append('windows-1255')
    else:
        encodings.append('latin-1')

    # try to decode
    for encoding in encodings:
        try:
            return content.decode(encoding)
        except UnicodeDecodeError:
            pass

    # fallback on charade
    logger.warning('Could not decode content with encodings %r', encodings)
    return content.decode(charade.detect(content)['encoding'], 'replace')
コード例 #10
0
ファイル: subtitle.py プロジェクト: doron1/subliminal
def decode(content, language):
    """Decode subtitle `content` in a specified `language`

    :param bytes content: content of the subtitle
    :param language: language of the subtitle
    :type language: :class:`babelfish.Language`
    :return: the decoded `content` bytes
    :rtype: string

    """
    # always try utf-8 first
    encodings = ['utf-8']

    # add language-specific encodings
    if language.alpha3 == 'zho':
        encodings.extend(['gb18030', 'big5'])
    elif language.alpha3 == 'jpn':
        encodings.append('shift-jis')
    elif language.alpha3 == 'ara':
        encodings.append('windows-1256')
    else:
        encodings.append('latin-1')

    # try to decode
    for encoding in encodings:
        try:
            return content.decode(encoding)
        except UnicodeDecodeError:
            pass

    # fallback on charade
    logger.warning('Could not decode content with encodings %r', encodings)
    return content.decode(charade.detect(content)['encoding'], 'replace')
コード例 #11
0
ファイル: podnapisi.py プロジェクト: vipere/nzbget-subliminal
    def download_subtitle(self, subtitle):
        soup = self.get(subtitle.link, is_xml=False)
        pre_link = soup.find('a', href=self.pre_link_re)
        if not pre_link:
            raise ProviderError('Cannot find the pre-download link')
        pre_link = self.server + \
            self.pre_link_re.match(pre_link['href']).group('link')

        # Continue following the link
        soup = self.get(
            pre_link,
            headers={
                'Referer': self.server,
            },
            is_xml=False,
        )

        link = soup.find('a', href=self.link_re)
        if not link:
            raise ProviderError('Cannot find the download link')
        try:
            r = self.session.get(self.server + self.link_re.match(link['href']).group('link'), timeout=10)
        except requests.Timeout:
            raise ProviderNotAvailable('Timeout after 10 seconds')
        if r.status_code != 200:
            raise ProviderNotAvailable('Request failed with status code %d' % r.status_code)
        with contextlib.closing(zipfile.ZipFile(io.BytesIO(r.content))) as zf:
            if len(zf.namelist()) > 1:
                raise ProviderError('More than one file to unzip')
            subtitle_bytes = zf.read(zf.namelist()[0])
        subtitle_text = subtitle_bytes.decode(charade.detect(subtitle_bytes)['encoding'], 'replace')
        if not is_valid_subtitle(subtitle_text):
            raise InvalidSubtitle
        return subtitle_text
コード例 #12
0
ファイル: __init__.py プロジェクト: rshiva/python-boilerpipe
    def __init__(self, extractor='DefaultExtractor', **kwargs):
        if kwargs.get('url'):
            response = requests.request('GET', kwargs['url'], headers=self.headers)
            self.data = response.text
        elif kwargs.get('html'):
            self.data = kwargs['html']
            if not isinstance(self.data, unicode):
                self.data = unicode(self.data, charade.detect(self.data)['encoding'])
        else:
            raise Exception('No text or url provided')

        try:
            # make it thread-safe
            if threading.activeCount() > 1:
                if jpype.isThreadAttachedToJVM() == False:
                    jpype.attachThreadToJVM()
            lock.acquire()
            
            self.extractor = jpype.JClass(
                "de.l3s.boilerpipe.extractors."+extractor).INSTANCE
        finally:
            lock.release()
    
        reader = StringReader(self.data)
        self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
        self.extractor.process(self.source)
コード例 #13
0
ファイル: downloadqueue.py プロジェクト: qitta/libhugin
    def _bytes_to_unicode(self, byte_data):
        """
        Decode a http byte-response to unicode.

        Tries to decode bytestream to utf-8. If this fails, encoding is guessed
        by charade and decoding is repeated with just dedected encoding.

        :param byte_data: A bytestream.
        :returns: A unicode

        """

        try:
            encoding = charade.detect(byte_data).get('encoding')
            return byte_data.decode(encoding)
        except (TypeError, AttributeError, UnicodeError) as e:
            print('Error decoding bytes with charade.', e)

        try:
            return byte_data.decode('utf-8')
        except (TypeError, AttributeError, UnicodeError) as e:
            print('Error decoding bytes to utf-8.', e)

        try:
            return str(BeautifulSoup(byte_data))
        except Exception as e:
            print('Exception in downloadqueue while trying to encode with BeautifulSoup:', e)
コード例 #14
0
ファイル: thesubdb.py プロジェクト: peterlindeman/subliminal
 def download_subtitle(self, subtitle):
     params = {"action": "download", "hash": subtitle.hash, "language": subtitle.language.alpha2}
     r = self.get(params)
     if r.status_code != 200:
         raise ProviderError("Request failed with status code %d" % r.status_code)
     subtitle_text = r.content.decode(charade.detect(r.content)["encoding"], "replace")
     if not is_valid_subtitle(subtitle_text):
         raise InvalidSubtitle
     subtitle.content = subtitle_text
コード例 #15
0
def detect(s):
	'''
	>>> detect('ascii')
	{'confidence': 1.0, 'encoding': 'ascii'}
	>>> detect('abcdé')
	{'confidence': 0.505, 'encoding': 'utf-8'}
	>>> detect(bytes('abcdé', 'utf-8'))
	{'confidence': 0.505, 'encoding': 'utf-8'}
	>>> detect(bytes('\222\222\223\225', 'latin-1'))
	{'confidence': 0.5, 'encoding': 'windows-1252'}
	'''
	try:
		if isinstance(s, str):
			return charade.detect(s.encode())
		else:
			return charade.detect(s)
	except UnicodeDecodeError:
		return charade.detect(s.encode('utf-8'))
コード例 #16
0
ファイル: thesubdb.py プロジェクト: alexbabintsev/subliminal
 def download_subtitle(self, subtitle):
     params = {'action': 'download', 'hash': subtitle.hash, 'language': subtitle.language.alpha2}
     r = self.get(params)
     if r.status_code != 200:
         raise ProviderError('Request failed with status code %d' % r.status_code)
     subtitle_text = r.content.decode(charade.detect(r.content)['encoding'])
     if not is_valid_subtitle(subtitle_text):
         raise InvalidSubtitle
     return subtitle_text
コード例 #17
0
def bp_extract(url):
    request = urllib2.Request(url, headers=headers)
    connection = urllib2.urlopen(request)
    data = connection.read()
    encoding = connection.headers['content-type'].lower().split('charset=')[-1]
    encoding = charade.detect(data)['encoding']

    extr = Extractor(extractor='ArticleExtractor', url=url)
    return extr.getText().encode(encoding).decode('iso-8859-15')
コード例 #18
0
def trans_url_to_utf8(url):

    debug('trans url to utf-8 encoding')
    if isinstance(url, unicode):
        url = url.encode('utf-8')
    encoding = charade.detect(url)['encoding']
    if encoding.lower() in ('gb2312', 'gbk'):
        encoding = 'gb18030'
    url = url.decode(encoding).encode('utf-8')
    return url
コード例 #19
0
ファイル: cleaners.py プロジェクト: Karmak23/soup-strainer
def set_encoding(soup):

    try:
        enc = charade.detect(soup.get_text())['encoding']

    except:
        log.warning('Could not detect encoding, good luck!')
        return soup

    return soup.encode(enc)
コード例 #20
0
def trans_url_to_utf8(url):

    debug('trans url to utf-8 encoding')
    if isinstance(url, unicode):
        url = url.encode('utf-8')
    encoding = charade.detect(url)['encoding']
    if encoding.lower() in ('gb2312', 'gbk'):
        encoding = 'gb18030'
    url = url.decode(encoding).encode('utf-8')
    return url
コード例 #21
0
ファイル: srtfile.py プロジェクト: GbalsaC/bitnamiP
    def _detect_encoding(cls, path):
        sample = open(path).read(1024)

        for bom, encoding in BOMS:
            if sample.startswith(bom):
                return encoding

        report = charade.detect(sample)
        encoding = report.get('encoding')
        if not encoding:
            return cls.DEFAULT_ENCODING
        return cls._normalize_encoding(encoding)
コード例 #22
0
ファイル: addic7ed.py プロジェクト: alexbabintsev/subliminal
 def download_subtitle(self, subtitle):
     try:
         r = self.session.get(self.server + subtitle.download_link, timeout=10,
                              headers={'Referer': self.server + subtitle.referer})
     except requests.Timeout:
         raise ProviderNotAvailable('Timeout after 10 seconds')
     if r.status_code != 200:
         raise ProviderNotAvailable('Request failed with status code %d' % r.status_code)
     subtitle_text = r.content.decode(charade.detect(r.content)['encoding'])
     if not is_valid_subtitle(subtitle_text):
         raise InvalidSubtitle
     return subtitle_text
コード例 #23
0
ファイル: agl.py プロジェクト: longzhaoai/autoxd
    def serial(result, fname="temp.bin"):
        if charade.detect(fname)['encoding'] == 'utf-8':
            fname = convert(fname)

        root_dir = os.path.dirname(__file__)
        fname = root_dir + "\\" + fname
        f = open(fname, "wb")
        p = cPickle.Pickler(f)
        p.clear_memo()
        p.fast = True
        p.dump(result)
        f.close()
コード例 #24
0
    def _detect_encoding(cls, path):
        sample = open(path).read(1024)

        for bom, encoding in BOMS:
            if sample.startswith(bom):
                return encoding

        report = charade.detect(sample)
        encoding = report.get('encoding')
        if not encoding:
            return cls.DEFAULT_ENCODING
        return cls._normalize_encoding(encoding)
コード例 #25
0
ファイル: __init__.py プロジェクト: Caimany/python-boilerpipe
    def __init__(self, extractor='DefaultExtractor', **kwargs):
        if kwargs.get('url'):
            request     = urllib2.Request(kwargs['url'], headers=self.headers)
            connection  = urllib2.urlopen(request)
            self.data   = connection.read()
            encoding    = connection.headers['content-type'].lower().split('charset=')[-1]
            if encoding.lower() == 'text/html':
                encoding = charade.detect(self.data)['encoding']
            # self.data = unicode(self.data, 'gbk')
            #self.data = self.data.decode(encoding, 'ignore')
            try:
                self.data = unicode(self.data, charade.detect(self.data)['encoding'])
            except UnicodeError:
                encoding = charade.detect(self.data)['encoding']
                self.data = self.data.decode(encoding, 'ignore')
        elif kwargs.get('html'):
            self.data = kwargs['html']
            if not isinstance(self.data, unicode):
                try:
		    self.data = unicode(self.data,'gbk')
                #self.data = unicode(self.data, charade.detect(self.data)['encoding'])
                #try:
                #    self.data = unicode(self.data, charade.detect(self.data)['encoding'])
                except UnicodeError:
		    
                    encoding = charade.detect(self.data)['encoding']
                    print "charset is :",encoding
		    self.data = self.data.decode(encoding, 'ignore')
        ## Extractor(extractor='ArticleExtractor',file='/tmp/a.html')
        elif kwargs.get('file'):
            Path = kwargs['file']
            f = open(Path, 'r')
            self.data = f.read()
            f.close()
            if not isinstance(self.data, unicode):
                try:
                    self.data = unicode(self.data, charade.detect(self.data)['encoding'])
                except UnicodeError:
                    encoding = charade.detect(self.data)['encoding']
                    self.data = self.data.decode(encoding, 'ignore')

        else:
            raise Exception('No text or url provided')

        try:
            # make it thread-safe
            if threading.activeCount() > 1:
                if jpype.isThreadAttachedToJVM() == False:
                    jpype.attachThreadToJVM()
            lock.acquire()

            self.extractor = jpype.JClass(
                "de.l3s.boilerpipe.extractors."+extractor).INSTANCE
        finally:
            lock.release()

        reader = StringReader(self.data)
        self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
        self.extractor.process(self.source)
コード例 #26
0
ファイル: helpers.py プロジェクト: rahulroxx/python-emails
def guess_text_charset(text, is_html=False):
    if is_html:
        rules = isinstance(text, bytes) and RULES_B or RULES_U
        for meta in rules.re_meta.findall(text):
            if rules.re_is_http_equiv.findall(meta):
                for content in rules.re_parse_http_equiv.findall(meta):
                    for charset in rules.re_charset.findall(content):
                        return to_native(charset)
            else:
                for charset in rules.re_charset.findall(meta):
                    return to_native(charset)
    # guess by chardet
    if isinstance(text, bytes):
        return to_native(charade.detect(text)['encoding'])
コード例 #27
0
 def download_subtitle(self, subtitle):
     try:
         response = self.server.DownloadSubtitles(self.token, [subtitle.id])
     except xmlrpclib.ProtocolError:
         raise ProviderNotAvailable
     if response['status'] != '200 OK':
         raise ProviderError('Download failed with status %r' % response['status'])
     if not response['data']:
         raise ProviderError('Nothing to download')
     subtitle_bytes = zlib.decompress(base64.b64decode(response['data'][0]['data']), 47)
     subtitle_text = subtitle_bytes.decode(charade.detect(subtitle_bytes)['encoding'])
     if not is_valid_subtitle(subtitle_text):
         raise InvalidSubtitle
     return subtitle_text
コード例 #28
0
ファイル: helpers.py プロジェクト: hubaimaster/aws-interface
def guess_text_charset(text, is_html=False):
    if is_html:
        rules = isinstance(text, bytes) and RULES_B or RULES_U
        for meta in rules.re_meta.findall(text):
            if rules.re_is_http_equiv.findall(meta):
                for content in rules.re_parse_http_equiv.findall(meta):
                    for charset in rules.re_charset.findall(content):
                        return to_native(charset)
            else:
                for charset in rules.re_charset.findall(meta):
                    return to_native(charset)
    # guess by chardet
    if isinstance(text, bytes):
        return to_native(chardet.detect(text)['encoding'])
コード例 #29
0
ファイル: addic7ed.py プロジェクト: mindw/subliminal
 def download_subtitle(self, subtitle):
     try:
         r = self.session.get(self.server + subtitle.download_link, timeout=10,
                              headers={'Referer': self.server + subtitle.referer})
     except requests.Timeout:
         raise ProviderNotAvailable('Timeout after 10 seconds')
     if r.status_code != 200:
         raise ProviderNotAvailable('Request failed with status code %d' % r.status_code)
     if r.headers['Content-Type'] == 'text/html':
         raise ProviderNotAvailable('Download limit exceeded')
     subtitle_text = r.content.decode(charade.detect(r.content)['encoding'], 'replace')
     if not is_valid_subtitle(subtitle_text):
         raise InvalidSubtitle
     subtitle.content = subtitle_text
コード例 #30
0
 def download_subtitle(self, subtitle):
     params = {
         'action': 'download',
         'hash': subtitle.hash,
         'language': subtitle.language.alpha2
     }
     r = self.get(params)
     if r.status_code != 200:
         raise ProviderError('Request failed with status code %d' %
                             r.status_code)
     subtitle_text = r.content.decode(charade.detect(r.content)['encoding'])
     if not is_valid_subtitle(subtitle_text):
         raise InvalidSubtitle
     return subtitle_text
コード例 #31
0
ファイル: mediafile.py プロジェクト: imenem/beets
    def _convert_encoding(self, str):
        try:
            encoded = str.encode("iso-8859-1")
        except (UnicodeEncodeError, # Encoding is determined correctly by Mutagen
                AttributeError):    # Object has no method encode()
            return str

        charset = charade.detect(encoded)

        try:
            return encoded.decode(charset['encoding'])
        except (TypeError,          # Charade can not determines encoding
                UnicodeDecodeError, # Encoding is determined incorrectly by Charade
                LookupError):       # Encoding determined by Charade is not found
            return str
コード例 #32
0
 def download_subtitle(self, subtitle):
     try:
         r = self.session.get(self.server + '/download-{subtitle_id}.html'.format(subtitle_id=subtitle.id),
                              timeout=10)
     except requests.Timeout:
         raise ProviderNotAvailable('Timeout after 10 seconds')
     if r.status_code != 200:
         raise ProviderNotAvailable('Request failed with status code %d' % r.status_code)
     with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
         if len(zf.namelist()) > 1:
             raise ProviderError('More than one file to unzip')
         subtitle_bytes = zf.read(zf.namelist()[0])
     subtitle_text = subtitle_bytes.decode(charade.detect(subtitle_bytes)['encoding'])
     if not is_valid_subtitle(subtitle_text):
         raise InvalidSubtitle
     return subtitle_text
コード例 #33
0
ファイル: tvsubtitles.py プロジェクト: mindw/subliminal
 def download_subtitle(self, subtitle):
     try:
         r = self.session.get(self.server + '/download-{subtitle_id}.html'.format(subtitle_id=subtitle.id),
                              timeout=10)
     except requests.Timeout:
         raise ProviderNotAvailable('Timeout after 10 seconds')
     if r.status_code != 200:
         raise ProviderNotAvailable('Request failed with status code %d' % r.status_code)
     with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
         if len(zf.namelist()) > 1:
             raise ProviderError('More than one file to unzip')
         subtitle_bytes = zf.read(zf.namelist()[0])
     subtitle_text = subtitle_bytes.decode(charade.detect(subtitle_bytes)['encoding'], 'replace')
     if not is_valid_subtitle(subtitle_text):
         raise InvalidSubtitle
     subtitle.content = subtitle_text
コード例 #34
0
ファイル: __init__.py プロジェクト: EliotBerriot/1flow
def detect_encoding_from_requests_response(response):
    """ :param:`response` beiing a :module:`requests` response, this function
        will try to detect the encoding as much as possible. Fist, the "normal"
        response encoding will be tried, else the headers will be parsed, and
        finally the ``<head>`` of the ``<html>`` content will be parsed. If
        nothing succeeds, we will rely on :module:`charade` to guess from the
        content.

        .. todo:: we have to check if content-type is HTML before parsing the
            headers. For now you should use this function only on responses
            which you are sure they will contain HTML.
    """

    if getattr(response, 'encoding', None):
        return response.encoding

    # In case the headers don't contain an content-type, we get()
    # 'text/html' as a fallback value, which will trigger the same
    # behaviour as having a content-type header with no charset value.
    encoding = response.headers.get(
        'content-type', 'text/html').lower().split('charset=')[-1]

    if encoding.lower() == 'text/html':
        # HTTP headers don't contain any encoding.
        # Search in page head, then try to detect from data.

        html_content = BeautifulSoup(response.content, 'lxml')

        for meta_header in html_content.head.findAll('meta'):
            for attribute, value in meta_header.attrs.items():
                if attribute.lower() == 'http-equiv':
                    if value.lower() == 'content-type':
                        content  = meta_header.attrs.get('content')
                        encoding = content.lower().split('charset=')[-1]
                        break

        if encoding.lower() == 'text/html':
            # If we couldn't find an encoding in the HTML <head>,
            # try to detect it manually wth charade. This can
            # eventually fail, too… In this case, OMG… We are alone.
            try:
                return charade.detect(response)['encoding']

            except:
                LOGGER.critical('Could not detect encoding of %s', response)

    return None
コード例 #35
0
 def download_subtitle(self, subtitle):
     try:
         response = self.server.DownloadSubtitles(self.token, [subtitle.id])
     except xmlrpclib.ProtocolError:
         raise ProviderNotAvailable
     if response['status'] != '200 OK':
         raise ProviderError('Download failed with status %r' %
                             response['status'])
     if not response['data']:
         raise ProviderError('Nothing to download')
     subtitle_bytes = zlib.decompress(
         base64.b64decode(response['data'][0]['data']), 47)
     subtitle_text = subtitle_bytes.decode(
         charade.detect(subtitle_bytes)['encoding'], 'replace')
     if not is_valid_subtitle(subtitle_text):
         raise InvalidSubtitle
     return subtitle_text
コード例 #36
0
ファイル: podnapisi.py プロジェクト: mindw/subliminal
 def download_subtitle(self, subtitle):
     soup = self.get(subtitle.link, is_xml=False)
     link = soup.find('a', href=self.link_re)
     if not link:
         raise ProviderError('Cannot find the download link')
     try:
         r = self.session.get(self.server + self.link_re.match(link['href']).group('link'), timeout=10)
     except requests.Timeout:
         raise ProviderNotAvailable('Timeout after 10 seconds')
     if r.status_code != 200:
         raise ProviderNotAvailable('Request failed with status code %d' % r.status_code)
     with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
         if len(zf.namelist()) > 1:
             raise ProviderError('More than one file to unzip')
         subtitle_bytes = zf.read(zf.namelist()[0])
     subtitle_text = subtitle_bytes.decode(charade.detect(subtitle_bytes)['encoding'], 'replace')
     if not is_valid_subtitle(subtitle_text):
         raise InvalidSubtitle
     subtitle.content = subtitle_text
コード例 #37
0
    def __init__(self, data_path, verbose=False):
        # data_path is the path to TAC_2014_BiomedSumm folder
        self.docs = {}
        self.verbose = verbose
        for topic_path in listfulldir(os.path.join(data_path, 'data')):
            topic = os.path.split(topic_path)[1].lower()
            self.docs.setdefault(topic, {})
            for doc_path in listfulldir(os.path.join(topic_path,
                                                     'Documents_Text')):
                doc = os.path.split(doc_path)[1][:-4].lower()

                with codecs.open(doc_path, mode='rb',
                                 encoding='utf-8', errors='strict') as df:
                    try:
                        self.docs[topic][doc] = df.read().replace('\r', '')
                    except UnicodeDecodeError:
                        with file(doc_path, mode='rb') as df:
                            frmt = charade.detect(df.read())['encoding']
                        with codecs.open(doc_path, mode='rb', encoding=frmt,
                                         errors='strict') as df:
                            self.docs[topic][doc] = df.read().replace('\r', '')
        if self.verbose:
            print('list of topics: %s' % '; '.join(self.docs.keys()))
            dnames = set(chain(*[d.keys() for d in self.docs.itervalues()]))
            print('list of doc_name: %s' % '; '.join(dnames))

        # create name aliases for incosistencies caused by ES
        for topic in self.docs.keys():
            for doc in self.docs[topic].keys():
                if doc.find(',') >= 0 or doc.find('\'') >= 0:
                    new_doc = doc.replace(',', '').replace('\'', '"')
                    self.docs[topic][new_doc] = self.docs[topic][doc]

        self.para_index = {}
        for topic in self.docs:
            self.para_index.setdefault(topic, {})
            for doc, data in self.docs[topic].iteritems():
                paragraphs = para_tokenize(self.docs[topic][doc])
                soff = [(s, e) for s, e in sorted(paragraphs['offsets'],
                                                  key=lambda x: x[0],
                                                  reverse=True)]
                self.para_index[topic][doc] = OrderedDict(soff)
コード例 #38
0
def text_reader(file):
    try:
        with open(file, 'rb') as f_obj:
            origin = f_obj.read()
            chartype = charade.detect(origin)
            try:
                if 'GB' in chartype['encoding']:
                    article = origin.decode('gbk')
                else:
                    article = origin.decode(chartype['encoding'])
            except UnicodeDecodeError and TypeError:
                error_box.append('Error - File decode is failed: ' + file)
                article = None
    except FileNotFoundError:
        error_box.append('Error - File is not exist: ' + file)
        article = None
    if article:
        while '=' in article:
            article = article.replace('=', '等于')
    return article
コード例 #39
0
    def serial(result, fname="temp.bin"):
        if isinstance(result, pd.DataFrame) or isinstance(result, pd.Panel):
            fname = str(fname).replace('.searial', '.df')
        elif isinstance(result, np.ndarray):
            fname = str(fname).replace('.searial', '.csv')

        if charade.detect(fname)['encoding'] == 'utf-8':
            fname = convert(fname)
        if isinstance(result, pd.DataFrame) or isinstance(result, pd.Panel):
            result.to_pickle(fname)
            #result.to_csv(fname)
        elif isinstance(result, np.ndarray):
            np.savetxt(fname, result, delimiter=',', fmt='%.3f')
        else:
            f = open(fname, "wb")
            p = cPickle.Pickler(f)
            p.clear_memo()
            p.fast = True
            p.dump(result)
            f.close()
コード例 #40
0
def determine_encoding(page):
    encoding = "utf8"
    text = TAG_MARK_PATTERN.sub(to_bytes(" "), page)

    # don't venture to guess
    if not text.strip() or len(text) < 10:
        return encoding

    # try enforce UTF-8
    diff = text.decode(encoding, "ignore").encode(encoding)
    sizes = len(diff), len(text)

    # 99% of UTF-8
    if abs(len(text) - len(diff)) < max(sizes) * 0.01:
        return encoding

    # try detect encoding
    encoding_detector = charade.detect(text)
    if encoding_detector["encoding"]:
        encoding = encoding_detector["encoding"]

    return encoding
コード例 #41
0
ファイル: htmlPage.py プロジェクト: gromoteur/gromoteur
    def dec(raw):
        print("mmm", charade.detect(raw)['encoding'])
        encoding = None
        for enc in ('utf-8', "CP1252", 'utf-16', 'utf-32'):
            try:
                sdec = raw.decode(enc)
                encoding = enc
                break
                #print "good",encoding

                #return sdec
            except UnicodeDecodeError:
                print("error", enc)
        if encoding:
            print("found encoding", encoding)
            #print sdec
            if "é" in sdec:
                print("ooooooooooooooooooo")
        else:
            findEncodingInfo(raw)
            decode(raw)
            print("chardet.detect(raw)", chardet.detect(raw))
コード例 #42
0
ファイル: document.py プロジェクト: iAcquire/breadability
def determine_encoding(page):
    encoding = "utf8"
    text = TAG_MARK_PATTERN.sub(to_bytes(" "), page)

    # don't venture to guess
    if not text.strip() or len(text) < 10:
        return encoding

    # try enforce UTF-8
    diff = text.decode(encoding, "ignore").encode(encoding)
    sizes = len(diff), len(text)

    # 99% of UTF-8
    if abs(len(text) - len(diff)) < max(sizes) * 0.01:
        return encoding

    # try detect encoding
    encoding_detector = charade.detect(text)
    if encoding_detector["encoding"]:
        encoding = encoding_detector["encoding"]

    return encoding
コード例 #43
0
ファイル: http.py プロジェクト: bobquest33/sparks
def detect_encoding_from_requests_response(response, meta=False, deep=False):
    """ Try to detect encoding as much as possible.

    :param:`response` beiing a :module:`requests` response, this function
    will try to detect the encoding as much as possible. Fist, the "normal"
    response encoding will be tried, else the headers will be parsed, and
    finally the ``<head>`` of the ``<html>`` content will be parsed. If
    nothing succeeds, we will rely on :module:`charade` to guess from the
    content.

    .. todo:: we have to check if content-type is HTML before parsing the
        headers. For now you should use this function only on responses
        which you are sure they will contain HTML.
    """

    if getattr(response, 'encoding', None) and not (meta or deep):

        # To understand, please read
        # http://docs.python-requests.org/en/latest/user/advanced/#encodings
        if response.encoding.lower() != 'iso-8859-1':
            if __debug__:
                LOGGER.debug(u'detect_encoding_from_requests_response(): '
                             u'detected %s via `requests` module.',
                             response.encoding)

            return response.encoding

    # If requests doesn't bring us any encoding or returns 'iso-8859-1',
    # we have 3 fallback options:
    # - inspect the server headers ourselves. This is fast, but rarely
    #   they exist (that's probably why requests failed), and sometimes
    #   they disagree with META tags,
    # - look up the META tags. This is fast too, but sometimes the tag
    #   is not present or the value is wrong too,
    # - detect it via `charade`. Quite slower, but gives accurate results.

    content_type = response.headers.get('content-type', None)

    # If found and no deeper search is wanted, return it.
    if content_type is not None and 'charset' in content_type \
            and not (meta or deep):

        encoding = content_type.lower().split('charset=')[-1]

        if __debug__:
            LOGGER.debug(u'detect_encoding_from_requests_response(): '
                         u'detected %s via server headers.',
                         encoding)

        return encoding

    # HTTP headers don't contain any encoding.
    # Search in page head, then try to detect from data.

    html_content = BeautifulSoup(response.content, 'lxml')

    found = False
    try:
        metas = html_content.head.findAll('meta')

    except AttributeError:
        # Happens on non-HTML pages (eg. RSS feed, other XML resources…)
        metas = []

    for meta_header in metas:
        for attribute, value in meta_header.attrs.items():

            if attribute.lower() == 'charset':
                encoding = value
                found = True
                break

            elif attribute.lower() == 'http-equiv':
                if value.lower() == 'content-type':
                    # OMG o_O took time to find this one :
                    #
                    # In [73]: meta_header
                    # Out[73]: <meta content="text/html; charset=utf-8" …
                    # In [74]: meta_header.get('content')
                    # Out[74]: u'text/html; charset=iso-8859-1'
                    #
                    # We cannot rely on get('content') and need to
                    # fallback to good ol' RE searching. Thanks BS4.
                    content = unicode(meta_header).lower()
                    if 'charset' in content:
                        encoding = re.search('charset=([\w-]*)',
                                             content, re.I | re.U).group(1)
                        found = True
                        break
        if found:
            break

    # If no deeper search is wanted, return it now.
    if found and encoding not in ('text/html', '', None) and not deep:

        if __debug__:
            LOGGER.debug(u'detect_encoding_from_requests_response(): '
                         u'detected %s via HTML meta tags.',
                         encoding)

        return encoding

    try:
        charade_result = charade.detect(response.content)

    except:
        pass

    else:
        if __debug__:
            LOGGER.debug(u'detect_encoding_from_requests_response(): '
                         u'detected %s via `charade` module (with %s%% '
                         u'confidence).',
                         charade_result['encoding'],
                         charade_result['confidence'] * 100)
        return charade_result['encoding']

    LOGGER.critical('detect_encoding_from_requests_response(): could not '
                    u'detect encoding of %s via all test methods.', response)
    return None
コード例 #44
0
ファイル: TextPad.py プロジェクト: YuriiKot/NotePad-on-Python
 def CodingDetermine(self):
     
     coding = charade.detect(self.textEdit.toPlainText())
     ExMess = QtGui.QMessageBox.question(self, u'Coding!', str(coding.items()), QtGui.QMessageBox.Yes, QtGui.QMessageBox.No)
コード例 #45
0
ファイル: main.py プロジェクト: sambanshee/textgen-fun
                        help="Tries, default: 10",
                        type=int,
                        default="10")
    parser.add_argument("-r",
                        "--remove_punct",
                        help="Remove punctuation,default: False",
                        type=bool,
                        default=False)
    args = parser.parse_args()

    print "Reading all files from %s" % args.dir

    try:
        files = open_dir(args.dir)
        result = read_files(files)
        enc = charade.detect(result)
        syscodepage = sys.stdout.encoding
        if args.remove_punct:
            print "Removing punctuation per %s" % args.remove_punct
            chars = re.escape(string.punctuation)
            new_result = re.sub(r'[' + chars + ']', ' ',
                                result.decode(enc['encoding']))
        else:
            new_result = result.decode(enc['encoding'])
    except:
        raise

    print "Tokenizing text..."

    if args.remove_punct:
        tokens = nltk.word_tokenize(new_result.lower())
コード例 #46
0
ファイル: srtfile.py プロジェクト: mtruneck/vocap
 def _detect_encoding(cls, path):
     report = charade.detect(open(path).read())
     encoding = report.get('encoding')
     if not encoding:
         return cls.DEFAULT_ENCODING
     return cls._normalize_encoding(encoding)
コード例 #47
0
ファイル: utils.py プロジェクト: yopming/Gandolf
def encoding_detect(file_path):
    """ get file's encoding """
    file_buf = open(file_path, 'rb').read()
    result = charade.detect(file_buf)
    return result['encoding']
コード例 #48
0
# tested windows-1250 to windows-1258 (1259 doesn't exist)
# utf-16 gives error: UnicodeError: UTF-16 stream does not start with BOM
# macRoman macGreek macturkish maclatin2
# latin-1 latin2 - latin10   nb  iso-8859-1 == latin-1  iso-8859-5 to 8
# UTF-16LE UTF-16BE utf_32_le utf_32_be
# ISO-8859-7
# cp500 cp737 cp850 cp852 cp855 cp857 cp858 cp869 cp875 cp1026 cp1140
# greek == iso-8859-7
# ascii (lol)
#

import ftfy


rawdata = open(dir + file, 'rb').read()
result = charade.detect(rawdata)
print ftfy.guess_bytes(rawdata)[0]
print rawdata
print result
'''


with codecs.open(dir + file, mode='r', encoding='utf-8') as infile:
#with io.open(dir + file, mode='rb') as infile:
#    data = infile.read().encode('windows-1250')
        #.decode('latin1')

    #print data
    for line in infile:

        #line = line.replace(u'ˆ', u'à')
コード例 #49
0
ファイル: main.py プロジェクト: sambanshee/textgen-fun
if __name__ == '__main__':
  parser = argparse.ArgumentParser(description="Generate Markov chains from directory with text files")
  parser.add_argument("-d", "--dir", help="directory with files", required=True)
  parser.add_argument("-n", "--ngram", help="n-Gram, default: 3 (trigram)", type=int, default="3")
  parser.add_argument("-w", "--words", help="Words, default: 100", type=int, default="100")
  parser.add_argument("-t", "--tries", help="Tries, default: 10", type=int, default="10")
  parser.add_argument("-r", "--remove_punct", help="Remove punctuation,default: False", type=bool, default=False)
  args = parser.parse_args()
  
  print "Reading all files from %s" % args.dir
  
  try:
    files = open_dir(args.dir)
    result = read_files(files)
    enc = charade.detect(result)
    syscodepage =  sys.stdout.encoding
    if args.remove_punct:
      print "Removing punctuation per %s" % args.remove_punct
      chars = re.escape(string.punctuation)
      new_result = re.sub(r'[' + chars + ']', ' ', result.decode(enc['encoding']))
    else:
      new_result = result.decode(enc['encoding'])
  except:
    raise 
  
  print "Tokenizing text..."
  
  if args.remove_punct:
    tokens = nltk.word_tokenize(new_result.lower())
  else:
コード例 #50
0
ファイル: htmlPage.py プロジェクト: gromoteur/gromoteur
    def _openLink(self, link):
        if verbose:
            print("trying: --------------- " + link.encode("utf-8") +
                  " ---------------------")
        try:
            r = self.groopener.open(link)
            self.urls = []
            self.mime = r.headers.get('content-type', None)
            #			.info().gettype()
            if verbose: print("self.mime", self.mime)
            if self.mime and self.mime.startswith(
                    "text"):  ######### text, html, or xml

                #				network.read()
                if verbose:
                    print("reading text or html: " + link.encode("utf-8") +
                          " ok")
                #				if self.followRedirect:match=redirectre.search(self.source)
                #				while self.followRedirect and link not in self.urls and redirectre.search(self.source):
                #					link = urljoin(link, redirectre.search(self.source).groups()[0].strip())
                #					network=self.groopener.open(link)
                #					self.source = network.read()
                #					self.urls+=[link]
                self.urls = [x.url for x in r.history]
                if link[-1] == "/":
                    link = link[:
                                -1]  # strangely enough normal pages without redirects are not added to the history
                if link not in self.urls and link + "/" not in self.urls:
                    self.urls += [link]  #
                if self.defaultEncoding.strip(
                ):  # an encoding should be forced. use BeautifulSoup
                    self.source = r.content
                    self.soup = BeautifulSoup(
                        self.source,
                        "html.parser",
                        from_encoding=self.defaultEncoding)
                    self.encoding = self.soup.original_encoding
                    self.source = str(self.soup)
                else:
                    self.source = r.content
                    self.encoding = charade.detect(r.content)['encoding']
                    if self.encoding:
                        self.soup = BeautifulSoup(self.source,
                                                  "html.parser",
                                                  from_encoding=self.encoding)

                        #if u"ĂŠ" in unicode(self.soup) or u"é" in unicode(self.soup):
                        if "Ă" in str(
                                self.soup
                        ):  # key for finding most wrong windows encodings inside utf-8

                            for g in isoUtf8Garbage:  # check whether it's at least one other typical character
                                if "Ă" + g in str(self.soup):
                                    if verbose:
                                        print(
                                            "Ă + some typical garbage - it's in fact utf-8"
                                        )
                                    # typical errors when something is in fact utf-8 but it's decoded as western
                                    self.encoding = "utf-8"
                                    self.soup = BeautifulSoup(
                                        self.source, "html.parser"
                                    )  # .decode("utf-8",'replace')
                                    break

                    else:
                        self.soup = BeautifulSoup(self.source, "html.parser")
                        self.encoding = r.encoding
                    if verbose:
                        print(
                            "htmlPage self.soup.contains_replacement_characters",
                            self.soup.contains_replacement_characters)
                        print("self.soup.original_encoding",
                              self.soup.original_encoding)
                    #self.encoding = r.encoding
                    self.source = str(self.soup)
                    #if verbose:print self.source.encode("utf-8")

                self.links = self._getLinks()
                self.source = self.getCleanHtml()
                self.text = self.html2text()
                title = None
                if self.soup.head: title = self.soup.head.title
                if not title: title = self.soup.title
                if title: self.title = title.text
                self.computeStat()

            elif self.mime == "application/pdf" and self.takePdf:  ######### pdf
                self.source = r.content  # TODO: how about encoding of pdf pages?
                if verbose:
                    print("reading pdf: " + link.encode("utf-8") + " ok")
                try:
                    self.text = self.pdf2text(self.source)
                except:
                    self.text = ""
                self.computeStat()
                self.title = link.split("/")[-1]
                if self.title.endswith(".pdf"): self.title = self.title[:-4]
                self.source = "pdf"  # TODO: what to do with the pdf source code??? can't put it easily into sqlite!

            elif verbose:
                print("wrong mime type", self.mime)
        except IOError:
            if verbose:
                print("timeout with ", link.encode("utf-8"))
            self.error = "Timeout"

        except Exception as msg:
            if verbose:
                print(traceback.format_exc())
                print(msg)
                print("problem with ", link.encode("utf-8"))
                self.error = str(traceback.format_exc()) + " " + str(msg)
            else:
                self.error = "other exception:" + str(msg)
コード例 #51
0
ファイル: webpage.py プロジェクト: cgoliver/zminerva
 def _use_htmlbytes(self, htmlbytes):
     encoding = charade.detect(htmlbytes)["encoding"]
     self.htmldata = htmlbytes.decode(encoding=encoding)
コード例 #52
0
def is_utf8(s):
    return charade.detect(s)['encoding'] == 'utf-8'