def detect_encoding(self): """ Try to guess the encoding Returns: a hash, with :encoding, :confidence, :type this will return nil if an error occurred during detection or no valid encoding could be found """ if hasattr(self, '_detect_encoding'): return self._detect_encoding if self.data: self._detect_encoding = charlockholmes.detect(self.data) return self._detect_encoding
def detect_encoding(self): """ Try to guess the encoding Returns: a hash, with :encoding, :confidence, :type this will return nil if an error occurred during detection or no valid encoding could be found """ if hasattr(self, '_detect_encoding'): return self._detect_encoding if self.data: self._detect_encoding = charlockholmes.detect(self.data.encode('utf-8', 'ignore')) return self._detect_encoding
def get_and_extract(url, timeout=5): http_client = AsyncHTTPClient() try: resp = yield http_client.fetch(url, connect_timeout=timeout) except: logging.exception('fetch: %s' % url) raise gen.Return({}) content_type = resp.headers.get('Content-Type') segs = charset_regexp.findall(content_type) if segs: charset = segs[0] else: charset = detect(resp.body).get('encoding', '') try: body = resp.body.decode(charset, errors='ignore') except LookupError: body = resp.body.decode('utf8', errors='ignore') raise gen.Return(extract(body))
"file/test.sh", { 'confidence': 21, 'type': 'text', 'language': 'es', 'encoding': 'ISO-8859-1' } ], "elf": ["file/test", { 'confidence': 100, 'type': 'binary' }], "bz2": ["file/test.tar.bz2", { 'confidence': 100, 'type': 'binary' }], "gz": ["file/test.tar.gz", { 'confidence': 100, 'type': 'binary' }], } for test in TEST_FILES: file_path = TEST_FILES[test][0] file_result = TEST_FILES[test][1] content = open(file_path, "rb").read() test_result = detect(content) if test_result == file_result: print(file_path + ": OK") else: print(file_path + ": ERROR")
"file/test.c", {'confidence': 50, 'type': 'text', 'language': 'en', 'encoding': 'ISO-8859-1'} ], "sh": [ "file/test.sh", {'confidence': 21, 'type': 'text', 'language': 'es', 'encoding': 'ISO-8859-1'} ], "elf": [ "file/test", {'confidence': 100, 'type': 'binary'} ], "bz2": [ "file/test.tar.bz2", {'confidence': 100, 'type': 'binary'} ], "gz": [ "file/test.tar.gz", {'confidence': 100, 'type': 'binary'} ], } for test in TEST_FILES: file_path = TEST_FILES[test][0] file_result = TEST_FILES[test][1] content = open(file_path).read() test_result = detect(content) if test_result == file_result: print file_path + ": OK" else: print file_path + ": ERROR"