def test_speed(self): try: import chardet has_chardet = True except ImportError: has_chardet = False import time do_times = 5 path = r"testdata/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt" with open(path, 'rb') as f: msg = f.read() # Test chardet if has_chardet: result_chardet = 0 for i in range(do_times): start_chardet = time.time() chardet.detect(msg) result_chardet += (time.time() - start_chardet) print('chardet:',1/(result_chardet/do_times), 'call(s)/s') # Test cchardet result_cchardet = 0 for i in range(do_times): start_cchardet = time.time() cchardet.detect(msg) result_cchardet += (time.time() - start_cchardet) print('cchardet:',1/(result_cchardet/do_times), 'call(s)/s')
def get(qstring): """ Builds and returns a JSON reply of all information and requested data """ args = dict(urlparse.parse_qsl(qstring)) reply = {} reply["headers"] = {} reply["status"] = {} if "url" in args and _validateUrl(args["url"]): reply["status"]["url"] = args["url"] if not args["url"].startswith("http://") and not args["url"].startswith("https://") : args["url"] = "http://"+args["url"] hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'} req = urllib2.Request(args["url"],headers=hdr) try: response = urllib2.urlopen(req) charset = response.headers.getparam('charset') if charset is not None: resp = response.read() try: reply["content"] = resp.decode(charset) except: try: reply["content"] = resp.decode(cchardet.detect(resp)['encoding']) except: reply["content"] = resp else: resp = response.read() try: reply["content"] = resp.decode(cchardet.detect(resp)['encoding']) except: reply["content"] = resp reply["status"]["http_code"] = response.code if "headers" in args and args["headers"] == "true": reply["headers"] = dict(response.info()) except (urllib2.HTTPError, urllib2.URLError) as e: try: reply['content'] = urllib2.build_opener(urllib2.HTTPCookieProcessor).open(args["url"]).read() except: reply["status"]["reason"] = str(e.reason) reply["content"] = None reply["status"]["http_code"] = e.code if hasattr(e,'code') else 0 else: reply["content"] = None reply["status"]["http_code"] = 400 reply["status"]["reason"] = "The url parameter value is missing or invalid" # Attach callback to reply if jsonp request if "callback" in args: return "{0}({1})".format(args["callback"], json.dumps(reply)) return json.dumps(reply, ensure_ascii=False)
def test_github_issue_20(self): """ https://github.com/PyYoshi/cChardet/issues/20 """ msg = b'\x8f' cchardet.detect(msg) detector = cchardet.UniversalDetector() detector.feed(msg) detector.close()
def detect(self, data, safe = False): try: return cchardet.detect(data) except TypeError: # TypeError is usually raised when cchardet expects a string # instead of unicode. Let's give it another last try before # giving up try: return cchardet.detect(str(data)) except Exception: raise except Exception: # pragma: no cover if safe: return None raise
def _convert_to_unicode(self, content): """ Converts content to unicode (or all the strings in content) NOTE: Even though this method supports any type, it will currently ignore contents of lists, tuple or any other iterable than dict. We don't need support for these at the moment :param content: content to convert :type content: object :rtype: object """ if isinstance(content, unicode): return content elif isinstance(content, str): result = chardet.detect(content) default = "utf-8" encoding = result["encoding"] or default try: content = content.decode(encoding) except UnicodeError as e: logger.error("Unicode error: {0!r}. Using 'replace'".format(e)) content = content.decode(encoding, 'replace') return content else: if isinstance(content, dict): for key in content.keys(): content[key] = self._convert_to_unicode(content[key]) return content
def versioned_static(file_path): """ Given the path for a static file Output a url path with a hex has query string for versioning """ full_path = find(file_path) url = static(file_path) if not full_path: msg = 'Could not find static file: {0}'.format(file_path) logger.warning(msg) return url versioned_url_path = url with open(full_path, 'rb') as file_contents: file_data = file_contents.read() # # Normalise encoding try: encoding = cchardet.detect(file_data)['encoding'] file_data = file_data.decode(encoding) except ValueError: pass file_data = file_data.encode('utf-8') # 7 chars of sha1 hex sha1_hash = sha1(file_data) sha1_hex = sha1_hash.hexdigest()[:7] versioned_url_path += '?v=' + sha1_hex return versioned_url_path
def _write(self, raw_data, filename) : """ Write raw data to a compressed file. @arg raw_data: The raw_data to be compressed and written @type raw_data: byte string @arg filename: The intended name of the outfile @type filename: unicode @return: outfile ; The full path and name of the file written @rtype: unicode """ result = chardet.detect(raw_data) if result['confidence'] > 0.5: encoding = result['encoding'] else: encoding = 'utf-8' if not util.is_utf8_alias(encoding): raw_data = raw_data.decode(encoding).encode('utf-8') # Compress the data to save disk space. comp = bz2.BZ2Compressor() data = comp.compress(raw_data) data += comp.flush() out_handle = open(self._nametofile(filename), "wb") out_handle.write(data) out_handle.close() return out_handle.name # return the full path to the file
def lazy_chardet_encoding(data): chardet_encoding = chardet.detect(data)['encoding'] if not chardet_encoding: chardet_encoding = '' if isinstance(chardet_encoding, bytes_): chardet_encoding = chardet_encoding.encode('ascii', 'ignore') return chardet_encoding
def d(url, bytes_data): if chardet == None: return '' Fetcher.lock.acquire() if url not in Fetcher.encoding_cache: cd_r = chardet.detect(bytes_data) cd_confidence = cd_r['confidence'] cd_encoding = cd_r['encoding'] if cd_confidence > 0.8: ret = Fetcher.lookup_encoding(cd_encoding) print('\n%s\nchardet[encoding:%s, confidence:%.5f]' % \ (url, ret, cd_confidence) ) else: ret = '' # add to cache Fetcher.encoding_cache[url] = ret else: ret = Fetcher.encoding_cache[url] Fetcher.lock.release() return ret
def convert_encoding(data, new_coding='UTF-8'): encoding = cchardet.detect(data)['encoding'] if new_coding.upper() != encoding.upper(): data = data.decode(encoding, data).encode(new_coding) return data
def test_detect_zh_gb18030(self): encoding = "GB18030" path = r"testdata/zh/GB18030/wikitop_zh_GB18030.txt" with open(path, 'rb') as f: msg = f.read() detected_encoding = cchardet.detect(msg) eq_(encoding.lower(),detected_encoding['encoding'].lower())
def main(path, n, encoding=None, no_more=False): try: with open(path, 'rb') as f: res_list = fileecho.head(f, n) res = b'\n'.join(res_list) detect_result = chardet.detect(res) if encoding is not None: codec = encoding elif detect_result['confidence'] > 0.7: codec = detect_result['encoding'] else: color.print_warn('Not Known encoding, may be %s.\n' 'Please point it explictly' % detect_result['encoding']) return if no_more: color.print_info(res.decode(codec, errors='ignore')) else: more(res.decode(codec, errors='ignore'), print_color=True) except FileNotFoundError: color.print_err('%s not found' % path) except PermissionError: color.print_err('Permission denied: %s' % path)
def test_detect_bg_iso88595(self): encoding = "ISO-8859-5" path = r"testdata/bg/ISO-8859-5/wikitop_bg_ISO-8859-5.txt" with open(path, 'rb') as f: msg = f.read() detected_encoding = cchardet.detect(msg) eq_(encoding.lower(),detected_encoding['encoding'].lower())
def stdEncode(sourceFile,destinationDir,suffix='',ext='txt',sourceEncoding='utf-16',destinationEncoding='utf-8', detectEncoding=True, overwrite=False, verbose = True): print(destinationDir) print(sourceFile) # make destination dir if necessary if not(os.path.exists(destinationDir)): os.mkdir(destinationDir) destFile = re.sub('(\.[a-zA-Z0-9]{1,6})', suffix+'.'+ext, sourceFile) destPath = os.path.join(destinationDir,destFile) if not(overwrite): if os.path.exists(destPath): if(verbose): print(destPath + " already exists; skipping.") return with open(sourceFile, mode = "r") as infile: try: string = infile.read() if detectEncoding==True: sourceEncoding = cchardet.detect(string)['encoding'] #print(sourceEncoding) string = string.decode(sourceEncoding) with open(destPath,"w") as outfile: outfile.write(string.encode('utf-8')) except DecodeError: return
def test_detect_de_windows1252(self): encoding = "WINDOWS-1252" path = r"testdata/de/WINDOWS-1252/wikitop_de_WINDOWS-1252.txt" with open(path, 'rb') as f: msg = f.read() detected_encoding = cchardet.detect(msg) print(detected_encoding) eq_(encoding.lower(),detected_encoding['encoding'].lower())
def test_detect_de_utf8(self): encoding = "UTF-8" path = r"testdata/de/UTF-8/wikitop_de_UTF-8.txt" with open(path, 'rb') as f: msg = f.read() detected_encoding = cchardet.detect(msg) print(detected_encoding) eq_(encoding.lower(),detected_encoding['encoding'].lower())
def test_detect_cz_iso88592(self): encoding = "ISO-8859-2" path = r"testdata/cz/ISO-8859-2/wikitop_cz_ISO-8859-2.txt" with open(path, 'rb') as f: msg = f.read() detected_encoding = cchardet.detect(msg) print(detected_encoding) eq_(encoding.lower(),detected_encoding['encoding'].lower())
def test_detect_el_iso88597(self): encoding = "ISO-8859-7" path = r"testdata/el/ISO-8859-7/wikitop_el_ISO-8859-7.txt" with open(path, 'rb') as f: msg = f.read() detected_encoding = cchardet.detect(msg) print(detected_encoding) eq_(encoding.lower(),detected_encoding['encoding'].lower())
def test_detect_tr_iso88599(self): encoding = "ISO-8859-9" path = r"testdata/tr/ISO-8859-9/wikitop_tr_ISO-8859-9.txt" with open(path, 'rb') as f: msg = f.read() detected_encoding = cchardet.detect(msg) print(detected_encoding) eq_(encoding.lower(),detected_encoding['encoding'].lower())
def test_detect_th_tis620_2(self): encoding = "TIS-620" path = r"testdata/th/TIS-620/wikitop_th_TIS-620.txt" with open(path, 'rb') as f: msg = f.read() detected_encoding = cchardet.detect(msg) print(detected_encoding) eq_(encoding.lower(),detected_encoding['encoding'].lower())
def test_detect_ru_maccyrillic(self): encoding = "MAC-CYRILLIC" path = r"testdata/ru/X-MAC-CYRILLIC/wikitop_ru_MACCYRILLIC.txt" with open(path, 'rb') as f: msg = f.read() detected_encoding = cchardet.detect(msg) print(detected_encoding) eq_(encoding.lower(),detected_encoding['encoding'].lower())
def test_detect_ru_ibm855(self): encoding = "IBM855" path = r"testdata/ru/IBM855/wikitop_ru_IBM855.txt" with open(path, 'rb') as f: msg = f.read() detected_encoding = cchardet.detect(msg) print(detected_encoding) eq_(encoding.lower(),detected_encoding['encoding'].lower())
def _detect_encoding(self): """Sniff the encoding using the entire file.""" with open(self.resource['path'], 'rb') as stream: text = stream.read() encoding = cchardet.detect(text)['encoding'] logging.info('Detected %s encoding with cchardet', encoding) return encoding
def detect(self, data, safe = False): try: return cchardet.detect(data) except: if safe: return None raise
def fetch_html_with_response(self, response): encoding_detected_by_cchardet = cchardet.detect(response.content)['encoding'] response.encoding = encoding_detected_by_cchardet html_body = response.text self.response = response # Adのために作った script_pattern = re.compile('<script.*?<\/script>') self.html_body = script_pattern.sub('', html_body)
def test_detect_ru_koi8r(self): encoding = "KOI8-R" path = r"testdata/ru/KOI8-R/wikitop_ru_KOI8-R.txt" with open(path, 'rb') as f: msg = f.read() detected_encoding = cchardet.detect(msg) print(detected_encoding) eq_(encoding.lower(),detected_encoding['encoding'].lower())
def _file_encoding(file_name): try: with open(file_name, 'rb') as f: encoding = cchardet.detect(f.read()).get('encoding') except IOError as e: raise Error(e) return encoding
def guess_encoding(self): """ Makes an expensive guess of the charset with the chardet library """ # TODO: would it be faster to look only in the first N thousand bytes? detected = cchardet.detect(self.doc.source_data) if detected.get("encoding"): c = webencodings.lookup(detected.get("encoding")) if c: return c.codec_info
def guess_encoding(filename_or_bytes): if isinstance(filename_or_bytes, bytes): result = chardet.detect(filename_or_bytes) return result['encoding'] else: # TODO: Use UniversalDetector to detect encoding incrementally (for large files) # http://chardet.readthedocs.org/en/latest/usage.html#example-using-the-detect-function with open(filename_or_bytes, 'rb') as f: return guess_encoding(f.read())
def test_ascii(self): detected_encoding = cchardet.detect(b'abcdefghijklmnopqrstuvwxyz') eq_( 'ascii', detected_encoding['encoding'].lower(), 'Expected %s, but got %s' % ( 'ascii', detected_encoding['encoding'].lower() ) )
def get_encoding(auto, raw): """ Automatically detect character encoding. Arguments: auto (str): auto-detection of character encoding - can be either 'chardet', 'cchardet', False, or True (the latter will pick the fastest available option) raw (bytes): array of bytes to detect from Returns: A string specifying the character encoding. """ if auto is True: try: import cchardet as chardet except ImportError: try: import chardet except ImportError: logger.debug( "chardet or cchardet is recommended for automatic" " detection of character encodings. Instead trying some" " common encodings." ) return None else: logger.debug("get_encoding Using chardet") method = "chardet" else: logger.debug("get_encoding Using cchardet") method = "cchardet" elif auto.lower() == "chardet": import chardet logger.debug("get_encoding Using chardet") method = "chardet" elif auto.lower() == "cchardet": import cchardet as chardet logger.debug("get_encoding Using cchardet") method = "cchardet" result = chardet.detect(raw) logger.debug( "{} method detected encoding of {} at confidence {}".format( method, result["encoding"], result["confidence"] ) ) return result["encoding"]
def quick_detect_encoding(string): """ Tries to detect the encoding of the passed string. Uses cchardet. Fallbacks to detect_encoding. """ assert isinstance(string, bytes) try: detected = cchardet.detect(string) if detected: return detected.get('encoding') or detect_encoding(string) except Exception as e: pass return detect_encoding(string)
def set_response_encoding(request): """Set the encoding if it isn't set already. Use cchardet for added performance. """ if request: # If the encoding is iso-8859-1, switch it to cp1252 (a superset) if request.encoding == 'ISO-8859-1': request.encoding = 'cp1252' if request.encoding is None: # Requests detects the encoding when the item is GET'ed using # HTTP headers, and then when r.text is accessed, if the encoding # hasn't been set by that point. By setting the encoding here, we # ensure that it's done by cchardet, if it hasn't been done with # HTTP headers. This way it is done before r.text is accessed # (which would do it with vanilla chardet). This is a big # performance boon, and can be removed once requests is upgraded if isinstance(request.content, text_type): as_bytes = request.content.encode() request.encoding = chardet.detect(as_bytes)['encoding'] else: request.encoding = chardet.detect(request.content)['encoding']
def decode_response(response, chunk_size=65536): """Read the first chunk of server response and decode it""" guessed_encoding = chardet.detect( response.content[:chunk_size])['encoding'] LOGGER.debug('response/guessed encoding: %s / %s', response.encoding, guessed_encoding) if guessed_encoding is not None: try: htmltext = response.content.decode(guessed_encoding) except UnicodeDecodeError: htmltext = response.text else: htmltext = response.text return htmltext
def _get_encoded_text(self): if self.successfully_read: encoding = cchardet.detect(self.text)['encoding'] if encoding is None: encoding = "utf-8" if len(self.text) > 0: # We convert, even if the text is detected to be UTF8 so, if it is an error and conversion fails, # the error is caught here for enc in [encoding, 'utf-8', 'iso-8859-1', 'windows‑1252']: try: return enc, self.text.decode(enc) except: pass return None, ''
def guess_encoding(text, default='utf-8'): """Guess string encoding. Given a piece of text, apply character encoding detection to guess the appropriate encoding of the text. """ result = chardet.detect(text) if result: encoding = result.get('encoding') if encoding is not None: encoding = encoding.lower().strip() if encoding != 'ascii': return encoding return default
def separete_file_sentences(self): with open(self._file_path, "rb") as f: msg = f.read() result = chardet.detect(msg) #print(result) with codecs.open(self._file_path, "r", encoding=result["encoding"]) as text_file: file_text = text_file.read() sentences = nltk.tokenize.sent_tokenize(file_text) for i in range(len(sentences)): if(len(sentences[i]) > 0): self._sentences.append(sentences[i])
def infer(self): """https://github.com/frictionlessdata/datapackage-py#resource """ descriptor = deepcopy(self.__current_descriptor) # Blank -> Stop if self.__source_inspection.get('blank'): return descriptor # Name if not descriptor.get('name'): descriptor['name'] = self.__source_inspection['name'] # Only for non inline/storage if not self.inline and not self.__storage: # Format if not descriptor.get('format'): descriptor['format'] = self.__source_inspection['format'] # Mediatype if not descriptor.get('mediatype'): descriptor['mediatype'] = 'text/%s' % descriptor['format'] # Encoding if not descriptor.get('encoding'): contents = b'' with self.raw_iter(stream=True) as stream: for chunk in stream: contents += chunk if len(contents) > 1000: break encoding = cchardet.detect(contents)['encoding'].lower() descriptor[ 'encoding'] = 'utf-8' if encoding == 'ascii' else encoding # Schema if not descriptor.get('schema'): if self.tabular: descriptor['schema'] = self.__get_table().infer() # Profile if descriptor.get('profile') == config.DEFAULT_RESOURCE_PROFILE: if self.tabular: descriptor['profile'] = 'tabular-data-resource' # Save descriptor self.__current_descriptor = descriptor self.__build() return descriptor
def detect_encoding(bytesobject): """Read the first chunk of input and return its encoding""" # unicode-test if isutf8(bytesobject): return 'UTF-8' else: guess = cchardet.detect(bytesobject) LOGGER.debug('guessed encoding: %s', guess['encoding']) return guess['encoding'] # fallback on full response # if guess is None or guess['encoding'] is None: # or guess['confidence'] < 0.99: # guessed_encoding = chardet.detect(bytesobject)['encoding'] # return return None
def fetch(self, url: str) -> str: html = self._storage.get(url) if html is not None: logger.info(Fore.BLUE, 'Storage', f'Get<{url}>') return html r = requests.get(url) content = r.content charset = cchardet.detect(content) html = content.decode(charset['encoding'] or 'utf-8') logger.info(Fore.GREEN, 'Sent', f'{url} {len(html)} {r.status_code}') self._storage[url] = html logger.info(Fore.BLUE, 'Storage', f'Set<{url}>') return html
def _get_file_encoding(self, file_path): """Detecta la codificación de un archivo con cierto nivel de confianza y devuelve esta codificación o el valor por defecto. Args: file_path (str): Ruta del archivo. Returns: str: Codificación del archivo. """ with open(file_path, 'rb') as f: info = cchardet.detect(f.read()) return (info['encoding'] if info['confidence'] > 0.75 else self.INPUT_DEFAULT_ENCODING)
def download(self, url): if url is None: return s = requests.Session() s.headers[ 'User-Agent'] = 'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 63.0.3239.132Safari / 537.36' res = s.get(url) if res.status_code == 200: encoding = cchardet.detect(res.content)['encoding'] text = res.content.decode(encoding) return text return None
def main(): do_times = 100 path = r'tests/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt' with open(path, 'rb') as f: msg = f.read() # Test chardet result_chardet = 0 for i in range(do_times): start_chardet = time.time() chardet.detect(msg) result_chardet += (time.time() - start_chardet) print('chardet v%s:' % (chardet.__version__), 1 / (result_chardet / do_times), 'call(s)/s') # Test cchardet result_cchardet = 0 for i in range(do_times): start_cchardet = time.time() cchardet.detect(msg) result_cchardet += (time.time() - start_cchardet) print('cchardet v%s:' % (cchardet.__version__), 1 / (result_cchardet / do_times), 'call(s)/s')
def search_page(self, query, language=None, num=None, start=0, pause=2): """ Google search :param query: Keyword :param language: Language :return: result """ time.sleep(pause) domain = self.get_random_domain() if start > 0: url = URL_NEXT url = url.format(domain=domain, language=language, query=quote_plus(query), num=num, start=start) else: if num is None: url = URL_SEARCH url = url.format(domain=domain, language=language, query=quote_plus(query)) else: url = URL_NUM url = url.format(domain=domain, language=language, query=quote_plus(query), num=num) if language is None: url = url.replace('hl=None&', '') # Add headers headers = {'user-agent': self.get_random_user_agent()} try: requests.packages.urllib3.disable_warnings( requests.packages.urllib3.exceptions.InsecureRequestWarning) r = requests.get(url=url, proxies=self.proxies, headers=headers, allow_redirects=False, verify=False, timeout=30) LOGGER.info(url) content = r.content charset = cchardet.detect(content) text = content.decode(charset['encoding']) return text except Exception as e: LOGGER.exception(e) return None
def csvencode(filename) : #print("resp") #try: #out file name #splunk_path = os.environ['SPLUNK_HOME'] #splunk_path = "/Applications/Splunk" convfile = filename.replace(".csv","_utf8.csv") #data=pd.read_csv(filename) # for col in data.columns: # try: # if "Unnamed" not in col: # data[col]=data[col].str.replace(",","") # except: # data[col]=data[col] #data.to_csv(tmpfile) #os.system('touch '+splunk_path+'/etc/apps/aiam-ml-core/lookups/worked') #os.remove(filename) with open(filename) as f: data=f.read() enc=c.detect(data)['encoding'].upper() print('enc',enc) if 'ISO-8859-1' in enc: enc='CP1252' cmd='iconv -c -f '+enc+' -t utf-8 '+filename+' > '+convfile os.system(cmd) #reads in file with open(convfile, 'r+') as f: text = f.read() # #encodes in utf-8 and changes to Unix line flush #temp = text.encode('utf-8', 'strict') temp_line = text.replace('\r', '\n') final = temp_line.replace('\n\n', '\n') # #removes prev file to overwrite # os.remove(tmpfile) # #writes out file f.seek(0) f.write(final) f.truncate() print("File successfuly converted")
def decode_bytes(content): try: if content.startswith(UTF8_HEADER): return content.decode('utf-8-sig') if content.startswith(BIG_ENDIAN_HEADER): return content[len(BIG_ENDIAN_HEADER):].decode('utf-16-be') if content.startswith(LITTLE_ENDIAN_HEADER): return content[len(LITTLE_ENDIAN_HEADER):].decode('utf-16-le') return content.decode('utf-8') except Exception: encoding_info = cchardet.detect(content) logger.info( f"Encoding detected to be {encoding_info['encoding']} with confidence of {encoding_info['confidence']}." ) return content.decode(encoding_info['encoding'])
def get_encoding(self): for f_or_d in vars(self.args())['file']: #位置参数是文件 if os.path.isfile(f_or_d): with open(f_or_d, "rb") as f: msg = f.read() result = chardet.detect(msg) print(f_or_d, result) #位置参数是目录 elif os.path.isdir(f_or_d): #指定了可选参数 -r:递归 if self.args().recursion: for i in self.getallfiles(f_or_d): with open(i, "rb") as f: msg = f.read() result = chardet.detect(msg) print(i, result) #没有指定可选参数 -r:只处理当前文件夹下的文件,忽略目录 else: current_dir = os.listdir(os.path.abspath(f_or_d)) for m in current_dir: #获取输入文件夹下文件的绝对路径 cureent_dir_abs_path = os.path.join( os.path.abspath(f_or_d), m) #如果该文件是一个文件 if os.path.isfile(cureent_dir_abs_path): with open(cureent_dir_abs_path, "rb") as f: msg = f.read() result = chardet.detect(msg) print(cureent_dir_abs_path, result) else: print("no such file or directory") sys.exit()
def _get_encoding(self): ctype = self.headers.get(hdrs.CONTENT_TYPE, '').lower() mtype, stype, _, params = helpers.parse_mimetype(ctype) encoding = params.get('charset') if not encoding: if mtype == 'application' and stype == 'json': # RFC 7159 states that the default encoding is UTF-8. encoding = 'utf-8' else: encoding = chardet.detect(self._content)['encoding'] if not encoding: encoding = 'utf-8' return encoding
def GetHtml(url, Htype='get', data={}): #设置提交方式,使用session()包含cookies # r = requests.session() print('session', r) print('cookies', r.cookies.get_dict()) #提交链接,获取网页 if Htype == 'get': req = r.get(url) elif Htype == 'post': req = r.post(url, data) # 判断编码 codetype = cchardet.detect(req.content) # 设置编码 req.encoding = codetype["encoding"] return req.text
def html2Unicode(bData): try: import cchardet as chardet except ImportError: try: import chardet except ImportError: return bData.decode('utf-8') mResult = chardet.detect(bData) if (mResult['confidence'] > 0.5): sEnc = mResult['encoding'] sData = bData.decode(sEnc, 'replace') return sData else: raise UnicodeError('can not identify the encoding')
def get_html_soup(url, **kwargs): headers = kwargs.get("headers", {}) headers.setdefault('User-Agent', random.choice(config.HEADER.USER_AGENT)) kwargs["headers"] = headers try: html = requests.get(url, timeout=config.request_timeout, **kwargs) if ('content-type' in html.headers and 'charset' not in html.headers['content-type']) \ or ('content-type' not in html.headers): # html.encoding = config.encoding html.encoding = cchardet.detect(html.content)['encoding'] soup = BeautifulSoup(html.text, "lxml") return soup except Exception as e: msg = u"get [%s] : %s" % (url, e) logger.get("error-log").error(msg)
def to_buffer(buffer_or_path): if isinstance(buffer_or_path, str): if not os.path.isfile(buffer_or_path): raise FileNotFoundError( f"no file found at given path: {buffer_or_path}") path = buffer_or_path with open(buffer_or_path, "rb") as f: encoding = chardet.detect(f.read()) buffer = open(buffer_or_path, encoding=encoding["encoding"], errors="ignore") else: path = None buffer = buffer_or_path return path, buffer
def get_html_by_requests(url, headers, timeout=15): """ :param url: :return: """ try: response = requests.get(url=url, headers=headers, verify=False, timeout=timeout) response.raise_for_status() content = response.content charset = cchardet.detect(content) text = content.decode(charset['encoding']) return text except Exception as e: LOGGER.exception(e) return None
def decode_string(string, encoding): try: value = string.decode(encoding) except (UnicodeDecodeError, LookupError): if chardet: enc = chardet.detect(string) try: if not (enc['confidence'] == 1 and enc['encoding'] == 'ascii'): value = string.decode(enc['encoding']) else: value = string.decode('ascii', 'ignore') except UnicodeDecodeError: value = force_string_decode(string) return value
async def main(): ssa = init_ssa() #print(ssa) print('media library path:', PATH) success = [] fail = [] print('finding and converting started...') for p, w, f in os.walk(PATH): for file_name in f: if file_name[-4:].lower() == '.srt': print('processing %s' % os.path.join(p, file_name)) try: with open(os.path.join(p, file_name), 'rb') as srt_file: srt_raw = srt_file.read() encoding = cchardet.detect(srt_raw) srt = srt_raw.decode(encoding['encoding'], errors=DECODE_ERRORS) ssa_file = codecs.open(os.path.join( p, os.path.splitext(file_name)[0] + SUFFIX + '.ssa'), 'w', encoding='utf-8') #ssa_file.write(convert_ssa(parse(smi),LANG)) #convert(srt) ssa_file.write(convert(srt)) success.append(file_name) if REMOVE_OPTION: os.remove(os.path.join(p, file_name)) except: fail.append(file_name) srt_list = list(set(success) | set(fail)) print('\nfound .srt subtitles:') for srt in srt_list: print(srt) if len(success) > 0: print('\nworked .srt subtitles:') for srt in success: print(srt) if len(fail) > 0: print('\nfailed .srt subtitles:') for srt in fail: print(srt) if REMOVE_OPTION: print('\nworked srt files are removed due to removal option')
def decode_string(string: bytes, encoding: typing.Optional[str]) -> str: """Try anything possible to parse an encoded bytes string and return the result. We do this using the encoding hint, if this fails, we try to detect the correct encoding using the chardet module, if that failed we try latin-1, utf-8 and as a last resort ascii. In any case we always return something. Args: string (bytes): The bytes string to be decoded. encoding (str, optional): An optional encoding hint. Returns: str: A decoded form of the string. """ if string == b'': return '' if encoding is not None: try: return string.decode(encoding) except (UnicodeDecodeError, LookupError): pass if chardet: enc = chardet.detect(string) if not (enc['confidence'] is None or enc['encoding'] is None) and not ( enc['confidence'] == 1 and enc['encoding'] == 'ascii'): value = string.decode(enc['encoding'], 'replace') else: value = string.decode('ascii', 'replace') else: text = '' for e in ('latin1', 'utf-8'): try: text = string.decode(e) except UnicodeDecodeError: pass else: break if text == '': value = string.decode('ascii', 'ignore') else: value = text return value
def get_text_v3(address, stream, mapped=False, decode=True): """faster way to extract strings from mdf versions 2 and 3 TextBlock Parameters ---------- address : int TextBlock address stream : handle file IO handle Returns ------- text : str unicode string """ if address == 0: return "" if decode else b"" if mapped: block_id = stream[address : address + 2] if block_id != b"TX": return "" if decode else b"" (size,) = UINT16_uf(stream, address + 2) text_bytes = ( stream[address + 4 : address + size].split(b"\0")[0].rstrip(b" \r\t\n\0") ) else: stream.seek(address) block_id = stream.read(2) if block_id != b"TX": return "" if decode else b"" size = UINT16_u(stream.read(2))[0] - 4 text_bytes = stream.read(size).split(b"\0")[0].rstrip(b" \r\t\n\0") if decode: try: text = text_bytes.decode("latin-1") except UnicodeDecodeError: try: encoding = detect(text_bytes)["encoding"] text = text_bytes.decode(encoding, "ignore") except: text = "<!text_decode_error>" else: text = text_bytes return text
def requests_target_fetch(url): """ :param url: :return: """ try: headers = {'user-agent': get_random_user_agent()} response = requests.get(url=url, headers=headers, verify=False) response.raise_for_status() content = response.content charset = cchardet.detect(content) text = content.decode(charset['encoding']) return text except Exception as e: LOGGER.exception(e) return None
def detect_encoding(sample, encoding=None): """Detect encoding of a byte string sample. """ # To reduce tabulator import time from cchardet import detect if encoding is not None: return normalize_encoding(sample, encoding) result = detect(sample) confidence = result['confidence'] or 0 encoding = result['encoding'] or 'ascii' encoding = normalize_encoding(sample, encoding) if confidence < config.ENCODING_CONFIDENCE: encoding = config.DEFAULT_ENCODING if encoding == 'ascii': encoding = config.DEFAULT_ENCODING return encoding
def convert_encoding(data): encoding = cchardet.detect(data)['encoding'] if encoding == None: encoding = "utf-8" if len(data) > 0: # We convert, even if the text is detected to be UTF8 so, if it is an error and conversion fails, the error # is catched here for enc in [encoding, 'utf-8', 'iso-8859-1', 'windows‑1252']: try: return enc, data.decode(enc) except: pass return None, ''