def _OSdownload(SubId, SubCodec): log.debug("Download subtitle: %s" % SubId) time.sleep(6) if not OS_NoOp(): return None try: Result = autosub.OPENSUBTITLESSERVER.DownloadSubtitles( autosub.OPENSUBTITLESTOKEN, [SubId]) except: autosub.OPENSUBTITLESTOKEN = None log.error('Error from Opensubtitles download API. DownloadId is: %s' % SubId) return None if Result['status'] == '200 OK': try: CompressedData = Result['data'][0]['data'].decode('base64') except Exception as error: log.error( 'Error decompressing sub from opensubtitles. Message is: %s' % error) return None if not CompressedData: log.debug( 'No data returned from DownloadSubtitles API call. Skipping this one.' ) return None SubDataBytes = gzip.GzipFile(fileobj=io.BytesIO(CompressedData)).read() # Opensubtitles makes no difference in UTF-8 and UTF8-SIG so we check with chardet the correct encoding # if Opensubtile does not know the encoding we assume windows-1252 is used. if SubCodec: if 'UTF' in SubCodec.upper() or SubCodec == 'Unknown': SubCodec = chardet.detect(SubDataBytes)['encoding'] elif '1252' in SubCodec: SubCodec = u'cp1252' elif '850' in SubCodec: SubCodec = u'cp850' else: SubCodec = chardet.detect(SubDataBytes)['encoding'] if not 'UTF' in SubCodec.upper(): SubCodec = u'cp1252' try: SubData = SubDataBytes.decode(SubCodec, errors='replace') except Exception as error: log.error('Error decoding sub from opensubtitles. Message is: %s' % error) return None return (SubData) else: if Result['status'][:3] == '406': autosub.OPENSUBTITLESTOKEN = None log.error('Message : %s' % Result['status']) return None
def _getzip(Session, url): # returns a file-like String object try: Result = Session.get(url, verify=autosub.CERTIFICATEPATH, timeout=22) except: log.debug("Zip file at %s couldn't be retrieved" % url) return None try: zip = zipfile.ZipFile(io.BytesIO(Result.content)) except Exception as error: log.debug("Expected a zip file but got error for link %s" % url) log.debug("%s is likely a dead link" % url) return None nameList = zip.namelist() for name in nameList: # sometimes .nfo files are in the zip container if name.lower().endswith('srt'): try: Data = zip.read(name) if Data.startswith(codecs.BOM_UTF8): SubData = unicode(Data[3:], 'UTF-8') else: Codec = chardet.detect(Data)['encoding'] SubData = unicode(Data, Codec) if SubData: return SubData except Exception as error: log.error(error.message) log.debug("No subtitle files was found in the zip archive for %s" % url) return None
def follow_meta_redirects(url, redirects, **kwargs): urls_history[url] = True if redirects < 0: raise ValueError("Cannot resolve real url with max_redirects=%s" % max_redirects) redirects -= 1 with closing(s.get(url, allow_redirects=True, stream=True, **kwargs)) as resp: if resp.history: for r in resp.history: urls_history[r.url] = True head, real_url = next(resp.iter_content(chunk_size)), resp.url encoding = resp.encoding if encoding is None: # detect encoding encoding = chardet.detect(head)['encoding'] try: head = str(head, encoding, errors='replace') except (LookupError, TypeError): head = str(head, errors='replace') # Removing html blocks in <noscript></noscript> if remove_noscript: head = re.sub(r'<noscript[^>]*>.*</noscript[^>]*>', '', head, flags=re.DOTALL) redirect = None if 'refresh' in resp.headers: redirect = resp.headers['refresh'] elif not redirect: for tag in get_tags(head, 'meta'): if tag.get('http-equiv', '') == 'refresh': redirect = tag.get('content', None) if redirect: m = re.search(r'url\s*=\s*([^\s;]+)', redirect, re.I) if m: m = m.group(1) # fixing case url='#url here#' if m.startswith(('"', "'")) and m.endswith(('"', "'")): m = m[1:-1] real_url = follow_meta_redirects(urljoin(resp.url, m), redirects) urls_history[real_url] = True return real_url
def ensure_text_type(value): try: return value.decode('utf-8') except AttributeError: # AttributeError: '<>' object has no attribute 'decode' # In this case assume already text_type and do nothing return value except UnicodeDecodeError: from requests.packages.chardet import detect encoding = detect(value).get('encoding') or 'utf-8' return value.decode(encoding)
def host_check(host_ip): host, ip = host_ip schemes = ["http://", "https://"] for scheme in schemes: url = scheme + ip headers = { 'Host': host.strip(), 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' } try: r = requests.session() requests.packages.urllib3.disable_warnings() res = r.get(url, verify=False, headers=headers, allow_redirects=False, timeout=30) charset = chardet.detect(res.content)["encoding"] res.encoding = charset title = "" try: title = re.search('<title>(.*)</title>', res.text).group(1) #获取标题 except Exception as ex: title = "Failed to get title!" info = '%s,%s,%s,Packet size:%d,Title:%s' % ( ip, host, scheme + host, len(res.text), title) if lock.acquire(): try: success_list.append(info) pbar.echo(info) pbar.update_suc() with open('hosts_ok.txt', 'a+', encoding="utf-8") as f: print(info + "\n") f.write(info + "\n") f.close() finally: lock.release() except Exception as ex: if lock.acquire(): try: # print ex.message # logging.exception(ex) error = "%s,%s,%s,Access failed!" % (ip, host, scheme + host) pbar.echo(error) finally: lock.release() finally: pbar.update()
def get_valid_response(self, response): html_content = response.body content_type = chardet.detect(html_content) print(content_type['encoding']) if content_type['encoding'] != "UTF-8": html_content = html_content.decode(content_type['encoding']) html_content = html_content.encode("utf-8") # open("qunima.html","wb").write(html_content) html_content = str(html_content) html_content = html_content.replace("\n", "") return html_content
def follow_meta_redirects(url, redirects, **kwargs): urls_history[url] = True if redirects < 0: raise ValueError("Cannot resolve real url with max_redirects=%s" % max_redirects) redirects -= 1 with closing(s.get(url, allow_redirects=True, stream=True, **kwargs)) as resp: if resp.history: for r in resp.history: urls_history[r.url] = True head, real_url = resp.iter_content(chunk_size).next(), resp.url encoding = resp.encoding if encoding is None: # detect encoding encoding = chardet.detect(head)['encoding'] try: head = unicode(head, encoding, errors='replace') except (LookupError, TypeError): head = unicode(head, errors='replace') # Removing html blocks in <noscript></noscript> if remove_noscript: head = re.sub('<noscript[^>]*>.*</noscript[^>]*>', '', head, flags=re.DOTALL) redirect = None if 'refresh' in resp.headers: redirect = resp.headers['refresh'] elif not redirect: for tag in get_tags(head, 'meta'): if tag.get('http-equiv', '') == 'refresh': redirect = tag.get('content', None) if redirect: m = re.search('url\s*=\s*([^\s;]+)', redirect, re.I) if m: m = m.group(1) # fixing case url='#url here#' if m.startswith(('"', "'")) and m.endswith(('"', "'")): m = m[1:-1] real_url = follow_meta_redirects(urljoin(resp.url, m), redirects) urls_history[real_url] = True return real_url
def _ensure_text_type(value): # copying here from conda/common/compat.py to avoid the import try: return value.decode('utf-8') except AttributeError: # AttributeError: '<>' object has no attribute 'decode' # In this case assume already text_type and do nothing return value except UnicodeDecodeError: try: from requests.packages.chardet import detect except ImportError: # pragma: no cover from pip._vendor.requests.packages.chardet import detect encoding = detect(value).get('encoding') or 'utf-8' return value.decode(encoding)
def _open_resource(xml_resource, detect_encoding=False): if isinstance(xml_resource, basestring): if detect_encoding: encoding = chardet.detect(xml_resource)['encoding'] if encoding in ('UTF-16LE', 'UTF-16BE'): xml_resource = xml_resource.decode('UTF-16').encode('utf-8') try: # https://github.com/IATI/iati-datastore/issues/160 xml_resource_is_path = os.path.exists(xml_resource) except TypeError: xml_resource_is_path = False if xml_resource_is_path: #https://bugzilla.redhat.com/show_bug.cgi?id=874546 f = open(xml_resource) lines = f.read() xmlfile = StringIO(lines) else: xmlfile = StringIO(xml_resource) else: # so it's a xml literal, probably from a test. It shouldn't be # big enough that a round trip through the serializer is a problem xmlfile = StringIO(ET.tostring(xml_resource)) return xmlfile
def to_unicode(content): from requests.packages import chardet encode_name = chardet.detect(content).get('encoding') return unicode(content, encode_name) if encode_name else ''
# -*- coding: utf-8 -*- # Time: 2019/6/7 17:20 # Author: laugc # Email: [email protected] # File: py70_chardet.py from requests.packages import chardet """ 编码 """ print(chardet.detect(b'Hello, world!')) data = '离离原上草,一岁一枯荣'.encode('gbk') print(chardet.detect(data)) data1 = '离离原上草,一岁一枯荣'.encode('utf-8') print(chardet.detect(data1)) data2 = '最新の主要ニュース'.encode('euc-jp') print(chardet.detect(data2))
def set_output(self,text,charset=None): #: TODO: MUST do nothing on unicode python 2.7 if not charset: charset = chardet.detect(text)['encoding'] self.output = text.decode(charset)
from requests.packages import chardet with open('test', 'rb') as f: print(chardet.detect(f.read()))
def apparent_encoding(self): """The apparent encoding, provided by the chardet library""" return chardet.detect(self.content)['encoding']