Beispiel #1
0
    def _unpack_content(self, raw_data):
        """Extract data from the body of a REST response object.

        :param raw_data: Data to be processed. This could be a
         requests.Response object, in which case the json content will be
         be returned.
        """
        if raw_data and isinstance(raw_data, bytes):
            data = raw_data.decode(
                encoding=chardet.detect(raw_data)['encoding'])
        else:
            data = raw_data

        if hasattr(raw_data, 'content'):
            if not raw_data.content:
                return None

            if isinstance(raw_data.content, bytes):
                encoding = chardet.detect(raw_data.content)["encoding"]
                data = raw_data.content.decode(encoding=encoding)
            else:
                data = raw_data.content
            try:
                return json.loads(data)
            except (ValueError, TypeError):
                return data

        return data
Beispiel #2
0
def transferencode():
	with open(config.http_repo_path+'---awtrc-ict-ac-cn-index-php-mact=News,cntnt01,detail,0&cntnt01articleid=217&cntnt01detailtemplate=custom_detail&cntnt01lang=zh_CN&cntnt01returnid=79.html') as f:
		html=f.read()
	print chardet.detect(html)
	html=html.encode('gb2312')
	with open('utf.html','w') as f:
		f.write(html)
Beispiel #3
0
def encode():
    x = '天气'
    print chardet.detect(x), type(x), x
    y = x.decode('GB2312').encode('utf-8')
    # y = xx.decode('').encode('utf-8')
    # z = x.encode(encoding='utf-8')
    print chardet.detect(y), type(y), y
Beispiel #4
0
 def dotask(self):
     self.cmd = 'ping'
     if sys.platform == "win32":
         self.cmd += ' -n 3 '
     else:
         self.cmd += ' -c 3 '
     self.cmd += self.host
     # os.popen(self.cmd, 'r', self.result)
     print "default coding type: {0}".format(sys.getdefaultencoding())
     reload(sys)
     sys.setdefaultencoding('utf-8')
     args = shlex.split(self.cmd)
     p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     (tmpoutput, tmpouterr) = p.communicate()
     self.retcode = p.returncode
     self.output = "".join(tmpoutput)
     self.outerr = "".join(tmpouterr)
     print "return code: %d" % self.retcode
     print "stdout:"
     if self.output:
         if not isinstance(self.output, unicode):
             self.output = self.output.decode(chardet.detect(self.output)['encoding'])
         print self.output
     print "stderr:"
     if self.outerr:
         if not isinstance(self.outerr, unicode):
             self.outerr = self.outerr.decode(chardet.detect(self.outerr)['encoding'])
         print self.outerr
     '''
    def get_str_charset(self, the_str):
        need_len = 400
        test_str = the_str[:need_len]
        test_str = test_str.decode('ascii', 'ignore').lower()
        cs       = 'charset='
        if cs in test_str:
            idx_start = test_str.find(cs) + len(cs)
            idx_end   = idx_start + 16
            st = test_str[idx_start:idx_end].split('\'')[0].split('\"')[0]
            if len(st) > 0:
                return st

        if len(the_str) < need_len:
            charset_info = chardet.detect(the_str)
        else:
            charset_info = chardet.detect(the_str[:need_len])
            while charset_info['confidence'] < 0.9:
                need_len += 400
                charset_info = chardet.detect(the_str[:need_len])
                if need_len > len(the_str):
                    break
        if charset_info['confidence'] >= 0.9:
            return charset_info['encoding']
        else:
            return 'utf-8'
    def __init__(self, extractor='DefaultExtractor', **kwargs):
        if kwargs.get('url'):
            request   = urllib2.urlopen(kwargs['url'])
            self.data = request.read()
            encoding  = request.headers['content-type'].lower().split('charset=')[-1]
            if encoding.lower() == 'text/html':
                encoding = chardet.detect(self.data)['encoding']
            self.data = unicode(self.data, encoding)
        elif kwargs.get('html'):
            self.data = kwargs['html']
            if not isinstance(self.data, unicode):
                self.data = unicode(self.data, chardet.detect(self.data)['encoding'])
        else:
            raise Exception('No text or url provided')

        try:
            # make it thread-safe
            if threading.activeCount() > 1:
                if jpype.isThreadAttachedToJVM() == False:
                    jpype.attachThreadToJVM()
            lock.acquire()
            
            self.extractor = jpype.JClass(
                "de.l3s.boilerpipe.extractors."+extractor).INSTANCE
        finally:
            lock.release()
    
        reader = StringReader(self.data)
        self.source = BoilerpipeSAXInput(InputSource(reader)).getTextDocument()
        self.extractor.process(self.source)
 def writeData(self):
     query = self.prefix
     for i in range(0,len(self.values)):
         if i > 0:
             query += ", "
         query += "("
         for j in range(0,len(self.fields)):
             if j > 0:
                 query += ", "
             if not isinstance(self.values[i][j], (str, unicode)):  # Is not string
                 query += "'" + str(self.values[i][j]) + "'"                
             elif self.values[i][j] == "NULL":
                 query += "NULL"
             elif self.values[i][j][0:12] == "GeomFromText":
                 query += self.values[i][j]
             else:
                 try:
                     if self.values[i][j] == '':
                         value = u''
                     else:
                         charset = chardet.detect(self.values[i][j])['encoding']
                         value = unicode(self.values[i][j].decode(charset).replace(u"'",u"\\'"))
                     query += u"'" + value + u"'"
                 except Exception, e:
                     print query
                     print self.values[i]
                     print chardet.detect(self.values[i][j])
                     print e
                     sys.exit(1)
         query += ")"
Beispiel #8
0
def SavePhoto(aid, pid, purl, ptitle):
  global g_download_success_count
  global g_download_fail_count
  global g_download_fail_list
  dirname = os.path.join("data" , aid)
  if not os.path.exists(dirname):
    os.makedirs(dirname)
  download_success = False
  for i in range(0,3):
    try:
      print purl
      photo_content = urllib2.urlopen(purl).read()
      download_success = True
      break
    except Exception: 
      print "download fail, sleep 3 seconds"
      time.sleep(3) 
  if not download_success:
    g_download_fail_count += 1
    g_download_fail_list.append(purl)
    return
  filename = os.path.join(dirname.decode("utf-8"), pid+ ".jpg") 
  f = open(filename, 'w')
  f.write(photo_content)
  f.close()
  filename = os.path.join(dirname, pid + '.txt')
  f = open(filename, 'w')
  print ptitle
  print chardet.detect(ptitle)
  f.write(ptitle)
  f.close()
  g_download_success_count +=1
Beispiel #9
0
def html_to_txt():
    """将多个html文件合并为一个txt文件,统一编码为utf-8 or ascii
    """
    ft = open(YAHOO_TXT, 'w')
    start = 1
    while 1:
        filename = YAHOO_DIR+ str(start) + '.html'
        if os.path.isfile(filename):
            fp = open(filename, 'r')
            htmltxt = ''.join(fp.readlines())
            if not htmltxt or not len(htmltxt):
                continue
            fp.close()
            
            codedetect = chardet.detect(htmltxt)["encoding"]				#检测得到修改之前的编码方式
            print codedetect
	    if not codedetect in ['utf-8', 'ascii']:
	        htmltxt = unicode(htmltxt, codedetect).encode('utf-8')
	        codedetect = chardet.detect(htmltxt)["encoding"]			#检测得到修改之后的编码方式
                print 'change', codedetect
            
            ft.write(html2txt(htmltxt))
            print 'Success change html to txt %s' % start
            start += 1
        else:
            break
    ft.close()
Beispiel #10
0
def decode_by_charset(content):
    # type: (bytes) -> Text
    r"""
    Detect the charset encoding of a string and decodes to unicode strings.

    >>> decode_by_charset(u'\u4e2d\u6587'.encode('UTF-8'))
    '\u4e2d\u6587'
    >>> decode_by_charset(u'\u4e2d\u6587'.encode('HZ-GB-2312'))
    '\u4e2d\u6587'
    """
    encoding = chardet.detect(content)['encoding']
    # Sometimes, the content is well encoded but the last few bytes. This is
    # common in the files downloaded by old versions of OSD Lyrics. In this
    # case,chardet may fail to determine what the encoding it is. So we take
    # half of the content of it and try again.
    if not encoding and len(content) > DETECT_CHARSET_GUESS_MIN_LEN:
        logging.warning('Failed to detect encoding, try to decode a part of it')
        content_half = len(content) // 2
        slice_end = min(max(DETECT_CHARSET_GUESS_MIN_LEN, content_half), DETECT_CHARSET_GUESS_MAX_LEN)
        encoding = chardet.detect(content[:slice_end])['encoding']
        logging.warning('guess encoding from part: ' + encoding)
    if not encoding:
        logging.warning('Failed to detect encoding, use utf-8 as fallback')
        encoding = 'utf-8'

    # When we take half of the content to determine the encoding, chardet may
    # think it be encoded with ascii, however the full content is probably
    # encoded with utf-8. As ascii is an subset of utf-8, decoding an ascii
    # string with utf-8 will always be right.
    if encoding == 'ascii':
        encoding = 'utf-8'
    return content.decode(encoding, 'replace')
Beispiel #11
0
def _try_decode_bytes_(raw_bytes: bytes) -> str:
    """helper function for decode_byte,try to decode the raw bytes

    :param raw_bytes: the bytes you get and want to decode to string
    :return: A decoded string
    """
    # Detect the encoding with only the first couple of bytes
    encoding_detect = chardet.detect(
        raw_bytes[:constants.MIN_ENCODING_DETECT])
    # get the encoding
    encoding_type = encoding_detect['encoding']

    if encoding_type is None:
        encoding_detect = chardet.detect(raw_bytes)
        encoding_type = encoding_detect['encoding']

    try:
        # try to decode the string using the encoding we get
        decoded_string = raw_bytes.decode(encoding_type)

    except UnicodeDecodeError:
        # if decoding failed, we use all the bytes to detect encoding
        encoding_detect = chardet.detect(raw_bytes)
        encoding_type = encoding_detect['encoding']
        decoded_string = raw_bytes.decode(encoding_type)

    return decoded_string
Beispiel #12
0
 def setTotals(self):
     rows = self.esc.generalInformationRows + self.esc.sequenceRows 
     
     self.totals[cu.FILE] = os.path.basename(self.inFilePath)
     self.totals[cu.QUANTITY] = self.esc.seql.generalInformation.quantity
     self.totals[cu.SEQUENCES_NUC] = self.esc.seql.quantity_nuc
     self.totals[cu.SEQUENCES_PRT] = self.esc.seql.quantity_prt
     self.totals[cu.MIXED_MODE] = self.esc.seql.quantity_mix  
     self.totals[cu.ELEMENT_ST25_LENGTH] = sum([r[2] for r in rows])
     self.totals[cu.VALUE_LENGTH] = sum([r[3] for r in rows])
     self.totals[cu.TAG_ST26_LENGTH] = sum([r[4] for r in rows])
     self.totals[cu.ELEMENT_ST26_LENGTH] = sum([r[5] for r in rows])
     
     with open(self.inFilePath, 'r') as inf:
         s_txt = inf.read()
         self.totals[cu.CHARS_TXT_FILE] = len(s_txt)
         self.totals[cu.ENCODING_TXT] = chardet.detect(s_txt)['encoding']
     self.totals[cu.FILE_SIZE_TXT] = os.path.getsize(self.inFilePath)
      
     with open(self.cleanXmlFilePath, 'r') as f:
         s_xml = f.read()
         self.totals[cu.CHARS_XML_CLEAN_FILE] = len(s_xml)
         self.totals[cu.ENCODING_XML] = chardet.detect(s_xml)['encoding']
     self.totals[cu.FILE_SIZE_XML_CLEAN] = os.path.getsize(self.cleanXmlFilePath) 
     print self.inFilePath
     print 'encoding:', self.esc.seql.charEncoding
Beispiel #13
0
def convertor(test, encoding=""):
    """
    convert zhpy source (Chinese) to Python Source 
    
    >>> convertor("印出 'hello'")
    "print 'hello'"
    
    >>> convertor("印出 'hello'", encoding="utf8")
    "print 'hello'"
    
    more keyword test cases are in /tests folder.
    """
    for k, v in replacedict.items():
        test = test.replace(k,v)
    
    if encoding:
        utest = test.decode(encoding)
    else:
        try:
            #detect encoding
            det = chardet.detect(test)
            if det['confidence'] >= 0.8:
                encoding = chardet.detect(test)['encoding']
            else :
                #print 'low confidence encoding detection, use utf8 encoding'
                encoding = 'utf8'
            utest = test.decode(encoding)
        except UnicodeDecodeError, e:
            print "can't recognize your language, set to utf-8"
            utest = test.decode('utf8')
        except ImportError, e:
            #no chardet mode
            utest = test.decode('utf8')
def getajax(url):
    if not pattern.match(url):
        url = 'http://' + url
    try:
        browser.get(url)
        n = browser.page_source
        soup = BeautifulSoup(n)
        n = soup.get_text()
        try:
            n = n.encode('utf-8')
        except:
            d = chardet.detect(n)
            n = n.decode(d['encoding']).encode('utf-8')
    except TimeoutException:
        n = browser.page_source
        soup = BeautifulSoup(n)
        n = soup.get_text()
        try:
            n = n.encode('utf-8')
        except:
            d = chardet.detect(n)
            n = n.decode(d['encoding']).encode('utf-8')
        n = 'TIMEDOUT'+n
    except WebDriverException as error:
        if 'MALFORMED_URI' in error.msg:
            n = 'MALFORMED_URI'
        else:
            raise error
    except Exception, error:
        raise error
Beispiel #15
0
    def adb(self, *args):
        """adb命令执行入口

        :param args:
        :return:
        """
        if self.__serial:
            cmd = " ".join([self.__adb_name, '-s', self.__serial] + list(args))
        else:
            cmd = " ".join([self.__adb_name] + list(args))
        stdout, stderr = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()

        if self.__debug:
            print cmd
            print stdout.strip().decode(self.__output_coding)
        if stderr:
            try:
                print stderr.decode(self.__output_coding)
            except Exception as e:
                print e
                print '-'*100
                print stderr
                import chardet
                print chardet.detect(stderr)
        return [i for i in stdout.splitlines() if i and not i.startswith("* daemon")]  # 过滤掉空的行,以及adb启动消息
Beispiel #16
0
def process_detail_info(detail_page):
    """
        处理详情页
    :param url:
    :return:
    """
    title = pq(detail_page)(".tb-detail-hd h1").text()
    if isinstance(title, str) and chardet.detect(title)["encoding"]:
        title = unicode(title, chardet.detect(title)["encoding"])
    pay_type = u""
    if pay_types["total_pay"] in title:
        pay_type = pay_types["total_pay"]
    elif pay_types["bargin_pay"] in title:
        pay_type = pay_types["bargin_pay"]
    elif pay_types["down_pay"] in title:
        pay_type = pay_types["down_pay"]
    else:
        pay_type = pay_types["other"]
    # car_price = pq(detail_page)('.tm-price-panel .tm-price').text()  # 价格是再次加载进去的
    car_brand = pq(detail_page)("#J_attrBrandName").text()
    car_brand = str(car_brand).replace(u"品牌:", u"")
    car_type = pq(detail_page)("#J_AttrUL li").eq(1).text()
    if str(car_type).__contains__(u"车型:"):
        car_type = str(car_type).replace(u"车型:", u"")
    if str(car_type).__contains__(u"车系,"):
        car_type = str(car_type).replace(u"车系,", u"")
    # detail_quantity = pq(detail_page)('.tm-ind-panel .tm-ind-sellCount').text() # 详情页的月销量, 暂时无法得到
    # if not detail_quantity:
    #     detail_quantity = -1
    entity = "%s\t%s\t%s" % (pay_type, car_brand, car_type)
    # print entity
    return entity
Beispiel #17
0
def convertor(text, verbose=False, encoding="", outcoding=""):
    """
    convert zhpy source (Chinese) to Python Source.

    annotator will be called automatically.

Accept args:
    test:
        source to be converted
    verbose:
        show detail message, default: False
    encoding:
        codec for encoding
    outcoding:
        codec for output encoding

    #annotator()
    >>> convertor("印出 'hello'")
    "print 'hello'"

    >>> convertor("印出 'hello'", encoding="utf8")
    "print 'hello'"

    >>> convertor('測試_範例')
    'test_p_7bc4_4f8b_v'

    more keyword test cases are in /tests folder.
    """
    # annotate if necessary
    annotator(force=False)
    #Use the provided encoding, if not exist select utf-8 as default.
    if encoding and encoding.lower() != 'utf-8':
        utext = text.decode(encoding)
    else:
        if has_chardet:
            try:
                #detect encoding
                det = chardet.detect(text)
                if verbose:
                    print "chardet", det
                if det['confidence'] >= 0.8:
                    encoding = chardet.detect(text)['encoding']
                else:
                    if verbose:
                        print """low confidence encoding detection,
                                use utf8 encoding"""
                    encoding = 'utf8'
                #prepare for unicode type support
                if isinstance(text, unicode):
                    utext = text
                else:
                    utext = text.decode(encoding)
            except UnicodeDecodeError, e:
                print "can't recognize your language, \
                        set to sys.stdout.encoding"
                utext = text.decode('utf8')
            except ImportError, e:
                if verbose:
                    print "proceed no chardet mode"
                utext = text.decode('utf8')
Beispiel #18
0
 def detect(self, line, num=450):
     try:
         l = len(line)
         if l < 1200:
             return chardet.detect(line)['encoding']
         else:
             #first
             res1 =  chardet.detect(line[: num])
             #second
             str2 = line[l/2: l/2 + num]
             start =  str2.find(' ')
             if start == -1: start = 0
             res2 = chardet.detect(str2[start:])
             if res1['encoding'] != res2['encoding']:
                 if res1['encoding'] == 'ascii':
                     return res2['encoding']
                 else:
                     str3 = line[l/3: l/3 + num]
                     start =  str2.find(' ')
                     if start == -1: start = 0
                     #third
                     res3 = chardet.detect(str3[start:])
                     if res3['encoding'] == res2['encoding']:
                         return res2['encoding']
                     else:
                         return res1['encoding']
             else:
                 return res1['encoding']
     except:
         print "detect error, return None"
         return None
Beispiel #19
0
	def decode(self,content,url):#根据内容和url进行解码
		result = content
		if not ALWAYS_CHAR_DETECT and self.encoding:#上次的编码
			try:
				result = content.decode(self.encoding)
			except UnicodeDecodeError: # 解码错误,使用自动检测编码
				encoding = chardet.detect(content)['encoding']
				try:
					result = content.decode(encoding,'ignore')#show error,use : export LANG="en_US.UTF-8"
				except UnicodeDecodeError: # 还是出错,则不转换,直接返回
					self.encoding = None
					result = content
				except TypeError:
					self.encoding = None
					result = content
				else:# 保存下次使用,以节省时间
					self.encoding = encoding
					#保存到redis
					netloc = urlparse.urlsplit(url)[1]
					r.set(netloc,encoding)
		else:#暂时没有编码信息
			netloc = urlparse.urlsplit(url)[1]
			self.encoding = chardet.detect(content)['encoding']
			#使用探测到的编码解压
			try:
				result = content.decode(self.encoding)
			except UnicodeDecodeError: # 出错,则不转换,直接返回
				result = content
			else:
			#保存到redis
				r.set(netloc,self.encoding)

		return result
Beispiel #20
0
    def get_text_content(self):
        if self.content_type.lower().startswith('text/'):
            file_contents = self.get_content() # unicode(self.get_content(), errors='xmlcharrefreplace')
            
            result = chardet.detect(file_contents)
                
            if result['encoding'] != None:
                try:
                    return unicode(file_contents, result['encoding'])
                except:
                    return unicode('Error: ' + str(sys.exc_info()[1]))
            else:
                return 'NOT_A_TEXT_FILE'
        elif self.content_type.lower().startswith('application/pdf'):
            try:
                f = slate.PDF(open(self.content_path))
            
                file_contents = '' 

                for page in f:
                    file_contents += page

                result = chardet.detect(file_contents)
                    
                return unicode(file_contents, result['encoding'])
            except:
#                traceback.print_exc()
                pass

        return None
def merge_subtitles(in_filename1, in_filename2, out_filename):
    # detect file encodings
    encoding1 = chardet.detect(open(in_filename1).read())['encoding']
    encoding2 = chardet.detect(open(in_filename2).read())['encoding']

    # create aeidon project
    project1 = aeidon.Project()
    project2 = aeidon.Project()
    project1.open_main(in_filename1, encoding1)
    project2.open_main(in_filename2, encoding2)

    # setup output format
    out_format = aeidon.files.new(aeidon.formats.ASS, out_filename, "utf_8")
    out_format.header = header
    header_lines = header.split('\n')
    defalut_margin_v = int(header_lines[6].split(',')[-1])
    alternate_fontsize = int(header_lines[7].split(',')[2])
    event_margin_v = defalut_margin_v + alternate_fontsize

    # motify event entries
    for subtitle in project1.subtitles:
        subtitle.main_text = subtitle.main_text.replace('\n', ' ')
        subtitle.ssa.margin_v = event_margin_v
    for subtitle in project2.subtitles:
        subtitle.main_text = subtitle.main_text.replace('\n', ' ')
        subtitle.ssa.style = 'Alternate'

    project1.subtitles.extend(project2.subtitles)
    project1.save_main(out_format)
Beispiel #22
0
def check_enc_fixed(url):
    print "\n\n"
    print "That is url {}".format(url)
    r = requests.get(url)
    ud = UnicodeDammit(r.content, is_html=True)
    print "\t\t\t\t\t\t", ud.original_encoding == ud.declared_html_encoding
    if not ud.original_encoding == ud.declared_html_encoding:
        print ("Origignal encoding: {} vs declared_html_encoding: {}"
               "".format(ud.original_encoding, ud.declared_html_encoding))
        print "Detected encoding: {!r}". format(chardet.detect(r.content))

    enc = ud.original_encoding.lower()
    declared_enc = ud.declared_html_encoding
    if declared_enc:
        declared_enc = declared_enc.lower()
    # possible misregocnition of an encoding
    if (declared_enc and enc != declared_enc):
        detect_dict = chardet.detect(r.content)
        det_conf = detect_dict["confidence"]
        det_enc = detect_dict["encoding"].lower()
        if enc == det_enc and det_conf < THRESHOLD_OF_CHARDETECT:
            enc = declared_enc
    print "CHOOSED ENCODING: {}".format(enc)
    # if page contains any characters that differ from the main
    # encodin we will ignore them
    content = r.content.decode(enc, "ignore").encode(enc)
    htmlparser = etree.HTMLParser(encoding=enc)
    root = etree.HTML(content, parser=htmlparser)
    etree.strip_elements(root, html.etree.Comment, "script", "style")
    text = html.tostring(root, method="text", encoding=unicode)

    text = re.sub('\s+', ' ', text)
    print text[:200]
Beispiel #23
0
def html_to_txt():
    ft = open(BAIDU_TXT,'w')
    start = 1
    while 1:
        filename = BAIDU_DIR+str(start)+'.html'
        if os.path.isfile(filename):
            fp = open(filename, 'r')
            htmltxt = ''.join(fp.readlines())
            if not htmltxt or not len(htmltxt):
                continue
            fp.close()

            codedetect = chardet.detect(htmltxt)["encoding"]
            print codedetext
            if not codedetect in ['utf-8', 'ascii']:
                htmltxt = unicode(htmltxt, codedetect).encode('utf-8')
                codedetect = chardet.detect(htmltxt)["encoding"]
                print 'change', codedetect

            ft.write(html2txt(htmltxt))
            print 'Success change html to txt %s' % start
            start+=1
        else:
            break
    ft.close()
Beispiel #24
0
def charpick(files):
    path = r"D:\testdata\testsoft1\raw"
    path = path.replace('\\', '/')
    conn = mysql.connector.connect(host='127.0.0.1', user='******', password='******', port='3306', database='url',
                                   use_unicode=True)
    cursor = conn.cursor()

    for filename in files:
        #print chardet.detect(filename)
        with open(path + '/' + filename) as f1:
            data0 = f1.read()
            #print chardet.detect(data)
            data1=data0.decode("ISO-8859-2")
            data=data1.encode("utf-8")
            print chardet.detect(data)

            cursor.execute("insert into urlsoft2 (url) values (%s)",[data])









    conn.commit()
    cursor.close()
    conn.close()
Beispiel #25
0
def getURL(url):
    cJar = cookielib.LWPCookieJar()
    cookie_support = urllib2.HTTPCookieProcessor(cJar)
    opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler)
    urllib2.install_opener(opener)

    headers = {"": "", "": ""}

    data = {}
    getdata = urllib.urlencode(data)

    req = urllib2.Request(
        url=url,
        # data=getdata,
        headers=headers,
    )
    result = urllib2.urlopen(req).read()

    typeEncode = sys.getfilesystemencoding()  ##系统默认编码
    print chardet.detect(result)
    infoencode = chardet.detect(result).get("encoding", "utf-8")  ##通过第3方模块来自动提取网页的编码
    if infoencode:
        result = result.decode(infoencode, "ignore").encode(typeEncode)  ##先转换成unicode编码,然后转换系统编码输出

    for ind, cookie in enumerate(cJar):
        print "%d - %s" % (ind, cookie)

    print "HTML Content:"
    print result
 def win_ping(self,inter='test',host=None,num=2,times=2,expect=1):
     print("run keyword:%s"%(sys._getframe().f_code.co_name))
     inter=_unicode_to_utf(inter)
     host=_unicode_to_utf(host)
     num=_unicode_to_utf(num)
     times=_unicode_to_utf(times)
     expect=_unicode_to_utf(expect)
     msgs=[]
     stat=0
     for i in range(int(times)) :
         tmp_msgs=[]
         cmd='ping '+str(host)+' -n '+str(num)
         print("%s"%cmd)
         p=subprocess.Popen(cmd,stdin=subprocess.PIPE,stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True)
         p.wait()
         tmp_msgs.append(p.stdout.read())
         tmp_msgs.append(p.stderr.read())
         p.terminate()
         msgs.extend(tmp_msgs)
         reobj=re.compile('\(0% loss\)|\(0% 丢失\)')
         tmp_msgs=' '.join(tmp_msgs)
         tmp_msgs=unicode(tmp_msgs,chardet.detect(tmp_msgs)['encoding']).encode('utf-8')
         if reobj.search(tmp_msgs) :
             stat = 1
             break
     print("%s"%unicode('\n'.join(msgs),chardet.detect('\n'.join(msgs))['encoding']))
     if expect != 'None' and expect != None :
         if int(stat) == int(expect) :
             print("Expect is %s, actually %s"%('success' if int(expect) == 1 else 'fail','success' if int(expect) == 1 else 'fail'))
         else:
             raise ExpectError(message="Expect is %s, actually %s"%("Success" if int(expect) == 1 else "failed","success" if int(stat) == 1 else "failed"))
Beispiel #27
0
def open(fn, write, codecname = 'mskanji', autodetect = 'safedetect'):
    if write or (autodetect == 'no'):
        return io.open(fn, {True: 'w', False: 'r'}[write], encoding=codecname)
    elif autodetect == 'chardet' or autodetect == 'mskanji' or autodetect == 'safedetect':
        with io.open(fn, 'rb') as f:
            buf = f.read()
        try:
            if autodetect == 'safedetect':
                #read only BOM and early characters.
                result = chardet.detect(buf[:10])
            else:
                result = chardet.detect(buf)
            confidence = result.get('confidence', 0.0)
            encoding = result.get('encoding', None)
            if encoding == None or confidence < 0.1:
                encoding = codecname
            elif encoding.upper().startswith('UTF-8'):
                encoding = 'utf-8-sig' #UTF-8 -> UTF-8-SIG (BOM)
            elif encoding.upper().startswith('UTF-16'):
                    encoding = 'UTF-16' #UTF-16LE -> UTF-16 (BOM)
            elif autodetect == 'safedetect':
                if not encoding.upper().startswith('UTF'):
                    encoding = codecname
            elif autodetect == 'mskanji' and encoding.upper() == 'SHIFT_JIS':
                encoding = 'mskanji'
        except:
            encoding = codecname

        file = io.BytesIO(buf)
        setattr(file, 'name', fn)

        return io.TextIOWrapper(file, encoding=encoding)
    else:
        raise LookupError(u'unknown autodetect mode: %s' % autodetect)
Beispiel #28
0
def printlog(key,out):
    print "=================== Order by " + key + " ======================="
    print "[.] Start to output!"
    resList =  sorted(resultList,key = lambda e:e.__getitem__(key))
    xlwtwork = xlwt.Workbook(encoding='utf-8')
    ws = xlwtwork.add_sheet('info')
    ws.write(0, 0, '网站状态')
    ws.write(0, 1, '网站地址')
    ws.write(0, 2, '允许HTTP请求方法')
    ws.write(0, 3, '网站标题')
    order = 1
    for res in resList:
        ws.write(order, 0, res['status'])
        ws.write(order, 1, res['target'])
        ws.write(order, 2, res['head_allow'])
        if chardet.detect(res['title'])['encoding'] == 'GB2312':
            ws.write(order, 3, res['title'].decode('GB2312').encode('utf-8'))
        elif chardet.detect(res['title'])['encoding'] == 'ascii':
            ws.write(order, 3, res['title'].decode('ascii').encode('utf-8'))
        elif chardet.detect(res['title'])['encoding'] == 'utf-8':
            ws.write(order, 3, res['title'])
        else:
            print '[-](手工补录)不支持 ' + res['target'] + ' title编码格式:' + chardet.detect(res['title'])['encoding'] + ' 输出到表格'
        order = order + 1
    xlwtwork.save(out)
    print "[.] End output!"
    print "======================================================="
Beispiel #29
0
def whatisthis(s):
    if isinstance(s,str):
        print "Ordinary string : ", chardet.detect(s)
    elif isinstance(s,unicode):
        print "Unicode string, chardet Expected a bytes object, not a unicode object"
    else:
        print "Not a string : ",chardet.detect(s)
Beispiel #30
0
def check(dir):
  count = 0
  for file in os.listdir(dir):
    checkfile = open(os.path.join(dir,file),"r")
    content = checkfile.read()
    print str(count)+' '+str(chardet.detect(content)['confidence'])+' '+str(chardet.detect(content)['encoding'])
    count+=1
Beispiel #31
0
#!/usr/bin/env python
# _*_ coding:utf-8 _*_

str1 = "我们"
print(str1)
print(type(str1))

str1 = "我们"
str_utf8 = str1.encode('utf-8')
print(str_utf8)
print(type(str_utf8))

str_decode = str1.encode('utf-8').decode('utf-8')
print(str_decode)
print(type(str_decode))

import chardet
str_gbk = "我们".encode('gbk')
print(chardet.detect(str_gbk))

# str_unicode_decode = "我们".decode()

# str_utf8 = "我们".encode('utf-8')
# str_gbk = str_utf8.encode('gbk')

str_utf8 = "我们".encode('utf-8')
str_gbk = str_utf8.decode('utf-8').encode('gbk')
print(str_gbk)
Beispiel #32
0
 def code_detecter(self, data):
     try:
         return chardet.detect(data)['encoding']
     except:
         return "windows-1251"
Beispiel #33
0
 def get_encoding_type(self, file):
     print_blue(file)
     with open(file, 'rb') as f:
         rawdata = f.read()
     return detect(rawdata)['encoding']
    return extract(lda, count_feature_names, no_top_words)

with open(SURVEY_FILE_NAME) as content_file:
    responses = []
    reader = csv.reader(content_file)
    for row in reader:
        if row[1]: responses.append(row[1])
    LDA_clusters = LDA(responses)
# Create CSV and json
with open('PROB_'+SURVEY_FILE_NAME,'w+') as out_file, open('TOPICS_'+SURVEY_FILE_NAME.replace('csv','json'),'w+') as jf:
    writer = csv.writer(out_file, quoting=csv.QUOTE_ALL)
    for idx, a_resp in enumerate(responses):
        writer.writerow([a_resp]+probabilities[idx])
    json.dump(LDA_clusters,jf, indent=4)

#-----------------------------------------------------------------------------------------------------------------------------
with open(SURVEY2_FILE_NAME,'rb') as f: result = chardet.detect(f.read())
with open(SURVEY2_FILE_NAME, encoding=result['encoding']) as content_file:
    responses = []
    reader = csv.reader(content_file)
    for row in reader:
        if row[1]: responses.append(row[1])
    LDA_clusters = LDA(responses)
# Create CSV
with open('PROB_'+SURVEY2_FILE_NAME,'w+') as out_file, open('TOPICS_'+SURVEY2_FILE_NAME.replace('csv','json'),'w+') as jf:
    writer = csv.writer(out_file, quoting=csv.QUOTE_ALL)
    for idx, a_resp in enumerate(responses):
        writer.writerow([a_resp]+probabilities[idx])
    json.dump(LDA_clusters,jf, indent=4)

Beispiel #35
0
		except Exception,e:
			# print ('unknow error')
			logger.error('unknow error %s: request fail url:%s'% (str(e), url))
	elif detect_flag == "web_detect":
		driver = webdriver.PhantomJS(executable_path=phantomjs_dir)
		driver.get(url)
		time.sleep(5)
		html_src = driver.page_source
		# print html_src
		# print 'type:%s' % type(html_src)
		driver.close()
		html = html_src.encode('utf-8')          # 动态加载完是unicode编码,转化为utf-8
		page_html = html
	if page_html:
		html_srccode = chardet.detect(page_html)['encoding']
		# page_html = html_encode(page_html)
		page_html = page_html.decode(encoding=html_srccode, errors='replace')
		page_html = page_html.replace('\n', '').replace(' ','')  # 去换行和空格
		# print page_html
	return page_html
def structtime_to_timestamp(time_sourse):
	try:
		time_stamp_soure = time.mktime(time.strptime(time_sourse, '%Y-%m-%d %H:%M:%S'))
	except ValueError:
		time_stamp_soure = time.mktime(time.strptime(time_sourse, '%Y-%m-%d'))
	return str(int(time_stamp_soure))


def html_encode(html_1):
	encoding_dict = chardet.detect(html_1)
Beispiel #36
0
    def jiexi(self, name):
        end = chardet.detect(open(name, 'rb').read())
        dk = open(name, 'r', encoding=end['encoding'])
        for j in dk.readlines():
            data = json.loads("".join(j.split('\n')))
            if "computers" in name:
                print('[+] wait write file:{}'.format(self.name))
                computers = data['computers']
                for c in computers:
                    print('=' * 90,
                          file=open(self.name, 'a', encoding=end['encoding']))
                    print('{}:{}'.format('ObjectIdentifier',
                                         c['ObjectIdentifier']),
                          file=open(self.name, 'a', encoding=end['encoding']))
                    Aces = c['Aces']
                    for k in Aces:
                        for r in k:
                            print('{}:{}'.format(r, k[r]),
                                  file=open(self.name,
                                            'a',
                                            encoding=end['encoding']))
                        print('',
                              file=open(self.name,
                                        'a',
                                        encoding=end['encoding']))
                    Properties = c['Properties']
                    for p in Properties:
                        print('{}:{}'.format(p, Properties[p]),
                              file=open(self.name,
                                        'a',
                                        encoding=end['encoding']))
                    print('',
                          file=open(self.name, 'a', encoding=end['encoding']))

                meta = data['meta']
                for m in meta:
                    print('{}:{}'.format(m, meta[m]),
                          file=open(self.name, 'a', encoding=end['encoding']))

            elif "domains" in name:
                print('[+] wait write file:{}'.format(self.name2))
                computers = data['domains']
                for c in computers:
                    print('=' * 90,
                          file=open(self.name2, 'a', encoding=end['encoding']))
                    print('{}:{}'.format('ObjectIdentifier',
                                         c['ObjectIdentifier']),
                          file=open(self.name2, 'a', encoding=end['encoding']))
                    Aces = c['Aces']
                    for k in Aces:
                        for r in k:
                            print('{}:{}'.format(r, k[r]),
                                  file=open(self.name2,
                                            'a',
                                            encoding=end['encoding']))
                        print('',
                              file=open(self.name2,
                                        'a',
                                        encoding=end['encoding']))
                    Properties = c['Properties']
                    for p in Properties:
                        print('{}:{}'.format(p, Properties[p]),
                              file=open(self.name2,
                                        'a',
                                        encoding=end['encoding']))
                    print('',
                          file=open(self.name2, 'a', encoding=end['encoding']))

                meta = data['meta']
                for m in meta:
                    print('{}:{}'.format(m, meta[m]),
                          file=open(self.name2, 'a', encoding=end['encoding']))

            elif 'gpos' in name:
                print('[+] wait write file:{}'.format(self.name3))
                computers = data['gpos']
                for c in computers:
                    print('=' * 90,
                          file=open(self.name3, 'a', encoding=end['encoding']))
                    print('{}:{}'.format('ObjectIdentifier',
                                         c['ObjectIdentifier']),
                          file=open(self.name3, 'a', encoding=end['encoding']))
                    Aces = c['Aces']
                    for k in Aces:
                        for r in k:
                            print('{}:{}'.format(r, k[r]),
                                  file=open(self.name3,
                                            'a',
                                            encoding=end['encoding']))
                        print('',
                              file=open(self.name3,
                                        'a',
                                        encoding=end['encoding']))
                    Properties = c['Properties']
                    for p in Properties:
                        print('{}:{}'.format(p, Properties[p]),
                              file=open(self.name3,
                                        'a',
                                        encoding=end['encoding']))
                    print('',
                          file=open(self.name3, 'a', encoding=end['encoding']))

                meta = data['meta']
                for m in meta:
                    print('{}:{}'.format(m, meta[m]),
                          file=open(self.name3, 'a', encoding=end['encoding']))

            elif 'groups' in name:
                print('[+] wait write file:{}'.format(self.name4))
                computers = data['groups']
                for c in computers:
                    print('=' * 90,
                          file=open(self.name4, 'a', encoding=end['encoding']))
                    print('{}:{}'.format('ObjectIdentifier',
                                         c['ObjectIdentifier']),
                          file=open(self.name4, 'a', encoding=end['encoding']))
                    Aces = c['Aces']
                    for k in Aces:
                        for r in k:
                            print('{}:{}'.format(r, k[r]),
                                  file=open(self.name4,
                                            'a',
                                            encoding=end['encoding']))
                        print('',
                              file=open(self.name4,
                                        'a',
                                        encoding=end['encoding']))
                    Properties = c['Properties']
                    for p in Properties:
                        print('{}:{}'.format(p, Properties[p]),
                              file=open(self.name4,
                                        'a',
                                        encoding=end['encoding']))
                    print('',
                          file=open(self.name4, 'a', encoding=end['encoding']))

                meta = data['meta']
                for m in meta:
                    print('{}:{}'.format(m, meta[m]),
                          file=open(self.name4, 'a', encoding=end['encoding']))

            elif 'ous' in name:
                print('[+] wait write file:{}'.format(self.name4))
                computers = data['ous']
                for c in computers:
                    print('=' * 90,
                          file=open(self.name5, 'a', encoding=end['encoding']))
                    print('{}:{}'.format('ObjectIdentifier',
                                         c['ObjectIdentifier']),
                          file=open(self.name5, 'a', encoding=end['encoding']))
                    Aces = c['Aces']
                    for k in Aces:
                        for r in k:
                            print('{}:{}'.format(r, k[r]),
                                  file=open(self.name5,
                                            'a',
                                            encoding=end['encoding']))
                        print('',
                              file=open(self.name5,
                                        'a',
                                        encoding=end['encoding']))
                    Properties = c['Properties']
                    for p in Properties:
                        print('{}:{}'.format(p, Properties[p]),
                              file=open(self.name5,
                                        'a',
                                        encoding=end['encoding']))
                    print('',
                          file=open(self.name5, 'a', encoding=end['encoding']))

                meta = data['meta']
                for m in meta:
                    print('{}:{}'.format(m, meta[m]),
                          file=open(self.name5, 'a', encoding=end['encoding']))

            elif 'users' in name:
                print('[+] wait write file:{}'.format(self.name4))
                computers = data['users']
                for c in computers:
                    print('=' * 90,
                          file=open(self.name6, 'a', encoding=end['encoding']))
                    print('{}:{}'.format('ObjectIdentifier',
                                         c['ObjectIdentifier']),
                          file=open(self.name6, 'a', encoding=end['encoding']))
                    Aces = c['Aces']
                    for k in Aces:
                        for r in k:
                            print('{}:{}'.format(r, k[r]),
                                  file=open(self.name6,
                                            'a',
                                            encoding=end['encoding']))
                        print('',
                              file=open(self.name6,
                                        'a',
                                        encoding=end['encoding']))
                    Properties = c['Properties']
                    for p in Properties:
                        print('{}:{}'.format(p, Properties[p]),
                              file=open(self.name6,
                                        'a',
                                        encoding=end['encoding']))
                    print('',
                          file=open(self.name6, 'a', encoding=end['encoding']))

                meta = data['meta']
                for m in meta:
                    print('{}:{}'.format(m, meta[m]),
                          file=open(self.name6, 'a', encoding=end['encoding']))
Beispiel #37
0
import chardet

de = chardet.detect(b'Hello World')
print(de)
data = '中文'.encode('utf-8')
de = chardet.detect(data)
print(de)
Beispiel #38
0
def encoding_file():
    file_name = input_file_name()
    with open(file_name, 'rb') as f:
        data = f.read()
        result = chardet.detect(data)
    return file_name, result['encoding']
import os
import glob
import chardet

path = input('input a csv_file path:')
os.chdir(path)

csv_files = glob.glob('*.csv')
for csv_file in csv_files:
    f = open(csv_file, "rb").read()
    print(csv_file, chardet.detect(f))
Beispiel #40
0
# -*-coding:utf-*
'''
利用request下载页面,自动检测页面编码
'''

from urllib import request
import chardet

if __name__ == '__main__':
    url = 'http://finance.eastmoney.com/news/1345,20180923951189228.html'

    rsp = request.urlopen(url)

    html = rsp.read()

    # 利用 chardet自动检测编码
    cs = chardet.detect(html)
    print(type(cs))
    print(cs)

    html = html.decode(cs.get("encoding", "utf-8"))
    print(html)
Beispiel #41
0
# -*- coding: utf-8 -*-
# @File  : Spider.py
# @Author: Zhuozhuo.Geng
# @Date  : 2018/2/12
# @Desc  :
from urllib import request
import chardet
import re

if __name__ == '__main__':
    response = request.urlopen('http://fanyi.baidu.com')

    htmlPage = response.read()
    # 获取网页的编码格式
    chardetRes = chardet.detect(htmlPage)
    encodeHtml = chardetRes.get('encoding', -1)

    chardetRes = htmlPage.decode(encodeHtml)

    REGX = r'href="(.*?)"'
    urlSet = re.findall(REGX, chardetRes, re.S)

    # print(chardetRes)
    print(urlSet)
Beispiel #42
0
 def chardet_dammit(s):
     return chardet.detect(s)['encoding']
Beispiel #43
0
    def __init__(s, text, fromFile=0, stop_words=('-')):
        nx.DiGraph.__init__(s)
        if (fromFile):
            with open(text) as f:
                text = f.read()
        try:
            encoding = chardet.detect(text[:289] + ' ' +
                                      text[len(text) / 2 - 144:len(text) / 2 +
                                           144] + ' ' +
                                      text[-289:])['encoding']
        except (TypeError):
            encoding = 'utf-8'
            print('Error: Encoding not detected, utf-8 selected\n')
        if (encoding):
            ss = TextWords(
                text.decode(encoding), stop_words=stop_words
            )  #.decode(encoding))#text.decode('cp1251').split()#
        else:
            ss = TextWords(unicode(text, 'utf-8'), stop_words=stop_words)
        s.node_list = dict()
        s.node_property = dict()
        s.edge_list = dict()
        s.edge_property = dict()
        _S = None
        try:
            s.add_node(ss[0].lower())
            _S = ss[0].lower()
        except (IndexError):
            return None
        s.node_list = dict()
        s.node_property = dict()
        s.edge_list = dict()
        s.edge_property = dict()
        s.add_properties('node',
                         weight=1,
                         textPositionFirst=-1,
                         textPositionLast=-1,
                         textPositionAvg=0,
                         edges_in=0,
                         edges_out=0,
                         sweight=1.0)
        s.add_properties('edge',
                         weight=1,
                         textPositionFirst=-1,
                         textPositionLast=-1,
                         textPositionAvg=0,
                         sweight=1.0,
                         is_tree_edge=False)
        s.node[_S]['weight'] = 1
        s.node[_S]['textPositionFirst'] = 1
        s.node[_S]['textPositionAvg'] = 1
        s.node[_S]['textPositionLast'] = 1
        t = 0
        for tt in ss:
            t += 1
            try:
                ss[t] = ss[t]
            except (IndexError):
                break
            #print(t,' ',ss.ii)##########################
            s.add_node(ss[t])
            s._add_node(ss[t], 'weight', 1, 1)
            if not (s.node[ss[t]].get('textPositionFirst')):
                s.node[ss[t]]['textPositionFirst'] = t + 1
            s._add_node(ss[t], 'textPositionAvg', t + 1, t + 1)
            s.node[ss[t]]['textPositionLast'] = t + 1

            s.add_edge(ss[t - 1], ss[t])
            s._add_edge(ss[t - 1], ss[t])
            if not (s.edge[ss[t - 1]][ss[t]].get('textPositionFirst')):
                s.edge[ss[t - 1]][ss[t]]['textPositionFirst'] = t
            s._add_edge(ss[t - 1], ss[t], 'textPositionAvg', 1 + t, t + 1)
            s.edge[ss[t - 1]][ss[t]]['textPositionLast'] = t
        for v in s.node.keys():
            s.node[v]['textPositionAvg'] /= float(s.node[v]['weight'])
            s.node[v]['edges_out'] = len(s.edge[v])
            s.node[v]['edges_in'] = 0
            s.node[v]['sweight'] = 1.0 / s.node[v]['weight']
        for e in s.edge:
            for ee in s.edge[e]:
                s.edge[e][ee]['textPositionAvg'] /= float(
                    s.edge[e][ee]['weight'])
                s.node[ee]['edges_in'] += 1
                s.edge[e][ee]['is_tree_edge'] = s.node[e][
                    'textPositionFirst'] < s.node[ee]['textPositionFirst']
                s.edge[e][ee]['sweight'] = 1.0 / s.edge[e][ee]['weight']
        s.sort_edge()
        s.sort_node()
Beispiel #44
0
print('''---------------------chardet---------------------
''')

# 字符串编码一直是令人非常头疼的问题,尤其是我们在处理一些不规范的第三方网页的时候。
# chardet这个第三方库正好就派上了用场。用它来检测编码,简单易用。
#
# *********** 安装chardet ***********
#
# 如果安装了Anaconda,chardet就已经可用了。否则,需要在命令行下通过pip安装:
#
# $ pip install chardet
# 如果遇到Permission denied安装失败,请加上sudo重试。

# *********** 使用chardet ***********
# 当我们拿到一个bytes时,就可以对其检测编码。用chardet检测编码,只需要一行代码:
print("chardet.detect(b'Hello, world!')=\t", chardet.detect(b'Hello, world!'))

# 我们来试试检测GBK编码的中文:
data = '离离原上草,一岁一枯荣'.encode('gbk')
print("chardet.detect(data)=\t", chardet.detect(data))

# 对UTF-8编码进行检测:
data = '离离原上草,一岁一枯荣'.encode('utf-8')
print("chardet.detect(data)=\t", chardet.detect(data))

# 我们再试试对日文进行检测:
data = '最新の主要ニュース'.encode('euc-jp')
print("chardet.detect(data)=\t", chardet.detect(data))

# 可见,用chardet检测编码,使用简单。获取到编码后,再转换为str,就可以方便后续处理。
# 使用chardet检测编码非常容易,chardet支持检测中文、日文、韩文等多种语言。
def get_encoding_type(file):
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']
Beispiel #46
0
def get_encode(filepath):
    f = open(filepath, 'rb')
    b = f.read(1024)
    f.close()
    return chardet.detect(b)['encoding']
Beispiel #47
0
    raw_input(u'按回车键退出……'.encode(sys.stdin.encoding))
    sys.exit()

if not os.path.isfile(list_file_code):
    print list_file, u'文件不存在。'
    raw_input(u'按回车键退出……'.encode(sys.stdin.encoding))
    sys.exit()

if path_src_code == path_dst_code:
    print u'源路径和目标路径相同,请检查后重新输入。'
    raw_input(u'按回车键退出……'.encode(sys.stdin.encoding))
    sys.exit()

# print u'list文件:',list_file
f_list = open(list_file_code, 'r')
fcode = chardet.detect(f_list.read())['encoding']
# print 'File encoding:',fcode
if fcode != 'utf-8':
    fcode = 'gbk'
f_list.seek(0)
list_all = []
for list_line in f_list:
    list_line = list_line.strip(' \n\r')
    list_line_encode = list_line.decode(fcode).encode(sys.stdin.encoding)
    if not list_line:
        continue
    if list_line not in os.listdir(path_src_code):
        print u'源路径 ', path_src, u'中没有文件', list_line_encode
        raw_input(u'按回车键退出……'.encode(sys.stdin.encoding))
        sys.exit()
    if list_line not in os.listdir(path_dst_code):
Beispiel #48
0
# -chardet[检测编码]
import chardet

b = b'hello catface!'
c = chardet.detect(b)
print('chardet:', c)

s1 = '国破山河在,城春草木深'
s3 = '最新の主要ニュース'
r = s1.encode('gbk')
c = chardet.detect(r)
print('chardet:', c)

r = s1.encode('utf-8')
c = chardet.detect(r)
print('chardet:', c)

r = s3.encode('euc-jp')
c = chardet.detect(r)
print('chardet:', c)
#!/usr/bin/env python2
# -*- encoding: utf-8 -*-

import chardet
import re
import os

for n in os.listdir('.'):
    encoding = chardet.detect(n)['encoding']
    if re.match(r"ascii|utf", encoding):
        continue
    print '%s: %s (%s)' % (n, chardet.detect(n)['encoding'], chardet.detect(n)['confidence'])
                            "".join(extended_event_descriptor_multi)).strip()

                    if not (extended_event_descriptor):
                        extended_event_descriptor = short_event_descriptor
                        extended_event_codepage = short_event_codepage

                    if name_event_descriptor:
                        try:
                            if name_event_codepage:
                                if name_event_codepage != 'utf-8':
                                    name_event_descriptor = name_event_descriptor.decode(
                                        name_event_codepage).encode("utf-8")
                                else:
                                    name_event_descriptor.decode('utf-8')
                            else:
                                encdata = chardet.detect(name_event_descriptor)
                                enc = encdata['encoding'].lower()
                                confidence = str(encdata['confidence'])
                                emcDebugOut(
                                    "[META] Detected name_event encoding-type: "
                                    + enc + " (" + confidence + ")")
                                if enc == "utf-8":
                                    name_event_descriptor.decode(enc)
                                else:
                                    name_event_descriptor = name_event_descriptor.decode(
                                        enc).encode('utf-8')
                        except (UnicodeDecodeError, AttributeError), e:
                            emcDebugOut("[META] Exception in readEitFile: " +
                                        str(e))
                    self.eit['name'] = name_event_descriptor
Beispiel #51
0
            time.sleep(1)
    html = page.read()
    page.close()
    return html


argv0_list = sys.argv[0].split("\\")
script_name = argv0_list[len(argv0_list) - 1]
script_name = script_name[0:-3]
tmp_record = script_name + '@' + str(os.getpid()) + '.txt'

try:
    for url in urls:
        content = load_html(url)
        #encoding = extract(str(content).lower(), 'charset=', '"')
        encoding = chardet.detect(content)['encoding']
        #print('-'*50)
        #print( "Encoding type = %s" % encoding )
        #print('-'*50)
        if encoding:
            # note that Python3 does not read the html code as string
            # but as html code bytearray, convert to string with
            content = content.decode(encoding, 'ignore').replace(u'\xa9', u'')
        else:
            print("Debug: Encoding type not found!")
        match = re.search("\'(\w{8,}-\w{4,}-\w{4,}-\w{4,}-\w{12,})\'",
                          str(content))
        if match is None and urlparse(url).path.startswith('/m/'):
            try:
                from selenium import webdriver
                dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS)
Beispiel #52
0
def GetEncodeString(str):
    try:
        str = str.decode(chardet.detect(str)["encoding"]).encode("utf-8")
    except:
        pass
    return str
 def _detect_encoding(self, file_name: str):
     with open(file_name, 'rb') as f:
         return chardet.detect(f.read())['encoding']
Beispiel #54
0
def get_encoding_type(file):
    """ Detect `file` encoding codec """ 
    with open(file, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)['encoding']
Beispiel #55
0
    except:
        title_num = trans_int(title_s[0])
    finally:
        if title_num >= 100:
            title_num = str(title_num)
        elif title_num >= 10:
            title_num = '0' + str(title_num)
        elif title_num >= 1:
            title_num = '00' + str(title_num)
    title = re.sub('第.*章', '第' + title_num + '章', title)
    article = str(soup.find(id="contents"))
    with open('D:/soft/test/' + title + '.htm', 'w', encoding='utf-8') as file:
        file.write(article)
        print('%s done' % title)


if __name__ == '__main__':
    origin_url = 'https://www.aszw.org/book/35/35222/'
    response = requests.get(origin_url)
    response.encoding = chardet.detect(response.content)['encoding']
    html = response.text
    lists = re.findall('<td.*?href="(.*?)">(.*?)</a></td>', html, re.S)
    contents = dict(
        zip([lists[i][1] for i in range(len(lists))],
            [lists[i][0] for i in range(len(lists))]))
    p = Pool(processes=5)
    for key in contents.keys():
        p.apply_async(get_article, args=(origin_url + contents[key], ))
    p.close()
    p.join()
Beispiel #56
0
def clean_cetc(institution, title):
    new_name = ''
    if_no_title_name = ''
    flag = False
    flag2 = False
    index = 0
    temp = ''
    no_title_name_flag = False
    #set separator
    separator = '所'
    institution = institution.split(';')[0]
    if separator in institution:
        '''
		e.x.
		input:中国电子科技集团,第十四研究所,江苏,南京,210013
		output:电子科技集团14所
		'''
        institution = institution.strip().split(separator)[0].replace(
            ',', '').replace(',', '').replace(' ', '').split(';')[0]
        no_title_name_flag = True
    else:
        '''
		e.x.
		input:中国电子科技集团第十四研究
		output:电子科技集团14所
		'''
        institution = institution.strip().split(" ")[0].split(",")[0].split(
            ',')[0]
    print(institution)
    n = {
        u'一': '1',
        u'二': '2',
        u'三': '3',
        u'四': '4',
        u'五': '5',
        u'六': '6',
        u'七': '7',
        u'八': '8',
        u'九': '9',
        u'十': ' '
    }
    if title in institution:
        new_name += title
        flag = True

    if flag or no_title_name_flag:
        institution = institution + separator
    try:
        for c in institution.decode(chardet.detect(institution)['encoding']):
            if n.has_key(c):
                temp += str(n[c])
                index += 1
                flag2 = True

            else:
                # 十四 => 14, 十 => 10, 二十 => 20
                if index != 0:
                    if temp[0] == ' ':
                        temp = '1' + temp
                    if temp[-1] == ' ':
                        temp = temp + '0'
                    temp = temp.replace(' ', '')
                    new_name += temp
                    if_no_title_name += temp
                    #reset value
                    index = 0
                    temp = ''
                if_no_title_name += c
            if c <= '9' and c >= '0':
                new_name += str(c)
                flag2 = True

        if flag and flag2:
            return new_name + separator
        return if_no_title_name
    except Exception as e:
        return institution
Beispiel #57
0
    def __parseCsvFile(self, handle):
        """
        Parse a CSV file. Does not reset the file handle to start.

        @arg handle: CSV file. Must be a seekable binary file object.
        @type handle: file object

        @return: list of lists
        @rtype: list
        """
        buf = handle.read(BUFFER_SIZE)
        result = chardet.detect(buf)
        handle.seek(0)

        if result['confidence'] > 0.5:
            encoding = unicode(result['encoding'])
        else:
            encoding = 'utf-8'

        # Python 2.7 makes it extraordinarily hard to do this correctly. We
        # have a binary file object containing lines of text in a certain
        # encoding with unknown style of line-endings.
        #
        # We want to correctly decode the file contents, accept any style of
        # line-endings, parse the lines with the `csv` module, and return
        # unicode strings.
        #
        # 1. `codecs.getreader` does not have a universal newlines mode.
        # 2. `io.TextIOWrapper` cannot be wrapped around our file object,
        #    since it is required to be an `io.BufferedIOBase`, which it
        #    usually will not be.
        # 3. The `csv` module cannot read unicode.
        #
        # Ugh.
        #
        # So, we use a stream wrapper that consumes byte strings, decodes to
        # unicode, normalises newlines, and produces the result UTF-8 encoded.
        # That's what we feed the `csv` module. We decode what it gives back
        # to unicode strings. What a mess.
        handle = _UniversalNewlinesByteStreamIter(handle,
                                                  encoding=encoding,
                                                  buffer_size=BUFFER_SIZE)

        try:
            buf = handle.read(BUFFER_SIZE)
        except UnicodeDecodeError:
            self.__output.addMessage(
                __file__, 3, 'EBPARSE',
                'Could not decode file (using %s encoding).' % encoding)
            return None

        # Default dialect
        dialect = 'excel'

        # The idea is that for new-style batch input files we have only
        # one column and the sniffer cannot find a delimiter.

        try:
            # Todo: delimiters in config file
            dialect = csv.Sniffer().sniff(buf, delimiters="\t ;|,")
            dialect.skipinitialspace = True
        except csv.Error:
            #self.__output.addMessage(__file__, 4, "EBPARSE", e)
            #return None
            pass
        #except

        #Watch out for : delimiter FIXME and for the . delimiter
#        if dialect.delimiter == ":":
#            dialect.delimiter = "\t"

        handle.seek(0)
        reader = csv.reader(handle, dialect)

        ret = []
        try:
            for i in reader:
                ret.append([c.decode('utf-8') for c in i])
        except UnicodeDecodeError:
            self.__output.addMessage(
                __file__, 3, 'EBPARSE',
                'Could not decode file (using %s encoding).' % encoding)
            return None

        return ret
Beispiel #58
0
from hanziconv import HanziConv
reload(sys)
sys.setdefaultencoding("utf-8")

countsig = 0
count1 = 0
count = 0
listnew = '/home/hongliang/Downloads/workspace/mirrorfunctions'
for root, dirs, files in os.walk(listnew):
    for fn in files:
        a = os.path.join(root, fn)

        raw = open(a)
        content = raw.read()
        raw.close
        if chardet.detect(content)['encoding'] == 'utf-8':
            count1 += 1
            print a, 'success'
        elif chardet.detect(content)['encoding'] == 'UTF-8-SIG':
            print a, 'sig success'
            countsig += 1
        else:
            print a
            content = content.decode('gbk').encode('utf-8')

        content = HanziConv.toSimplified(content)

        raw = open(a, 'w')
        raw.write(content)
        raw.close
# use natural language toolkit
import re
import numpy as np
from random import randint
import pandas as pd
import string
import chardet
with open('data_for_spam.csv', 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large

dataset = pd.read_csv('data_for_spam.csv', encoding=result['encoding'])
x = dataset.iloc[:, 0]
y = dataset.iloc[:, 1]
x = x.to_dict()

X = []
for d in range(len(x)):
    b = x[d].lower()
    sentence = re.sub(r'\d+', '', b)
    sentence = re.sub('[' + string.punctuation + ']', '', sentence)
    X.append(sentence)

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vect = CountVectorizer()
a = count_vect.fit_transform(X)
a.toarray()

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
Beispiel #60
0
    def form_valid(self, form):
        node_id = self.request.GET.get("node_id")
        node = get_object_or_none(Node, id=node_id) if node_id else Node.root()
        f = form.cleaned_data['file']
        det_result = chardet.detect(f.read())
        f.seek(0)  # reset file seek index

        file_data = f.read().decode(det_result['encoding']).strip(codecs.BOM_UTF8.decode())
        csv_file = StringIO(file_data)
        reader = csv.reader(csv_file)
        csv_data = [row for row in reader]
        fields = [
            field for field in Asset._meta.fields
            if field.name not in [
                'date_created'
            ]
        ]
        header_ = csv_data[0]
        mapping_reverse = {field.verbose_name: field.name for field in fields}
        attr = [mapping_reverse.get(n, None) for n in header_]
        if None in attr:
            data = {'valid': False,
                    'msg': 'Must be same format as '
                           'template or export file'}
            return self.render_json_response(data)

        created, updated, failed = [], [], []
        assets = []
        for row in csv_data[1:]:
            if set(row) == {''}:
                continue

            asset_dict_raw = dict(zip(attr, row))
            asset_dict = dict()
            for k, v in asset_dict_raw.items():
                v = v.strip()
                if k == 'is_active':
                    v = False if v in ['False', 0, 'false'] else True
                elif k == 'admin_user':
                    v = get_object_or_none(AdminUser, name=v)
                elif k in ['port', 'cpu_count', 'cpu_cores']:
                    try:
                        v = int(v)
                    except ValueError:
                        v = ''
                elif k == 'domain':
                    v = get_object_or_none(Domain, name=v)

                if v != '':
                    asset_dict[k] = v

            asset = None
            asset_id = asset_dict.pop('id', None)
            if asset_id:
                asset = get_object_or_none(Asset, id=asset_id)
            if not asset:
                try:
                    if len(Asset.objects.filter(hostname=asset_dict.get('hostname'))):
                        raise Exception(_('already exists'))
                    with transaction.atomic():
                        asset = Asset.objects.create(**asset_dict)
                        if node:
                            asset.nodes.set([node])
                        created.append(asset_dict['hostname'])
                        assets.append(asset)
                except Exception as e:
                    failed.append('%s: %s' % (asset_dict['hostname'], str(e)))
            else:
                for k, v in asset_dict.items():
                    if v != '':
                        setattr(asset, k, v)
                try:
                    asset.save()
                    updated.append(asset_dict['hostname'])
                except Exception as e:
                    failed.append('%s: %s' % (asset_dict['hostname'], str(e)))

        data = {
            'created': created,
            'created_info': 'Created {}'.format(len(created)),
            'updated': updated,
            'updated_info': 'Updated {}'.format(len(updated)),
            'failed': failed,
            'failed_info': 'Failed {}'.format(len(failed)),
            'valid': True,
            'msg': 'Created: {}. Updated: {}, Error: {}'.format(
                len(created), len(updated), len(failed))
        }
        return self.render_json_response(data)