Python UniversalDetector.reset 예제들, chardet.universaldetector.UniversalDetector.reset Python 예제들

예제 #1

0

파일 보기

    def validate_file(self, filename):
        from chardet.universaldetector import UniversalDetector

        detector = UniversalDetector()
        detector.reset()

        with open(filename, "rb") as to_convert:
            for line in to_convert:
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            encoding = detector.result.get("encoding")
        valid_encodings = ["ASCII", "ISO-8859-1", "UTF-8"]

        if encoding is None:
            message = "Please check that you are uploading a CSV file."
            self.logger.exception(message)
            raise UploadCheckError(message)

        if encoding.upper() not in valid_encodings:
            message = "File encoding %s not valid. Valid encodings: %s" % (encoding, ", ".join(valid_encodings))
            self.logger.exception(message)
            raise UploadCheckError(message)

        return encoding.upper()

예제 #2

0

파일 보기

파일: clean.py 프로젝트: dongyi-kim/ajou.datastructure.2018.spring

def convert_encoding(student_path):
    temp_path = os.path.join(student_path, "__submit__")
    for file_name in os.listdir(temp_path):
        file_path = os.path.join(temp_path, file_name)
        if file_name.lower().endswith('.c') or file_name.lower().endswith(
                '.cpp') or file_name.lower().endswith('.cc'):
            detector = UniversalDetector()
            detector.reset()
            with open(os.path.join(temp_path, file_name), 'rb') as fin:
                for line in fin.readlines():
                    detector.feed(line)
                    if detector.done:
                        break

            detector.close()
            source_encoding = detector.result['encoding']
            target_encoding = 'utf-8'
            if source_encoding == target_encoding:
                continue

            data = None
            with codecs.open(file_path, 'r', source_encoding) as fin:
                data = fin.read()

            if not data:
                print(file_path)
                continue

            with codecs.open(file_path, 'w', target_encoding) as fout:
                fout.write(data)

예제 #3

0

파일 보기

def switch_char(filenames):
    detector = UniversalDetector()
    try:
        for filename in filenames:
            detector.reset()
        for line in open(filename, 'rb'):
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        file_encod = detector.result['encoding']
        print(filename + '的文件编码是：' + file_encod)
        if file_encod is None:
            file_encod = 'utf-8'
        elif file_encod == 'ascii':
            pass
        elif file_encod != 'utf-8':
            with open(filename, 'rb') as fr:
                lines = fr.readlines()
            with open(filename, 'w', encoding='utf-8') as fw:
                for lineb in lines:
                    if lineb != None:
                        linestr = lineb.decode(encoding=file_encod)
                        fw.write(linestr)
    except Exception as e:
        print(e)
        return False
    return True

예제 #4

0

파일 보기

파일: file_type_util.py 프로젝트: yyellin/path-to-re

    def determine_encoding(file_path):
        """
        based on https://stackoverflow.com/questions/46037058/using-chardet-to-find-encoding-of-very-large-file

        Parameters
        ----------
        """
        # encoding = 'ascii'
        encoding = 'utf-8'
        detector = UniversalDetector()
        detector.reset()
        with open(file_path, 'rb') as f:

            for count, row in enumerate(f, start=1):
                detector.feed(row)
                if detector.done:
                    encoding = detector.result['encoding']
                    break

                if count == 3:
                    break

        detector.close()

        return encoding

예제 #5

0

파일 보기

def detect_character_code(file_list):
    """
    file_listから該当するファイルの文字コードを判別して
    ファイル名と文字コードのdictを返す

    :param file_list: 文字コードを判別したいファイルリスト
    :return: ファイル名がキー、文字コードが値のdict
    """

    files_code_dic = {}
    detector = UniversalDetector()
    for file in file_list:
        with open(file, 'rb') as f_in:
            detector.reset()
            for line in f_in.readlines():
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            files_code_dic[file] = detector.result['encoding']
    '''
    テストため、取得されたファイルをb.textに出力する
    '''
    # base_dir = os.path.dirname(__file__)
    # out_file = os.path.join(base_dir, 'bbb.txt')
    # with open(out_file, 'a+', encoding='utf-8') as f_out:
    #     for key, value in files_code_dic.items():
    #         f_out.write('{0}\t{1}'.format(key, value))
    #         f_out.write('\n')

    return files_code_dic

예제 #6

0

파일 보기

    def convertToUTF8(filename, out_enc='utf-8'):
        (filepath, name) = os.path.split(filename)
        try:
            f = open(filename, 'rb')
            b = b' '
            b += f.read(1024)

            u = UniversalDetector()
            u.reset()
            u.feed(b)
            u.close()

            f.seek(0)

            b = f.read()
            f.close()

            in_enc = u.result['encoding']

            if 'utf-8' != in_enc:
                new_content = b.decode(in_enc, 'ignore')
                f = open(filename, 'w', encoding=out_enc)
                f.write(new_content)
                f.close()
            #print('Success:' + filename + ' converted from ' + in_enc + ' to ' + out_enc)
        except IOError:
            print('Error:' + filename + ' failed to convert from ' + in_enc +
                  ' to ' + out_enc)
        finally:
            f.close()

예제 #7

0

파일 보기

파일: enron_emails_workflow.py 프로젝트: trobbins70/dsc650

def read_email(email_path):
    detector = UniversalDetector()
    result = {}
    with open(email_path, 'rb') as fp:
        msg = email.message_from_binary_file(fp, policy=default)
    try:
        with open(email_path) as f:
            original = f.read()
    except UnicodeDecodeError:
        detector.reset()
        with open(email_path, 'rb') as f:
            for line in f.readlines():
                detector.feed(line)
                if detector.done:
                    break
        detector.close()
        encoding = detector.result['encoding']
        with open(email_path, encoding=encoding) as f:
            original = f.read()
    result['original_msg'] = original
    result['payload'] = msg.get_payload()
    result['text'] = parse_html_payload(result['payload'])
    try:
        for key, value in msg.items():
            result[key] = value
    except Exception as e:
        logger.error('Problem parsing email: {}\n{}'.format(email_path, e))
    try:
        result['Date'] = dateparser.parse(result['Date']).isoformat()
    except Exception as e:
        logger.error('Problem converting date: {}\n{}'.format(
            result.get('date'), e))
    return result

예제 #8

0

파일 보기

def Convert_Auto(filename, out_enc="utf-8"):
    ''' Re-encode text file with auto detec current encode. Need chardet Lib.
        Input Parameter:
        filename: full path and file name, e.g. c:/dir1/file.txt
        out_enc: new encode. Default as 'utf-8'
        Output Parameter
        None'''
    try:
        with open(filename, 'rb') as f:
            b = b' '
            b += f.read(1024)
            u = UniversalDetector()
            u.reset()
            u.feed(b)
            u.close()
            f.seek(0)
            b = f.read()
            in_enc = u.result['encoding']
            new_content = b.decode(in_enc, 'ignore')
        with open(filename, 'w', encoding=out_enc) as f:
            f.write(new_content)

        print("Success: " + filename + " converted from " + in_enc + " to " +
              out_enc + " !")
    except IOError:
        print("Error: " + filename + " FAIL to converted from " + in_enc +
              " to " + out_enc + " !")

예제 #9

0

파일 보기

파일: Encoding.py 프로젝트: Debug-Orz/thug

class Encoding:
    def __init__(self):
        self.detector = UniversalDetector()

    def detect(self, data):
        self.detector.reset()
        self.detector.feed(data)
        self.detector.close()
        return self.detector.result

예제 #10

0

파일 보기

파일: chardetection.py 프로젝트: geritwagner/tools

def detect_charset(filename):
  detector = UniversalDetector()
  detector.reset()
  for line in open(filename, 'rb'):
    detector.feed(line)
    if detector.done:
      break
  detector.close()
  return detector.result

예제 #11

0

파일 보기

파일: iconvfiles.py 프로젝트: maximdresvyankin/PythonFiles

def detectcharset(filename):
    #Function for detect file charset
    detector = UniversalDetector()
    detector.reset()
    for line in file(filename, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    return(detector.result)

예제 #12

0

파일 보기

    def get_charset(fp):
        detector = UniversalDetector()

        detector.reset()
        for line in file(fp, 'rb'):
            detector.feed(line)
            if detector.done: break
        detector.close()
        return detector.result

예제 #13

0

파일 보기

class Encoding:
    def __init__(self):
        self.detector = UniversalDetector()

    def detect(self, data):
        self.detector.reset()
        self.detector.feed(data)
        self.detector.close()
        return self.detector.result

예제 #14

0

파일 보기

파일: Dashboard_Updater.py 프로젝트: francoisdell/personal

def get_encoding_type(current_file):
    from chardet.universaldetector import UniversalDetector
    detector = UniversalDetector()
    detector.reset()
    for line in open(current_file, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    print(current_file.split('\\')[-1] + ": " + detector.result['encoding'])
    return detector.result['encoding']

예제 #15

0

파일 보기

def check_chart(path):
    detector = UniversalDetector()
    detector.reset()
    for each in open(path, 'rb'):
        detector.feed(each)
        if detector.done:
            break
    detector.close()
    fileencoding = detector.result['encoding']
    return fileencoding

예제 #16

0

파일 보기

파일: opencc with GUI-V1.0.py 프로젝트: WimHan/opencc-with-GUI

def detect_encoding(file_path):
    # Detect file encoding
    detector = UniversalDetector()
    detector.reset()
    with open(file_path, mode='rb') as f:
        for b in f:
            detector.feed(b)
            if detector.done: break
    detector.close()
    return detector.result

예제 #17

0

파일 보기

파일: pdf_body_creator.py 프로젝트: eddiechapman/zb-files

def determine_encoding(filename):
    detector = UniversalDetector()
    with open(filename + '.txt', 'rb') as infile:
        detector.reset()
        for line in infile:
            detector.feed(line)
            if detector.done:  # detection process ends automatically when confidence is high enough
                break
        detector.close()
        return detector.result

예제 #18

0

파일 보기

파일: config.py 프로젝트: rishubil/TrickyTowersUtils

def detect_encoding():
    with open(CONFIG_FILE, 'rb') as f:
        detector = UniversalDetector()
        detector.reset()
        for line in f.readlines():
            detector.feed(line)
            if detector.done:
                break
        detector.close()
    return detector.result['encoding']

예제 #19

0

파일 보기

파일: techfunction.py 프로젝트: gubkin-utility/cinderella

def detect_encode(file):
    from chardet.universaldetector import UniversalDetector
    detector = UniversalDetector()
    detector.reset()
    with open(file, 'rb') as f:
        for row in f:
            detector.feed(row)
            if detector.done: break
    detector.close()
    return detector.result.get('encoding')

예제 #20

0

파일 보기

def getFileEncoding(file_name):
    detector = UniversalDetector()
    detector.reset()
    with open(file_name, 'rb') as f:
        for line in f:
            detector.feed(line)
            if detector.done:
                break
    detector.close()
    return detector.result

예제 #21

0

파일 보기

파일: skipngram.py 프로젝트: wordgod123/cs224n

def get_coding(filename):
    detector = UniversalDetector()
    detector.reset()
    for line in open(filename, "rb"):
        detector.feed(line)
        if detector.done: break
    detector.close()
    result = detector.result
    # print(result)
    return result["encoding"]

예제 #22

0

파일 보기

파일: Encoding_detector.py 프로젝트: hochmuth/SQL_convert_generator

def get_encoding(file_path):
    detector = UniversalDetector()
    detector.reset()
    for line in open(file_path, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    if detector.result['confidence'] == 1.0:
        return str(detector.result['encoding'])
    else:
        return 'Unknown encoding'

예제 #23

0

파일 보기

def get_encoding_type(current_file: str):
    print("Determining file encoding: [%s]" % current_file, 'info')
    detector = UniversalDetector()
    detector.reset()
    for line in open(current_file, 'rb'):
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    print(current_file.split('\\')[-1] + ": " + detector.result['encoding'])
    return detector.result['encoding']

예제 #24

0

파일 보기

파일: encoding_detect.py 프로젝트: dajima/purepythontest

def detectfiledir(filedir, separator):
    detector = UniversalDetector()
    for filename in glob.glob(filedir):
        print(filename + "=", )
        detector.reset()
        for line in open(filename, 'rb'):
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        print(detector.result.get('encoding'))

예제 #25

0

파일 보기

파일: utils.py 프로젝트: digininja/wfuzz

    def detect_encoding(file_path):
        detector = UniversalDetector()
        detector.reset()

        with open(file_path, mode='rb') as file_to_detect:
            for line in file_to_detect:
                detector.feed(line)
                if detector.done:
                    break
        detector.close()

        return detector.result

예제 #26

0

파일 보기

def autoDetectEncoding(txtFilePath):
    with open(txtFilePath) as f:
        detector = UniversalDetector()
        whole_time = 300
        while True:
            counter = 0
            print '=' * 20 + "Auto detect encoding method!" + '=' * 20
            #file_size = os.stat(txtFilePath).st_size
            #print file_size
            read_size = 0
            print "file:" + txtFilePath
            bar = ProgressBar(
                widgets=[Bar('=', '[', ']'), ' ',
                         Percentage(), ' ',
                         ETA()],
                maxval=1).start()
            start_time = clock()
            for line in f.readlines():
                detector.feed(line)
                #read_size += sys.getsizeof(line)
                spend_time = clock() - start_time
                bar.update(int(spend_time) / whole_time)
                if spend_time > whole_time:
                    break
                #if counter%1000 == 0:
                #    print "lineNum:" + str(counter)
                #counter += 1
                if detector.done:
                    break
            print '=' * 20 + "Detecting done!" + '=' * 20
            #print float(detector.result['confidence'])
            #print detector.done
            #detector.close()
            if not detector.done:
                if f.readline():
                    detector.close()
                    if float(detector.result['confidence']) < 0.9:
                        print detector.result
                        print "Get more confidence!"
                        detector.reset()
                        f.seek(0)
                        whole_time += 10
                    else:
                        break
                else:
                    break
            else:
                break

        f.close()
        with open('./output/detecEncodingMethod.txt', 'a') as f1:
            f1.write('\n' + txtFilePath + '\n' + str(detector.result))
        return detector.result

예제 #27

0

파일 보기

파일: datauploader.py 프로젝트: d8agroup/opendatacore

 def __init__(self, f, encoding=None, **kwargs):
     if not encoding:
         chardet_detector = UniversalDetector()
         chardet_detector.reset()
         chunk = f.read(self.chunk_size)
         chardet_detector.feed(chunk)
         chardet_detector.close()
         chardet_encoding = chardet_detector.result['encoding']
         encoding = chardet_encoding if chardet_encoding and not chardet_encoding == 'ascii' else 'utf-8'
         f.seek(0)
     self.csv_reader = csv.reader(f, **kwargs)
     self.encoding = encoding

예제 #28

0

파일 보기

    def detect_encoding(file_path):
        detector = UniversalDetector()
        detector.reset()

        with open(file_path, mode="rb") as file_to_detect:
            for line in file_to_detect:
                detector.feed(line)
                if detector.done:
                    break
        detector.close()

        return detector.result

예제 #29

0

파일 보기

def get_encoding_type(pathname):  # 인코딩을 확인한다.
    detector = UniversalDetector()
    detector.reset()
    try:
        with open(pathname, 'rb') as fp:
            for line in fp:
                detector.feed(line)
                if detector.done:
                    break
    except FileNotFoundError:
        return str('FileNotFoundError')
    detector.close()
    return detector.result['encoding']

예제 #30

0

파일 보기

def detectEncodeOfMFiles(filenames):
    from chardet.universaldetector import UniversalDetector

    m_files_encodes = dict()
    detector = UniversalDetector()
    for filename in filenames:
        detector.reset()
        for line in open(filename, 'rb'):
            detector.feed(line)
            if detector.done: break
        detector.close()
        m_files_encodes[filename] = detector.result['encoding']
    return m_files_encodes

예제 #31

0

파일 보기

def get_file_encoding(filename):
    """autodetects the encoding of the given file"""
    detector = UniversalDetector()
    detector.reset()
    i = 0
    for line in file(filename, 'rb'):
        detector.feed(line)
        i += 1
        #done or hit max lines (it takes ages to read a large file)
        if detector.done or i >= 2000:
            break
    detector.close()
    return detector.result

예제 #32

0

파일 보기

파일: utils.py 프로젝트: xmendez/wfuzz

    def detect_encoding(self):
        detector = UniversalDetector()
        detector.reset()

        for line in self.file_des:
            detector.feed(line)
            self.cache.append(line)
            if detector.done:
                break

        detector.close()

        return detector.result

예제 #33

0

파일 보기

    def detect_encoding(self):
        detector = UniversalDetector()
        detector.reset()

        for line in self.file_des:
            detector.feed(line)
            self.cache.append(line)
            if detector.done:
                break

        detector.close()

        return detector.result

예제 #34

0

파일 보기

def detect_encoding(filename):
    # Detector object for encoding detection
    detector = UniversalDetector()

    # Determining encoding information, adding as a key-value pair to record's metadata dict entry
    with open(filename + '.txt', 'rb') as infile:
        detector.reset()
        for line in infile:
            detector.feed(line)
            if detector.done:  # detection process ends automatically when confidence is high enough
                break
        detector.close()
        return detector.result['encoding']

예제 #35

0

파일 보기

파일: util.py 프로젝트: leek120/P

def detectEncodeOfMFiles(filenames):
    from chardet.universaldetector import UniversalDetector
    
    m_files_encodes = dict()
    detector = UniversalDetector()
    for filename in filenames:
        detector.reset()
        for line in open(filename, 'rb'):
            detector.feed(line)
            if detector.done: break
        detector.close()
        m_files_encodes[filename] = detector.result['encoding']
    return m_files_encodes

예제 #36

0

파일 보기

파일: comparetitlelists.py 프로젝트: cmccluskey/misc

def get_encoding(filename):
    detector = UniversalDetector()
    detector.reset()
    for line in open(filename, 'rb'):
        detector.feed(line)
        if detector.done: break
    detector.close()
    if detector.result['encoding']:
        temp_encoding = detector.result['encoding'].lower()
        if "utf" in temp_encoding:
            temp_encoding = temp_encoding.replace("-", "_")
        return temp_encoding
    else:
        return None

예제 #37

0

파일 보기

def detcect_encoding_v2(filepath):
	detector = UniversalDetector()
	detector.reset()
	for each in open(filepath, 'rb'):
		detector.feed(each)
		if detector.done:
			break
	detector.close()
	fileencoding = detector.result['encoding']
	confidence = detector.result['confidence']
	if fileencoding is None:
		fileencoding = 'unknown'
		confidence = 0.99
	return fileencoding, confidence * 100

예제 #38

0

파일 보기

파일: txtencoding.py 프로젝트: tilusnet/thqpylib

class TxtEncoding:
    def __init__(self):
        # inspired by https://chardet.readthedocs.org/en/latest/usage.html#example-detecting-encodings-of-multiple-files
        self.detector = UniversalDetector()

    def detectEncoding(self, fname):
        '''Detect the encoding of file fname.
        Returns a dictionary with {'encoding', 'confidence'} fields.'''
        self.detector.reset()
        with open(fname, 'rb') as f:
            for line in f:
                self.detector.feed(line)
                if self.detector.done: break
        self.detector.close()
        return self.detector.result

예제 #39

0

파일 보기

파일: filters.py 프로젝트: sairuk/wahcade

def get_file_encoding(filename):
    """autodetects the encoding of the given file"""
    detector = UniversalDetector()
    detector.reset()
    i = 0
    if not os.path.exists(os.path.expanduser(filename)):
        return "ascii"
    for line in file(filename, "rb"):
        detector.feed(line)
        i += 1
        # done or hit max lines (it takes ages to read a large file)
        if detector.done or i >= 2000:
            break
    detector.close()
    return detector.result

예제 #40

0

파일 보기

파일: to_utf8.py 프로젝트: carriercomm/mini.tools

def is_utf8( fi ):
    '''
    try to detect if a file is utf_8 using chardet
    '''
    ff = open(fi, 'r')
    detector = UniversalDetector()
    detector.reset()
    for line in ff.readlines():
        detector.feed(line)
        if detector.done: break
    detector.close()
    ff.close()
    if detector.result['encoding'] == 'utf-8':
        return True
    else:
        return None

예제 #41

0

파일 보기

class Encoding(object):
    def __init__(self):
        self.detector = UniversalDetector()

    def _detect(self, data):
        self.detector.reset()
        self.detector.feed(data)
        self.detector.close()
        return self.detector.result

    def detect(self, data, safe = False):
        try:
            return self._detect(data)
        except:
            if safe:
                return None

            raise

예제 #42

0

파일 보기

파일: CSV.py 프로젝트: BackupTheBerlios/griffith-svn

	def set_source(self, name):
		import os
		# source _dependent_ initialization goes here
		if name is None or not os.path.isfile(name):
			return False
		self.__source_name = name
		# auto-detect file-encoding (optional)
		try:
			from chardet.universaldetector import UniversalDetector
			detector = UniversalDetector()
			detector.reset()
			lines = 0
			for line in file(self.__source_name, 'rb'):
				detector.feed(line)
				lines += 1
				if detector.done or lines == 50:
					break
			detector.close()
			encoding = string.replace( string.lower( detector.result['encoding'] ), '-', '' )
		except:
			encoding = 'utf_8'
		# remove - and _ for better detection
		encoding = string.replace( encoding, '_', '' )
		
		model	= self.gtk.get_widget('e_encoding').get_model()
		itempos	= 0
		for item in model:
			pos1 = string.find( string.replace( string.lower(str(item[0])), '_', '' ) , encoding )
			if pos1 == 0:
				break
			itempos += 1
		self.gtk.get_widget('e_encoding').set_active(itempos)
		
		# run dialog
		response = self.gtk.get_widget('d_import').run()
		if response == gtk.RESPONSE_OK:
			return True
		else:
			return False

예제 #43

0

파일 보기

파일: CSV.py 프로젝트: BackupTheBerlios/griffith-svn

 def set_source(self, name):
     # source _dependent_ initialization goes here
     if name is None or not os.path.isfile(name):
         return False
     self.__source_name = name
     # auto-detect file-encoding (optional)
     try:
         from chardet.universaldetector import UniversalDetector
         detector = UniversalDetector()
         detector.reset()
         lines = 0
         for line in file(self.__source_name, 'rb'):
             detector.feed(line)
             lines += 1
             if detector.done or lines == 50:
                 break
         detector.close()
         encoding = string.lower(detector.result['encoding'])
     except:
         log.exception('')
         encoding = 'utf_8'
     encoding = self._encoding_cleanup.sub('', encoding)
     model = self.gtk.get_widget('e_encoding').get_model()
     itempos = 0
     for item in model:
         pos1 = string.find(self._encoding_cleanup.sub('', string.lower(str(item[0]))), encoding)
         if pos1 == 0:
             break
         itempos += 1
     self.gtk.get_widget('e_encoding').set_active(itempos)
     # auto-detect CSV import settings (optional)
     try:
         import csv
         sniffer = csv.Sniffer()
         csvfilesize = os.path.getsize(self.__source_name)
         if csvfilesize > 65535:
             csvfilesize = 65535
         csvfile = file(self.__source_name, 'rb')
         try:
             # quote char, line terminator and field delimiter
             proposed_dialect = sniffer.sniff(csvfile.read(csvfilesize))
             self.gtk.get_widget('e_delimiter').set_text(proposed_dialect.delimiter)
             self.gtk.get_widget('e_quotechar').set_text(proposed_dialect.quotechar)
             if proposed_dialect.lineterminator == '\r\n':
                 self.gtk.get_widget('e_lineterminator').set_active(1)
             # first row with column headers
             csvfile.seek(0)
             if sniffer.has_header(csvfile.read(csvfilesize)):
                 self.gtk.get_widget('e_startrow').set_text('1')
             else:
                 self.gtk.get_widget('e_startrow').set_text('0')
         finally:
             csvfile.close()
     except:
         log.exception('')
     # run dialog
     response = self.gtk.get_widget('d_import').run()
     if response == gtk.RESPONSE_OK:
         return True
     else:
         return False

예제 #44

0

파일 보기

파일: codespell.py 프로젝트: kylelutz/codespell

class FileOpener:
    def __init__(self, use_chardet):
        self.use_chardet = use_chardet
        if use_chardet:
            self.init_chardet()

    def init_chardet(self):
        try:
            from chardet.universaldetector import UniversalDetector
        except ImportError:
            raise Exception("There's no chardet installed to import from. "
                            "Please, install it and check your PYTHONPATH "
                            "environment variable")

        self.encdetector = UniversalDetector()

    def open(self, filename):
        if self.use_chardet:
            return self.open_with_chardet(filename)
        else:
            return self.open_with_internal(filename)

    def open_with_chardet(self, filename):
        self.encdetector.reset()
        with open(filename, 'rb') as f:
            for line in f:
                self.encdetector.feed(line)
                if self.encdetector.done:
                    break
        self.encdetector.close()
        encoding = self.encdetector.result['encoding']

        try:
            f = open(filename, encoding=encoding)
            lines = f.readlines()
        except UnicodeDecodeError:
            print('ERROR: Could not detect encoding: %s' % filename,
                                                        file=sys.stderr)
            raise
        except LookupError:
            print('ERROR: %s -- Don\'t know how to handle encoding %s'
                                % (filename, encoding), file=sys.stderr)
            raise
        finally:
            f.close()

        return lines, encoding


    def open_with_internal(self, filename):
        curr = 0
        global encodings

        while True:
            try:
                f = open(filename, 'r', encoding=encodings[curr])
                lines = f.readlines()
                break
            except UnicodeDecodeError:
                if not quiet_level & QuietLevels.ENCODING:
                    print('WARNING: Decoding file %s' % filename,
                                                        file=sys.stderr)
                    print('WARNING: using encoding=%s failed. '
                                                        % encodings[curr],
                                                        file=sys.stderr)
                    print('WARNING: Trying next encoding: %s' % encodings[curr],
                                                        file=sys.stderr)

                curr += 1

            finally:
                f.close()

        if not lines:
            print('ERROR: Could not detect encoding: %s' % filename,
                                                        file=sys.stderr)
            raise Exception('Unknown encoding')

        encoding = encodings[curr]

        return lines, encoding

예제 #45

0

파일 보기

파일: chardet-compatible-multiple-files.py 프로젝트: Joungkyun/python-chardet

	'https://raw.githubusercontent.com/BYVoid/uchardet/master/test/zh/big5.txt',
	'https://raw.githubusercontent.com/BYVoid/uchardet/master/test/zh/euc-tw.txt',
	'https://raw.githubusercontent.com/BYVoid/uchardet/master/test/zh/gb18030.txt',
	'https://raw.githubusercontent.com/BYVoid/uchardet/master/test/zh/utf-8.txt'
]

detector = UniversalDetector()

for url in urls :
	purl = os.path.basename (url)
	print ("URL %-20s : " % purl, end=""),

	try :
		usock = urlopen(url)

		detector.reset ();
		for line in usock.readlines():
			detector.feed(line)
			if detector.done: break

		detector.close()
		usock.close()

		# det member => encoding(string), confidence(.2f)
		print ("encoding: %-15s, confidence: %.2f" % (detector.result['encoding'], detector.result['confidence']))
	except HTTPError as e :
		print (e)

sys.exit (0)

#

예제 #46

0

파일 보기

파일: py-usefulmod.py 프로젝트: wocin/stu

#!/usr/bin/env python
#coding=utf-8
'''
chardet
#检测字符编码
glob
'''
from chardet.universaldetector import UniversalDetector as UD
detect=UD()

for filename in glob.glob('*.py'):
	print filename.ljust(60)
	detect.reset()
	for line in file(filename,'rb'):
		detect.feed(line)
		if detect.done:
			break
		detect.close()
	print detect.result
#----------------------------------------------------------------------
'''
epydoc
@author: 									作者
@license: 									版权
@contact: 									联系
@todo: 										改进
@version: 									版本
@var v: 										模块变量V说明
@type v: 										模块变量类型V说明
@param p: 									参数P说明
@type v: 										参数P类型说明

예제 #47

0

파일 보기

파일: eg.py 프로젝트: thuleqaid/pytools

class EncodingGrep(object):
    def __init__(self):
        self.__options = self.parseCmd()
        try:
            from chardet.universaldetector import UniversalDetector
            self._detector = UniversalDetector()
        except:
            self._detector = None
        self._pat = None
        self.action()

    def action(self):
        # Todo List:
        # -r -n -H -e --exclude-dir --exclude --include --encoding
        if not self._detector:
            if self.__options.guess:
                print('Warning: Please run "pip install chardet" to enable coding detection.')
                self.__options.guess = False
        # construct search pattern
        patlist = []
        if self.__options.file:
            # read patterns from file
            with open(self.__options.file, 'rb') as fh:
                data = fh.read()
            if self._detector:
                self._detector.reset()
                self._detector.feed(data)
                self._detector.close()
                data = data.decode(
                    self._detector.result['encoding'] or 'UTF-8',
                    errors='ignore')
            else:
                data = data.decode('UTF-8', errors='ignore')
            patlist = [x for x in data.splitlines() if len(x) > 0]
        else:
            patlist = [x for x in self.__options.regexp if len(x) > 0]
        if self.__options.word_regexp:
            patlist = [r'\b' + x + r'\b' for x in patlist]
        elif self.__options.line_regexp:
            patlist = [r'^' + x + r'$' for x in patlist]
        pattxt = '|'.join(patlist)
        if self.__options.ignore_case:
            self._pat = re.compile(pattxt.encode('utf-8'), re.I)
        else:
            self._pat = re.compile(pattxt.encode('utf-8'))
        # search
        for item in self.__options.glob:
            self.grepDir(os.path.abspath(item))

    def grepDir(self, path):
        if os.path.exists(path):
            for item in os.listdir(path):
                fullpath = os.path.join(path, item)
                if os.path.isdir(fullpath):
                    if self.__options.recursive:
                        for subitem in self.__options.exclude_dir:
                            if fnmatch.fnmatch(item, subitem):
                                break
                        else:
                            self.grepDir(fullpath)
                else:
                    for subitem in self.__options.exclude:
                        if fnmatch.fnmatch(item, subitem):
                            break
                    else:
                        for subitem in self.__options.include:
                            if fnmatch.fnmatch(item, subitem):
                                self.grepFile(fullpath)
                                break

    def grepFile(self, path):
        # read file with correct encoding
        if self.__options.guess or len(self.__options.encoding) > 0:
            with open(path, 'rb') as fh:
                data = fh.read()
            if self.__options.guess:
                self._detector.reset()
                self._detector.feed(data)
                self._detector.close()
                data = data.decode(
                    self._detector.result['encoding'] or 'UTF-8',
                    errors='ignore')
            else:
                for testcode in self.__options.encoding:
                    try:
                        data.decode(testcode)
                        data = data.decode(testcode, errors='ignore')
                        break
                    except:
                        pass
                else:
                    data = data.decode('utf-8', errors='ignore')
        else:
            with open(path, 'r', errors='ignore') as fh:
                data = fh.read()
        # save hit line index
        data = data.splitlines()
        matchlist = []
        for idx, line in enumerate(data):
            if self._pat.search(line.encode('utf-8')):
                matchlist.append(idx)
        # invert matchlist when --invert_match is set
        if self.__options.invert_match:
            matchlist = [x for x in range(len(data)) if x not in matchlist]
        # output result
        for item in matchlist:
            outline = '{path}:{line}:{code}'.format(
                path=path, line=item + 1, code=data[item])
            if self.__options.stdout:
                outline = outline.encode(sys.stdout.encoding, 'ignore').decode(
                    sys.stdout.encoding, 'ignore')
            print(outline)

    def parseCmd(self):
        parser = ArgumentParser(add_help=False)
        parser.add_argument('glob', nargs='*', help='File(s)/Dir(s)')
        # Generic Program Information
        parser.add_argument(
            '--help',
            dest='help',
            action='store_true',
            default=False,
            help=
            'Print a usage message briefly summarizing the command-line options and the bug-reporting address, then exit.'
        )
        parser.add_argument(
            '-V',
            '--version',
            dest='version',
            action='store_true',
            default=False,
            help=
            'Print the version number of eg to the standard output stream.')
        # Matching Control
        parser.add_argument(
            '-e',
            '--regexp',
            dest='regexp',
            action='append',
            default=[],
            help='Search patterns')
        parser.add_argument(
            '-f',
            '--file',
            dest='file',
            action='store',
            help=
            'Obtain patterns from file, one per line. The empty file contains zero patterns, and therefore matches nothing.'
        )
        parser.add_argument(
            '-i',
            '--ignore-case',
            dest='ignore_case',
            action='store_true',
            default=False,
            help=
            'Ignore case distinctions in both the patterns and the input files.'
        )
        parser.add_argument(
            '-v',
            '--invert-match',
            dest='invert_match',
            action='store_true',
            default=False,
            help='Invert the sense of matching, to select non-matching lines.')
        parser.add_argument(
            '-w',
            '--word-regexp',
            dest='word_regexp',
            action='store_true',
            default=False,
            help=
            'Select only those lines containing matches that form whole words.')
        parser.add_argument(
            '-x',
            '--line-regexp',
            dest='line_regexp',
            action='store_true',
            default=False,
            help=
            'Select only those matches that exactly match the whole line.')
        # General Output Control
        parser.add_argument(
            '-c',
            '--count',
            dest='count',
            action='store_true',
            default=False,
            help=
            'Suppress normal output; instead print a count of matching lines for each input file. With the ‘-v’, ‘--invert-match’ option, count non-matching lines.'
        )
        parser.add_argument(
            '--color',
            dest='color',
            action='store',
            default='never',
            help=
            'Surround the matched (non-empty) strings, matching lines, context lines, file names, line numbers, byte offsets, and separators (for fields and groups of context lines) with escape sequences to display them in color on the terminal. The colors are defined by the environment variable GREP_COLORS and default to ‘ms=01;31:mc=01;31:sl=:cx=:fn=35:ln=32:bn=32:se=36’ for bold red matched text, magenta file names, green line numbers, green byte offsets, cyan separators, and default terminal colors otherwise. COLOR is ‘never’, ‘always’, or ‘auto’.'
        )
        parser.add_argument(
            '-L',
            '--files-without-match',
            dest='files_without_match',
            action='store_true',
            default=False,
            help=
            'Suppress normal output; instead print the name of each input file from which no output would normally have been printed. The scanning of every file will stop on the first match.'
        )
        parser.add_argument(
            '-l',
            '--files-with-matches',
            dest='files_with_matches',
            action='store_true',
            default=False,
            help=
            'Suppress normal output; instead print the name of each input file from which output would normally have been printed. The scanning of every file will stop on the first match.'
        )
        parser.add_argument(
            '-m',
            '--max-count',
            dest='max_count',
            action='store',
            type=int,
            default=-1,
            help=
            'Stop reading a file after num matching lines. If the input is standard input from a regular file, and num matching lines are output, grep ensures that the standard input is positioned just after the last matching line before exiting, regardless of the presence of trailing context lines. This enables a calling process to resume a search.'
        )
        parser.add_argument(
            '-o',
            '--only-matching',
            dest='only_matching',
            action='store_true',
            default=False,
            help=
            'Print only the matched (non-empty) parts of matching lines, with each such part on a separate output line.'
        )
        parser.add_argument(
            '-q',
            '--quiet',
            dest='quiet',
            action='store_true',
            default=False,
            help=
            'Quiet; do not write anything to standard output. Exit immediately with zero status if any match is found, even if an error was detected.'
        )
        parser.add_argument(
            '-s',
            '--no-message',
            dest='no_message',
            action='store_true',
            default=False,
            help=
            'Suppress error messages about nonexistent or unreadable files.')
        # Output Line Prefix
        parser.add_argument(
            '-b',
            '--byte-offset',
            dest='byte_offset',
            action='store_true',
            default=False,
            help=
            'Print the 0-based byte offset within the input file before each line of output. If ‘-o’ (‘--only-matching’) is specified, print the offset of the matching part itself. '
        )
        parser.add_argument(
            '-H',
            '--with-filename',
            dest='with_filename',
            action='store_true',
            default=False,
            help=
            'Print the file name for each match. This is the default when there is more than one file to search.'
        )
        parser.add_argument(
            '-h',
            '--no-filename',
            dest='no_filename',
            action='store_true',
            default=False,
            help=
            'Suppress the prefixing of file names on output. This is the default when there is only one file (or only standard input) to search.'
        )
        parser.add_argument(
            '--label',
            dest='label',
            action='store',
            help=
            'Display input actually coming from standard input as input coming from file LABEL.'
        )
        parser.add_argument(
            '-n',
            '--line-number',
            dest='line_number',
            action='store_true',
            default=False,
            help=
            'Prefix each line of output with the 1-based line number within its input file.'
        )
        parser.add_argument(
            '-T',
            '--initial-tab',
            dest='initial_tab',
            action='store_true',
            default=False,
            help=
            'Make sure that the first character of actual line content lies on a tab stop, so that the alignment of tabs looks normal. This is useful with options that prefix their output to the actual content: ‘-H’, ‘-n’, and ‘-b’. In order to improve the probability that lines from a single file will all start at the same column, this also causes the line number and byte offset (if present) to be printed in a minimum-size field width. '
        )
        parser.add_argument(
            '-u',
            '--unix-byte-offsets',
            dest='unix_byte_offsets',
            action='store_true',
            default=False,
            help='Report Unix-style byte offsets.')
        parser.add_argument(
            '-Z',
            '--null',
            dest='null',
            action='store_true',
            default=False,
            help=
            'Output a zero byte (the ASCII NUL character) instead of the character that normally follows a file name.'
        )
        # Context Line Control
        parser.add_argument(
            '-A',
            '--after-context',
            dest='after_context',
            action='store',
            default=0,
            type=int,
            help='Print num lines of trailing context after matching lines.')
        parser.add_argument(
            '-B',
            '--before-context',
            dest='before_context',
            action='store',
            default=0,
            type=int,
            help='Print num lines of leading context before matching lines.')
        parser.add_argument(
            '-C',
            '--context',
            dest='context',
            action='store',
            default=0,
            type=int,
            help='Print num lines of leading and trailing output context.')
        # File and Directory Selection
        parser.add_argument(
            '-a',
            '--text',
            dest='text',
            action='store_true',
            default=False,
            help=
            'Process a binary file as if it were text; this is equivalent to the ‘--binary-files=text’ option. '
        )
        parser.add_argument(
            '--binary-files',
            dest='binary_files',
            action='store',
            help=
            'If the first few bytes of a file indicate that the file contains binary data, assume that the file is of type type. By default, type is ‘binary’, and grep normally outputs either a one-line message saying that a binary file matches, or no message if there is no match. If type is ‘without-match’, grep assumes that a binary file does not match; this is equivalent to the ‘-I’ option. If type is ‘text’, grep processes a binary file as if it were text; this is equivalent to the ‘-a’ option. Warning: ‘--binary-files=text’ might output binary garbage, which can have nasty side effects if the output is a terminal and if the terminal driver interprets some of it as commands. '
        )
        parser.add_argument(
            '-D',
            '--devices',
            dest='devices',
            action='store',
            help=
            'If an input file is a device, FIFO, or socket, use action to process it. By default, action is ‘read’, which means that devices are read just as if they were ordinary files. If action is ‘skip’, devices, FIFOs, and sockets are silently skipped. '
        )
        parser.add_argument(
            '-d',
            '--directories',
            dest='directories',
            action='store',
            help=
            'If an input file is a directory, use action to process it. By default, action is ‘read’, which means that directories are read just as if they were ordinary files (some operating systems and file systems disallow this, and will cause grep to print error messages for every directory or silently skip them). If action is ‘skip’, directories are silently skipped. If action is ‘recurse’, grep reads all files under each directory, recursively; this is equivalent to the ‘-r’ option. '
        )
        parser.add_argument(
            '--exclude',
            dest='exclude',
            action='append',
            default=[],
            help=
            'Skip files whose base name matches glob (using wildcard matching). A file-name glob can use ‘*’, ‘?’, and ‘[’...‘]’ as wildcards, and \ to quote a wildcard or backslash character literally. '
        )
        parser.add_argument(
            '--exclude-from',
            dest='exclude_from',
            action='append',
            default=[],
            help=
            'Skip files whose base name matches any of the file-name globs read from file (using wildcard matching as described under ‘--exclude’). '
        )
        parser.add_argument(
            '--exclude-dir',
            dest='exclude_dir',
            action='append',
            default=[],
            help=
            'Exclude directories matching the pattern dir from recursive directory searches. '
        )
        parser.add_argument(
            '-I',
            dest='I',
            action='store_true',
            default=False,
            help=
            'Process a binary file as if it did not contain matching data; this is equivalent to the ‘--binary-files=without-match’ option. '
        )
        parser.add_argument(
            '--include',
            dest='include',
            action='append',
            default=[],
            help=
            'Search only files whose base name matches glob (using wildcard matching as described under ‘--exclude’). '
        )
        parser.add_argument(
            '-r',
            '--recursive',
            dest='recursive',
            action='store_true',
            default=False,
            help=
            'For each directory mentioned on the command line, read and process all files in that directory, recursively. This is the same as the ‘--directories=recurse’ option. '
        )
        # Other Options
        parser.add_argument(
            '--line-buffered',
            dest='line_buffered',
            action='store_true',
            default=False,
            help=
            'Use line buffering on output. This can cause a performance penalty. '
        )
        parser.add_argument(
            '--mmap',
            dest='mmap',
            action='store_true',
            default=False,
            help=
            'If possible, use the mmap system call to read input, instead of the default read system call. In some situations, ‘--mmap’ yields better performance. However, ‘--mmap’ can cause undefined behavior (including core dumps) if an input file shrinks while grep is operating, or if an I/O error occurs. '
        )
        parser.add_argument(
            '-U',
            '--binary',
            dest='binary',
            action='store_true',
            default=False,
            help=
            'Treat the file(s) as binary. By default, under MS-DOS and MS-Windows, grep guesses the file type by looking at the contents of the first 32kB read from the file. If grep decides the file is a text file, it strips the CR characters from the original file contents (to make regular expressions with ^ and $ work correctly). Specifying ‘-U’ overrules this guesswork, causing all files to be read and passed to the matching mechanism verbatim; if the file is a text file with CR/LF pairs at the end of each line, this will cause some regular expressions to fail. This option has no effect on platforms other than MS-DOS and MS-Windows. '
        )
        parser.add_argument(
            '-z',
            '--null-data',
            dest='null_data',
            action='store_true',
            default=False,
            help=
            'Treat the input as a set of lines, each terminated by a zero byte (the ASCII NUL character) instead of a newline. Like the ‘-Z’ or ‘--null’ option, this option can be used with commands like ‘sort -z’ to process arbitrary file names. '
        )
        # Encoding Options
        parser.add_argument(
            '-E',
            '--encoding',
            dest='encoding',
            action='append',
            default=[],
            help='Encoding for reading text files.')
        parser.add_argument(
            '-G',
            '--guess',
            dest='guess',
            action='store_true',
            default=False,
            help='Guess encoding for text files.')
        parser.add_argument(
            '-S',
            '--stdout',
            dest='stdout',
            action='store_true',
            default=False,
            help='Encoding output text with system default encoding.')
        options = parser.parse_args()
        if options.help:
            parser.print_help()
            parser.exit()
        elif options.version:
            print('Version: 0.1')
            parser.exit()
        else:
            return options

예제 #48

0

파일 보기

파일: client.py 프로젝트: bioothod/greylock_elliptics

class content_parser(HTMLParser):
    def __init__(self, mailbox, id):
        HTMLParser.__init__(self)

        self.detector = UniversalDetector()

        # if this ID lives in elliptics
        self.id = id
        self.timestamp = 0

        # all indexes are related to given mailbox
        # if it is None, 'To' address is used
        self.set_mailbox(mailbox)

        self.words = set()
        self.attrs = set()

        self.encoding = ''
        self.url = re.compile('(\w+)(\.\w+)+(:\d+)?(/+\w+)+')
        self.host = re.compile('(\w+)(\.\w+)+(:\d+)?')
        self.mail = re.compile('(\w+)([\.!\-_\+]\w+)*@(\w+)([\.!\-_\+]\w+)*')

    def set_mailbox(self, mailbox):
        self.mailbox = mailbox

    def detect_encoding(self, text):
        self.detector.reset()
        self.detector.feed(text)
        self.detector.close()

        return self.detector.result['encoding']

    def set_encoding(self, enc):
        self.encoding = enc

    def set_encoding_from_email(self, msg):
        enc = msg.get_content_charset()
        if not enc:
            charset = msg.get_charset()
            if charset:
                enc = charset.input_codec

        self.set_encoding(enc)

    def recode(self, text):
        enc = self.encoding

        if not enc:
            enc = self.detect_encoding(text)

        if enc == 'binary':
            return u''

        if not enc or enc == 'unknown-8bit':
            return unicode(text, errors='ignore')

        #print text.decode(enc), enc
        return unicode(text.decode(enc))

    def parse_regexps(self, decoded):
        for m in self.host.finditer(decoded):
            s = m.group(0)
            self.words.add(s)
            #print "host: %s" % s.decode('unicode_internal').encode('utf8')
        for m in self.url.finditer(decoded):
            s = m.group(0)
            self.words.add(s)
            #print "url: %s" % s.decode('unicode_internal').encode('utf8')
        for m in self.mail.finditer(decoded):
            s = m.group(0)
            self.words.add(s)
            #print "mail: %s" % s.decode('unicode_internal').encode('utf8')

    # tags may contain meaningful data too
    def handle_starttag(self, tag, attrs):
        for a in attrs:
            self.handle_data(a[1])

        self.handle_data(tag)

    def handle_endtag(self, tag):
        pass
    def handle_data(self, data):
        decoded = self.recode(data).lower()
        if len(decoded) != 0:
            self.words.add(decoded)
            self.parse_regexps(decoded)

    def feed_email(self, reader):
        p = Parser()
        msg = p.parse(reader)

        from email.header import decode_header
        def parse_header(h):
            if not h or len(h) == 0:
                return []

            ret = []
            for x in decode_header(h):
                if not x[1]:
                    ret.append(x[0])
                    #print x[0]
                else:
                    #print x[0].decode(x[1]).encode('utf8')
                    ret.append(x[0].decode(x[1]))

            return ret

        def get_mail_addr(ret):
            for r in ret:
                addr = parseaddr(r)
                if len(addr[1]) != 0:
                    # sanity check to find non-local addresses, i.e. not 'username', but something like '*****@*****.**')
                    m = re.match("([^@|\s]+@[^@]+\.[^@|\s]+)", addr[1])
                    if m:
                        return m.group(1)
            return None

        def feed_mail_addr(ret, prefix):
            for r in ret:
                self.words.add(r)
            addr = get_mail_addr(ret)
            if addr:
                self.attrs.add(prefix + ':' + addr)


        for r in parse_header(msg['Subject']):
            self.words.add(r)

        feed_mail_addr(parse_header(msg['Cc']), 'to')
        feed_mail_addr(parse_header(msg['Bcc']), 'to')
        feed_mail_addr(parse_header(msg['From']), 'from')

        to_header = parse_header(msg['To'])
        feed_mail_addr(to_header, 'to')

        self.timestamp = mktime_tz(parsedate_tz(msg['Date']))

        # this address will be used to modify every index,
        # i.e. this scripts only updates indexes which belong to given mailbox
        if not self.mailbox:
            self.set_mailbox(get_mail_addr(to_header))
            if not self.mailbox:
                raise NameError("No mailbox name has been provided: there is no 'To' header and nothing was provided via command line, exiting")

        if not self.id or len(self.id) == 0:
            msg_id = msg['Message-Id']
            if not msg_id:
                raise NameError("There is no 'Message-Id' header and no ID has been specified via command line, exiting")

            # @get_mail_addr() performs sanity check on its arguments
            self.id = get_mail_addr([msg_id])
            if not self.id or len(self.id) == 0:
                raise NameError("Could not detect ID in 'Message-Id' header and "
                        "no ID has been provided via command line, exiting")

        def feed_check_multipart(msg):
            if not msg.is_multipart():
                self.set_encoding_from_email(msg)
                self.feed(msg.get_payload(decode=True))
            else:
                # these are multipart parts as email.Message objects
                for m in msg.get_payload():
                    feed_check_multipart(m)

        feed_check_multipart(msg)

예제 #49

0

파일 보기

파일: detect_encoding.py 프로젝트: backbone/detect_encoding

#!/usr/bin/python

import sys
import glob
from chardet.universaldetector import UniversalDetector

detector = UniversalDetector()
detector.reset()
contents=file(sys.argv[1], 'rb').read()
detector.feed(contents)
detector.close()
print detector.result['encoding']

예제 #50

0

파일 보기

파일: UserlistLib.py 프로젝트: jakenjarvis/pyOss

    def Load(self, fullpathfilename = u"", encoding=None):
        """
            指定したファイルからユーザーリストを読み込み、構成を解析します。
            ファイル名の指定を省略した場合は、最後にLoad又はSaveしたファイルから読み込みます。
            *encoding* にエンコーディング文字を指定すると、指定したコードで読み込みます。省略した場合は自動判定します。

            :param string fullpathfilename: 読み込むファイル名
            :param string encoding: ファイルのエンコーディング（公式は「Windows-1252:'cp1252'」又は「UTF-8 BOM有:'utf-8-sig'」）
        """
        if len(fullpathfilename) != 0:
            self._fullpathfilename = fullpathfilename.lstrip().rstrip()
        if len(self._fullpathfilename) == 0:
            raise IOError, "Invalid filename."
        if not os.path.exists(self._fullpathfilename):
            raise IOError, "No such file."

        if self.ChildCount() != 0:
            # もし子供が追加されていたら、全て削除する。
            for child in self.EachChilds():
                self.DeleteChild(child)

        # --------------------------------------------------
        # エンコーディング判定
        # --------------------------------------------------
        if encoding is None:
            detector = UniversalDetector()
            detector.reset()
            for line in file(self._fullpathfilename, 'rb'):
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            encoding = detector.result["encoding"]
        self._encoding = encoding
        #print u"Load encoding: %s" % (self._encoding)

        # --------------------------------------------------
        # デコードエラー時の処理定義
        # --------------------------------------------------
        if getattr(self, "OnDecodingErrorFromLoad", None) == None:
            def _onDecodingErrorFromLoad(linecount, linestring, encoding):
                # コマンドプロンプトはcp932(shift-jisのMicrosoft拡張)で表示するため、暗黙の文字コード変換が行われる。
                # そのため、linestringをprintすると、cp932に存在しない文字は表示できずにエラーになってしまう。
                # これは回避不能なので、表示しない動作をデフォルトとする。
                print u"UNICODE(%s) decoding error! skip line: %s" % (encoding, linecount)
                return None
            self.OnDecodingErrorFromLoad = _onDecodingErrorFromLoad

        # --------------------------------------------------
        # 行オブジェクト生成時の処理定義
        # --------------------------------------------------
        if getattr(self, "OnCreateLineObject", None) == None:
            def _onCreateLineObject(linecount, linestring):
                # 行オブジェクトの作成
                return UserLine(linestring)
            self.OnCreateLineObject = _onCreateLineObject

        # --------------------------------------------------
        # userlistの読み込みとオブジェクトへの展開
        # --------------------------------------------------
        thisProcedure = self
        thisOperation = UserOperation()
        thisProcedure.AddChild(thisOperation)

        linecount = 0

        fileuserlist = open(self._fullpathfilename, "rU")
        #fileuserlist = codecs.open(self._fullpathfilename, "rU", "shift_jis")
        try:
            for linestring in fileuserlist:
                if linecount == 0:
                    linestring = CommonLib.CutBomString(linestring)
                linecount += 1
                # --------------------------------------------------
                # Encodingの変換に失敗する文字が使われている場合は削除する。
                # --------------------------------------------------
                try:
                    linestring = u"%s" % (unicode(linestring, self._encoding).encode("utf-8"))
                except UnicodeDecodeError:
                    param_linecount = copy.copy(linecount)
                    param_linestring = copy.copy(linestring)
                    param_encoding = copy.copy(self._encoding)
                    linestring = self.OnDecodingErrorFromLoad(param_linecount, param_linestring, param_encoding)
                    if (isinstance(linestring, str)) or (isinstance(linestring, unicode)):
                        try:
                            linestring = u"%s" % (unicode(linestring, self._encoding).encode("utf-8"))
                        except UnicodeDecodeError:
                            linestring = u""
                    else:
                        linestring = u""

                # 行オブジェクトの作成
                #thisLine = UserLine(linestring)
                param_linecount = copy.copy(linecount)
                thisLine = self.OnCreateLineObject(param_linecount, linestring)
                if not isinstance(thisLine, UserLine):
                    raise SyntaxError, "OnCreateLineObject() is an invalid object to return."

                if thisLine.IsType(EnumCommandType.RULE):
                    # ルール定義の開始に合わせてユーザーオペレーションを作成する。
                    thisOperation = UserOperation()
                    thisProcedure.AddChild(thisOperation)

                # ユーザーオペレーションに行を追加
                thisOperation.AddChild(thisLine)
        finally:
            # ユーザーリストを閉じる
            fileuserlist.close()
        return self

예제 #51

0

파일 보기

파일: testchardet.py 프로젝트: cmacro/simple

import sys, glob
sys.path.insert(0, '..')
from chardet.universaldetector import UniversalDetector

count = 0
u = UniversalDetector()
for f in glob.glob(r'D:\temp\test\mlalloc.h'):
    print(f.ljust(60), end=' ')
    u.reset()
    for line in open(f, 'rb'):
        u.feed(line)
        if u.done: break
    u.close()
    result = u.result
    if result['encoding']:
        print(result['encoding'], 'with confidence', result['confidence'])
    else:
        print('******** no result')
    count += 1
print(count, 'tests')