def validate_file(self, filename): from chardet.universaldetector import UniversalDetector detector = UniversalDetector() detector.reset() with open(filename, "rb") as to_convert: for line in to_convert: detector.feed(line) if detector.done: break detector.close() encoding = detector.result.get("encoding") valid_encodings = ["ASCII", "ISO-8859-1", "UTF-8"] if encoding is None: message = "Please check that you are uploading a CSV file." self.logger.exception(message) raise UploadCheckError(message) if encoding.upper() not in valid_encodings: message = "File encoding %s not valid. Valid encodings: %s" % (encoding, ", ".join(valid_encodings)) self.logger.exception(message) raise UploadCheckError(message) return encoding.upper()
def convert_encoding(student_path): temp_path = os.path.join(student_path, "__submit__") for file_name in os.listdir(temp_path): file_path = os.path.join(temp_path, file_name) if file_name.lower().endswith('.c') or file_name.lower().endswith( '.cpp') or file_name.lower().endswith('.cc'): detector = UniversalDetector() detector.reset() with open(os.path.join(temp_path, file_name), 'rb') as fin: for line in fin.readlines(): detector.feed(line) if detector.done: break detector.close() source_encoding = detector.result['encoding'] target_encoding = 'utf-8' if source_encoding == target_encoding: continue data = None with codecs.open(file_path, 'r', source_encoding) as fin: data = fin.read() if not data: print(file_path) continue with codecs.open(file_path, 'w', target_encoding) as fout: fout.write(data)
def switch_char(filenames): detector = UniversalDetector() try: for filename in filenames: detector.reset() for line in open(filename, 'rb'): detector.feed(line) if detector.done: break detector.close() file_encod = detector.result['encoding'] print(filename + '的文件编码是:' + file_encod) if file_encod is None: file_encod = 'utf-8' elif file_encod == 'ascii': pass elif file_encod != 'utf-8': with open(filename, 'rb') as fr: lines = fr.readlines() with open(filename, 'w', encoding='utf-8') as fw: for lineb in lines: if lineb != None: linestr = lineb.decode(encoding=file_encod) fw.write(linestr) except Exception as e: print(e) return False return True
def determine_encoding(file_path): """ based on https://stackoverflow.com/questions/46037058/using-chardet-to-find-encoding-of-very-large-file Parameters ---------- """ # encoding = 'ascii' encoding = 'utf-8' detector = UniversalDetector() detector.reset() with open(file_path, 'rb') as f: for count, row in enumerate(f, start=1): detector.feed(row) if detector.done: encoding = detector.result['encoding'] break if count == 3: break detector.close() return encoding
def detect_character_code(file_list): """ file_listから該当するファイルの文字コードを判別して ファイル名と文字コードのdictを返す :param file_list: 文字コードを判別したいファイルリスト :return: ファイル名がキー、文字コードが値のdict """ files_code_dic = {} detector = UniversalDetector() for file in file_list: with open(file, 'rb') as f_in: detector.reset() for line in f_in.readlines(): detector.feed(line) if detector.done: break detector.close() files_code_dic[file] = detector.result['encoding'] ''' テストため、取得されたファイルをb.textに出力する ''' # base_dir = os.path.dirname(__file__) # out_file = os.path.join(base_dir, 'bbb.txt') # with open(out_file, 'a+', encoding='utf-8') as f_out: # for key, value in files_code_dic.items(): # f_out.write('{0}\t{1}'.format(key, value)) # f_out.write('\n') return files_code_dic
def convertToUTF8(filename, out_enc='utf-8'): (filepath, name) = os.path.split(filename) try: f = open(filename, 'rb') b = b' ' b += f.read(1024) u = UniversalDetector() u.reset() u.feed(b) u.close() f.seek(0) b = f.read() f.close() in_enc = u.result['encoding'] if 'utf-8' != in_enc: new_content = b.decode(in_enc, 'ignore') f = open(filename, 'w', encoding=out_enc) f.write(new_content) f.close() #print('Success:' + filename + ' converted from ' + in_enc + ' to ' + out_enc) except IOError: print('Error:' + filename + ' failed to convert from ' + in_enc + ' to ' + out_enc) finally: f.close()
def read_email(email_path): detector = UniversalDetector() result = {} with open(email_path, 'rb') as fp: msg = email.message_from_binary_file(fp, policy=default) try: with open(email_path) as f: original = f.read() except UnicodeDecodeError: detector.reset() with open(email_path, 'rb') as f: for line in f.readlines(): detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] with open(email_path, encoding=encoding) as f: original = f.read() result['original_msg'] = original result['payload'] = msg.get_payload() result['text'] = parse_html_payload(result['payload']) try: for key, value in msg.items(): result[key] = value except Exception as e: logger.error('Problem parsing email: {}\n{}'.format(email_path, e)) try: result['Date'] = dateparser.parse(result['Date']).isoformat() except Exception as e: logger.error('Problem converting date: {}\n{}'.format( result.get('date'), e)) return result
def Convert_Auto(filename, out_enc="utf-8"): ''' Re-encode text file with auto detec current encode. Need chardet Lib. Input Parameter: filename: full path and file name, e.g. c:/dir1/file.txt out_enc: new encode. Default as 'utf-8' Output Parameter None''' try: with open(filename, 'rb') as f: b = b' ' b += f.read(1024) u = UniversalDetector() u.reset() u.feed(b) u.close() f.seek(0) b = f.read() in_enc = u.result['encoding'] new_content = b.decode(in_enc, 'ignore') with open(filename, 'w', encoding=out_enc) as f: f.write(new_content) print("Success: " + filename + " converted from " + in_enc + " to " + out_enc + " !") except IOError: print("Error: " + filename + " FAIL to converted from " + in_enc + " to " + out_enc + " !")
class Encoding: def __init__(self): self.detector = UniversalDetector() def detect(self, data): self.detector.reset() self.detector.feed(data) self.detector.close() return self.detector.result
def detect_charset(filename): detector = UniversalDetector() detector.reset() for line in open(filename, 'rb'): detector.feed(line) if detector.done: break detector.close() return detector.result
def detectcharset(filename): #Function for detect file charset detector = UniversalDetector() detector.reset() for line in file(filename, 'rb'): detector.feed(line) if detector.done: break detector.close() return(detector.result)
def get_charset(fp): detector = UniversalDetector() detector.reset() for line in file(fp, 'rb'): detector.feed(line) if detector.done: break detector.close() return detector.result
class Encoding: def __init__(self): self.detector = UniversalDetector() def detect(self, data): self.detector.reset() self.detector.feed(data) self.detector.close() return self.detector.result
def get_encoding_type(current_file): from chardet.universaldetector import UniversalDetector detector = UniversalDetector() detector.reset() for line in open(current_file, 'rb'): detector.feed(line) if detector.done: break detector.close() print(current_file.split('\\')[-1] + ": " + detector.result['encoding']) return detector.result['encoding']
def check_chart(path): detector = UniversalDetector() detector.reset() for each in open(path, 'rb'): detector.feed(each) if detector.done: break detector.close() fileencoding = detector.result['encoding'] return fileencoding
def detect_encoding(file_path): # Detect file encoding detector = UniversalDetector() detector.reset() with open(file_path, mode='rb') as f: for b in f: detector.feed(b) if detector.done: break detector.close() return detector.result
def determine_encoding(filename): detector = UniversalDetector() with open(filename + '.txt', 'rb') as infile: detector.reset() for line in infile: detector.feed(line) if detector.done: # detection process ends automatically when confidence is high enough break detector.close() return detector.result
def detect_encoding(): with open(CONFIG_FILE, 'rb') as f: detector = UniversalDetector() detector.reset() for line in f.readlines(): detector.feed(line) if detector.done: break detector.close() return detector.result['encoding']
def detect_encode(file): from chardet.universaldetector import UniversalDetector detector = UniversalDetector() detector.reset() with open(file, 'rb') as f: for row in f: detector.feed(row) if detector.done: break detector.close() return detector.result.get('encoding')
def getFileEncoding(file_name): detector = UniversalDetector() detector.reset() with open(file_name, 'rb') as f: for line in f: detector.feed(line) if detector.done: break detector.close() return detector.result
def get_coding(filename): detector = UniversalDetector() detector.reset() for line in open(filename, "rb"): detector.feed(line) if detector.done: break detector.close() result = detector.result # print(result) return result["encoding"]
def get_encoding(file_path): detector = UniversalDetector() detector.reset() for line in open(file_path, 'rb'): detector.feed(line) if detector.done: break detector.close() if detector.result['confidence'] == 1.0: return str(detector.result['encoding']) else: return 'Unknown encoding'
def get_encoding_type(current_file: str): print("Determining file encoding: [%s]" % current_file, 'info') detector = UniversalDetector() detector.reset() for line in open(current_file, 'rb'): detector.feed(line) if detector.done: break detector.close() print(current_file.split('\\')[-1] + ": " + detector.result['encoding']) return detector.result['encoding']
def detectfiledir(filedir, separator): detector = UniversalDetector() for filename in glob.glob(filedir): print(filename + "=", ) detector.reset() for line in open(filename, 'rb'): detector.feed(line) if detector.done: break detector.close() print(detector.result.get('encoding'))
def detect_encoding(file_path): detector = UniversalDetector() detector.reset() with open(file_path, mode='rb') as file_to_detect: for line in file_to_detect: detector.feed(line) if detector.done: break detector.close() return detector.result
def autoDetectEncoding(txtFilePath): with open(txtFilePath) as f: detector = UniversalDetector() whole_time = 300 while True: counter = 0 print '=' * 20 + "Auto detect encoding method!" + '=' * 20 #file_size = os.stat(txtFilePath).st_size #print file_size read_size = 0 print "file:" + txtFilePath bar = ProgressBar( widgets=[Bar('=', '[', ']'), ' ', Percentage(), ' ', ETA()], maxval=1).start() start_time = clock() for line in f.readlines(): detector.feed(line) #read_size += sys.getsizeof(line) spend_time = clock() - start_time bar.update(int(spend_time) / whole_time) if spend_time > whole_time: break #if counter%1000 == 0: # print "lineNum:" + str(counter) #counter += 1 if detector.done: break print '=' * 20 + "Detecting done!" + '=' * 20 #print float(detector.result['confidence']) #print detector.done #detector.close() if not detector.done: if f.readline(): detector.close() if float(detector.result['confidence']) < 0.9: print detector.result print "Get more confidence!" detector.reset() f.seek(0) whole_time += 10 else: break else: break else: break f.close() with open('./output/detecEncodingMethod.txt', 'a') as f1: f1.write('\n' + txtFilePath + '\n' + str(detector.result)) return detector.result
def __init__(self, f, encoding=None, **kwargs): if not encoding: chardet_detector = UniversalDetector() chardet_detector.reset() chunk = f.read(self.chunk_size) chardet_detector.feed(chunk) chardet_detector.close() chardet_encoding = chardet_detector.result['encoding'] encoding = chardet_encoding if chardet_encoding and not chardet_encoding == 'ascii' else 'utf-8' f.seek(0) self.csv_reader = csv.reader(f, **kwargs) self.encoding = encoding
def detect_encoding(file_path): detector = UniversalDetector() detector.reset() with open(file_path, mode="rb") as file_to_detect: for line in file_to_detect: detector.feed(line) if detector.done: break detector.close() return detector.result
def get_encoding_type(pathname): # 인코딩을 확인한다. detector = UniversalDetector() detector.reset() try: with open(pathname, 'rb') as fp: for line in fp: detector.feed(line) if detector.done: break except FileNotFoundError: return str('FileNotFoundError') detector.close() return detector.result['encoding']
def detectEncodeOfMFiles(filenames): from chardet.universaldetector import UniversalDetector m_files_encodes = dict() detector = UniversalDetector() for filename in filenames: detector.reset() for line in open(filename, 'rb'): detector.feed(line) if detector.done: break detector.close() m_files_encodes[filename] = detector.result['encoding'] return m_files_encodes
def get_file_encoding(filename): """autodetects the encoding of the given file""" detector = UniversalDetector() detector.reset() i = 0 for line in file(filename, 'rb'): detector.feed(line) i += 1 #done or hit max lines (it takes ages to read a large file) if detector.done or i >= 2000: break detector.close() return detector.result
def detect_encoding(self): detector = UniversalDetector() detector.reset() for line in self.file_des: detector.feed(line) self.cache.append(line) if detector.done: break detector.close() return detector.result
def detect_encoding(self): detector = UniversalDetector() detector.reset() for line in self.file_des: detector.feed(line) self.cache.append(line) if detector.done: break detector.close() return detector.result
def detect_encoding(filename): # Detector object for encoding detection detector = UniversalDetector() # Determining encoding information, adding as a key-value pair to record's metadata dict entry with open(filename + '.txt', 'rb') as infile: detector.reset() for line in infile: detector.feed(line) if detector.done: # detection process ends automatically when confidence is high enough break detector.close() return detector.result['encoding']
def detectEncodeOfMFiles(filenames): from chardet.universaldetector import UniversalDetector m_files_encodes = dict() detector = UniversalDetector() for filename in filenames: detector.reset() for line in open(filename, 'rb'): detector.feed(line) if detector.done: break detector.close() m_files_encodes[filename] = detector.result['encoding'] return m_files_encodes
def get_encoding(filename): detector = UniversalDetector() detector.reset() for line in open(filename, 'rb'): detector.feed(line) if detector.done: break detector.close() if detector.result['encoding']: temp_encoding = detector.result['encoding'].lower() if "utf" in temp_encoding: temp_encoding = temp_encoding.replace("-", "_") return temp_encoding else: return None
def detcect_encoding_v2(filepath): detector = UniversalDetector() detector.reset() for each in open(filepath, 'rb'): detector.feed(each) if detector.done: break detector.close() fileencoding = detector.result['encoding'] confidence = detector.result['confidence'] if fileencoding is None: fileencoding = 'unknown' confidence = 0.99 return fileencoding, confidence * 100
class TxtEncoding: def __init__(self): # inspired by https://chardet.readthedocs.org/en/latest/usage.html#example-detecting-encodings-of-multiple-files self.detector = UniversalDetector() def detectEncoding(self, fname): '''Detect the encoding of file fname. Returns a dictionary with {'encoding', 'confidence'} fields.''' self.detector.reset() with open(fname, 'rb') as f: for line in f: self.detector.feed(line) if self.detector.done: break self.detector.close() return self.detector.result
def get_file_encoding(filename): """autodetects the encoding of the given file""" detector = UniversalDetector() detector.reset() i = 0 if not os.path.exists(os.path.expanduser(filename)): return "ascii" for line in file(filename, "rb"): detector.feed(line) i += 1 # done or hit max lines (it takes ages to read a large file) if detector.done or i >= 2000: break detector.close() return detector.result
def is_utf8( fi ): ''' try to detect if a file is utf_8 using chardet ''' ff = open(fi, 'r') detector = UniversalDetector() detector.reset() for line in ff.readlines(): detector.feed(line) if detector.done: break detector.close() ff.close() if detector.result['encoding'] == 'utf-8': return True else: return None
class Encoding(object): def __init__(self): self.detector = UniversalDetector() def _detect(self, data): self.detector.reset() self.detector.feed(data) self.detector.close() return self.detector.result def detect(self, data, safe = False): try: return self._detect(data) except: if safe: return None raise
def set_source(self, name): import os # source _dependent_ initialization goes here if name is None or not os.path.isfile(name): return False self.__source_name = name # auto-detect file-encoding (optional) try: from chardet.universaldetector import UniversalDetector detector = UniversalDetector() detector.reset() lines = 0 for line in file(self.__source_name, 'rb'): detector.feed(line) lines += 1 if detector.done or lines == 50: break detector.close() encoding = string.replace( string.lower( detector.result['encoding'] ), '-', '' ) except: encoding = 'utf_8' # remove - and _ for better detection encoding = string.replace( encoding, '_', '' ) model = self.gtk.get_widget('e_encoding').get_model() itempos = 0 for item in model: pos1 = string.find( string.replace( string.lower(str(item[0])), '_', '' ) , encoding ) if pos1 == 0: break itempos += 1 self.gtk.get_widget('e_encoding').set_active(itempos) # run dialog response = self.gtk.get_widget('d_import').run() if response == gtk.RESPONSE_OK: return True else: return False
def set_source(self, name): # source _dependent_ initialization goes here if name is None or not os.path.isfile(name): return False self.__source_name = name # auto-detect file-encoding (optional) try: from chardet.universaldetector import UniversalDetector detector = UniversalDetector() detector.reset() lines = 0 for line in file(self.__source_name, 'rb'): detector.feed(line) lines += 1 if detector.done or lines == 50: break detector.close() encoding = string.lower(detector.result['encoding']) except: log.exception('') encoding = 'utf_8' encoding = self._encoding_cleanup.sub('', encoding) model = self.gtk.get_widget('e_encoding').get_model() itempos = 0 for item in model: pos1 = string.find(self._encoding_cleanup.sub('', string.lower(str(item[0]))), encoding) if pos1 == 0: break itempos += 1 self.gtk.get_widget('e_encoding').set_active(itempos) # auto-detect CSV import settings (optional) try: import csv sniffer = csv.Sniffer() csvfilesize = os.path.getsize(self.__source_name) if csvfilesize > 65535: csvfilesize = 65535 csvfile = file(self.__source_name, 'rb') try: # quote char, line terminator and field delimiter proposed_dialect = sniffer.sniff(csvfile.read(csvfilesize)) self.gtk.get_widget('e_delimiter').set_text(proposed_dialect.delimiter) self.gtk.get_widget('e_quotechar').set_text(proposed_dialect.quotechar) if proposed_dialect.lineterminator == '\r\n': self.gtk.get_widget('e_lineterminator').set_active(1) # first row with column headers csvfile.seek(0) if sniffer.has_header(csvfile.read(csvfilesize)): self.gtk.get_widget('e_startrow').set_text('1') else: self.gtk.get_widget('e_startrow').set_text('0') finally: csvfile.close() except: log.exception('') # run dialog response = self.gtk.get_widget('d_import').run() if response == gtk.RESPONSE_OK: return True else: return False
class FileOpener: def __init__(self, use_chardet): self.use_chardet = use_chardet if use_chardet: self.init_chardet() def init_chardet(self): try: from chardet.universaldetector import UniversalDetector except ImportError: raise Exception("There's no chardet installed to import from. " "Please, install it and check your PYTHONPATH " "environment variable") self.encdetector = UniversalDetector() def open(self, filename): if self.use_chardet: return self.open_with_chardet(filename) else: return self.open_with_internal(filename) def open_with_chardet(self, filename): self.encdetector.reset() with open(filename, 'rb') as f: for line in f: self.encdetector.feed(line) if self.encdetector.done: break self.encdetector.close() encoding = self.encdetector.result['encoding'] try: f = open(filename, encoding=encoding) lines = f.readlines() except UnicodeDecodeError: print('ERROR: Could not detect encoding: %s' % filename, file=sys.stderr) raise except LookupError: print('ERROR: %s -- Don\'t know how to handle encoding %s' % (filename, encoding), file=sys.stderr) raise finally: f.close() return lines, encoding def open_with_internal(self, filename): curr = 0 global encodings while True: try: f = open(filename, 'r', encoding=encodings[curr]) lines = f.readlines() break except UnicodeDecodeError: if not quiet_level & QuietLevels.ENCODING: print('WARNING: Decoding file %s' % filename, file=sys.stderr) print('WARNING: using encoding=%s failed. ' % encodings[curr], file=sys.stderr) print('WARNING: Trying next encoding: %s' % encodings[curr], file=sys.stderr) curr += 1 finally: f.close() if not lines: print('ERROR: Could not detect encoding: %s' % filename, file=sys.stderr) raise Exception('Unknown encoding') encoding = encodings[curr] return lines, encoding
'https://raw.githubusercontent.com/BYVoid/uchardet/master/test/zh/big5.txt', 'https://raw.githubusercontent.com/BYVoid/uchardet/master/test/zh/euc-tw.txt', 'https://raw.githubusercontent.com/BYVoid/uchardet/master/test/zh/gb18030.txt', 'https://raw.githubusercontent.com/BYVoid/uchardet/master/test/zh/utf-8.txt' ] detector = UniversalDetector() for url in urls : purl = os.path.basename (url) print ("URL %-20s : " % purl, end=""), try : usock = urlopen(url) detector.reset (); for line in usock.readlines(): detector.feed(line) if detector.done: break detector.close() usock.close() # det member => encoding(string), confidence(.2f) print ("encoding: %-15s, confidence: %.2f" % (detector.result['encoding'], detector.result['confidence'])) except HTTPError as e : print (e) sys.exit (0) #
#!/usr/bin/env python #coding=utf-8 ''' chardet #检测字符编码 glob ''' from chardet.universaldetector import UniversalDetector as UD detect=UD() for filename in glob.glob('*.py'): print filename.ljust(60) detect.reset() for line in file(filename,'rb'): detect.feed(line) if detect.done: break detect.close() print detect.result #---------------------------------------------------------------------- ''' epydoc @author: 作者 @license: 版权 @contact: 联系 @todo: 改进 @version: 版本 @var v: 模块变量V说明 @type v: 模块变量类型V说明 @param p: 参数P说明 @type v: 参数P类型说明
class EncodingGrep(object): def __init__(self): self.__options = self.parseCmd() try: from chardet.universaldetector import UniversalDetector self._detector = UniversalDetector() except: self._detector = None self._pat = None self.action() def action(self): # Todo List: # -r -n -H -e --exclude-dir --exclude --include --encoding if not self._detector: if self.__options.guess: print('Warning: Please run "pip install chardet" to enable coding detection.') self.__options.guess = False # construct search pattern patlist = [] if self.__options.file: # read patterns from file with open(self.__options.file, 'rb') as fh: data = fh.read() if self._detector: self._detector.reset() self._detector.feed(data) self._detector.close() data = data.decode( self._detector.result['encoding'] or 'UTF-8', errors='ignore') else: data = data.decode('UTF-8', errors='ignore') patlist = [x for x in data.splitlines() if len(x) > 0] else: patlist = [x for x in self.__options.regexp if len(x) > 0] if self.__options.word_regexp: patlist = [r'\b' + x + r'\b' for x in patlist] elif self.__options.line_regexp: patlist = [r'^' + x + r'$' for x in patlist] pattxt = '|'.join(patlist) if self.__options.ignore_case: self._pat = re.compile(pattxt.encode('utf-8'), re.I) else: self._pat = re.compile(pattxt.encode('utf-8')) # search for item in self.__options.glob: self.grepDir(os.path.abspath(item)) def grepDir(self, path): if os.path.exists(path): for item in os.listdir(path): fullpath = os.path.join(path, item) if os.path.isdir(fullpath): if self.__options.recursive: for subitem in self.__options.exclude_dir: if fnmatch.fnmatch(item, subitem): break else: self.grepDir(fullpath) else: for subitem in self.__options.exclude: if fnmatch.fnmatch(item, subitem): break else: for subitem in self.__options.include: if fnmatch.fnmatch(item, subitem): self.grepFile(fullpath) break def grepFile(self, path): # read file with correct encoding if self.__options.guess or len(self.__options.encoding) > 0: with open(path, 'rb') as fh: data = fh.read() if self.__options.guess: self._detector.reset() self._detector.feed(data) self._detector.close() data = data.decode( self._detector.result['encoding'] or 'UTF-8', errors='ignore') else: for testcode in self.__options.encoding: try: data.decode(testcode) data = data.decode(testcode, errors='ignore') break except: pass else: data = data.decode('utf-8', errors='ignore') else: with open(path, 'r', errors='ignore') as fh: data = fh.read() # save hit line index data = data.splitlines() matchlist = [] for idx, line in enumerate(data): if self._pat.search(line.encode('utf-8')): matchlist.append(idx) # invert matchlist when --invert_match is set if self.__options.invert_match: matchlist = [x for x in range(len(data)) if x not in matchlist] # output result for item in matchlist: outline = '{path}:{line}:{code}'.format( path=path, line=item + 1, code=data[item]) if self.__options.stdout: outline = outline.encode(sys.stdout.encoding, 'ignore').decode( sys.stdout.encoding, 'ignore') print(outline) def parseCmd(self): parser = ArgumentParser(add_help=False) parser.add_argument('glob', nargs='*', help='File(s)/Dir(s)') # Generic Program Information parser.add_argument( '--help', dest='help', action='store_true', default=False, help= 'Print a usage message briefly summarizing the command-line options and the bug-reporting address, then exit.' ) parser.add_argument( '-V', '--version', dest='version', action='store_true', default=False, help= 'Print the version number of eg to the standard output stream.') # Matching Control parser.add_argument( '-e', '--regexp', dest='regexp', action='append', default=[], help='Search patterns') parser.add_argument( '-f', '--file', dest='file', action='store', help= 'Obtain patterns from file, one per line. The empty file contains zero patterns, and therefore matches nothing.' ) parser.add_argument( '-i', '--ignore-case', dest='ignore_case', action='store_true', default=False, help= 'Ignore case distinctions in both the patterns and the input files.' ) parser.add_argument( '-v', '--invert-match', dest='invert_match', action='store_true', default=False, help='Invert the sense of matching, to select non-matching lines.') parser.add_argument( '-w', '--word-regexp', dest='word_regexp', action='store_true', default=False, help= 'Select only those lines containing matches that form whole words.') parser.add_argument( '-x', '--line-regexp', dest='line_regexp', action='store_true', default=False, help= 'Select only those matches that exactly match the whole line.') # General Output Control parser.add_argument( '-c', '--count', dest='count', action='store_true', default=False, help= 'Suppress normal output; instead print a count of matching lines for each input file. With the ‘-v’, ‘--invert-match’ option, count non-matching lines.' ) parser.add_argument( '--color', dest='color', action='store', default='never', help= 'Surround the matched (non-empty) strings, matching lines, context lines, file names, line numbers, byte offsets, and separators (for fields and groups of context lines) with escape sequences to display them in color on the terminal. The colors are defined by the environment variable GREP_COLORS and default to ‘ms=01;31:mc=01;31:sl=:cx=:fn=35:ln=32:bn=32:se=36’ for bold red matched text, magenta file names, green line numbers, green byte offsets, cyan separators, and default terminal colors otherwise. COLOR is ‘never’, ‘always’, or ‘auto’.' ) parser.add_argument( '-L', '--files-without-match', dest='files_without_match', action='store_true', default=False, help= 'Suppress normal output; instead print the name of each input file from which no output would normally have been printed. The scanning of every file will stop on the first match.' ) parser.add_argument( '-l', '--files-with-matches', dest='files_with_matches', action='store_true', default=False, help= 'Suppress normal output; instead print the name of each input file from which output would normally have been printed. The scanning of every file will stop on the first match.' ) parser.add_argument( '-m', '--max-count', dest='max_count', action='store', type=int, default=-1, help= 'Stop reading a file after num matching lines. If the input is standard input from a regular file, and num matching lines are output, grep ensures that the standard input is positioned just after the last matching line before exiting, regardless of the presence of trailing context lines. This enables a calling process to resume a search.' ) parser.add_argument( '-o', '--only-matching', dest='only_matching', action='store_true', default=False, help= 'Print only the matched (non-empty) parts of matching lines, with each such part on a separate output line.' ) parser.add_argument( '-q', '--quiet', dest='quiet', action='store_true', default=False, help= 'Quiet; do not write anything to standard output. Exit immediately with zero status if any match is found, even if an error was detected.' ) parser.add_argument( '-s', '--no-message', dest='no_message', action='store_true', default=False, help= 'Suppress error messages about nonexistent or unreadable files.') # Output Line Prefix parser.add_argument( '-b', '--byte-offset', dest='byte_offset', action='store_true', default=False, help= 'Print the 0-based byte offset within the input file before each line of output. If ‘-o’ (‘--only-matching’) is specified, print the offset of the matching part itself. ' ) parser.add_argument( '-H', '--with-filename', dest='with_filename', action='store_true', default=False, help= 'Print the file name for each match. This is the default when there is more than one file to search.' ) parser.add_argument( '-h', '--no-filename', dest='no_filename', action='store_true', default=False, help= 'Suppress the prefixing of file names on output. This is the default when there is only one file (or only standard input) to search.' ) parser.add_argument( '--label', dest='label', action='store', help= 'Display input actually coming from standard input as input coming from file LABEL.' ) parser.add_argument( '-n', '--line-number', dest='line_number', action='store_true', default=False, help= 'Prefix each line of output with the 1-based line number within its input file.' ) parser.add_argument( '-T', '--initial-tab', dest='initial_tab', action='store_true', default=False, help= 'Make sure that the first character of actual line content lies on a tab stop, so that the alignment of tabs looks normal. This is useful with options that prefix their output to the actual content: ‘-H’, ‘-n’, and ‘-b’. In order to improve the probability that lines from a single file will all start at the same column, this also causes the line number and byte offset (if present) to be printed in a minimum-size field width. ' ) parser.add_argument( '-u', '--unix-byte-offsets', dest='unix_byte_offsets', action='store_true', default=False, help='Report Unix-style byte offsets.') parser.add_argument( '-Z', '--null', dest='null', action='store_true', default=False, help= 'Output a zero byte (the ASCII NUL character) instead of the character that normally follows a file name.' ) # Context Line Control parser.add_argument( '-A', '--after-context', dest='after_context', action='store', default=0, type=int, help='Print num lines of trailing context after matching lines.') parser.add_argument( '-B', '--before-context', dest='before_context', action='store', default=0, type=int, help='Print num lines of leading context before matching lines.') parser.add_argument( '-C', '--context', dest='context', action='store', default=0, type=int, help='Print num lines of leading and trailing output context.') # File and Directory Selection parser.add_argument( '-a', '--text', dest='text', action='store_true', default=False, help= 'Process a binary file as if it were text; this is equivalent to the ‘--binary-files=text’ option. ' ) parser.add_argument( '--binary-files', dest='binary_files', action='store', help= 'If the first few bytes of a file indicate that the file contains binary data, assume that the file is of type type. By default, type is ‘binary’, and grep normally outputs either a one-line message saying that a binary file matches, or no message if there is no match. If type is ‘without-match’, grep assumes that a binary file does not match; this is equivalent to the ‘-I’ option. If type is ‘text’, grep processes a binary file as if it were text; this is equivalent to the ‘-a’ option. Warning: ‘--binary-files=text’ might output binary garbage, which can have nasty side effects if the output is a terminal and if the terminal driver interprets some of it as commands. ' ) parser.add_argument( '-D', '--devices', dest='devices', action='store', help= 'If an input file is a device, FIFO, or socket, use action to process it. By default, action is ‘read’, which means that devices are read just as if they were ordinary files. If action is ‘skip’, devices, FIFOs, and sockets are silently skipped. ' ) parser.add_argument( '-d', '--directories', dest='directories', action='store', help= 'If an input file is a directory, use action to process it. By default, action is ‘read’, which means that directories are read just as if they were ordinary files (some operating systems and file systems disallow this, and will cause grep to print error messages for every directory or silently skip them). If action is ‘skip’, directories are silently skipped. If action is ‘recurse’, grep reads all files under each directory, recursively; this is equivalent to the ‘-r’ option. ' ) parser.add_argument( '--exclude', dest='exclude', action='append', default=[], help= 'Skip files whose base name matches glob (using wildcard matching). A file-name glob can use ‘*’, ‘?’, and ‘[’...‘]’ as wildcards, and \ to quote a wildcard or backslash character literally. ' ) parser.add_argument( '--exclude-from', dest='exclude_from', action='append', default=[], help= 'Skip files whose base name matches any of the file-name globs read from file (using wildcard matching as described under ‘--exclude’). ' ) parser.add_argument( '--exclude-dir', dest='exclude_dir', action='append', default=[], help= 'Exclude directories matching the pattern dir from recursive directory searches. ' ) parser.add_argument( '-I', dest='I', action='store_true', default=False, help= 'Process a binary file as if it did not contain matching data; this is equivalent to the ‘--binary-files=without-match’ option. ' ) parser.add_argument( '--include', dest='include', action='append', default=[], help= 'Search only files whose base name matches glob (using wildcard matching as described under ‘--exclude’). ' ) parser.add_argument( '-r', '--recursive', dest='recursive', action='store_true', default=False, help= 'For each directory mentioned on the command line, read and process all files in that directory, recursively. This is the same as the ‘--directories=recurse’ option. ' ) # Other Options parser.add_argument( '--line-buffered', dest='line_buffered', action='store_true', default=False, help= 'Use line buffering on output. This can cause a performance penalty. ' ) parser.add_argument( '--mmap', dest='mmap', action='store_true', default=False, help= 'If possible, use the mmap system call to read input, instead of the default read system call. In some situations, ‘--mmap’ yields better performance. However, ‘--mmap’ can cause undefined behavior (including core dumps) if an input file shrinks while grep is operating, or if an I/O error occurs. ' ) parser.add_argument( '-U', '--binary', dest='binary', action='store_true', default=False, help= 'Treat the file(s) as binary. By default, under MS-DOS and MS-Windows, grep guesses the file type by looking at the contents of the first 32kB read from the file. If grep decides the file is a text file, it strips the CR characters from the original file contents (to make regular expressions with ^ and $ work correctly). Specifying ‘-U’ overrules this guesswork, causing all files to be read and passed to the matching mechanism verbatim; if the file is a text file with CR/LF pairs at the end of each line, this will cause some regular expressions to fail. This option has no effect on platforms other than MS-DOS and MS-Windows. ' ) parser.add_argument( '-z', '--null-data', dest='null_data', action='store_true', default=False, help= 'Treat the input as a set of lines, each terminated by a zero byte (the ASCII NUL character) instead of a newline. Like the ‘-Z’ or ‘--null’ option, this option can be used with commands like ‘sort -z’ to process arbitrary file names. ' ) # Encoding Options parser.add_argument( '-E', '--encoding', dest='encoding', action='append', default=[], help='Encoding for reading text files.') parser.add_argument( '-G', '--guess', dest='guess', action='store_true', default=False, help='Guess encoding for text files.') parser.add_argument( '-S', '--stdout', dest='stdout', action='store_true', default=False, help='Encoding output text with system default encoding.') options = parser.parse_args() if options.help: parser.print_help() parser.exit() elif options.version: print('Version: 0.1') parser.exit() else: return options
class content_parser(HTMLParser): def __init__(self, mailbox, id): HTMLParser.__init__(self) self.detector = UniversalDetector() # if this ID lives in elliptics self.id = id self.timestamp = 0 # all indexes are related to given mailbox # if it is None, 'To' address is used self.set_mailbox(mailbox) self.words = set() self.attrs = set() self.encoding = '' self.url = re.compile('(\w+)(\.\w+)+(:\d+)?(/+\w+)+') self.host = re.compile('(\w+)(\.\w+)+(:\d+)?') self.mail = re.compile('(\w+)([\.!\-_\+]\w+)*@(\w+)([\.!\-_\+]\w+)*') def set_mailbox(self, mailbox): self.mailbox = mailbox def detect_encoding(self, text): self.detector.reset() self.detector.feed(text) self.detector.close() return self.detector.result['encoding'] def set_encoding(self, enc): self.encoding = enc def set_encoding_from_email(self, msg): enc = msg.get_content_charset() if not enc: charset = msg.get_charset() if charset: enc = charset.input_codec self.set_encoding(enc) def recode(self, text): enc = self.encoding if not enc: enc = self.detect_encoding(text) if enc == 'binary': return u'' if not enc or enc == 'unknown-8bit': return unicode(text, errors='ignore') #print text.decode(enc), enc return unicode(text.decode(enc)) def parse_regexps(self, decoded): for m in self.host.finditer(decoded): s = m.group(0) self.words.add(s) #print "host: %s" % s.decode('unicode_internal').encode('utf8') for m in self.url.finditer(decoded): s = m.group(0) self.words.add(s) #print "url: %s" % s.decode('unicode_internal').encode('utf8') for m in self.mail.finditer(decoded): s = m.group(0) self.words.add(s) #print "mail: %s" % s.decode('unicode_internal').encode('utf8') # tags may contain meaningful data too def handle_starttag(self, tag, attrs): for a in attrs: self.handle_data(a[1]) self.handle_data(tag) def handle_endtag(self, tag): pass def handle_data(self, data): decoded = self.recode(data).lower() if len(decoded) != 0: self.words.add(decoded) self.parse_regexps(decoded) def feed_email(self, reader): p = Parser() msg = p.parse(reader) from email.header import decode_header def parse_header(h): if not h or len(h) == 0: return [] ret = [] for x in decode_header(h): if not x[1]: ret.append(x[0]) #print x[0] else: #print x[0].decode(x[1]).encode('utf8') ret.append(x[0].decode(x[1])) return ret def get_mail_addr(ret): for r in ret: addr = parseaddr(r) if len(addr[1]) != 0: # sanity check to find non-local addresses, i.e. not 'username', but something like '*****@*****.**') m = re.match("([^@|\s]+@[^@]+\.[^@|\s]+)", addr[1]) if m: return m.group(1) return None def feed_mail_addr(ret, prefix): for r in ret: self.words.add(r) addr = get_mail_addr(ret) if addr: self.attrs.add(prefix + ':' + addr) for r in parse_header(msg['Subject']): self.words.add(r) feed_mail_addr(parse_header(msg['Cc']), 'to') feed_mail_addr(parse_header(msg['Bcc']), 'to') feed_mail_addr(parse_header(msg['From']), 'from') to_header = parse_header(msg['To']) feed_mail_addr(to_header, 'to') self.timestamp = mktime_tz(parsedate_tz(msg['Date'])) # this address will be used to modify every index, # i.e. this scripts only updates indexes which belong to given mailbox if not self.mailbox: self.set_mailbox(get_mail_addr(to_header)) if not self.mailbox: raise NameError("No mailbox name has been provided: there is no 'To' header and nothing was provided via command line, exiting") if not self.id or len(self.id) == 0: msg_id = msg['Message-Id'] if not msg_id: raise NameError("There is no 'Message-Id' header and no ID has been specified via command line, exiting") # @get_mail_addr() performs sanity check on its arguments self.id = get_mail_addr([msg_id]) if not self.id or len(self.id) == 0: raise NameError("Could not detect ID in 'Message-Id' header and " "no ID has been provided via command line, exiting") def feed_check_multipart(msg): if not msg.is_multipart(): self.set_encoding_from_email(msg) self.feed(msg.get_payload(decode=True)) else: # these are multipart parts as email.Message objects for m in msg.get_payload(): feed_check_multipart(m) feed_check_multipart(msg)
#!/usr/bin/python import sys import glob from chardet.universaldetector import UniversalDetector detector = UniversalDetector() detector.reset() contents=file(sys.argv[1], 'rb').read() detector.feed(contents) detector.close() print detector.result['encoding']
def Load(self, fullpathfilename = u"", encoding=None): """ 指定したファイルからユーザーリストを読み込み、構成を解析します。 ファイル名の指定を省略した場合は、最後にLoad又はSaveしたファイルから読み込みます。 *encoding* にエンコーディング文字を指定すると、指定したコードで読み込みます。省略した場合は自動判定します。 :param string fullpathfilename: 読み込むファイル名 :param string encoding: ファイルのエンコーディング(公式は「Windows-1252:'cp1252'」又は「UTF-8 BOM有:'utf-8-sig'」) """ if len(fullpathfilename) != 0: self._fullpathfilename = fullpathfilename.lstrip().rstrip() if len(self._fullpathfilename) == 0: raise IOError, "Invalid filename." if not os.path.exists(self._fullpathfilename): raise IOError, "No such file." if self.ChildCount() != 0: # もし子供が追加されていたら、全て削除する。 for child in self.EachChilds(): self.DeleteChild(child) # -------------------------------------------------- # エンコーディング判定 # -------------------------------------------------- if encoding is None: detector = UniversalDetector() detector.reset() for line in file(self._fullpathfilename, 'rb'): detector.feed(line) if detector.done: break detector.close() encoding = detector.result["encoding"] self._encoding = encoding #print u"Load encoding: %s" % (self._encoding) # -------------------------------------------------- # デコードエラー時の処理定義 # -------------------------------------------------- if getattr(self, "OnDecodingErrorFromLoad", None) == None: def _onDecodingErrorFromLoad(linecount, linestring, encoding): # コマンドプロンプトはcp932(shift-jisのMicrosoft拡張)で表示するため、暗黙の文字コード変換が行われる。 # そのため、linestringをprintすると、cp932に存在しない文字は表示できずにエラーになってしまう。 # これは回避不能なので、表示しない動作をデフォルトとする。 print u"UNICODE(%s) decoding error! skip line: %s" % (encoding, linecount) return None self.OnDecodingErrorFromLoad = _onDecodingErrorFromLoad # -------------------------------------------------- # 行オブジェクト生成時の処理定義 # -------------------------------------------------- if getattr(self, "OnCreateLineObject", None) == None: def _onCreateLineObject(linecount, linestring): # 行オブジェクトの作成 return UserLine(linestring) self.OnCreateLineObject = _onCreateLineObject # -------------------------------------------------- # userlistの読み込みとオブジェクトへの展開 # -------------------------------------------------- thisProcedure = self thisOperation = UserOperation() thisProcedure.AddChild(thisOperation) linecount = 0 fileuserlist = open(self._fullpathfilename, "rU") #fileuserlist = codecs.open(self._fullpathfilename, "rU", "shift_jis") try: for linestring in fileuserlist: if linecount == 0: linestring = CommonLib.CutBomString(linestring) linecount += 1 # -------------------------------------------------- # Encodingの変換に失敗する文字が使われている場合は削除する。 # -------------------------------------------------- try: linestring = u"%s" % (unicode(linestring, self._encoding).encode("utf-8")) except UnicodeDecodeError: param_linecount = copy.copy(linecount) param_linestring = copy.copy(linestring) param_encoding = copy.copy(self._encoding) linestring = self.OnDecodingErrorFromLoad(param_linecount, param_linestring, param_encoding) if (isinstance(linestring, str)) or (isinstance(linestring, unicode)): try: linestring = u"%s" % (unicode(linestring, self._encoding).encode("utf-8")) except UnicodeDecodeError: linestring = u"" else: linestring = u"" # 行オブジェクトの作成 #thisLine = UserLine(linestring) param_linecount = copy.copy(linecount) thisLine = self.OnCreateLineObject(param_linecount, linestring) if not isinstance(thisLine, UserLine): raise SyntaxError, "OnCreateLineObject() is an invalid object to return." if thisLine.IsType(EnumCommandType.RULE): # ルール定義の開始に合わせてユーザーオペレーションを作成する。 thisOperation = UserOperation() thisProcedure.AddChild(thisOperation) # ユーザーオペレーションに行を追加 thisOperation.AddChild(thisLine) finally: # ユーザーリストを閉じる fileuserlist.close() return self
import sys, glob sys.path.insert(0, '..') from chardet.universaldetector import UniversalDetector count = 0 u = UniversalDetector() for f in glob.glob(r'D:\temp\test\mlalloc.h'): print(f.ljust(60), end=' ') u.reset() for line in open(f, 'rb'): u.feed(line) if u.done: break u.close() result = u.result if result['encoding']: print(result['encoding'], 'with confidence', result['confidence']) else: print('******** no result') count += 1 print(count, 'tests')