def decode(filename, data): if '.m3u8' in filename: encoding = 'utf-8' data = data.decode(encoding) elif '.m3u' in filename or '.pls' in filename: try: encoding = 'ISO-8859-2' data = data.decode(encoding) except: if chardet: u = UniversalDetector() u.feed(data) u.close() if u.result['confidence'] > 0.5: try: encoding = result['encoding'] data = data.decode(encoding) except: encoding = 'ascii' else: encoding = 'ascii' else: encoding = 'ascii' elif '.xml' in filename or '.xspf' in filename: encoding = 'utf-8' return {'data' : data, 'encoding' : encoding}
def validate_csv(f): """Return dialect information about given csv file.""" with open(f.fullpath, 'rU') as csvfile: is_valid = False try: dialect = csv.Sniffer().sniff(csvfile.read(1024)) except Exception as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (f.name + f.superformat, e)) return { 'delimiter': '', 'encoding': '', 'is_valid': is_valid } u = UniversalDetector() dialect.strict = True csvfile.seek(0) reader = csv.reader(csvfile, dialect) try: for row in reader: u.feed(dialect.delimiter.join(row)) is_valid = True except csv.Error as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (f.name + f.superformat, e)) finally: u.close() return { 'delimiter': dialect.delimiter, 'encoding': u.result['encoding'], 'is_valid': is_valid }
def repo_cat_file(self, repo_path, commit_hash, path): (commit_hash, path) = self._all_to_utf8(commit_hash, path) if not self._path_check_chdir(repo_path, commit_hash, path): return '' path = self._get_quote_path(path) if path.startswith('./'): path = path[2:] file_type = path.split('.')[-1] if file_type in BINARY_FILE_TYPE: return u'二进制文件' stage_file = self._get_stage_file(repo_path, commit_hash, path) result = self._read_load_stage_file(stage_file) if result is not None: return result['blob'] command = '/usr/bin/git show %s:%s | /usr/bin/head -c 524288' % (commit_hash, path) try: signal.signal(signal.SIGPIPE, signal.SIG_DFL) result = check_output(command, shell=True) ud = UniversalDetector() ud.feed(result) ud.close() if ud.result['encoding']: encoding = ud.result['encoding'] if encoding != 'utf-8' or encoding != 'utf8': result = result.decode(encoding).encode('utf-8') self._dumps_write_stage_file({'blob': result}, stage_file) return result except Exception, e: logger.exception(e)
def detectEncoding(self, parseMeta=True, chardet=True): # First look for a BOM # This will also read past the BOM if present encoding = self.detectBOM() confidence = "certain" # If there is no BOM need to look for meta elements with encoding # information if encoding is None and parseMeta: encoding = self.detectEncodingMeta() confidence = "tentative" # Guess with chardet, if available if encoding is None and chardet: confidence = "tentative" try: from chardet.universaldetector import UniversalDetector buffers = [] detector = UniversalDetector() while not detector.done: buffer = self.rawStream.read(self.numBytesChardet) assert isinstance(buffer, bytes) if not buffer: break buffers.append(buffer) detector.feed(buffer) detector.close() encoding = lookupEncoding(detector.result['encoding']) self.rawStream.seek(0) except ImportError: pass # If all else fails use the default encoding if encoding is None: confidence = "tentative" encoding = lookupEncoding(self.defaultEncoding) return encoding, confidence
def detect_encoding(file_path): with open(file_path, 'rb') as f: u = UniversalDetector() for line in f: u.feed(line) u.close() return u.result['encoding']
def detect_encoding(f, verbose=False): """Detects a file's encoding. Args: f (obj): The file like object to detect. verbose (Optional[bool]): The file open mode (default: False). mode (Optional[str]): The file open mode (default: 'rU'). Returns: dict: The encoding result Examples: >>> filepath = p.join(DATA_DIR, 'test.csv') >>> with open(filepath, 'rb') as f: ... result = detect_encoding(f) ... result == {'confidence': 0.99, 'encoding': 'utf-8'} True """ pos = f.tell() detector = UniversalDetector() for line in f: detector.feed(line) if detector.done: break detector.close() f.seek(pos) if verbose: print('result', detector.result) return detector.result
def detect(view, file_name, cnt): #traceback.print_stack() print("detect...") if not file_name or not os.path.exists(file_name) or os.path.getsize(file_name) == 0: return encoding = encoding_cache.pop(file_name) if encoding: print("it is already at cache encoding_cache.json:",encoding) sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0) detector = UniversalDetector() fp = open(file_name, 'rb') for line in fp: # cut MS-Windows CR code line = line.replace(b'\r',b'') detector.feed(line) #print(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result['encoding'] print(encoding) if encoding: encoding = encoding.upper() confidence = detector.result['confidence'] print(confidence) sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def get_coding(text, force_chardet=False): """ Function to get the coding of a text. @param text text to inspect (string) @return coding string """ if not force_chardet: for line in text.splitlines()[:2]: try: result = CODING_RE.search(to_text_string(line)) except UnicodeDecodeError: # This could fail because to_text_string assume the text # is utf8-like and we don't know the encoding to give # it to to_text_string pass else: if result: codec = result.group(1) # sometimes we find a false encoding that can # result in errors if codec in CODECS: return codec # Fallback using chardet if is_binary_string(text): detector = UniversalDetector() for line in text.splitlines()[:2]: detector.feed(line) if detector.done: break detector.close() return detector.result['encoding'] return None
def detect_local_charset(filepath): global VERBOSE # Open to read in binary. fp = open(filepath, "rb") detector = UniversalDetector() if VERBOSE: print "Reading file to detect encoding..." for line in fp: line = line.replace(b'\r',b'') detector.feed(line) if detector.done: break fp.close() detector.close() if VERBOSE: print "Encoding: %s" % detector.result["encoding"] print "Confidence: {0:.0f}% ".format(detector.result["confidence"]*100) if detector.result["confidence"] > 0.75: encoding = detector.result["encoding"] return encoding.replace('-','_').lower() # Format for codecs else: return None
def decode_data(data, encoding_guess, can_be_binary=True): """Given string data, return an (is_text, data) tuple, where data is returned as unicode if we think it's text and were able to determine an encoding for it. If can_be_binary is False, then skip the initial is_binary check. """ if not (can_be_binary and is_binary_string(data[:1024])): try: # Try our default encoding. data = data.decode(encoding_guess) return True, data except UnicodeDecodeError: # Fall back to chardet - chardet is really slow, which is why we # don't just do chardet from the start. detector = UniversalDetector() for chunk in ichunks(80, data): detector.feed(chunk) if detector.done: break detector.close() if detector.result['encoding']: try: data = data.decode(detector.result['encoding']) return True, data except (UnicodeDecodeError, LookupError): # Either we couldn't decode or chardet gave us an encoding # that python doesn't recognize (yes, it can do that). pass # Leave data as str. return False, data
def detect_encoding(bytes, encoding=None): """Detect encoding of a byte stream. """ # To reduce tabulator import time from chardet.universaldetector import UniversalDetector if encoding is not None: if encoding.lower() == 'utf-8': prefix = bytes.read(len(codecs.BOM_UTF8)) if prefix == codecs.BOM_UTF8: encoding = 'utf-8-sig' bytes.seek(0) return encoding detector = UniversalDetector() num_lines = config.ENCODING_DETECTION_MAX_LINES while num_lines > 0: line = bytes.readline() detector.feed(line) if detector.done: break num_lines -= 1 detector.close() bytes.seek(0) confidence = detector.result['confidence'] encoding = detector.result['encoding'] # Do not use if not confident if confidence < config.ENCODING_DETECTION_MIN_CONFIDENCE: encoding = config.DEFAULT_ENCODING # Default to utf-8 for safety if encoding == 'ascii': encoding = config.DEFAULT_ENCODING return encoding
def guessWithChardet(content): u = UniversalDetector() for line in content: u.feed(line) u.close() result = u.result return result
def detect_encoding(file): detector = UniversalDetector() for line in open(file, 'rb').readlines(): detector.feed(line) if detector.done: break detector.close() return detector.result['encoding']
def get_file_encoding(file_name): if not os.path.isfile(file_name): return "" u = UniversalDetector() with open(file_name, "rb") as f: for index, line in enumerate(f): u.feed(line) if index > 500: break u.close() if u.result["encoding"].lower() == "gb2312": try: _file = codecs.open(file_name, encoding="gb2312") _file.readlines() result = "gb2312" except Exception as e: print e try: _file = codecs.open(file_name, encoding="gbk") _file.readlines() result = "gbk" except Exception as e: print e result = "gb18030" else: result = u.result["encoding"] return result
def detect(view, file_name, encoding): if not file_name or not os.path.exists(file_name): return if not encoding.endswith(' with BOM'): encoding = encoding_cache.pop(file_name) if encoding: sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0) detector = UniversalDetector() cnt = SETTINGS['max_detect_lines'] fp = open(file_name, 'rb') for line in fp: # cut MS-Windows CR code line = line.replace(b'\r',b'') detector.feed(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result['encoding'] if encoding: encoding = encoding.upper() confidence = detector.result['confidence'] sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def process_buffer(buf, d): if not buf: return header = buf[0] url = header.split()[1] skip = 0 empty_lines = 0 while empty_lines < 2: skip += 1 if not buf[skip].strip(): empty_lines += 1 rawhtml = "".join(buf[skip + 1:]) html = None try: html = rawhtml.decode("utf-8") except: try: detector = UniversalDetector() for line in buf[skip + 1:]: detector.feed(line) if detector.done: break detector.close() encoding = detector.result html = rawhtml.decode(encoding["encoding"]) except: html = rawhtml.decode("utf-8", errors='ignore') assert html is not None, "Error processing %s\n" % rawhtml html = html.replace(r"\r", "") d[url] = (header, html)
def transferToEncoding(filename, toCode): if os.path.isdir(filename): print "error:not file" return False try: detector = UniversalDetector() f = open(filename, 'r') ls = f.readlines() f.close() # 如果空文件没法探测到,所以直接跳出做提示即可 if len(ls) == 0: print printRed(filename), printRed(' is blank file, can not detect encoding') return False; # 探测编码 for l in ls: detector.feed(l) if detector.done: break detector.close() encode = gb(detector.result['encoding']) if encode.lower() != toCode.lower(): f = open(filename, 'w') print printGreen(filename) + ' ====> ' + toCode + ' SUCCESS' for l in ls: f.write(unicode(l, encode).encode(toCode)) f.close() else: pass except Exception, e: traceback.print_exc() print 'exception'
def detect(view, file_name): if not os.path.exists(file_name): return encoding = encoding_cache.pop(file_name) if encoding: sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Detecting encoding, please wait...'), 0) detector = UniversalDetector() cnt = SETTINGS['max_detect_lines'] fp = file(file_name, 'rb') for line in fp: detector.feed(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result['encoding'] confidence = detector.result['confidence'] if not encoding or confidence < 0.7: sublime.set_timeout(lambda: view.set_status('origin_encoding', 'Encoding can not be detected, please choose one manually. (%s/%.2f)' % (encoding, confidence)), 0) return encoding = encoding.upper() if encoding == 'BIG5': encoding = 'BIG5-HKSCS' elif encoding == 'GB2312': encoding = 'GBK' sublime.set_timeout(lambda: init_encoding_vars(view, encoding), 0)
def get_unicode_content(file_path, encoding=None): """ Return a unicode string of the files contents using the given encoding. If no encoding is given then chardet will be used to determine the encoding. Note that this uses the chardet library and may cause problems, if an error is thrown then a utf-8 encoding is assumed and unrecognize caracters are discarded. """ from chardet.universaldetector import UniversalDetector try: if not encoding: detector = UniversalDetector() contents = '' with open(file_path, 'rb') as f: contents = f.read() detector.feed(contents) detector.close() determined_encoding = detector.result['encoding'] return contents.decode(encoding=determined_encoding) else: with open(file_path, 'r') as f: return unicode(f.read(), encoding=encoding, errors='ignore') except UnicodeError: with open(file_path, 'r') as f: return unicode(f.read(), encoding='utf-8', errors='ignore')
def detect(view, file_name, encoding): if not file_name or not os.path.exists(file_name): return if not encoding.endswith(" with BOM"): encoding = encoding_cache.pop(file_name) if encoding: sublime.set_timeout(lambda: init_encoding_vars(view, encoding, detect_on_fail=True), 0) return sublime.set_timeout(lambda: view.set_status("origin_encoding", "Detecting encoding, please wait..."), 0) detector = UniversalDetector() cnt = SETTINGS["max_detect_lines"] fp = open(file_name, "rb") for line in fp: # cut MS-Windows CR code line = line.replace(b"\r", b"") detector.feed(line) cnt -= 1 if detector.done or cnt == 0: break fp.close() detector.close() encoding = detector.result["encoding"] if encoding: encoding = encoding.upper() if encoding == "BIG5": encoding = "BIG5-HKSCS" elif encoding == "GB2312": encoding = "GBK" confidence = detector.result["confidence"] sublime.set_timeout(lambda: check_encoding(view, encoding, confidence), 0)
def description_of(file_path, name='stdin', byte=1000000): """ Return a string describing the probable encoding of a file. """ from chardet.universaldetector import UniversalDetector file = open(file_path, 'rb') u = UniversalDetector() i = 0 for line in file: l = len(line) if i + l > byte: bytoread = byte-i u.feed(line[:bytoread]) break else: bytoread = l u.feed(line) i += bytoread file.close() u.close() result = u.result if result['encoding']: return '%s: %s with confidence %s' % (name, result['encoding'], result['confidence']) else: return '%s: no result' % name
def sanitize_texts(directory): """ Strip all header and copyright information from downloaded text files in the specified directory using gutenberg.strip_headers module and ensure proper file encodings. :param directory: <String> A string containing the full path to directory containing files to strip :return: """ for item in os.listdir(directory): file_path = os.path.join(directory, item) if os.path.isfile(file_path): # Detect file encoding, takes time to run with open(file_path, 'rb') as inf: text = inf.readlines() detector = UniversalDetector() for line in text: detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] # Open file, strip headers, and save result with open(file_path, 'r', encoding=encoding) as inf: text = inf.read() text = strip_headers(text).strip() os.remove(file_path) with open(file_path, 'w+', encoding=encoding) as outf: outf.write(text)
def safe_open(path, mode='r'): ''' Retrieves a file's encoding and returns the opened file. If the opened file begins with a BOM, it is read before the file object is returned. This allows callers to not have to handle BOMs of files. :param str path: file path to open :param str mode: the mode to open the file (see :func:`open`) :returns file: the opened file object ''' u = UniversalDetector() first = None with open(path, 'rb') as fp: bin = first = fp.read(0x1000) while not u.done and bin: u.feed(bin) if not u.done: bin = fp.read(0x1000) u.close() if not first: return open(path, mode) fp = codecs.open(path, mode, encoding=u.result['encoding']) for bom in (codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE, codecs.BOM_UTF8, codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE): if first.startswith(bom): fp.seek(len(bom)) break return fp
def _guessEncoding(self, path): """Opens a file from the given `path` and checks the file encoding. The file must exists on the file system and end with the extension `.csv`. The file is read line by line until the encoding could be guessed. On a successfull identification, the widgets of this dialog will be updated. Args: path (string): Path to a csv file on the file system. """ if os.path.exists(path) and path.lower().endswith('csv'): encodingDetector = UniversalDetector() with open(path, 'r') as fp: for line in fp: encodingDetector.feed(line) if encodingDetector.done: break encodingDetector.close() result = encodingDetector.result['encoding'] result = result.replace('-','_') self._encodingKey = _calculateEncodingKey(result) if self._encodingKey: index = self._encodingComboBox.findText(result.upper()) self._encodingComboBox.setCurrentIndex(index)
def listTextBasedFiles(file): try: # Detect MIME type for file # https://github.com/kaito834/myNotes/blob/master/snippets/python/magic_from_file.py # https://github.com/ahupp/python-magic#usage f_mimetype = magic.from_file(file, mime=True) except Exception as e: print("[!] Exception: {0} ({1})".format(e, type(e))) # Open and count lines if MIME type of the file is text/* if f_mimetype.split('/')[0] == 'text': # Detect encoding by chardet.universaldetector.UniversalDetector() # https://chardet.readthedocs.io/en/latest/usage.html#advanced-usage detector = UniversalDetector() with open(file, 'rb') as f: for line in f.readlines(): detector.feed(line) if detector.done: break detector.close() with open(file, "r", encoding=detector.result['encoding']) as f: line_count = 0 for line in f.readlines(): line_count += 1 print("{0}: {1}, {2}, {3} lines".format(file, f_mimetype, detector.result['encoding'], line_count)) else: print("{0}: NOT txet based file (reason: MIME type isn't text/*: {1})".format(file, f_mimetype))
def description_of(lines, name='stdin'): """ Return a string describing the probable encoding of a file or list of strings. :param lines: The lines to get the encoding of. :type lines: Iterable of bytes :param name: Name of file or collection of lines :type name: str """ u = UniversalDetector() for line in lines: line = bytearray(line) u.feed(line) # shortcut out of the loop to save reading further - particularly useful if we read a BOM. if u.done: break u.close() result = u.result if PY2: name = name.decode(sys.getfilesystemencoding(), 'ignore') if result['encoding']: return '{0}: {1} with confidence {2}'.format(name, result['encoding'], result['confidence']) else: return '{0}: no result'.format(name)
def validate_csv(document): """Return dialect information about given csv file.""" with open(document.document.uri, 'rU') as csvfile: is_valid = False try: dialect = csv.Sniffer().sniff(csvfile.read(1024)) except Exception as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (document.get_filename(), e)) return { 'delimiter': '', 'encoding': '', 'is_valid': is_valid } universal_detector = UniversalDetector() dialect.strict = True csvfile.seek(0) reader = csv.reader(csvfile, dialect) try: for row in reader: universal_detector.feed( dialect.delimiter.join(row).encode('utf-8')) is_valid = True except csv.Error as e: current_app.logger.debug( 'File %s is not valid CSV: %s' % (document.get_filename(), e)) finally: universal_detector.close() return { 'delimiter': dialect.delimiter, 'encoding': universal_detector.result['encoding'], 'is_valid': is_valid }
def decode(string): """ detects string encoding and returns decoded string""" u = UniversalDetector() u.feed(string) u.close() result = u.result return string.decode(result['encoding'])
def deserialize(file_bytes): try: file_string = file_bytes.decode('utf-8') except UnicodeDecodeError as ude: detector = UniversalDetector() for line in BytesIO(file_bytes): detector.feed(line) if detector.done: break detector.close() if detector.result['confidence'] < 0.5: raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files.") try: file_string = file_bytes.decode(detector.result['encoding']) except UnicodeDecodeError: raise ValueError("Failed to guess the encoding of the file (it's not utf-8). Use utf-8 encoded files. " "(The invalid character is '{char:#x}' at {pos})".format(pos=ude.start, char=file_bytes[ude.start])) csv_lines = file_string.splitlines() first_line = csv_lines[:1] first_row_tab = next(csv.reader(first_line, delimiter="\t")) first_row_semicolon = next(csv.reader(first_line, delimiter=";")) if len(first_row_tab) > 1: rows = csv.reader(csv_lines, delimiter="\t") elif len(first_row_semicolon) > 1: rows = csv.reader(csv_lines, delimiter=";") else: raise ValueError("Csv file is not delimited by ';' or 'tab'") return rows
def get_csv_reader(filename, charset=None): logger.info("Reading CSV file %s", filename) myfile = open(filename, "rb") if not charset: # Detect encoding detector = UniversalDetector() for line in myfile.xreadlines(): detector.feed(line) if detector.result["confidence"] > 0.01: logger.debug("Result so far: %s", detector.result) if detector.done: break detector.close() charset = detector.result["encoding"] logger.info("Found encoding %s", charset) # Reset the file index myfile.seek(0) # Attempt to detect the dialect encodedfile = codecs.EncodedFile(myfile, charset) dialect = csv.Sniffer().sniff(encodedfile.read(1024)) logger.info("Found dialect %s", dialect) # Reset the file index myfile.seek(0) return UnicodeReader(myfile, dialect=dialect, encoding=charset)
class FileOpener(object): def __init__(self, use_chardet, quiet_level): self.use_chardet = use_chardet if use_chardet: self.init_chardet() self.quiet_level = quiet_level def init_chardet(self): try: from chardet.universaldetector import UniversalDetector except ImportError: raise ImportError("There's no chardet installed to import from. " "Please, install it and check your PYTHONPATH " "environment variable") self.encdetector = UniversalDetector() def open(self, filename): if self.use_chardet: return self.open_with_chardet(filename) else: return self.open_with_internal(filename) def open_with_chardet(self, filename): self.encdetector.reset() with codecs.open(filename, 'rb') as f: for line in f: self.encdetector.feed(line) if self.encdetector.done: break self.encdetector.close() encoding = self.encdetector.result['encoding'] try: f = codecs.open(filename, 'r', encoding=encoding) except UnicodeDecodeError: print("ERROR: Could not detect encoding: %s" % filename, file=sys.stderr) raise except LookupError: print("ERROR: Don't know how to handle encoding %s: %s" % (encoding, filename,), file=sys.stderr) raise else: lines = f.readlines() f.close() return lines, encoding def open_with_internal(self, filename): curr = 0 while True: try: f = codecs.open(filename, 'r', encoding=encodings[curr]) except UnicodeDecodeError: if not self.quiet_level & QuietLevels.ENCODING: print("WARNING: Decoding file using encoding=%s failed: %s" % (encodings[curr], filename,), file=sys.stderr) try: print("WARNING: Trying next encoding %s" % encodings[curr + 1], file=sys.stderr) except IndexError: pass curr += 1 else: lines = f.readlines() f.close() break if not lines: raise Exception('Unknown encoding') encoding = encodings[curr] return lines, encoding
import sys, glob sys.path.insert(0, '..') from chardet.universaldetector import UniversalDetector count = 0 u = UniversalDetector() for f in glob.glob(src.argv[1]): for f in g: print f.ljust(60), u.reset() for line in file(f, 'rb'): u.feed(line) if u.done: break u.close() result = u.result if result['encoding']: print result['encoding'], 'with confidence', result['confidence'] else: print '******** no result' count += 1 print count, 'tests'
from chardet.universaldetector import UniversalDetector detector = UniversalDetector() with open("tmp.txt", mode='rb') as f: for b in f: detector.feed(b) if detector.done: break detector.close() print(detector.result)
def read_into_dataframe(file, filename=None, nrows=100, max_characters=50): """ Reads a file into a DataFrame. Infers the file encoding and whether a header column exists The file can be in any format (.csv, .txt, .zip, .gif,...). If it's not a .csv file, it will throw an exception (pandas.errors.EmptyDataError). One-column .csv gives exception there in try...except. Parameters ---------- file : IO File buffer. filename : str Filename. Used to infer compression. Default to None. nrows : int Number of rows to peek. Default to 100. max_characters : int Max characters a column name can have to be distinguished from a real text value. Default to 50. Returns ------- pd.DataFrame The dataframe content. Raises ------ pandas.errors.EmptyDataError Notes ----- If no filename is given, a hex uuid will be used as the file name. """ detector = UniversalDetector() for line, text in enumerate(file): detector.feed(text) if detector.done or line > nrows: break detector.close() encoding = detector.result.get("encoding") if filename is None: filename = uuid4().hex compression = infer_compression(filename, "infer") file.seek(0, SEEK_SET) pdread = TextIOWrapper(file, encoding=encoding) try: # check if the file has header. sniffer = csv.Sniffer() pdread.seek(0, SEEK_SET) pdreadline = pdread.readline() pdreadline += pdread.readline() has_header = sniffer.has_header(pdreadline) sep = None except csv.Error: sep = "," has_header = True # Prefix and header header = "infer" if has_header else None prefix = None if header else "col" pdread.seek(0, SEEK_SET) df = pd.read_csv( pdread, encoding=encoding, compression=compression, sep=sep, engine="python", header=header, nrows=nrows, prefix=prefix, ) return df
def parse_dat_file(dat_path, spec_csv_path, out_folder): """Parse a .DAT file (CSPro fixed-width text datafile) into a series of CSV files containing the tabular data for each table contained in the .DAT and described in the associated .DCD file. Developed for use in particular with DAT files provided in the "hierarchical data" from DHS, but may be more generally applicable to CSPro format files. The .DCF file must be parsed first, using DCF_Parser, and the table specification file it generates is used by this function to parse the data file. Produces one CSV data file for every table (recordtype) defined in the .DCF and occurring in the .DAT. """ filecode = os.path.extsep.join(os.path.basename(dat_path).split(os.path.extsep)[:-1]) # See if we've already done this one test_fn = os.path.join(out_folder, f"{filecode}.REC01.csv") if os.path.exists(test_fn): print("Already parsed " + filecode) return print("Parsing "+dat_path) # read the parsed file specification in CSV form which was created by parsing the .dcf file # The first row specifies where, on all subsequent rows, the "record type" is found i.e. the identifier # that specifies which table the row defines a variable for. This is constant throughout the file. # Each remaining item in the parsed DCF spec defines one field from one table, specifying what position that # field's data is found in the fixed-width text format row when the row's record_type_info # (destination table name) is for this table with open(spec_csv_path, 'r') as dict_file: dict_file_reader = csv.DictReader(dict_file) # the record type position info must be in the first line recordtype_info = next(dict_file_reader) rt_start = int(recordtype_info['Start']) - 1 rt_end = int(recordtype_info['Len']) + rt_start all_vars_this_file = [row for row in dict_file_reader] for field_info in all_vars_this_file: field_info['Start'] = int(field_info['Start']) field_info['Len'] = int(field_info['Len']) # sort them by record type (i.e. destination table) then position in the row (order of fields) sorted_fields = sorted(all_vars_this_file, key=(itemgetter('RecordTypeValue', 'Start'))) # build a dictionary of record type (i.e. tablename) : list of its fields (i.e. field infos) rt_field_info = {} for field_info in sorted_fields: record_tag = field_info['RecordTypeValue'] if record_tag not in rt_field_info: rt_field_info[record_tag] = [] rt_field_info[record_tag].append(field_info) # now parse the data file result = {} n_cols_per_table = {} detector = UniversalDetector() with open(dat_path, 'rb') as f: for line in f: detector.feed(line) if detector.done: break detector.close() enc = detector.result['encoding'] with open(dat_path, 'r', encoding=enc) as data: for i, line in enumerate(data): #if i == 0 and line.startswith(codecs.BOM_UTF8): # print(f"File {dat_path} appears to contain BOM; ignoring it") # line = line[len(codecs.BOM_UTF8):] record_type = line[rt_start:rt_end] if record_type not in rt_field_info: print("Specification for recordtype '{0!s}' not found in file for {1!s} at line {2!s}".format( record_type, filecode, i)) continue record_spec = rt_field_info[record_type] if record_type not in result: result[record_type] = [] # split the column-aligned text according to the row specification # The .DAT format allows a fixed width for each column of each recordtype. # Should we strip the whitespace on shorter values? This is difficult. # In general, yes we should, because values are stored as fixed-width and where # shorter than the field, are padded with spaces, which would take up unnecessary space # and would prevent joining/comparison between surveys. # HOWEVER in the case of the CASEID / HHID variables we must NOT strip the whitespace. # The HHID is usually the CASEID with the last 3 chars trimmed off, but if we # trim "some" whitespace from HHID here then we can break that association and # damage referential integrity. # On the other hand some joins are based on e.g. BIDX (recorded as len 2) # to MIDX (recorded as len 1, despite containing the same data), and we need # to join on a single digit found in both so BIDX would need to be stripped. # Define a lambda to strip or not strip accordingly, and use it in a list comp to # split the row into its field values strip_or_not = lambda data, name: data if name in ('CASEID', 'HHID') else data.strip() rowParts = [strip_or_not( (line[i['Start'] - 1: i['Start'] + i['Len'] - 1]), i['Name']) for i in record_spec] if record_type not in n_cols_per_table: n_cols_per_table[record_type] = len(rowParts) else: assert len(rowParts) == n_cols_per_table[record_type] # add as a list to the list of rows for this record type result[record_type].append(rowParts) # (",".join(rowParts)) for record_type, field_infos in rt_field_info.items(): if not record_type in result: print(f"No rows were found for record type {record_type} in file {filecode} despite DCF specification") continue field_header = [i['Name'] for i in field_infos] field_records = set([i['RecordName'] for i in field_infos]) assert len(field_records) == 1 rec_name = field_records.pop() if not os.path.exists(out_folder): os.makedirs(out_folder) out_fn = os.path.join(out_folder, f"{filecode}.{rec_name}.csv") with open(out_fn, 'w', newline='', encoding='utf-8') as out_csv: csv_writer = csv.writer(out_csv) csv_writer.writerow(field_header) csv_writer.writerows(result[record_type])
class autoSettings(object): def __init__(self): # возможные разделители строк (байты, потому что разделитель строк находим в строке байтов) self.row_separators = [b'\r\n', b'\n\r', b'\r', b'\n'] # возможные разделители колонок self.column_separators = pd.Series([' ', ';', ',', '\t', '.', '\s+']) # детектор для определения кодировки self.detector = UniversalDetector() # разделитель строк self.column_sep = '' # разделитель колонок self.row_sep = '' # десятичный разделитель self.decimal_sep = '' # кодировка файла self.code_standart = '' # число строк с мусором после заголовка self.rubbish_lines_afterHead = 0 # число строк заголовка self.head_lines = 0 # число строк с мусором до заголовка self.rubbish_lines_toHead = 0 # число значащих строк данных self.meaning_data_lines = 0 # находим первый разделитель в файле def __separator(self, byteString): for sep in self.row_separators: if (sep in byteString) == True: return sep return None # проверяем, остались ли еще разделители в файле после разделения на строки заданным разделителем def __separatorChecking(self, separator, byteString): splittedS = byteString.split(sep=separator) for s in splittedS: if self.__separator(splittedS) != None: return False return True # определение разделителя строк def __searchRowSeparator(self, string): Sep = self.__separator(string) if self.__separatorChecking(Sep, string) == True: return Sep else: if b'\r\n' in string and b'\n\r' in string: Sep = b'\n\r' if self.__separatorChecking(Sep, string) == True: return Sep # есть строка с символами перехода на новую строку (либо \n, либо \r) _, Sep = max( list( zip(map(string.count, self.row_separators), self.row_separators))) return Sep # делим данные на строки def __splitToRows(self, string): for sep in self.row_separators: string = string.replace(sep, b'\n') return string.split(sep=b'\n') # определеяем кодировку def __codeStandart(self, byte_rows): # определение кодировки for line in byte_rows: self.detector.feed(line) if self.detector.done == True: break self.detector.close() code = self.detector.result['encoding'] return code # определяем, является ли заданная строка, строкой с числовыми данными def __isStringOfNumbers(self, string, column_separator): meaning_data_line = [ ch for ch in re.split(column_separator, string) if ch != '' ] for number in meaning_data_line: try: float(number) except ValueError: try: float( number.replace(',', '').replace(':', '').replace( ' ', '').replace('.', '').replace('-', '')) except ValueError: return False return True # делим строку на столбцы def __splitToColumns(self, string): # убираем пустые строки, которые получается если несколько разделителей идут подряд columns = [ch for ch in re.split(self.column_sep, string) if ch != ''] if string != '' and re.split( self.column_sep, string).count('') != 0 and self.column_sep in [' ', '\t']: self.column_sep = '\s+' return columns # делим строку на столбцы с заданным разделителем def __splitToColumns_specSep(self, string, column_separator): # убираем пустые строки, которые получается если несколько разделителей идут подряд columns = [ch for ch in re.split(column_separator, string) if ch != ''] return columns # определяем число строк с мусором в конце файла # функция работает с перевернутым списком строк def __rubbish_afterData(self, rows_of_data_reverse): # определение количества строк мусора после значащих данных rubbishRows_afterMeaningData = -1 number_of_columns = -1 for line in rows_of_data_reverse: # число столбцов в строке count = len(self.__splitToColumns(line)) if count != 1 and count == number_of_columns and self.__isStringOfNumbers( line, self.column_sep) == True: return rubbishRows_afterMeaningData, number_of_columns else: rubbishRows_afterMeaningData = rubbishRows_afterMeaningData + 1 number_of_columns = count # количество строк мусора после значащих данных return rubbishRows_afterMeaningData, number_of_columns # определяем есть ли в строке буквы def __haveStringLettersOrDigits(self, string): for cell in self.__splitToColumns(string): if re.search(r'[^\W_]', string) is None: return False else: continue return True # определяем число строк с мусором после заголовка # функция работает с перевернутым списокмо строк def __rubbish_afterHead(self, rows_of_data_reverse, number_of_columns): rubbishRows_afterHead = 0 # находим начало строк с значащими данными for i, line in enumerate(rows_of_data_reverse): # число столбцов в строке count = len(self.__splitToColumns(line)) if count == number_of_columns and self.__isStringOfNumbers( line, self.column_sep) == True: continue else: rows_of_data_reverse = rows_of_data_reverse[i:] break else: rows_of_data_reverse = [] for line in rows_of_data_reverse: # число столбцов в строке count = len(self.__splitToColumns(line)) if count == number_of_columns and self.__haveStringLettersOrDigits( line) == True: return rubbishRows_afterHead, rows_of_data_reverse else: rubbishRows_afterHead = rubbishRows_afterHead + 1 # количество строк мусора после заголовка return rubbishRows_afterHead, rows_of_data_reverse # определяем число строк заголовка # функция работает с перевернутым списокмо строк def __headRows(self, rows_of_data_reverse, number_of_columns): # определение количества строк заголовка headRows = 0 for line in rows_of_data_reverse: count = len(self.__splitToColumns(line)) if count == number_of_columns and self.__haveStringLettersOrDigits( line) == True: headRows = headRows + 1 else: return headRows return headRows # # определяем число строк мусора до заголовка # def __rubbish_toHead(self, rows_of_data_reverse): # return len(rows_of_data_reverse) # определение десятичного разделителя def __decimalSeparator(self, numbers): for number in numbers: try: float(number) continue except ValueError: try: float(number.replace(',', '.')) except ValueError: continue else: return ',' return '.' #определение разделителя колонок # начинает поиск с последней строки def __searchColumnSeparator(self, rows_of_data): for i, line in enumerate(rows_of_data): # ищем начало строк со значащими данными (пропускаем строки с мусором) #если в строке есть буквы или любые другие символы, кроме указанных в квадратных скобках, переходим к следующей строке if re.search(r'[^\d\t- :;.,e]', line) is None and line != '': rows_of_data = rows_of_data[i:] break else: continue columns_sep = ' ' columns_count = -1 #начинаем поиск разделителя колонок с последней строки column_separators = self.column_separators.copy() for line in rows_of_data: if columns_count == len( self.__splitToColumns_specSep(line, columns_sep)): break else: l = zip( list(column_separators.keys()), list( map( lambda x: len( self.__splitToColumns_specSep(line, x)), column_separators))) for i, count in l: if count <= 1 or self.__isStringOfNumbers( line, column_separators[i]) == False: del column_separators[i] else: if (columns_sep == ' ' or columns_sep == '\t') and re.search( r'(\s\s)', line) is not None: columns_sep = r'\s+' columns_count = len( self.__splitToColumns_specSep( line, columns_sep)) else: columns_sep = self.column_separators[i] columns_count = count break return columns_sep def get_auto_settings(self, filename): # открываем файл в байтовом режиме with open(filename, 'rb') as dataBytes: s = dataBytes.read() dataRows_begin = s[:10000] dataRows_end = s[-10000:] # определение разделителя строк self.row_sep = self.__searchRowSeparator(dataRows_begin) # делим на строки dataRows_begin = self.__splitToRows(dataRows_begin) dataRows_end = self.__splitToRows(dataRows_end) # опредеялем кодировку файла self.code_standart = self.__codeStandart(dataRows_begin + dataRows_end) #декодируем данные dataRows_begin = [ x.decode(self.code_standart) for x in dataRows_begin ] dataRows_end = [x.decode(self.code_standart) for x in dataRows_end] # удаляем лишние пробелы с начала и конца строк # удаляем последнюю и первые строки соответственно, так как они могут быть неполными (так как читается N знаков, # а не определенное число строк) dataRows_begin = (list(map(lambda x: x.strip(), dataRows_begin)))[:-1] dataRows_end = (list(map(lambda x: x.strip(), dataRows_end)))[1:] # определение разделителя колонок # ищем с конца файла!!!! dataRows_end_reversed = dataRows_end.copy() dataRows_end_reversed.reverse() self.column_sep = self.__searchColumnSeparator( dataRows_end_reversed) # число строк с мусором после данных # так же с конца файла!!!! rubbish_lines_afterData, number_of_columns = self.__rubbish_afterData( dataRows_end_reversed) # число строк с мусором после заголовка # работаем с перевернутым файлом!!!! dataRows_begin_reversed = dataRows_begin.copy() dataRows_begin_reversed.reverse() self.rubbish_lines_afterHead, dataRows_begin_reversed = self.__rubbish_afterHead( dataRows_begin_reversed, number_of_columns) # число строк заголовка # работаем с перевернутым файлом!!!! self.head_lines = self.__headRows( dataRows_begin_reversed[self.rubbish_lines_afterHead:], number_of_columns) # число строк с мусором до заголовка # работаем с перевернутым файлом!!!! self.rubbish_lines_toHead = len( dataRows_begin_reversed[self.head_lines + self.rubbish_lines_afterHead:]) # число строк значащих данных self.meaning_data_lines = len( self.__splitToRows(s) ) - self.rubbish_lines_toHead - self.rubbish_lines_afterHead - rubbish_lines_afterData - self.head_lines # десятичный разделитель # функция работает со строкой, в которой находятся значащие числовые данные decimal_sep = self.__decimalSeparator( self.__splitToColumns( dataRows_end_reversed[rubbish_lines_afterData])) return dict( column_separator=self.column_sep, row_separator=self.row_sep.decode(self.code_standart), decimal_separator=decimal_sep, code_standart=self.code_standart, number_of_rows_with_rubbish_toHead=self.rubbish_lines_toHead, number_of_head_lines=self.head_lines, number_of_rows_with_rubbish_afterHead=self.rubbish_lines_afterHead, number_of_rows_with_meaningful_data=self.meaning_data_lines) # возвращет True если все хорошо, и False, если файл кривой def check_settings(self, filename, settings): column_sep = settings['column_separator'] row_sep = settings['row_separator'] decimal_sep = settings['decimal_separator'] code_std = settings['code_standart'] rubbish_toHead = settings['number_of_rows_with_rubbish_toHead'] rubbish_afterHead = settings['number_of_rows_with_rubbish_afterHead'] head = settings['number_of_head_lines'] meaning_data = settings['number_of_rows_with_meaningful_data'] if column_sep == '\s+': engine = 'python' else: engine = 'c' try: header = pd.read_csv( filename, sep=column_sep, engine=engine, decimal=decimal_sep, #lineterminator=row_sep, warn_bad_lines=True, header=None, skiprows=rubbish_toHead, nrows=head, encoding=code_std) data = pd.read_csv( filename, sep=column_sep, header=list(range(head)), engine=engine, decimal=decimal_sep, #lineterminator=row_sep, warn_bad_lines=True, skip_blank_lines=False, skiprows=list(range(rubbish_toHead)) + list( range(rubbish_toHead + head, rubbish_toHead + head + rubbish_afterHead)), #list(range(rubbish_toHead+head+rubbish_afterHead)), nrows=meaning_data + head, encoding=code_std) except Exception: print('Exception') return False if (None not in data.values or head == 0) and len(data.columns) > 1: return True else: return False def get_data(self, filename): settings = self.get_auto_settings(filename) if self.check_settings(filename, settings) == False: return None else: column_sep = settings['column_separator'] row_sep = settings['row_separator'] decimal_sep = settings['decimal_separator'] code_std = settings['code_standart'] rubbish_toHead = settings['number_of_rows_with_rubbish_toHead'] rubbish_afterHead = settings[ 'number_of_rows_with_rubbish_afterHead'] head = settings['number_of_head_lines'] meaning_data = settings['number_of_rows_with_meaningful_data'] if column_sep == '\s+': engine = 'python' else: engine = 'c' header = pd.read_csv( filename, sep=column_sep, engine=engine, decimal=decimal_sep, # lineterminator=row_sep, warn_bad_lines=True, header=None, skiprows=rubbish_toHead, nrows=head, encoding=code_std) data = pd.read_csv( filename, sep=column_sep, header=None, engine=engine, decimal=decimal_sep, # lineterminator=row_sep, warn_bad_lines=True, skip_blank_lines=False, skiprows=rubbish_toHead + head + rubbish_afterHead, nrows=meaning_data, encoding=code_std) return header.values, data.values
def determineEncoding(self, chardet=True): # BOMs take precedence over everything # This will also read past the BOM if present charEncoding = self.detectBOM(), "certain" if charEncoding[0] is not None: return charEncoding # If we've been overriden, we've been overriden charEncoding = lookupEncoding(self.override_encoding), "certain" if charEncoding[0] is not None: return charEncoding # Now check the transport layer charEncoding = lookupEncoding(self.transport_encoding), "certain" if charEncoding[0] is not None: return charEncoding # Look for meta elements with encoding information charEncoding = self.detectEncodingMeta(), "tentative" if charEncoding[0] is not None: return charEncoding # Parent document encoding charEncoding = lookupEncoding( self.same_origin_parent_encoding), "tentative" if charEncoding[0] is not None and not charEncoding[0].name.startswith( "utf-16"): return charEncoding # "likely" encoding charEncoding = lookupEncoding(self.likely_encoding), "tentative" if charEncoding[0] is not None: return charEncoding # Guess with chardet, if available if chardet: try: from chardet.universaldetector import UniversalDetector except ImportError: pass else: buffers = [] detector = UniversalDetector() while not detector.done: buffer = self.rawStream.read(self.numBytesChardet) assert isinstance(buffer, bytes) if not buffer: break buffers.append(buffer) detector.feed(buffer) detector.close() encoding = lookupEncoding(detector.result['encoding']) self.rawStream.seek(0) if encoding is not None: return encoding, "tentative" # Try the default encoding charEncoding = lookupEncoding(self.default_encoding), "tentative" if charEncoding[0] is not None: return charEncoding # Fallback to html5lib's default if even that hasn't worked return lookupEncoding("windows-1252"), "tentative"
def getEncodingByContent(content): detector = UniversalDetector() detector.feed(content) detector.close() return detector.result["encoding"]
def build_schema(infile, outfile, delimiter=None, quotechar='\"', encoding=None, dataset_name=None, base="https://iisg.amsterdam/"): """ Build a CSVW schema based on the ``infile`` CSV file, and write the resulting JSON CSVW schema to ``outfile``. Takes various optional parameters for instructing the CSV reader, but is also quite good at guessing the right values. """ url = os.path.basename(infile) # Get the current date and time (UTC) today = datetime.datetime.utcnow().strftime("%Y-%m-%d") if dataset_name is None: dataset_name = url if encoding is None: detector = UniversalDetector() with open(infile, 'rb') as f: for line in f.readlines(): detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] logger.info("Detected encoding: {} ({} confidence)".format(detector.result['encoding'], detector.result['confidence'])) if delimiter is None: try: #Python 3 with open(infile, 'r', errors='ignore') as csvfile: # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t") dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter csvfile.seek(0) except TypeError: #Python 2 with open(infile, 'r') as csvfile: # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t") dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter csvfile.seek(0) logger.info("Detected dialect: {} (delimiter: '{}')".format(dialect, dialect.delimiter)) delimiter = dialect.delimiter logger.info("Delimiter is: {}".format(delimiter)) if base.endswith('/'): base = base[:-1] metadata = { u"@id": iribaker.to_iri(u"{}/{}".format(base, url)), u"@context": [u"https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json", {u"@language": u"en", u"@base": u"{}/".format(base)}, get_namespaces(base)], u"url": url, u"dialect": {u"delimiter": delimiter, u"encoding": encoding, u"quoteChar": quotechar }, u"dc:title": dataset_name, u"dcat:keyword": [], u"dc:publisher": { u"schema:name": u"CLARIAH Structured Data Hub - Datalegend", u"schema:url": {u"@id": u"http://datalegend.net"} }, u"dc:license": {u"@id": u"http://opendefinition.org/licenses/cc-by/"}, u"dc:modified": {u"@value": today, u"@type": u"xsd:date"}, u"tableSchema": { u"columns": [], u"primaryKey": None, u"aboutUrl": u"{_row}" } } with io.open(infile, 'rb') as infile_file: r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar) try: # Python 2 header = r.next() except AttributeError: # Python 3 header = next(r) logger.info(u"Found headers: {}".format(header)) if u'' in header: logger.warning("WARNING: You have one or more empty column headers in your CSV file. Conversion might produce incorrect results because of conflated URIs or worse") if len(set(header)) < len(header): logger.warning("WARNING: You have two or more column headers that are syntactically the same. Conversion might produce incorrect results because of conflated URIs or worse") # First column is primary key metadata[u'tableSchema'][u'primaryKey'] = header[0] for head in header: col = { u"@id": iribaker.to_iri(u"{}/{}/column/{}".format(base, url, head)), u"name": head, u"titles": [head], u"dc:description": head, u"datatype": u"string" } metadata[u'tableSchema'][u'columns'].append(col) with open(outfile, 'w') as outfile_file: outfile_file.write(json.dumps(metadata, indent=True)) logger.info("Done") return
def read_into_dataframe(file: IO, filename: str = "", nrows: int = 100, max_characters: int = 50) -> pd.DataFrame: """Reads a file into a DataFrame. Infers the file encoding and whether a header column exists Args: file (IO): file buffer. filename (str): filename. Used to infer compression. nrows (int, optional): number of rows to peek. Default: 100. max_characters (int, optional): max characters a column name can have to be distinguished from a real text value Returns: A pandas.DataFrame. """ detector = UniversalDetector() for line, text in enumerate(file): detector.feed(text) if detector.done or line > nrows: break detector.close() encoding = detector.result.get("encoding") compression = infer_compression(filename, "infer") file.seek(0, SEEK_SET) contents = file.read() with BytesIO(contents) as file: df0 = pd.read_csv( file, encoding=encoding, compression=compression, sep=None, engine="python", header="infer", nrows=nrows, ) df0_cols = list(df0.columns) #Check if all columns are strins and short strings(text values tend to be long) column_names_checker = all([type(item) == str for item in df0_cols]) if column_names_checker: column_names_checker = all( [len(item) < max_characters for item in df0_cols]) #Check if any column can be turned to float conversion_checker = True for item in df0_cols: try: item = float(item) conversion_checker = False break except: pass #Prefix and header final_checker = True if (column_names_checker and conversion_checker) else False header = "infer" if final_checker else None prefix = None if header else "col" with BytesIO(contents) as file: df = pd.read_csv( file, encoding=encoding, compression=compression, sep=None, engine="python", header=header, prefix=prefix, ) return df
def show_encode(text): detector = UniversalDetector() detector.feed(text) detector.close() return detector.result['encoding']
with open('test.txt', 'w') as file: for line in LINES_LST: file.write(f'{line}\n') file.close() # узнаем кодировку файла """ Если файл имеет большой размер, то вместо считывания его целиком в строку и использования функции detect() можно воспользоваться классом UniversalDetector. В этом случае можно читать файл построчно и передавать текущую строку методу feed(). Если определение кодировки прошло успешно, атрибут done будет иметь значение True. Это условие можно использовать для выхода из цикла. После окончания проверки следует вызвать метод close(). Получить результат определения кодировки позволяет атрибут result """ DETECTOR = UniversalDetector() with open('test.txt', 'rb') as test_file: for i in test_file: DETECTOR.feed(i) if DETECTOR.done: break DETECTOR.close() print(DETECTOR.result['encoding']) # открываем файл в правильной кодировке with open('test.txt', 'r', encoding=DETECTOR.result['encoding']) as file: CONTENT = file.read() print(CONTENT)
def process(self, identifier, infile): # Need to use some guesswork to detect the file encoding of the latex files. u = UniversalDetector() for line in open(infile, 'rb'): u.feed(line) u.close() result = u.result if result['encoding']: encoding = result['encoding'] print "Detected encoding for %s: %s" % (identifier, encoding) else: encoding = self.DEFAULT_ENCODING print "Warning: using default encoding (%s) - as a file encoding could not be detected for %s" % ( encoding, infile) file_handle = codecs.open(infile, encoding=encoding, mode='r') try: #Always re-encode files as UTF-8 for processing, as this is what ElasticSearch is expecting raw_data = file_handle.read().encode("UTF-8") except UnicodeDecodeError as e: #Otherwise, give up trying to read this file print "Error: could not re-encode %s to UTF-8: %s %s" % ( identifier, e, infile) raw_data = "" file_handle.close() citations = [] #remove latex comments, to avoid confusion in processing data = re.sub(r"^\s*\%.*$", "", raw_data, 0, re.MULTILINE) #remove whitespace and newlines data = re.sub(r"\s+", " ", data, 0, re.MULTILINE) #find the bibliography section match = re.search( r'\\begin{thebibliography(?P<bibliography>.*)\\end{thebibliography', data, re.DOTALL) if match: data = match.group('bibliography') #get a list of bibitems. Start at [1:] to ignore the stuff between \begin{thebibliography} and \bibitem counter = 1 for bibitem in re.split(r"\\bibitem", data)[1:]: #trim the string bibitem_trimmed = bibitem.strip() citation = {"_latex": bibitem_trimmed, "cite_order": counter} bibstring_to_process = bibitem_trimmed (arxiv_id, bibstring_to_process ) = self.extract_arxiv_id(bibstring_to_process) if arxiv_id is not None: citation["identifier"] = [{ "type": "arXiv", "id": arxiv_id, "canonical": "arXiv:" + arxiv_id }] (label, key, bibstring_to_process ) = self.extract_label_key(bibstring_to_process) if (label is not None): citation["label"] = label if (key is not None): citation["key"] = key (url, bibstring_to_process) = self.extract_url(bibstring_to_process) if (url is not None): citation["url"] = url (year, bibstring_to_process ) = self.extract_year(bibstring_to_process) if (year is not None): citation["year"] = year (authors, bibstring_to_process ) = self.extract_authors(bibstring_to_process) if (authors is not None): citation["authors"] = authors (title, bibstring_to_process ) = self.extract_title(bibstring_to_process) if (title is not None): citation["title"] = title (publisher, bibstring_to_process ) = self.extract_publisher(bibstring_to_process) if (publisher is not None): citation["publisher"] = publisher citations.append(citation) #print "Counter: %i\tarxiv_id: %s\tlabel: %s\tkey: %s\turl: %s\tauthors: %s\ttitle: %s" % (counter, arxiv_id, label, key, url, authors, title) #print "COUNTER: %i \t AUTHORS: %s \t TITLE: %s" % (counter, authors, title) #print "bibstring_to_process:\t", bibstring_to_process #print bibitem, "\n" counter += 1 return citations
def parse_csv(myfile, newsletter, ignore_errors=False): from newsletter.addressimport.csv_util import UnicodeReader import codecs import csv # Detect encoding from chardet.universaldetector import UniversalDetector detector = UniversalDetector() for line in myfile.readlines(): detector.feed(line) if detector.done: break detector.close() charset = detector.result['encoding'] # Reset the file index myfile.seek(0) # Attempt to detect the dialect encodedfile = codecs.EncodedFile(myfile, charset) dialect = csv.Sniffer().sniff(encodedfile.read(1024)) # Reset the file index myfile.seek(0) logger.info('Detected encoding %s and dialect %s for CSV file', charset, dialect) myreader = UnicodeReader(myfile, dialect=dialect, encoding=charset) firstrow = myreader.next() # Find name column colnum = 0 namecol = None for column in firstrow: if "name" in column.lower() or ugettext("name") in column.lower(): namecol = colnum if "display" in column.lower() or \ ugettext("display") in column.lower(): break colnum += 1 if namecol is None: raise forms.ValidationError( _("Name column not found. The name of this column should be " "either 'name' or '%s'.") % ugettext("name")) logger.debug("Name column found: '%s'", firstrow[namecol]) # Find email column colnum = 0 mailcol = None for column in firstrow: if 'email' in column.lower() or \ 'e-mail' in column.lower() or \ ugettext("e-mail") in column.lower(): mailcol = colnum break colnum += 1 if mailcol is None: raise forms.ValidationError( _("E-mail column not found. The name of this column should be " "either 'email', 'e-mail' or '%(email)s'.") % {'email': ugettext("e-mail")}) logger.debug("E-mail column found: '%s'", firstrow[mailcol]) #assert namecol != mailcol, \ # 'Name and e-mail column should not be the same.' if namecol == mailcol: raise forms.ValidationError( _("Could not properly determine the proper columns in the " "CSV-file. There should be a field called 'name' or " "'%(name)s' and one called 'e-mail' or '%(e-mail)s'.") % { "name": _("name"), "e-mail": _("e-mail") }) logger.debug('Extracting data.') addresses = {} for row in myreader: if not max(namecol, mailcol) < len(row): logger.warn("Column count does not match for row number %d", myreader.line_num, extra=dict(data={'row': row})) if ignore_errors: # Skip this record continue else: raise forms.ValidationError( _("Row with content '%(row)s' does not contain a name and " "email field.") % {'row': row}) name = check_name(row[namecol], ignore_errors) email = check_email(row[mailcol], ignore_errors) logger.debug("Going to add %s <%s>", name, email) try: validate_email(email) addr = make_subscription(newsletter, email, name) except ValidationError: if ignore_errors: logger.warn( "Entry '%s' at line %d does not contain a valid " "e-mail address.", name, myreader.line_num, extra=dict(data={'row': row})) else: raise forms.ValidationError( _("Entry '%s' does not contain a valid " "e-mail address.") % name) if addr: if email in addresses: logger.warn( "Entry '%s' at line %d contains a " "duplicate entry for '%s'", name, myreader.line_num, email, extra=dict(data={'row': row})) if not ignore_errors: raise forms.ValidationError( _("The address file contains duplicate entries " "for '%s'.") % email) addresses.update({email: addr}) else: logger.warn( "Entry '%s' at line %d is already subscribed to " "with email '%s'", name, myreader.line_num, email, extra=dict(data={'row': row})) if not ignore_errors: raise forms.ValidationError( _("Some entries are already subscribed to.")) return addresses
def GetFileBody(self, get): if sys.version_info[0] == 2: get.path = get.path.encode('utf-8') if not os.path.exists(get.path): if get.path.find('rewrite') == -1: return public.returnMsg(False, 'FILE_NOT_EXISTS', (get.path, )) public.writeFile(get.path, '') if os.path.getsize(get.path) > 2097152: return public.returnMsg(False, 'CANT_EDIT_ONLINE_FILE') fp = open(get.path, 'rb') data = {} data['status'] = True try: if fp: from chardet.universaldetector import UniversalDetector detector = UniversalDetector() srcBody = b"" for line in fp.readlines(): detector.feed(line) srcBody += line detector.close() char = detector.result data['encoding'] = char['encoding'] if char['encoding'] == 'GB2312' or not char[ 'encoding'] or char['encoding'] == 'TIS-620' or char[ 'encoding'] == 'ISO-8859-9': data['encoding'] = 'GBK' if char['encoding'] == 'ascii' or char[ 'encoding'] == 'ISO-8859-1': data['encoding'] = 'utf-8' if char['encoding'] == 'Big5': data['encoding'] = 'BIG5' if not char['encoding'] in ['GBK', 'utf-8', 'BIG5']: data['encoding'] = 'utf-8' try: if sys.version_info[0] == 2: data['data'] = srcBody.decode(data['encoding']).encode( 'utf-8', errors='ignore') else: data['data'] = srcBody.decode(data['encoding']) except: data['encoding'] = char['encoding'] if sys.version_info[0] == 2: data['data'] = srcBody.decode(data['encoding']).encode( 'utf-8', errors='ignore') else: data['data'] = srcBody.decode(data['encoding']) else: if sys.version_info[0] == 2: data['data'] = srcBody.decode('utf-8').encode('utf-8') else: data['data'] = srcBody.decode('utf-8') data['encoding'] = u'utf-8' return data except Exception as ex: return public.returnMsg( False, 'INCOMPATIBLE_FILECODE', (str(ex)), )
def __init__(self, open_file): try: detector = UniversalDetector() for line in open_file: detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] if encoding == 'UTF-8-SIG': encoding = 'utf-8' parser = ET.XMLParser(encoding=encoding) except TypeError: parser = None open_file.seek(0) iterator = ET.iterparse(open_file, parser=parser) self.services = {} # element = None serviced_organisations = None journey_pattern_sections = {} for _, element in iterator: tag = element.tag[33:] if tag == 'StopPoints': stops = (Stop(stop) for stop in element) self.stops = {stop.atco_code: stop for stop in stops} element.clear() elif tag == 'Routes': # routes = { # route.get('id'): route.find('txc:Description', NS).text # for route in element # } element.clear() elif tag == 'RouteSections': element.clear() elif tag == 'Operators': self.operators = element elif tag == 'JourneyPatternSections': for section in element: section = JourneyPatternSection(section, self.stops) if section.timinglinks: journey_pattern_sections[section.id] = section element.clear() elif tag == 'ServicedOrganisations': serviced_organisations = (ServicedOrganisation(child) for child in element) serviced_organisations = { organisation.code: organisation for organisation in serviced_organisations } elif tag == 'VehicleJourneys': try: self.journeys = self.__get_journeys( element, serviced_organisations) except (AttributeError, KeyError) as e: logger.error(e, exc_info=True) return element.clear() elif tag == 'Service': service = Service(element, serviced_organisations, journey_pattern_sections) self.services[service.service_code] = service elif tag == 'Garages': # print(ET.tostring(element).decode()) element.clear() self.element = element self.transxchange_date = max( element.attrib['CreationDateTime'], element.attrib['ModificationDateTime'])[:10]
def analyse_HTTP(serveurHTTP, protocole_serveur, port_serveur, wordlist_bruteforce, extensions_a_bruteforcer, nb_processus): liste_dossiers_a_powned = ["/"] liste_dossiers_deja_testes = [] protocole_serveur = protocole_serveur + "://" contenu_robots_txt = requests.get(protocole_serveur + serveurHTTP + "/robots.txt") if contenu_robots_txt.status_code == 200: print("Contenu du fichier robots.txt du serveur %s :" % serveurHTTP) for line in (contenu_robots_txt.iter_lines()): if "Disallow:" in str(line): ligne_robots_string = str(line) fichier_ou_dossier_robot = ligne_robots_string[( ligne_robots_string.find("/")):( ligne_robots_string.rfind("'"))] print(fichier_ou_dossier_robot) if "." not in fichier_ou_dossier_robot or "/" in fichier_ou_dossier_robot: test_requete = requests.get(protocole_serveur + serveurHTTP + fichier_ou_dossier_robot + "/") if test_requete.status_code != 404 and fichier_ou_dossier_robot[ -1] == "/": liste_dossiers_a_powned.append( fichier_ou_dossier_robot) else: liste_dossiers_a_powned.append( fichier_ou_dossier_robot + "/") else: print("Le serveur %s ne contient pas de fichier robots.txt" % serveurHTTP) verification_gestion_fichier_non_present = requests.get( protocole_serveur + serveurHTTP + ":" + str(port_serveur) + "/TESTERREUR404_ElTito.php") if verification_gestion_fichier_non_present.status_code != 404: print( "Le serveur %s ne retourne pas d'erreur 404 si on essaye d'accéder à un fichier inexistant" % serveurHTTP) exit(0) verification_methode_http = requests.head(protocole_serveur + serveurHTTP + ":" + str(port_serveur)) if verification_methode_http.status_code != 405: methode_http_a_utiliser = "HEAD" else: methode_http_a_utiliser = "GET" nb_lignes_wordlist = 0 detection_encodage = UniversalDetector() detection_encodage.reset() with open(wordlist_bruteforce, mode='rb') as e: for b in e: detection_encodage.feed(b) if detection_encodage.done: break detection_encodage.close() encodage_wordlist = detection_encodage.result["encoding"] with open(wordlist_bruteforce, encoding=encodage_wordlist) as f: for line in f: nb_lignes_wordlist = nb_lignes_wordlist + 1 nb_lignes_a_lire_par_processus = nb_lignes_wordlist // nb_processus manager = multiprocessing.Manager() dictionnaire_fichier_decouverts = manager.dict() dictionnare_dossiers_decouverts = manager.dict() liste_dossier_directory_indexing = manager.list() while (len(liste_dossiers_a_powned)) > 0: jobs = [] for i in range(nb_processus): tache = multiprocessing.Process( target=brute_force_http, args=(serveurHTTP, protocole_serveur, port_serveur, wordlist_bruteforce, liste_dossiers_a_powned[0], extensions_a_bruteforcer, i, nb_lignes_a_lire_par_processus, encodage_wordlist, methode_http_a_utiliser, dictionnaire_fichier_decouverts, dictionnare_dossiers_decouverts, liste_dossier_directory_indexing)) jobs.append(tache) tache.start() for proc in jobs: proc.join() liste_dossiers_deja_testes.append(liste_dossiers_a_powned[0]) for dossier in dictionnare_dossiers_decouverts.keys(): if ".htpasswd" in dossier or ".htaccess" in dossier: dictionnaire_fichier_decouverts[ dossier] = dictionnare_dossiers_decouverts.get(dossier) del dictionnare_dossiers_decouverts[dossier] elif dossier == "/server-status": dictionnaire_fichier_decouverts[ dossier] = dictionnare_dossiers_decouverts.get(dossier) del dictionnare_dossiers_decouverts[dossier] elif (dossier + "/") not in liste_dossiers_deja_testes and ( dossier + "/") not in liste_dossiers_a_powned: liste_dossiers_a_powned.append(dossier + "/") liste_dossiers_a_powned.remove(liste_dossiers_a_powned[0]) print(dictionnaire_fichier_decouverts) print(dictionnare_dossiers_decouverts) print(liste_dossier_directory_indexing) for page, message_retour in dictionnaire_fichier_decouverts.items(): print(page) print(message_retour) if ".php" in page or ".aspx" in page and message_retour == "200": analyse_formulaire_php(serveurHTTP, protocole_serveur, port_serveur, page)
def detect_file_encoding(training_file, file_encoding, max_passwords=10000): ##Try to import chardet # # If that package is not installed print out a warning and use is ok, # then use ascii as the default values # try: from chardet.universaldetector import UniversalDetector detector = UniversalDetector() except ImportError as error: print("FAILED: chardet not insalled") print( "It is highly recommended that you install the 'chardet' Python package" ) print( "or manually specify the file encoding of the training set via the command line" ) print( "You can download chardet from https://pypi.python.org/pypi/chardet" ) if get_confirmation( "Do you want to continue using the default encoding 'ascii'?"): file_encoding.append('ascii') return True else: # User wanted to exit instead print( "Understood. Please install chardet or specify an encoding " + "format on the command line") return False try: cur_count = 0 with open(training_file, 'rb') as file: for line in file.readlines(): detector.feed(line) if detector.done: break cur_count = cur_count + 1 if cur_count >= max_passwords: break detector.close() except IOError as error: print("Error opening file " + training_file) print("Error is " + str(error)) return False try: file_encoding.append(detector.result['encoding']) print("File Encoding Detected: " + str(detector.result['encoding'])) print("Confidence for file encoding: " + str(detector.result['confidence'])) print( "If you think another file encoding might have been used please ") print( "manually specify the file encoding and run the training program again" ) print() except KeyError as error: print("Error encountered with file encoding autodetection") print("Error : " + str(error)) return False return True
def set_source(self, name): # source _dependent_ initialization goes here if name is None or not os.path.isfile(name): return False IP.set_source(self, name) self.__source_name = name # auto-detect file-encoding (optional) try: from chardet.universaldetector import UniversalDetector detector = UniversalDetector() detector.reset() lines = 0 for line in file(self.__source_name, 'rb'): detector.feed(line) lines += 1 if detector.done or lines == 50: break detector.close() encoding = string.lower(detector.result['encoding']) except: log.exception('') encoding = 'utf_8' encoding = self._encoding_cleanup.sub('', encoding) model = self.gtk.get_widget('e_encoding').get_model() itempos = 0 for item in model: pos1 = string.find( self._encoding_cleanup.sub('', string.lower(str(item[0]))), encoding) if pos1 == 0: break itempos += 1 self.gtk.get_widget('e_encoding').set_active(itempos) # auto-detect CSV import settings (optional) try: import csv sniffer = csv.Sniffer() csvfilesize = os.path.getsize(self.__source_name) if csvfilesize > 65535: csvfilesize = 65535 csvfile = file(self.__source_name, 'rb') try: # quote char, line terminator and field delimiter proposed_dialect = sniffer.sniff(csvfile.read(csvfilesize)) self.gtk.get_widget('e_delimiter').set_text( proposed_dialect.delimiter) self.gtk.get_widget('e_quotechar').set_text( proposed_dialect.quotechar) if proposed_dialect.lineterminator == '\r\n': self.gtk.get_widget('e_lineterminator').set_active(1) # first row with column headers csvfile.seek(0) if sniffer.has_header(csvfile.read(csvfilesize)): self.gtk.get_widget('e_startrow').set_text('1') else: self.gtk.get_widget('e_startrow').set_text('0') finally: csvfile.close() except: log.exception('') # run dialog response = self.gtk.get_widget('d_import').run() if response == gtk.RESPONSE_OK: return True else: return False
## toolFileName = sys.argv[0] if len(toolFileName) <= 0: toolDirName = os.path.dirname(os.getcwd()) elif os.path.isdir(toolFileName): toolDirName = toolFileName else: toolDirName = os.path.dirname(toolFileName) print(toolDirName , sys.argv[1]) # encodingの検出ツールを使う。 encode_detector = UniversalDetector() encode_detector.reset() raw_doc = open(sys.argv[1].replace('\\', '/'), 'rb').read() encode_detector.feed(raw_doc) if encode_detector.done: encode_detector.close() raw_doc = raw_doc.decode(encode_detector.result['encoding'], errors='ignore' ) # .encode('utf-8', 'ignore') else: encode_detector.close() raw_doc = raw_doc.decode('utf-8', errors='ignore') doc3 = gensim.parsing.preprocess_string(raw_doc) raw_doc = None model = gensim.models.doc2vec.Doc2Vec.load(os.path.join(toolDirName,'data/doc2vec.model')) # .doc2vec is better Similarity! # .doc2vecは、隣接する単語の並びをnGram化しているので、文章としての類似度が自然に見える。 new_doc_vec3 = model.infer_vector(doc3) similarl_docs = sorted(model.docvecs.most_similar([new_doc_vec3], topn=topN), key=lambda item: -item[1]) print('doc2vec most_similar',len(similarl_docs)) for docName,similarl in similarl_docs: print('{:3.5f}'.format(similarl), docName)
class FileOpener: def __init__(self, use_chardet): self.use_chardet = use_chardet if use_chardet: self.init_chardet() def init_chardet(self): try: from chardet.universaldetector import UniversalDetector except ImportError: raise Exception("There's no chardet installed to import from. " "Please, install it and check your PYTHONPATH " "environment variable") self.encdetector = UniversalDetector() def open(self, filename): if self.use_chardet: return self.open_with_chardet(filename) else: return self.open_with_internal(filename) def open_with_chardet(self, filename): self.encdetector.reset() with open(filename, 'rb') as f: for line in f: self.encdetector.feed(line) if self.encdetector.done: break self.encdetector.close() encoding = self.encdetector.result['encoding'] try: f = open(filename, encoding=encoding) lines = f.readlines() except UnicodeDecodeError: print('ERROR: Could not detect encoding: %s' % filename, file=sys.stderr) raise except LookupError: print('ERROR: %s -- Don\'t know how to handle encoding %s' % (filename, encoding), file=sys.stderr) raise finally: f.close() return lines, encoding def open_with_internal(self, filename): curr = 0 global encodings while True: try: f = open(filename, 'r', encoding=encodings[curr]) lines = f.readlines() break except UnicodeDecodeError: if not quiet_level & QuietLevels.ENCODING: print('WARNING: Decoding file %s' % filename, file=sys.stderr) print('WARNING: using encoding=%s failed. ' % encodings[curr], file=sys.stderr) print('WARNING: Trying next encoding: %s' % encodings[curr], file=sys.stderr) curr += 1 finally: f.close() if not lines: raise Exception('Unknown encoding') encoding = encodings[curr] return lines, encoding