Beispiel #1
0
def guess_encoding(
    data: Iterable[bytes], default_encoding: str = "utf-8"
) -> Tuple[str, float]:
    """
    Guess the encoding to decode bytes into corresponding string object.

    Uses chardet to attempt to progressively guess the encoding which can be used to
    decode the bytes into corresponding strings. Returns a tuple[encoding, confidence].

    Args:
        data (Iterable[bytes]): [description]
        default_encoding (str, optional): [description]. Defaults to "utf-8".

    Returns:
        Tuple[str, float]: [description]
    """
    if not CHARDET_INSTALLED:
        return _charset_guess_encoding(data=data, default_encoding=default_encoding)

    detector = UniversalDetector()
    for line in data:
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    return (
        detector.result.get("encoding", default_encoding),  # type: ignore
        detector.result.get("confidence", 0.0),
    )
Beispiel #2
0
 def __init__(self, api):
     self.encoding = "utf-8"
     self.lang = "lang"
     self.api = api
     self.text = self.api.ui.Input.Text(scroll=True, command=self.on_modify)
     self.text.pack(fill="both", expand=True)
     self.orig = ""
     if os.path.exists(self.api.filepath):
         try:
             from chardet import UniversalDetector
         except:
             pass
         else:
             detector = UniversalDetector()
             try:
                 with open(self.api.filepath, mode='rb') as f:
                     while True:
                         binary = f.readline()
                         if binary == b'':
                             break
                         detector.feed(binary)
                         if detector.done:
                             break
             finally:
                 detector.close()
             self.encoding = detector.result["encoding"]
             self.lang = detector.result["language"]
         with open(api.filepath, "r", encoding=self.encoding) as f:
             self.orig = f.read()
             self.text.insert("end", self.orig)
         self.api.saved = True
     else:
         self.api.saved = False
Beispiel #3
0
def chardet_detect(response):
    detector = UniversalDetector()
    for line in response.content.splitlines():
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    return detector.result
Beispiel #4
0
def get_encoding(form):
    detector = UniversalDetector()
    for chunk in form.cleaned_data["upload_file"].chunks():
        detector.feed(chunk)
        if detector.done:
            break
    detector.close()
    return detector.result["encoding"]
Beispiel #5
0
    def __EncodingDefinition(self, path):
        detector = UniversalDetector()
        for line in open(path, 'rb'):
            detector.feed(line)
            if detector.done: break

        detector.close()
        return detector.result['encoding']
Beispiel #6
0
def guess_file_encoding(path):
    _detector = UniversalDetector()
    with open(path, 'rb') as f:
        for line in f.readlines():
            _detector.feed(line)
            if _detector.done:
                break
    _detector.close()
    return _detector.result.get('encoding', 'utf-8')
Beispiel #7
0
def detect_encoding(file_path):
    detector = UniversalDetector()
    detector.reset()
    with open(file_path, 'rb') as file:
        for line in file.readlines():
            detector.feed(line)
            if detector.done: break
    detector.close()
    return detector.result['encoding']
Beispiel #8
0
def detect_encoding(f):
    u = UniversalDetector()
    for line in f:
        line = bytearray(line)
        u.feed(line)
        if u.done:
            break
    u.close()
    return u.result['encoding']
Beispiel #9
0
 def _get_encoding(content: bytes) -> str:
     detector = UniversalDetector()
     timeout = datetime.datetime.now() + datetime.timedelta(seconds=5)
     for line in sliced(content, 2500):
         detector.feed(line)
         if detector.done or datetime.datetime.now() > timeout:
             break
     detector.close()
     return detector.result["encoding"]
Beispiel #10
0
 def __fileEncoding__(self, filename):
     detector = UniversalDetector()
     with open(filename, 'rb') as fh:
         for line in fh:
             detector.feed(line)
             if detector.done:
                 break
     detector.close()
     return detector.result['encoding']
    def __detectEncoding_file(self, filename):
        detector = UniversalDetector()
        detector.reset()
        with open(filename, 'rb') as fReader:
            text = fReader.readlines()

        for line in text:
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        return detector.result['encoding']
Beispiel #12
0
def detect_encod(file):
    detector = UniversalDetector()
    with open(file, 'rb') as d:
        for line in d.readlines():

            detector.feed(line)

            if detector.done:
                break

    detector.close()

    return detector.result['encoding']
    def __detectEncoding(self):
        detector = UniversalDetector()
        detector.reset()
        with open(os.path.join(self.sourcePath, self.fileNameWithExt),
                  'rb') as fReader:
            text = fReader.readlines()

        for line in text:
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        return detector.result['encoding']
def check_unicode(contents, contents_name="This"):
    detector = UniversalDetector()
    for line in contents.split(b'\n'):
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    if detector.result['encoding'] != 'utf-8' and detector.result[
            'encoding'] != 'ascii':
        print(contents_name + " is not Unicode!\nIt's detected as " +
              detector.result['encoding'] + " with a confidence degree of " +
              str(detector.result['confidence'] * 100) + '%.')
        return detector.result['encoding']
    return 'utf-8'
Beispiel #15
0
def detect_encoding(f, limit=100):
    u = UniversalDetector()
    for line in f:
        line = bytearray(line)
        u.feed(line)

        limit -= 1
        if u.done or limit < 1:
            break

    u.close()
    if u.result['encoding'] == 'ascii':
        return 'utf-8'
    else:
        return u.result['encoding']
Beispiel #16
0
    def process_upload(self, uploaded_file):
        with uploaded_file as f:
            # Accumulate file content by reading in chunks
            # We should not process the chunk straightaway because depending on the
            # chunk size last line of csv could be partial
            buf = BytesIO()
            for c in f.chunks(chunk_size=settings.UPLOAD_CHUNK_SIZE):
                buf.write(c)

            encoding_detector = UniversalDetector()
            buf.seek(0)
            encoding_detector.feed(buf.read())
            detection_result = encoding_detector.close()
            encoding = detection_result["encoding"]
            logging.info(f"Encoding detection result: {detection_result}")

            # Use the encoding detected by chardet, in case of failure use utf-8
            try:
                buf.seek(0)
                decoded = buf.read().decode(encoding)
            except Exception:
                buf.seek(0)
                decoded = buf.read().decode("utf-8")
            finally:
                csv_lines = decoded.split("\n")

            records = self._build_records(csv_lines)

        logging.info(f"Successfully imported {len(records)} records")
        return records
    def clean_csv_file(self):
        """Validates the uploaded CSV file and creates a CSV DictReader from it."""
        # This could be an instance of InMemoryUploadedFile or TemporaryUploadedFile
        # (depending on the file size)
        file_field = self.cleaned_data['csv_file']

        # Guess the file encoding (primarily to check for a UTF-8 BOM)
        encoding_detector = UniversalDetector()
        for chunk in file_field.chunks():
            encoding_detector.feed(chunk)
            if encoding_detector.done:
                break

        detection_result = encoding_detector.close()
        self.cleaned_data['csv_file_encoding'] = detection_result['encoding']

        # Check that the file can actually be decoded using the detected encoding so that
        # we don't need to worry about encoding errors when reading the CSV
        file_field.seek(0)
        self._validate_encoding()

        file_field.seek(0)
        # Check that the CSV file has the required column
        self._validate_columns()

        file_field.seek(0)
        return file_field
Beispiel #18
0
    def detect_encoding(file_path: str) -> str:
        """ Detects encoding of given file """
        detector = UniversalDetector()

        with open(file_path, 'rb') as file:
            for line in file.readlines():
                detector.feed(line)
                if detector.done:
                    break

        detector.close()

        encoding = detector.result['encoding']
        logger.debug(f'Detected encoding for file "{file_path}": {encoding}')

        return encoding
Beispiel #19
0
def detectar_codif(archivo,
                   máx_líneas=None,
                   cortar=None,
                   cert=0.95,
                   auto='utf-8'):
    """
    Detecta la codificación de un fuente. (Necesario porque todavía existen programas dinosaurios que no entienden
    los milagros de unicódigo.)

    Parameters
    ----------
    archivo : str
        La dirección del fuente.
    máx_líneas : int
        El número máximo de líneas para pasar al detector. Por ejemplo, si únicamente la primera línea de tu documento
        tiene palabras (por ejemplo, un fuente .csv), no hay razón de seguir analizando las otras líneas.
    cortar : str
        Hasta dónde hay que leer cada línea. Es útil si tienes un csv donde únicamente la primera columna
        contiene texto.

    Returns
    -------
    str
        La codificación más probable.

    """

    detector = UniversalDetector()
    with open(archivo, 'rb') as d:
        for í, línea in enumerate(d.readlines()):

            if cortar is None or cortar.encode() not in línea:
                detector.feed(línea)  # Pasar la próxima línea al detector
            else:
                detector.feed(línea.split(cortar.encode())[0])

            # Parar si alcanzamos el máximo de líneas
            if máx_líneas is not None and í >= (máx_líneas - 1):
                break

            if detector.done:
                break  # Para si el detector ya está seguro

    detector.close()  # Cerrar el detector
    if detector.result['confidence'] > cert:
        return detector.result['encoding']
    return auto
Beispiel #20
0
def get_encoding(_file):
    """
    function defines encoding of file in param
    :param _file:
    :return: encoding
    """
    print('Detecting encoding...')
    detector = UniversalDetector()
    with open(_file, 'rb') as file_descriptor:
        for line in file_descriptor:
            detector.feed(line)
            if detector.done:
                break
        detector.close()
    file_descriptor.close()
    encoding = detector.result["encoding"]
    print('Detected encoding:', encoding)
    return encoding
 def detect_encoding(file):
     """
     If you are having problems with character encoding, it
     may help to check the source file.  This function uses
     the chardet python library to determine the file encoding.
     Result is printed to the console.
     :param file: the path to a processed file that contains bad encoding.
     :return:
     """
     input = open(file, 'rb')
     print("Opened the source file: " + file)
     print(
         "Reading the file to detect character encoding.  This can take a while."
     )
     detector = UniversalDetector()
     for line in input:
         detector.feed(line)
         if detector.done: break
     detector.close()
     print()
     print(detector.result)
def validate_utf8(file):
    base_name = basename(file)
    detector = UniversalDetector()
    detector.reset()
    with open(file, "rb") as f:
        for i, line in enumerate(f):
            detector.feed(line)
            if detector.done or i > 1000:
                break
    detector.close()
    result = detector.result
    if not (result["encoding"] == "utf-8" and result["confidence"] >= 0.99):
        warn(message=f"File {file} should encoding with UTF-8", level=1)
        sys.exit(1)
    with open(file, "r") as f:
        content = f.read()
    normalized_nfc_content = Text(content)
    if normalized_nfc_content != content:
        warn(message=f"File {base_name} should normalized to NFC",
             error_type="Format nfc-normalized-failed",
             file=base_name, level=1)
Beispiel #23
0
def bytes_enc(bytes_content: bytes) -> Optional[str]:
    """バイト文字列のエンコード名を返す。
    Args: 
        bytes_content (bytes): バイト文字列
    Returns:
        str: エンコーディング名
        None: エンコーディングが不明
    """
    detector = UniversalDetector()
    buflen = 1000  # detectorに渡すバッファの大きさ
    for buf in seq_split(bytes_content, buflen):
        detector.feed(buf)
        if detector.done:
            break

    # UnivarsalDetectorのresultアトリビュートで
    # エンコーディング名を取り出す前にclose()関数を呼ばないと
    # きちんとエンコーディングを取得できないので注意。
    detector.close()
    encdic = detector.result
    return encdic['encoding']
Beispiel #24
0
    def _detect_encoding(self, max_line=200):
        detector = UniversalDetector()

        with open(self._path, 'rb') as f:
            for num, line in enumerate(f, 1):
                if num > max_line:
                    break
                detector.feed(line)
                if detector.done:
                    break

            detector.close()

        if detector.result['confidence'] > 0.9:
            self._encoding = detector.result['encoding']
        else:
            with open(self._path, encoding='cp1256') as f:
                for num, line in enumerate(f, 1):
                    if num > max_line:
                        break

            self._encoding = 'cp1256'
Beispiel #25
0
def file_encoding(path):
    iso_unique = (b'\xb1', b'\xac', b'\xbc', b'\xa1', b'\xb6', b'\xa6')
    cp_unique = (b'\xb9', b'\xa5', b'\x9f', b'\x8f', b'\x8c', b'\x9c')

    iso_counter = 0
    cp_counter = 0

    _detector = UniversalDetector()
    with open(path, 'rb') as f:
        for line in f.readlines():
            for c in iso_unique:
                iso_counter += line.count(c)
            for c in cp_unique:
                cp_counter += line.count(c)

            _detector.feed(line)
            if _detector.done:
                break
    _detector.close()

    if _detector.result.get('confidence') < 0.95 and (cp_counter or iso_counter):
        return 'Windows-1250' if cp_counter > iso_counter else 'iso-8859-2'
    return _detector.result.get('encoding') or 'utf-8'
Beispiel #26
0
    def clean_email_list(self):
        """Validates the uploaded CSV file and creates a CSV DictReader from it."""
        # This could be an instance of InMemoryUploadedFile or TemporaryUploadedFile
        # (depending on the file size)
        file_field = self.cleaned_data['email_list']

        # Guess the file encoding (primarily to check for a UTF-8 BOM)
        encoding_detector = UniversalDetector()
        for chunk in file_field.chunks():
            encoding_detector.feed(chunk)
            if encoding_detector.done:
                break

        detection_result = encoding_detector.close()
        encoding = detection_result['encoding']

        file_field.seek(0)
        csv_reader = csv.DictReader(
            io.TextIOWrapper(file_field, encoding=encoding))

        self._validate_columns(csv_reader)
        return csv_reader
Beispiel #27
0
def check_encoding(binary):
    detector = UniversalDetector()
    detector.feed(binary)
    detector.close()
    return detector.result['encoding']
Beispiel #28
0
usage = (
    "Usage: \n"
    " This script loads a data file as a Pandas DataFrame and summarizes contents.\n"
    " > python LoadDF.py filename.extension (.txt, .csv, .tsv, .xlsx)\n")

myargs = sys.argv  # read command line args
if len(myargs) == 1:  # if there are no args, then exit
    sys.exit(usage)

myfile = myargs[1]
myflag = 0
df = pd.DataFrame()
detector = UniversalDetector()
f = open(myfile, "rb")
detector.feed(f.read())
detector.close()
f.close()
dctResult = detector.result
myencoding = dctResult['encoding']
print("File", myfile, "encoded as", myencoding)

if re.search(r'\.txt+$', myfile, re.IGNORECASE) != None:
    print('TXT file')
    myflag += 1
    df = pd.read_csv(myfile, sep='\t', encoding=myencoding)
if re.search(r'\.tsv+$', myfile, re.IGNORECASE) != None:
    print('TSV file')
    myflag += 1
    df = pd.read_csv(myfile, sep='\t', encoding=myencoding)
if re.search(r'\.csv+$', myfile, re.IGNORECASE) != None:
    print('CSV file')
Beispiel #29
0
parser.add_argument("-l",
                    help="list all encoding changes in file",
                    action='store_true')
parser.add_argument("-d", help="try to decode all Lines", action='store_true')
parser.add_argument('filename')
args = parser.parse_args()

with open(args.filename, 'rb') as infile:
    det = UniversalDetector()
    if args.l:
        print("listing encodings of file \"{}\"".format(args.filename))
        encoding = None
        for nl, line in enumerate(infile.readlines()):
            det.reset()
            det.feed(line)
            det.close()
            res = det.result
            if encoding != res["encoding"]:
                encoding = res["encoding"]
                if args.d:
                    print("{}#{}#{}({})".format(nl,
                                                line.decode(res["encoding"]),
                                                res["encoding"],
                                                res["confidence"]))
                else:
                    print("{}#{}#{}({})".format(nl, line, res["encoding"],
                                                res["confidence"]))
    else:
        i = 1000
        for line in infile.readlines():
            i -= 1