Esempi in Python per UniversalDetector.close, esempi in Python per cchardet.UniversalDetector.close

Esempio n. 1

0

Mostra file

File: utilities.py Progetto: cuulee/partridge

def detect_encoding(f: BinaryIO, limit: int = 2500) -> str:
    """
    Return encoding of provided input stream.

    Most of the time it's unicode, but if we are unable to decode the input
    natively, use `chardet` to determine the encoding heuristically.
    """
    unicode_decodable = True

    for line_no, line in enumerate(f):
        try:
            line.decode("utf-8")
        except UnicodeDecodeError:
            unicode_decodable = False
            break

        if line_no > limit:
            break

    if unicode_decodable:
        return "utf-8"

    f.seek(0)
    u = UniversalDetector()

    for line_no, line in enumerate(f):
        u.feed(line)
        if u.done or line_no > limit:
            break

    u.close()
    return u.result["encoding"]

Esempio n. 2

0

Mostra file

    def _get_dict_reader(self, resource_data_fp):
        '''
        Returns a dict reader for the given source file pointer. If the encoding of the source
        hasn't been guessed yet then this function will read the from the file pointer until EOF and
        guess the encoding. The file pointer is reset to the start after this occurs.

        :param resource_data_fp: the file pointer to the source
        :return: a DictReader object
        '''
        if self.encoding is None:
            with ensure_reset(resource_data_fp):
                detector = UniversalDetector()
                while True:
                    chunk = resource_data_fp.read(8192)
                    if chunk:
                        detector.feed(chunk)
                    else:
                        detector.close()
                        break

            self.encoding = detector.result[u'encoding']
            # if the detector failed to work out the encoding (unlikely) or if the encoding it
            # comes up with is ASCII, just default to UTF-8 (UTF-8 is a superset of ASCII)
            if self.encoding is None or self.encoding == u'ASCII':
                self.encoding = u'utf-8'

        # create and return the dict reader
        line_iterator = iter_universal_lines(resource_data_fp, self.encoding)
        return unicodecsv.DictReader(line_iterator, dialect=self.dialect, encoding=self.encoding)

Esempio n. 3

0

Mostra file

File: encoding.py Progetto: gvprime/anycsv

def guessWithChardet(content):
    u = UniversalDetector()
    for line in content.split(b"\n"):
        u.feed(line)
    u.close()
    result = u.result
    return result

Esempio n. 4

0

Mostra file

def detect_encoding(f):
    detector = UniversalDetector()
    detector.reset()
    for line in f:
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    return detector.result["encoding"]

Esempio n. 5

0

Mostra file

def detect_encoding(path):
    detector = UniversalDetector()
    detector.reset()
    with open(path, "rb") as f:
        for line in f:
            detector.feed(line)
            if detector.done:
                break
    detector.close()
    return detector.result["encoding"]

Esempio n. 6

0

Mostra file

File: detection.py Progetto: vlasvlasvlas/csv-detective

def detect_encoding(the_file):
    '''Detects file encoding using chardet based on N first lines
    '''
    detector = UniversalDetector()
    for line in the_file.readlines():
        detector.feed(line)
        if detector.done:
            break
    detector.close()

    return detector.result

Esempio n. 7

0

Mostra file

    def _file_encoding(self):
        detector = UniversalDetector()

        for chunk in self.chunks:
            for line in chunk.splitlines():
                detector.feed(line)
                if detector.done:
                    break

        detector.close()
        encoding = detector.result['encoding']

        return encoding

Esempio n. 8

0

Mostra file

 def SetEncoding(self):
     if os.path.isfile(self.filename):
         bigdata = open(filename, "rb")
         from cchardet import UniversalDetector
         detector = UniversalDetector()
         for line in bigdata.readlines():
             detector.feed(line)
             if detector.done:
                 break
         detector.close()
         bigdata.close()
         result = detector.result
         self.encoding = result.get('encoding', self.encoding)

Esempio n. 9

0

Mostra file

def detect_encoding(f: BinaryIO, limit: int = 2500) -> str:
    u = UniversalDetector()
    for line in f:
        u.feed(line)

        limit -= 1
        if u.done or limit < 1:
            break

    u.close()
    if u.result["encoding"].lower() == "ascii":
        return "utf-8"
    else:
        return u.result["encoding"]

Esempio n. 10

0

Mostra file

def detect_encoding_with_chardet(iostream,
                                 min_lines=10,
                                 max_lines=anycsvconfig.NO_SNIFF_LINES):
    detector = UniversalDetector()

    if max_lines != -1:
        max_lines = max(min_lines, max_lines)

    c = 0
    for line in iostream:
        c += 1
        detector.feed(line)
        if c > min_lines and (detector.done or min(c, max_lines) == max_lines):
            break
    detector.close()
    return detector.result  #{'encoding':, 'confidence':}

Esempio n. 11

0

Mostra file

File: gettxtcollection.py Progetto: yhyDewily/nlputils

def detect_convert(filename):
    detector = UniversalDetector()
    detector.reset()
    cache = b''
    with open(filename, 'rb') as f:
        for line in f:
            detector.feed(line)
            cache += line
            if detector.done:
                break
        detector.close()
        cache = cache.decode(detector.result['encoding'] or args.fallback_enc,
                             errors='ignore')
        cache += f.read().decode(detector.result['encoding']
                                 or args.fallback_enc,
                                 errors='ignore')
        cf = convertfunc(cache, args.locale, args.locale_only)
        return cf(cache)

Esempio n. 12

0

Mostra file

File: gettxtcollection.py Progetto: The-Orizon/nlputils

def detect_convert(filename):
    detector = UniversalDetector()
    detector.reset()
    cache = b''
    with open(filename, 'rb') as f:
        for line in f:
            detector.feed(line)
            cache += line
            if detector.done:
                break
        detector.close()
        cache = cache.decode(
            detector.result['encoding'] or args.fallback_enc,
            errors='ignore')
        cache += f.read().decode(
            detector.result['encoding'] or args.fallback_enc,
            errors='ignore')
        cf = convertfunc(cache, args.locale, args.locale_only)
        return cf(cache)

Esempio n. 13

0

Mostra file

File: clean_text.py Progetto: ADP51/NovelGenerationNLP

def read_file(usr_fil, log=None):
    """Reads a file and returns each line as a list of strings.

    :param usr_fil: Required - Input file
    :param log: Optional - Log file
    :return: str: The file contents as a string
    """

    out = ""

    try:
        print(usr_fil)
        # Detect file encoding
        file = open(usr_fil, 'rb')
        det = UniversalDetector()
        for line in file.readlines():
            det.feed(line)
            if det.done:
                break
        det.close()
        usr_enc = det.result.get("encoding")

        out += f"File encoding: {usr_enc}\n"

        # Open file using detected encoding
        text_read = open(usr_fil, 'r', encoding='UTF-8', errors='ignore')
        text = text_read.read()
        text_read.close()

        return text
    except IOError:
        out += "\nFile reading failed\n"
        return False
    finally:
        # Log info
        if log is not None:
            log.write(out)
        else:
            print(out, end='')

Esempio n. 14

0

Mostra file

def convert(asmfile, opt, verbose, stdout) -> None:
    encoding = "utf-8"
    requires_manual_conversion = False
    try:
        with open(asmfile, "r") as f:
            text = f.readlines()
    except Exception as e:
        detector = UniversalDetector()
        for line in open(asmfile, "rb"):
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        encod = detector.result
        if encod["confidence"] is not None:
            if encod["confidence"] >= 0.5:
                encoding = encod["encoding"]
            else:
                encoding = "SHIFT_JIS"
        else:
            encoding = "SHIFT_JIS"  # if confidence is low, try japanese
        try:
            if verbose:
                print(f"Guessed encoding {encoding}, will try to parse now...")
            with open(asmfile, "r", encoding=encoding) as f:
                text = f.readlines()
        except Exception as e:
            raise e  # propagate
    bw_defs = []
    outputfile = asmfile.replace(".asm", "_sa1.asm")
    outfile = open(outputfile, "w", encoding=encoding)
    stdout.write(bytes(f"Processing file {asmfile}:\n", encoding=encoding))
    outlines = []
    if opt:
        outfile.write("incsrc conv_defines.asm\n")
    tot_conversions = 0
    for index, line in enumerate(text, start=1):
        outlines.append("")
        data_types = ["db", "dw", "dl", "dd"]
        in_comment = False
        in_data = False
        define_found = re.match(
            r"![A-Za-z\d_]+\s+=\s+((\$)?[\dA-Fa-f]{2,6})\S*", line.strip())
        words = re.split(r"([ \t;])", line.rstrip())
        if line.strip() == "" or line.lstrip().startswith(";") or define_found:
            # shortcuts for comments and blank lines and defines
            if define_found:
                requires_manual_conversion = True
                is_hex = define_found.group(2) is not None
                if (int(
                        define_found.group(1).replace("$", "0x")
                        if is_hex else define_found.group(1),
                        16 if is_hex else 10,
                ) == 12):
                    stdout.write(
                        bytes(
                            f"There is define {define_found.group(0)} at line {index} which is equal to 12,"
                            f" this might be a define related to how many sprites can be loaded by the game"
                            f" if so, change it to 22 or $16, or (even better) use the following\n"
                            f"\tif read1($00FFD5) == $23\n\t\t{define_found.group(0)}\n\telse\n\t\t"
                            f'{define_found.group(0).split("=")[0]}= {"$16" if is_hex else "22"}\n\tendif\n',
                            encoding=encoding,
                        ))
                elif (int(define_found.group(1).replace("$", "0x"), 16)
                      in sprite_addr_list and is_hex):
                    stdout.write(
                        bytes(
                            f"There is define {define_found.group(0)} at line {index} which is a sprite "
                            f"address, usually replacing the $ with ! works in most tools, it didn't get "
                            f"converted automatically because it might not be necessary to do so, make sure "
                            f"to convert manually it ONLY if needed.\n",
                            encoding=encoding,
                        ))
                elif (0x0100 <= int(
                        define_found.group(1).replace("$", "0x")
                        if is_hex else define_found.group(1),
                        16 if is_hex else 10,
                ) <= 0x1FFF):
                    stdout.write(
                        bytes(
                            f"There is define {define_found.group(0)} at line {index} which might be a ram"
                            f" address, if it is, convert it by adding |!addr at the end of it, if it's not"
                            f" a ram address leave it alone\n",
                            encoding=encoding,
                        ))
            outlines[index - 1] = line.rstrip()
            continue
        ignore_next_address = False
        for og_word in words:
            stripped_word = og_word.strip()
            to_insert = ""
            if in_comment or in_data:
                pass
            elif stripped_word.startswith(";"):
                in_comment = True
            elif any([stripped_word.startswith(a) for a in data_types]):
                in_data = True
            elif stripped_word.startswith("PEA") or stripped_word.startswith(
                    "PER"):
                ignore_next_address = True
            elif addr := re.findall(r"\$[\da-fA-F]{1,6}\|![a-zA-Z\d_]+\b",
                                    og_word):
                stdout.write(
                    bytes(
                        f"Possibly address {addr[0]} at line {index} was already hybrid.\n",
                        encoding=encoding,
                    ))
            elif re.findall(r"\$[^, \n()\[\]]{1,6}", og_word):
                if ignore_next_address:
                    ignore_next_address = False
                    outlines[index - 1] += og_word
                    continue
                splitted = re.split(r"([\[\](), ])", og_word)
                word_tuples = []
                for i, word in enumerate(splitted):
                    if word.startswith("$"):
                        try:
                            proc_word = eval(word.replace("$", "0x"))
                            expr = re.split(
                                r"[+\\\-^*~<>|]", word.replace(
                                    "$", ""))  # +\-^*~<>  some asar math ops
                            word = "${:0{}X}".format(
                                proc_word, max([len(e) for e in expr]))
                            word_tuples.append((WordType.ADDR, word, i))
                        except SyntaxError:
                            bunch = re.split(r"([+\-^*~<>| ])", word)
                            for w in bunch:
                                if w.startswith("$"):
                                    word_tuples.append((WordType.ADDR, w, i))
                                else:
                                    word_tuples.append((WordType.OTHER, w, i))
                    elif word.startswith(","):
                        word_tuples.append((WordType.COMMA, word, i))
                    else:
                        word_tuples.append((WordType.OTHER, word, i))
                for wordtype, word, i in word_tuples:
                    if wordtype == WordType.ADDR:
                        try:
                            try:
                                comma_index = (i + 1 if word_tuples[i + 1][0]
                                               == WordType.COMMA else -1)
                            except IndexError:
                                comma_index = -1
                            (
                                ww,
                                bwram_define_needed,
                                converted,
                                manual_conversion,
                            ) = process_word(
                                word.replace("$", ""),
                                stdout,
                                encoding,
                                index,
                                splitted,
                                comma_index,
                            )
                            if manual_conversion:
                                requires_manual_conversion = True
                            if converted:
                                tot_conversions += 1
                                stdout.write(
                                    bytes(
                                        f"Conversion: {word} -> {ww}\n",
                                        encoding=encoding,
                                    ))
                            bw_defs.append(bwram_define_needed)
                            to_insert += ww
                        except ValueError:
                            to_insert += word
                    else:
                        to_insert += word
            outlines[index - 1] += to_insert if to_insert != "" else og_word

Esempio n. 15

0

Mostra file

detector = UniversalDetector()

arg = sys.argv[1:]
if arg:
    fns = list(filter(os.path.isfile, arg))
else:
    fns = ['-']

for fn in fns:
    if fn == '-':
        stream = sys.stdin.buffer
    else:
        stream = open(fn, 'rb')
    detector.reset()
    cache = []
    for line in stream:
        detector.feed(line)
        cache.append(line)
        if detector.done:
            break
    detector.close()
    for line in cache:
        sys.stdout.write(
            line.decode(detector.result['encoding'] or 'utf-8',
                        errors='replace'))
    for line in stream:
        sys.stdout.write(
            line.decode(detector.result['encoding'] or 'utf-8',
                        errors='replace'))
    stream.close()