Esempio n. 1
0
def detectEncoding(data):
    """Attempt to determine the encoding of a byte sequence.

  :param data: Input data on which to perform encoding detection.
  :type data: :class:`str`

  :return: Tuple of (encoding name, detection confidence).
  :rtype: :class:`tuple` of (:class:`str` or ``None``, :class:`float`)

  This function attempts to determine the character encoding of the input data.
  It returns a tuple with the most likely encoding (or ``None`` if the input
  data is not text) and the confidence of the detection.

  This function uses the :mod:`chardet` module, if it is available. Otherwise,
  only ``'ascii'`` is detected, and ``None`` is returned for any non-ASCII
  input.
  """

    if _haveCharDet:
        result = detect(data)
        return result["encoding"], result["confidence"]

    else:
        chars = ''.join(map(chr, list(range(7, 14)) + list(range(32, 128))))
        if len(data.translate(None, chars)):
            return None, 0.0

        return "ascii", 1.0
Esempio n. 2
0
def detect_encoding(bytesobject):
    """Read the first chunk of input and return its encoding"""
    # unicode-test
    if isutf8(bytesobject):
        return 'UTF-8'
    # try one of the installed detectors on first part
    if cchardet is not None:
        guess = cchardet.detect(bytesobject[:5000])
    else:
        guess = detect(bytesobject[:5000])
    LOGGER.debug('guessed encoding: %s, confidence: %s', guess['encoding'],
                 guess['confidence'])
    # fallback on full response
    if guess is None or (guess['confidence'] is not None
                         and guess['confidence'] < 0.98):
        guess = detect(bytesobject)
        LOGGER.debug('second-guessed encoding: %s, confidence: %s',
                     guess['encoding'], guess['confidence'])
    return guess['encoding']
Esempio n. 3
0
def get_file_encoding_delimiter(fpath: str) -> tuple:
    '''returns tuple of file encoding and delimiter'''
    with open(fpath, mode='rb') as f_as_bytes:
        try:
            byte_contents = f_as_bytes.read()
            enc_data = charset_normalizer.detect(byte_contents)
            encoding = enc_data['encoding']
        except Exception as e:
            logging.warning(
                f'charset err: {e} when figuring out file {os.path.basename(fpath)} encoding. Defaulting to utf-8'
            )
            encoding = 'utf-8'

    with open(fpath, mode='r', encoding=encoding) as f_text:
        text_contents = f_text.read()
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(text_contents)
        delimiter = dialect.delimiter if not dialect.delimiter == ' ' else '\t'
    return encoding, delimiter
Esempio n. 4
0
def decompress(data, enable_encoding_guessing=True):
    """
    Convert a base64-compressed subtitles file back to a string.
    :param data: the compressed data
    :param bool enable_encoding_guessing:
    """

    raw_subtitle = zlib.decompress(base64.b64decode(data), 16 + zlib.MAX_WBITS)
    encoding_detection = detect(
        raw_subtitle) if enable_encoding_guessing is True else None

    if encoding_detection is None:
        return raw_subtitle.decode('utf_8', errors='ignore')

    try:
        my_decoded_str = raw_subtitle.decode(encoding_detection['encoding'])
    except UnicodeDecodeError as e:
        print(e)
        return

    return my_decoded_str
Esempio n. 5
0
def performance_compare(arguments):
    parser = argparse.ArgumentParser(
        description="Performance CI/CD check for Charset-Normalizer")

    parser.add_argument(
        '-s',
        '--size-increase',
        action="store",
        default=1,
        type=int,
        dest='size_coeff',
        help=
        "Apply artificial size increase to challenge the detection mechanism further"
    )

    args = parser.parse_args(arguments)

    chardet_results = []
    charset_normalizer_results = []

    for tbt_path in glob("./char-dataset/**/*.*"):
        print(tbt_path)

        # Read Bin file
        with open(tbt_path, "rb") as fp:
            content = fp.read() * args.size_coeff

        before = time_ns()
        chardet_detect(content)
        chardet_results.append(round((time_ns() - before) / 1000000000, 5))
        print("  --> Chardet: " + str(chardet_results[-1]) + "s")
        before = time_ns()
        detect(content)
        charset_normalizer_results.append(
            round((time_ns() - before) / 1000000000, 5))
        print("  --> Charset-Normalizer: " +
              str(charset_normalizer_results[-1]) + "s")

    chardet_avg_delay = mean(chardet_results)
    chardet_99p = calc_percentile(chardet_results, 99)
    chardet_95p = calc_percentile(chardet_results, 95)
    chardet_50p = calc_percentile(chardet_results, 50)

    charset_normalizer_avg_delay = mean(charset_normalizer_results)
    charset_normalizer_99p = calc_percentile(charset_normalizer_results, 99)
    charset_normalizer_95p = calc_percentile(charset_normalizer_results, 95)
    charset_normalizer_50p = calc_percentile(charset_normalizer_results, 50)

    print("")

    print("------------------------------")
    print("--> Chardet Conclusions")
    print("   --> Avg: " + str(chardet_avg_delay) + "s")
    print("   --> 99th: " + str(chardet_99p) + "s")
    print("   --> 95th: " + str(chardet_95p) + "s")
    print("   --> 50th: " + str(chardet_50p) + "s")

    print("------------------------------")
    print("--> Charset-Normalizer Conclusions")
    print("   --> Avg: " + str(charset_normalizer_avg_delay) + "s")
    print("   --> 99th: " + str(charset_normalizer_99p) + "s")
    print("   --> 95th: " + str(charset_normalizer_95p) + "s")
    print("   --> 50th: " + str(charset_normalizer_50p) + "s")

    return 0 if chardet_avg_delay > charset_normalizer_avg_delay and chardet_99p > charset_normalizer_99p else 1
Esempio n. 6
0
    def detect_encoding(
            self,
            text: bytes,
            default_encodings: Optional[List[str]] = None) -> Tuple[str, str]:
        """
        Try to detect a file encoding from `text`, using either the chardet lib
        or by trying to decode the file.
        """
        if not default_encodings:
            default_encodings = ["utf-8"]
        try:
            from charset_normalizer import detect
        except ImportError:
            detected_encoding = self.fallback_detection(text)
        else:
            detected_encoding = detect(text)
            if (detected_encoding["confidence"] is None
                    or detected_encoding["confidence"] < 0.48):
                detected_encoding = None
            elif detected_encoding["encoding"] == "ascii":
                detected_encoding["encoding"] = self.encoding
            else:
                detected_encoding["encoding"] = detected_encoding[
                    "encoding"].lower()

        encodings = []
        # Purposefully accessed the internal _encoding, as encoding is never 'auto'
        if self._encoding == "auto":
            if detected_encoding and detected_encoding[
                    "encoding"] not in encodings:
                encodings.append(detected_encoding["encoding"])
            for encoding in default_encodings:
                if encoding not in encodings:
                    encodings.append(encoding)
        elif detected_encoding:
            if "-" in detected_encoding["encoding"]:
                encoding, suffix = detected_encoding["encoding"].rsplit("-", 1)
            else:
                encoding = detected_encoding["encoding"]
                suffix = None

            # Different charset, just with BOM
            if encoding == self.encoding and suffix == "sig":
                encodings.append(detected_encoding["encoding"])
            elif detected_encoding["encoding"] != self.encoding:
                logging.warning(
                    "trying to parse %s with encoding: %s but "
                    "detected encoding is %s (confidence: %s)",
                    self.filename,
                    self.encoding,
                    detected_encoding["encoding"],
                    detected_encoding["confidence"],
                )
            encodings.append(self.encoding)
        else:
            encodings.append(self.encoding)

        for encoding in encodings:
            try:
                r_text = str(text, encoding)
                r_encoding = encoding
                break
            except UnicodeDecodeError:
                r_text = None
                r_encoding = None
        if r_encoding == "ascii":
            r_encoding = "utf-8"
        return r_text, r_encoding
Esempio n. 7
0
def main():
    descr = '''
    Update racing team info volunteer records from csv file
    '''
    parser = ArgumentParser(description=descr)
    parser.add_argument('inputfile', help='csv file with input records', default=None)
    args = parser.parse_args()
    
    scriptdir = dirname(__file__)
    # two levels up
    scriptfolder = dirname(dirname(scriptdir))
    configdir = join(scriptfolder, 'config')
    memberconfigfile = "members.cfg"
    memberconfigpath = join(configdir, memberconfigfile)
    userconfigfile = "users.cfg"
    userconfigpath = join(configdir, userconfigfile)

    # create app and get configuration
    # use this order so members.cfg overrrides users.cfg
    configfiles = [userconfigpath, memberconfigpath]
    app = create_app(Development(configfiles), configfiles)

    # set up database
    db.init_app(app)

    # determine input file encoding
    with open(args.inputfile, 'rb') as binaryfile:
        rawdata = binaryfile.read()
    detected = detect(rawdata)

    # need app context, open input file
    with app.app_context(), open(args.inputfile, 'r', encoding=detected['encoding'], newline='', errors='replace') as IN:
        # turn on logging
        setlogging()

        # trick local interest stuff
        g.interest = 'fsrc'

        # initialize database tables from input file
        infile = DictReader(IN)
        for row in infile:
            # first check if racing team member exists
            localuser = LocalUser.query.filter_by(name=row['name'], **localinterest_query_params()).one_or_none()
            member = RacingTeamMember.query.filter_by(localuser=localuser, **localinterest_query_params()).one_or_none() if localuser else None
            if not member: continue
            
            # this pulls timezone information off of timestamp, formatted like 'Sun Feb 25 2018 14:07:17 GMT-0500 (EST)'
            timestampasc = ' '.join(row['timestamp'].split(' ')[:-2])
            timestamp = tstamp.asc2dt(timestampasc)
            
            # if we already have received an info record for this member at this timestamp, skip it else we'll get duplicates
            inforec = RacingTeamInfo.query.filter_by(member=member, logtime=timestamp).one_or_none()
            if inforec: continue
            
            # if we've gotten here, we need to add info and volunteer records
            inforec = RacingTeamInfo(interest=localinterest(), member=member, logtime=timestamp)
            db.session.add(inforec)
            volrec = RacingTeamVolunteer(
                interest=localinterest(), 
                info=inforec, 
                eventdate = isodate.asc2dt(row['eventdate']).date(),
                eventname = row['eventname'],
                hours = row['hours'],
                comment = row['comments'],
            )
            db.session.add(volrec)
            
        db.session.commit()
Esempio n. 8
0
def main():
    descr = '''
    Update racing team info volunteer records from csv file
    '''
    parser = ArgumentParser(description=descr)
    parser.add_argument('inputfile',
                        help='csv file with input records',
                        default=None)
    args = parser.parse_args()

    scriptdir = dirname(__file__)
    # two levels up
    scriptfolder = dirname(dirname(scriptdir))
    configdir = join(scriptfolder, 'config')
    memberconfigfile = "members.cfg"
    memberconfigpath = join(configdir, memberconfigfile)
    userconfigfile = "users.cfg"
    userconfigpath = join(configdir, userconfigfile)

    # create app and get configuration
    # use this order so members.cfg overrrides users.cfg
    configfiles = [userconfigpath, memberconfigpath]
    app = create_app(Development(configfiles), configfiles)

    # set up database
    db.init_app(app)

    # determine input file encoding
    with open(args.inputfile, 'rb') as binaryfile:
        rawdata = binaryfile.read()
    detected = detect(rawdata)

    # translate type from old format to new
    applntype = {
        'Returning Racing Team Member': 'renewal',
        'New Racing Team Member': 'new',
    }

    # need app context, open input file
    with app.app_context(), open(args.inputfile,
                                 'r',
                                 encoding=detected['encoding'],
                                 newline='',
                                 errors='replace') as IN:
        # turn on logging
        setlogging()

        # trick local interest stuff
        g.interest = 'fsrc'

        # initialize database tables from input file
        infile = DictReader(IN)
        for row in infile:
            # this pulls timezone information off of record timestamp, formatted like 'Sun Feb 25 2018 14:07:17 GMT-0500 (EST)'
            timestampasc = ' '.join(row['time'].split(' ')[:-2])
            timestamp = tstamp.asc2dt(timestampasc)

            # if we already have received an application for this name at this timestamp, skip it else we'll get duplicates
            applnrec = RacingTeamApplication.query.filter_by(
                name=row['name'],
                logtime=timestamp,
                **localinterest_query_params()).one_or_none()
            if applnrec: continue

            # at least one record doesn't have a date of birth
            if not row['dob']:
                app.logger.warning(
                    f"racingteam_appln_init: skipping {row['name']} {row['race1-name']} {row[f'race1-date']}"
                )
                continue

            # if we've gotten here, we need to add application and result records
            dob = isodate.asc2dt(row['dob']).date()
            applnrec = RacingTeamApplication(
                interest=localinterest(),
                logtime=timestamp,
                name=row['name'],
                type=applntype[row['applntype']],
                comments=row['comments'],
                dateofbirth=dob,
                email=row['email'],
                gender=row['gender'].upper()[0],
            )
            db.session.add(applnrec)
            for race in ['race1', 'race2']:
                # originally, new members were only asked for one race
                # detect this condition and skip this result -- this should only happen for race2
                if not row[f'{race}-date']: continue

                # handle case where age grade was not calculated properly
                # this was due to deficiency in the original script, so these should be early entries
                # it's not worth adding the complexity to fix this data at this point
                try:
                    agegrade = float(row[f'{race}-agegrade']),
                    agegrade = row[f'{race}-agegrade']
                except ValueError:
                    agegrade = None

                # calculate age
                racedate = isodate.asc2dt(row[f'{race}-date']).date()
                thisage = age(racedate, dob)

                # add result
                resultrec = RacingTeamResult(
                    interest=localinterest(),
                    application=applnrec,
                    eventdate=racedate,
                    eventname=row[f'{race}-name'],
                    age=thisage,
                    agegrade=agegrade,
                    distance=row[f'{race}-distance'],
                    units=row[f'{race}-units'],
                    location=row[f'{race}-location'],
                    url=row[f'{race}-resultslink'],
                    time=row[f'{race}-time'],
                )
                db.session.add(resultrec)

        db.session.commit()
Esempio n. 9
0

def _get_detector_name(module: Any) -> str:
    name = '-'.join(word.capitalize() for word in module.__name__.split('_'))
    return f'{name} v{module.__version__}'


CHARDET = _get_detector_name(chardet)
C_CHARDET = _get_detector_name(cchardet)
CHARSET_NORMALIZER = _get_detector_name(charset_normalizer)
CHARAMEL = _get_detector_name(charamel)

DETECTORS = {
    CHARDET: lambda c: chardet.detect(c)['encoding'],
    C_CHARDET: lambda c: cchardet.detect(c)['encoding'],
    CHARSET_NORMALIZER: lambda c: charset_normalizer.detect(c)['encoding'],
    CHARAMEL: charamel.Detector().detect,
}
SUPPORTED_ENCODINGS = {
    CHARDET: {
        charamel.Encoding.ASCII,
        charamel.Encoding.UTF_8,
        charamel.Encoding.UTF_16,
        charamel.Encoding.UTF_32,
        charamel.Encoding.BIG_5,
        charamel.Encoding.GB_2312,
        charamel.Encoding.HZ,
        charamel.Encoding.EUC_JP,
        charamel.Encoding.SHIFT_JIS,
        charamel.Encoding.CP_932,
        charamel.Encoding.ISO_2022_JP,
Esempio n. 10
0
def get_encoding(txt_path):
    with open(txt_path, 'rb') as f:
        fileContent = f.read()
    return charset_normalizer.detect(fileContent)['encoding']
Esempio n. 11
0
    print(f"charset_normalizer {__version_cn__}")

    files: List[str] = get("http://127.0.0.1:8080/").json()

    print("## Testing with actual files")

    for file in files:
        r = get(
            "http://127.0.0.1:8080/" + file
        )

        if r.ok is False:
            print(f"Unable to retrieve '{file}' | HTTP/{r.status_code}")
            exit(1)

        expected_encoding = detect(r.content)["encoding"]

        if expected_encoding != r.apparent_encoding:
            print(f"Integration test failed | File '{file}' | Expected '{expected_encoding}' got '{r.apparent_encoding}'")
            exit(1)

        print(f"✅✅ '{file}' OK")

    print("## Testing with edge cases")

    # Should NOT crash
    get("http://127.0.0.1:8080/edge/empty/json").json()

    print("✅✅ Empty JSON OK")

    if get("http://127.0.0.1:8080/edge/empty/plain").apparent_encoding != "utf-8":