def detectEncoding(data): """Attempt to determine the encoding of a byte sequence. :param data: Input data on which to perform encoding detection. :type data: :class:`str` :return: Tuple of (encoding name, detection confidence). :rtype: :class:`tuple` of (:class:`str` or ``None``, :class:`float`) This function attempts to determine the character encoding of the input data. It returns a tuple with the most likely encoding (or ``None`` if the input data is not text) and the confidence of the detection. This function uses the :mod:`chardet` module, if it is available. Otherwise, only ``'ascii'`` is detected, and ``None`` is returned for any non-ASCII input. """ if _haveCharDet: result = detect(data) return result["encoding"], result["confidence"] else: chars = ''.join(map(chr, list(range(7, 14)) + list(range(32, 128)))) if len(data.translate(None, chars)): return None, 0.0 return "ascii", 1.0
def detect_encoding(bytesobject): """Read the first chunk of input and return its encoding""" # unicode-test if isutf8(bytesobject): return 'UTF-8' # try one of the installed detectors on first part if cchardet is not None: guess = cchardet.detect(bytesobject[:5000]) else: guess = detect(bytesobject[:5000]) LOGGER.debug('guessed encoding: %s, confidence: %s', guess['encoding'], guess['confidence']) # fallback on full response if guess is None or (guess['confidence'] is not None and guess['confidence'] < 0.98): guess = detect(bytesobject) LOGGER.debug('second-guessed encoding: %s, confidence: %s', guess['encoding'], guess['confidence']) return guess['encoding']
def get_file_encoding_delimiter(fpath: str) -> tuple: '''returns tuple of file encoding and delimiter''' with open(fpath, mode='rb') as f_as_bytes: try: byte_contents = f_as_bytes.read() enc_data = charset_normalizer.detect(byte_contents) encoding = enc_data['encoding'] except Exception as e: logging.warning( f'charset err: {e} when figuring out file {os.path.basename(fpath)} encoding. Defaulting to utf-8' ) encoding = 'utf-8' with open(fpath, mode='r', encoding=encoding) as f_text: text_contents = f_text.read() sniffer = csv.Sniffer() dialect = sniffer.sniff(text_contents) delimiter = dialect.delimiter if not dialect.delimiter == ' ' else '\t' return encoding, delimiter
def decompress(data, enable_encoding_guessing=True): """ Convert a base64-compressed subtitles file back to a string. :param data: the compressed data :param bool enable_encoding_guessing: """ raw_subtitle = zlib.decompress(base64.b64decode(data), 16 + zlib.MAX_WBITS) encoding_detection = detect( raw_subtitle) if enable_encoding_guessing is True else None if encoding_detection is None: return raw_subtitle.decode('utf_8', errors='ignore') try: my_decoded_str = raw_subtitle.decode(encoding_detection['encoding']) except UnicodeDecodeError as e: print(e) return return my_decoded_str
def performance_compare(arguments): parser = argparse.ArgumentParser( description="Performance CI/CD check for Charset-Normalizer") parser.add_argument( '-s', '--size-increase', action="store", default=1, type=int, dest='size_coeff', help= "Apply artificial size increase to challenge the detection mechanism further" ) args = parser.parse_args(arguments) chardet_results = [] charset_normalizer_results = [] for tbt_path in glob("./char-dataset/**/*.*"): print(tbt_path) # Read Bin file with open(tbt_path, "rb") as fp: content = fp.read() * args.size_coeff before = time_ns() chardet_detect(content) chardet_results.append(round((time_ns() - before) / 1000000000, 5)) print(" --> Chardet: " + str(chardet_results[-1]) + "s") before = time_ns() detect(content) charset_normalizer_results.append( round((time_ns() - before) / 1000000000, 5)) print(" --> Charset-Normalizer: " + str(charset_normalizer_results[-1]) + "s") chardet_avg_delay = mean(chardet_results) chardet_99p = calc_percentile(chardet_results, 99) chardet_95p = calc_percentile(chardet_results, 95) chardet_50p = calc_percentile(chardet_results, 50) charset_normalizer_avg_delay = mean(charset_normalizer_results) charset_normalizer_99p = calc_percentile(charset_normalizer_results, 99) charset_normalizer_95p = calc_percentile(charset_normalizer_results, 95) charset_normalizer_50p = calc_percentile(charset_normalizer_results, 50) print("") print("------------------------------") print("--> Chardet Conclusions") print(" --> Avg: " + str(chardet_avg_delay) + "s") print(" --> 99th: " + str(chardet_99p) + "s") print(" --> 95th: " + str(chardet_95p) + "s") print(" --> 50th: " + str(chardet_50p) + "s") print("------------------------------") print("--> Charset-Normalizer Conclusions") print(" --> Avg: " + str(charset_normalizer_avg_delay) + "s") print(" --> 99th: " + str(charset_normalizer_99p) + "s") print(" --> 95th: " + str(charset_normalizer_95p) + "s") print(" --> 50th: " + str(charset_normalizer_50p) + "s") return 0 if chardet_avg_delay > charset_normalizer_avg_delay and chardet_99p > charset_normalizer_99p else 1
def detect_encoding( self, text: bytes, default_encodings: Optional[List[str]] = None) -> Tuple[str, str]: """ Try to detect a file encoding from `text`, using either the chardet lib or by trying to decode the file. """ if not default_encodings: default_encodings = ["utf-8"] try: from charset_normalizer import detect except ImportError: detected_encoding = self.fallback_detection(text) else: detected_encoding = detect(text) if (detected_encoding["confidence"] is None or detected_encoding["confidence"] < 0.48): detected_encoding = None elif detected_encoding["encoding"] == "ascii": detected_encoding["encoding"] = self.encoding else: detected_encoding["encoding"] = detected_encoding[ "encoding"].lower() encodings = [] # Purposefully accessed the internal _encoding, as encoding is never 'auto' if self._encoding == "auto": if detected_encoding and detected_encoding[ "encoding"] not in encodings: encodings.append(detected_encoding["encoding"]) for encoding in default_encodings: if encoding not in encodings: encodings.append(encoding) elif detected_encoding: if "-" in detected_encoding["encoding"]: encoding, suffix = detected_encoding["encoding"].rsplit("-", 1) else: encoding = detected_encoding["encoding"] suffix = None # Different charset, just with BOM if encoding == self.encoding and suffix == "sig": encodings.append(detected_encoding["encoding"]) elif detected_encoding["encoding"] != self.encoding: logging.warning( "trying to parse %s with encoding: %s but " "detected encoding is %s (confidence: %s)", self.filename, self.encoding, detected_encoding["encoding"], detected_encoding["confidence"], ) encodings.append(self.encoding) else: encodings.append(self.encoding) for encoding in encodings: try: r_text = str(text, encoding) r_encoding = encoding break except UnicodeDecodeError: r_text = None r_encoding = None if r_encoding == "ascii": r_encoding = "utf-8" return r_text, r_encoding
def main(): descr = ''' Update racing team info volunteer records from csv file ''' parser = ArgumentParser(description=descr) parser.add_argument('inputfile', help='csv file with input records', default=None) args = parser.parse_args() scriptdir = dirname(__file__) # two levels up scriptfolder = dirname(dirname(scriptdir)) configdir = join(scriptfolder, 'config') memberconfigfile = "members.cfg" memberconfigpath = join(configdir, memberconfigfile) userconfigfile = "users.cfg" userconfigpath = join(configdir, userconfigfile) # create app and get configuration # use this order so members.cfg overrrides users.cfg configfiles = [userconfigpath, memberconfigpath] app = create_app(Development(configfiles), configfiles) # set up database db.init_app(app) # determine input file encoding with open(args.inputfile, 'rb') as binaryfile: rawdata = binaryfile.read() detected = detect(rawdata) # need app context, open input file with app.app_context(), open(args.inputfile, 'r', encoding=detected['encoding'], newline='', errors='replace') as IN: # turn on logging setlogging() # trick local interest stuff g.interest = 'fsrc' # initialize database tables from input file infile = DictReader(IN) for row in infile: # first check if racing team member exists localuser = LocalUser.query.filter_by(name=row['name'], **localinterest_query_params()).one_or_none() member = RacingTeamMember.query.filter_by(localuser=localuser, **localinterest_query_params()).one_or_none() if localuser else None if not member: continue # this pulls timezone information off of timestamp, formatted like 'Sun Feb 25 2018 14:07:17 GMT-0500 (EST)' timestampasc = ' '.join(row['timestamp'].split(' ')[:-2]) timestamp = tstamp.asc2dt(timestampasc) # if we already have received an info record for this member at this timestamp, skip it else we'll get duplicates inforec = RacingTeamInfo.query.filter_by(member=member, logtime=timestamp).one_or_none() if inforec: continue # if we've gotten here, we need to add info and volunteer records inforec = RacingTeamInfo(interest=localinterest(), member=member, logtime=timestamp) db.session.add(inforec) volrec = RacingTeamVolunteer( interest=localinterest(), info=inforec, eventdate = isodate.asc2dt(row['eventdate']).date(), eventname = row['eventname'], hours = row['hours'], comment = row['comments'], ) db.session.add(volrec) db.session.commit()
def main(): descr = ''' Update racing team info volunteer records from csv file ''' parser = ArgumentParser(description=descr) parser.add_argument('inputfile', help='csv file with input records', default=None) args = parser.parse_args() scriptdir = dirname(__file__) # two levels up scriptfolder = dirname(dirname(scriptdir)) configdir = join(scriptfolder, 'config') memberconfigfile = "members.cfg" memberconfigpath = join(configdir, memberconfigfile) userconfigfile = "users.cfg" userconfigpath = join(configdir, userconfigfile) # create app and get configuration # use this order so members.cfg overrrides users.cfg configfiles = [userconfigpath, memberconfigpath] app = create_app(Development(configfiles), configfiles) # set up database db.init_app(app) # determine input file encoding with open(args.inputfile, 'rb') as binaryfile: rawdata = binaryfile.read() detected = detect(rawdata) # translate type from old format to new applntype = { 'Returning Racing Team Member': 'renewal', 'New Racing Team Member': 'new', } # need app context, open input file with app.app_context(), open(args.inputfile, 'r', encoding=detected['encoding'], newline='', errors='replace') as IN: # turn on logging setlogging() # trick local interest stuff g.interest = 'fsrc' # initialize database tables from input file infile = DictReader(IN) for row in infile: # this pulls timezone information off of record timestamp, formatted like 'Sun Feb 25 2018 14:07:17 GMT-0500 (EST)' timestampasc = ' '.join(row['time'].split(' ')[:-2]) timestamp = tstamp.asc2dt(timestampasc) # if we already have received an application for this name at this timestamp, skip it else we'll get duplicates applnrec = RacingTeamApplication.query.filter_by( name=row['name'], logtime=timestamp, **localinterest_query_params()).one_or_none() if applnrec: continue # at least one record doesn't have a date of birth if not row['dob']: app.logger.warning( f"racingteam_appln_init: skipping {row['name']} {row['race1-name']} {row[f'race1-date']}" ) continue # if we've gotten here, we need to add application and result records dob = isodate.asc2dt(row['dob']).date() applnrec = RacingTeamApplication( interest=localinterest(), logtime=timestamp, name=row['name'], type=applntype[row['applntype']], comments=row['comments'], dateofbirth=dob, email=row['email'], gender=row['gender'].upper()[0], ) db.session.add(applnrec) for race in ['race1', 'race2']: # originally, new members were only asked for one race # detect this condition and skip this result -- this should only happen for race2 if not row[f'{race}-date']: continue # handle case where age grade was not calculated properly # this was due to deficiency in the original script, so these should be early entries # it's not worth adding the complexity to fix this data at this point try: agegrade = float(row[f'{race}-agegrade']), agegrade = row[f'{race}-agegrade'] except ValueError: agegrade = None # calculate age racedate = isodate.asc2dt(row[f'{race}-date']).date() thisage = age(racedate, dob) # add result resultrec = RacingTeamResult( interest=localinterest(), application=applnrec, eventdate=racedate, eventname=row[f'{race}-name'], age=thisage, agegrade=agegrade, distance=row[f'{race}-distance'], units=row[f'{race}-units'], location=row[f'{race}-location'], url=row[f'{race}-resultslink'], time=row[f'{race}-time'], ) db.session.add(resultrec) db.session.commit()
def _get_detector_name(module: Any) -> str: name = '-'.join(word.capitalize() for word in module.__name__.split('_')) return f'{name} v{module.__version__}' CHARDET = _get_detector_name(chardet) C_CHARDET = _get_detector_name(cchardet) CHARSET_NORMALIZER = _get_detector_name(charset_normalizer) CHARAMEL = _get_detector_name(charamel) DETECTORS = { CHARDET: lambda c: chardet.detect(c)['encoding'], C_CHARDET: lambda c: cchardet.detect(c)['encoding'], CHARSET_NORMALIZER: lambda c: charset_normalizer.detect(c)['encoding'], CHARAMEL: charamel.Detector().detect, } SUPPORTED_ENCODINGS = { CHARDET: { charamel.Encoding.ASCII, charamel.Encoding.UTF_8, charamel.Encoding.UTF_16, charamel.Encoding.UTF_32, charamel.Encoding.BIG_5, charamel.Encoding.GB_2312, charamel.Encoding.HZ, charamel.Encoding.EUC_JP, charamel.Encoding.SHIFT_JIS, charamel.Encoding.CP_932, charamel.Encoding.ISO_2022_JP,
def get_encoding(txt_path): with open(txt_path, 'rb') as f: fileContent = f.read() return charset_normalizer.detect(fileContent)['encoding']
print(f"charset_normalizer {__version_cn__}") files: List[str] = get("http://127.0.0.1:8080/").json() print("## Testing with actual files") for file in files: r = get( "http://127.0.0.1:8080/" + file ) if r.ok is False: print(f"Unable to retrieve '{file}' | HTTP/{r.status_code}") exit(1) expected_encoding = detect(r.content)["encoding"] if expected_encoding != r.apparent_encoding: print(f"Integration test failed | File '{file}' | Expected '{expected_encoding}' got '{r.apparent_encoding}'") exit(1) print(f"✅✅ '{file}' OK") print("## Testing with edge cases") # Should NOT crash get("http://127.0.0.1:8080/edge/empty/json").json() print("✅✅ Empty JSON OK") if get("http://127.0.0.1:8080/edge/empty/plain").apparent_encoding != "utf-8":