def build_schema(infile, outfile, delimiter=None, quotechar='\"', encoding=None, dataset_name=None, base="https://iisg.amsterdam/"): """ Build a CSVW schema based on the ``infile`` CSV file, and write the resulting JSON CSVW schema to ``outfile``. Takes various optional parameters for instructing the CSV reader, but is also quite good at guessing the right values. """ url = os.path.basename(infile) # Get the current date and time (UTC) today = datetime.datetime.utcnow().strftime("%Y-%m-%d") if dataset_name is None: dataset_name = url if encoding is None: detector = UniversalDetector() with open(infile, 'rb') as f: for line in f.readlines(): detector.feed(line) if detector.done: break detector.close() encoding = detector.result['encoding'] logger.info("Detected encoding: {} ({} confidence)".format(detector.result['encoding'], detector.result['confidence'])) if delimiter is None: try: #Python 3 with open(infile, 'r', errors='ignore') as csvfile: # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t") dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter csvfile.seek(0) except TypeError: #Python 2 with open(infile, 'r') as csvfile: # dialect = csv.Sniffer().sniff(csvfile.read(1024), delimiters=";,$\t") dialect = csv.Sniffer().sniff(csvfile.readline()) #read only the header instead of the entire file to determine delimiter csvfile.seek(0) logger.info("Detected dialect: {} (delimiter: '{}')".format(dialect, dialect.delimiter)) delimiter = dialect.delimiter logger.info("Delimiter is: {}".format(delimiter)) if base.endswith('/'): base = base[:-1] metadata = { u"@id": iribaker.to_iri(u"{}/{}".format(base, url)), u"@context": [u"https://raw.githubusercontent.com/CLARIAH/COW/master/csvw.json", {u"@language": u"en", u"@base": u"{}/".format(base)}, get_namespaces(base)], u"url": url, u"dialect": {u"delimiter": delimiter, u"encoding": encoding, u"quoteChar": quotechar }, u"dc:title": dataset_name, u"dcat:keyword": [], u"dc:publisher": { u"schema:name": u"CLARIAH Structured Data Hub - Datalegend", u"schema:url": {u"@id": u"http://datalegend.net"} }, u"dc:license": {u"@id": u"http://opendefinition.org/licenses/cc-by/"}, u"dc:modified": {u"@value": today, u"@type": u"xsd:date"}, u"tableSchema": { u"columns": [], u"primaryKey": None, u"aboutUrl": u"{_row}" } } with io.open(infile, 'rb') as infile_file: r = csv.reader(infile_file, delimiter=delimiter, quotechar=quotechar) try: # Python 2 header = r.next() except AttributeError: # Python 3 header = next(r) logger.info(u"Found headers: {}".format(header)) if u'' in header: logger.warning("WARNING: You have one or more empty column headers in your CSV file. Conversion might produce incorrect results because of conflated URIs or worse") if len(set(header)) < len(header): logger.warning("WARNING: You have two or more column headers that are syntactically the same. Conversion might produce incorrect results because of conflated URIs or worse") # First column is primary key metadata[u'tableSchema'][u'primaryKey'] = header[0] for head in header: col = { u"@id": iribaker.to_iri(u"{}/{}/column/{}".format(base, url, head)), u"name": head, u"titles": [head], u"dc:description": head, u"datatype": u"string" } metadata[u'tableSchema'][u'columns'].append(col) with open(outfile, 'w') as outfile_file: outfile_file.write(json.dumps(metadata, indent=True)) logger.info("Done") return
def create_search_index(self): search_index = set() detector = UniversalDetector() ### Код проекта try: search_index.add(self.proj_kod.decode("utf-8")) except: search_index.add(self.proj_kod) ## Название проекта for w in self.proj_name.split(): try: search_index.add(w.decode("utf-8")) except: search_index.add(w) ### Инициатор проекта if self.proj_init: try: search_index.add(self.proj_init.name.decode("utf-8")) except: search_index.add(self.proj_init.name) ### Реализатор проекта if self.executor: try: search_index.add(self.executor.name.decode("utf-8")) except: search_index.add(self.executor.name) ### Этапы if self.stage: try: search_index.add(self.stage.getfullname().decode("utf-8")) except: search_index.add(self.stage.getfullname()) ### Адрес if self.data.has_key('address'): for addr in self.data["address"]: try: search_index.add(addr["city"].decode("utf-8")) except: search_index.add(addr["city"]) try: search_index.add(addr["street"].decode("utf-8")) except: search_index.add(addr["street"]) ### Контрагент for w in self.contragent.split(): try: search_index.add(w.decode("utf-8")) except: search_index.add(w) ### Исполнители for ex in self.reestr_proj_exec_date_set.all(): if ex.worker: for w in ex.worker.get_full_name().split(): try: search_index.add(w.decode("utf-8")) except: search_index.add(w) ### Связь с другими системами if self.data.has_key('other_system'): for code in self.data['other_system']: try: search_index.add(code['other_name'].decode("utf-8")) except: search_index.add(code['other_name']) try: search_index.add(code['other_code'].decode("utf-8")) except: search_index.add(code['other_code']) detector.close() self.search_index = u"".join(list(search_index)) self.save() return "ok"
def load_file(self, f_path): """ Load data from a CSV file to the workspace. Column 0 is used for the index column. chardet attempts to determine encoding if file is not utf-8. # Attributes f_path(String): The filename selected via open_file """ # FIXME: Reset status bar when new data is loaded. try: self.full_data = pd.read_csv(f_path, encoding='utf-8', index_col=0) except UnicodeDecodeError as ude: self.logger.warning("UnicodeDecode error opening file", exc_info=True) self.comms.update_statusbar.emit( "Attempting to determine file encoding...") detector = UniversalDetector() try: for line in open(f_path, 'rb'): detector.feed(line) if detector.done: break detector.close() print("chardet determined encoding type to be {}".format( detector.result['encoding'])) self.full_data = pd.read_csv( f_path, encoding=detector.result['encoding'], index_col=0) except Exception as e: self.logger.error("Error detecing encoding", exc_info=True) exceptionWarning("Exception has occured.", exception=e) except IOError as ioe: self.logger.error("IOError detecting encoding", exc_info=True) exceptionWarning("IO Exception occured while opening file.", exception=ioe) except Exception as e: self.logger.error("Error detecting encoding", exc_info=True) exceptionWarning("Error occured opening file.", exception=e) try: columns = self.full_data.columns self.available_columns = [] for column in columns: if column.endswith("text"): self.available_columns.append(column) if self.available_columns: self.available_column_model.loadData(self.available_columns, include_labels=False) self.available_column_model.setAllowableData( self.allowable_columns) # drop_cols = [col for col in self.full_data.columns if col not in self.available_columns ] # self.full_data.drop(drop_cols, axis=1, inplace=True) # print("full_data columns: ", self.full_data.columns) self.full_text_count.setText(str(self.full_data.shape[0])) # self.display_selected_row(None) self.select_all_btn.setEnabled(True) self.deselect_all_btn.setEnabled(True) self.comms.update_statusbar.emit("CSV loaded.") else: exceptionWarning("No allowable data discovered in file.") except pd.errors.EmptyDataError as ede: exceptionWarning('Empty Data Error.\n', exception=ede) except Exception as e: self.logger.error("Error loading dataframe", exc_info=True) exceptionWarning("Exception occured. PredictWidget.load_file.", exception=e)
def load_file(self, f_path): """ Load data from a CSV file to the workspace. Column 0 is used for the index column. chardet attempts to determine encoding if file is not utf-8. # Attributes f_path(String): The filename selected via open_file """ # FIXME: Reset status bar when new data is loaded. try: self.update_progressbar.emit(0, True) self.available_column_model.loadData([]) self.select_all_btn.setEnabled(False) self.deselect_all_btn.setEnabled(False) self.full_data = pd.read_csv(f_path, encoding='utf-8', index_col=0, sep=None) except UnicodeDecodeError as ude: self.logger.warning( "UnicodeDecode error opening file", exc_info=True) print("UnicodeDecodeError caught. File is not UTF-8 encoded. \ Attempting to determine file encoding...") self.update_statusbar.emit( "File is not UTF-8 encoded. Attempting to determine file encoding...") detector = UniversalDetector() try: for line in open(f_path, 'rb'): detector.feed(line) if detector.done: break detector.close() self.update_statusbar.emit("Chardet determined encoding type to be {}".format( detector.result['encoding'])) self.logger.info("Chardet determined encoding type to be {}".format( detector.result['encoding'])) self.full_data = pd.read_csv( f_path, encoding=detector.result['encoding'], index_col=0) except Exception as e: self.logger.error("Error detecting encoding", exc_info=True) exceptionWarning("Exception has occured.", exception=e) except IOError as ioe: self.logger.error("IOError detecting encoding", exc_info=True) exceptionWarning( "IO Exception occured while opening file.", exception=ioe) except Exception as e: self.logger.error("Error detecting encoding", exc_info=True) exceptionWarning("Error occured opening file.", exception=e) #TODO: clean up dataset by removing NA for values or index try: columns = self.full_data.columns self.available_columns = [] for column in columns: if column.endswith(DATA_COLUMN_SUFFIX): label_col = column.split(TAG_DELIMITER)[0] + TRUTH_SUFFIX if label_col in columns: self.available_columns.append(column) self.available_columns.append(label_col) # If no data found, the model will be reset. if(self.available_columns): self.available_column_model.loadData(self.available_columns) self.full_text_count.setText(str(self.full_data.shape[0])) self.display_selected_rows(None) self.update_statusbar.emit("CSV loaded.") self.select_all_btn.setEnabled(True) self.deselect_all_btn.setEnabled(True) else: exceptionWarning(f"No usable data found in {f_path}") self.logger.info(f"No usable data found in {f_path}") self.update_statusbar.emit("No usable data found in file") self.available_column_model.setCheckboxes(False) self.load_selected_data() except pd.errors.EmptyDataError as ede: exceptionWarning( exceptionTitle='Empty Data Error.\n', exception=ede) except Exception as e: self.logger.error("Error loading dataframe", exc_info=True) exceptionWarning( "Exception occured. DataLoader.load_file.", exception=e) tb = traceback.format_exc() print(tb) finally: self.update_progressbar.emit(0, False)
def __init__(self, fileName): self.fileName = fileName self.detector = UniversalDetector()
def detect_file_encoding(file_path, buffer_size=1024, max_lines=20): """ Determine encoding of files within initial `max_lines` of length `buffer_size`. :param file_path: path to the file :type file_path: str :param buffer_size: buffer length for each line being read :type buffer_size: int :param max_lines: number of lines to read from file of length buffer_size :type max_lines: int :return: encoding type :rtype: str """ detector = UniversalDetector() line_count = 0 with FileOrBufferHandler(file_path, "rb") as input_file: chunk = input_file.read(buffer_size) while chunk and line_count < max_lines: detector.feed(chunk) chunk = input_file.read(buffer_size) line_count += 1 detector.close() encoding = detector.result["encoding"] # Typical file representation is utf-8 instead of ascii, treat as such. if not encoding or encoding.lower() in ["ascii", "windows-1254"]: encoding = "utf-8" # Check if encoding can be used to decode without throwing an error def _decode_is_valid(encoding): try: with FileOrBufferHandler(file_path, encoding=encoding) as input_file: input_file.read(1024 * 1024) return True except Exception: return False if not _decode_is_valid(encoding): try: from charset_normalizer import CharsetNormalizerMatches as CnM # Try with small sample with FileOrBufferHandler(file_path, "rb") as input_file: raw_data = input_file.read(10000) result = CnM.from_bytes( raw_data, steps=5, chunk_size=512, threshold=0.2, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False, ) result = result.best() if result: if result.first(): encoding = result.first().encoding # Try again with full sample if not _decode_is_valid(encoding): with FileOrBufferHandler(file_path, "rb") as input_file: raw_data = input_file.read(max_lines * buffer_size) result = CnM.from_bytes( raw_data, steps=max_lines, chunk_size=buffer_size, threshold=0.2, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False, ) result = result.best() if result: if result.first(): encoding = result.first().encoding except Exception: logger.info("Install charset_normalizer for improved file " "encoding detection") # If no encoding is still found, default to utf-8 if not encoding: encoding = "utf-8" return encoding.lower()
def set_source(self, name): # source _dependent_ initialization goes here if name is None or not os.path.isfile(name): return False IP.set_source(self, name) self.__source_name = name # auto-detect file-encoding (optional) try: from chardet.universaldetector import UniversalDetector detector = UniversalDetector() detector.reset() lines = 0 for line in file(self.__source_name, 'rb'): detector.feed(line) lines += 1 if detector.done or lines == 50: break detector.close() encoding = string.lower(detector.result['encoding']) except: log.exception('') encoding = 'utf_8' encoding = self._encoding_cleanup.sub('', encoding) model = self.gtk.get_widget('e_encoding').get_model() itempos = 0 for item in model: pos1 = string.find( self._encoding_cleanup.sub('', string.lower(str(item[0]))), encoding) if pos1 == 0: break itempos += 1 self.gtk.get_widget('e_encoding').set_active(itempos) # auto-detect CSV import settings (optional) try: import csv sniffer = csv.Sniffer() csvfilesize = os.path.getsize(self.__source_name) if csvfilesize > 65535: csvfilesize = 65535 csvfile = file(self.__source_name, 'rb') try: # quote char, line terminator and field delimiter proposed_dialect = sniffer.sniff(csvfile.read(csvfilesize)) self.gtk.get_widget('e_delimiter').set_text( proposed_dialect.delimiter) self.gtk.get_widget('e_quotechar').set_text( proposed_dialect.quotechar) if proposed_dialect.lineterminator == '\r\n': self.gtk.get_widget('e_lineterminator').set_active(1) # first row with column headers csvfile.seek(0) if sniffer.has_header(csvfile.read(csvfilesize)): self.gtk.get_widget('e_startrow').set_text('1') else: self.gtk.get_widget('e_startrow').set_text('0') finally: csvfile.close() except: log.exception('') # run dialog response = self.gtk.get_widget('d_import').run() if response == gtk.RESPONSE_OK: return True else: return False
# 3.2 读取全文来判断文件编码 f_temp = open('测试文件.txt', 'rb') # 测试文件.txt是一个WIN7系统下GB2312编码的文本文档 f_content = f_temp.read() enc = chardet.detect(f_content) f_temp.close() result = '测试文件.txt' + " 信息如下>>\n\t"+"文件编码语言:" + enc['language']+"\n\t文件编码:"+enc['encoding']+"\n\t结果可信度:"+ str(enc['confidence']) print(result) # 测试文件.txt 信息如下>> 文件编码语言:Chinese 文件编码:GB2312 结果可信度:0.99 # 3.3 高级应用,当用于检测的文档特别大时,可以chardet的子模块chardet.universaldetector。 # 这个模块允许我们分多次(逐行读取或者自行断行读取)检测文本的编码格式,当达到一定的阈值时便可以提前退出检测。 # 这样做可以有效地节省资源,提高程序效率,同时保证检测结果的准确性。 detector = UniversalDetector() # 初始化一个UniversalDetector对象 detector.reset() # 清除上次的检测结果 f = open('测试文件.txt', 'rb') # 测试文件.txt是一个WIN7系统下GB2312编码的文本文档 for line in f: detector.feed(line) # 逐行载入UniversalDetector对象中进行识别 if detector.done: # done为一个布尔值,默认为False,达到阈值时变为True break detector.close() # 调用该函数做最后的数据整合 f.close() print(detector.result) # {'language': 'Chinese', 'encoding': 'GB2312', 'confidence': 0.99}
def filelist(root): wb = px.load_workbook('./filelist.xlsx') #書き込み対象のExcel ws = wb.active START_COL = 4 #開始行 ROW_C = 67 #C列 ROW_M = 77 #M列 ROW_V = 86 #V列 ID_LIST = [65, 1, 1, 1, 1, 1, 1, 1, 1, 1] #65 = A PREV_COUNTER = 0 PREV_DIRECTORY = '' ## 現在のディレクトリを再帰的に検索 for dirpath, dirname, filename in os.walk(root): for FILENAME in filename: ## 拡張子「.html」or「.php」のファイルに絞る if fnmatch.fnmatch(FILENAME, '*.html') or fnmatch.fnmatch( FILENAME, '*.php'): ## 対象ファイルパス html = os.path.join(dirpath, FILENAME) ## 対象ファイルの文字コード判定 detector = UniversalDetector() with open(html, mode='rb') as f: for binary in f: detector.feed(binary) if detector.done: break detector.close() ## ルートパスは不要なため置換 PATH = html.replace(root, '') DIRECTORY = dirpath.replace(root, '') + '/' print(PATH) ## html情報を取得 try: soup = BeautifulSoup( open(html, encoding=detector.result['encoding']), "lxml") ## title情報を取得 HEAD = soup.find("head") TITLE_CONTENT = HEAD.find("title") if TITLE_CONTENT != None: TITLE = TITLE_CONTENT.text else: ## titleが無い場合は「null」をセット TITLE = 'null' ## keywords情報を取得 META_KEYWORDS = HEAD.find('meta', attrs={'name': 'keywords'}) if META_KEYWORDS != None: KEYWORDS = META_KEYWORDS.attrs['content'] else: ## keywordsが無い場合は「null」をセット KEYWORDS = 'null' ## description情報を取得 META_DESCRIPTION = soup.find('meta', attrs={'name': 'description'}) if META_DESCRIPTION != None: DESCRIPTION = META_DESCRIPTION.attrs['content'] else: ## descriptionが無い場合は「null」をセット DESCRIPTION = 'null' ## html情報が取得出来なかった場合 except: TITLE = 'エラー' KEYWORDS = '' DESCRIPTION = '' ## 階層を取得 COUNTER = Counter(PATH) ## ページIDをセット if PREV_COUNTER >= COUNTER['/']: ID_LIST[(COUNTER['/'] - 1)] = ID_LIST[(COUNTER['/'] - 1)] + 1 for i in range(COUNTER['/'], 6): ID_LIST[i] = 1 if COUNTER['/'] == 2: if PREV_DIRECTORY != DIRECTORY: ID_LIST[0] = ID_LIST[0] + 1 ID_LIST[1] = 1 ## 1つ前のページ情報を保存 PREV_COUNTER = COUNTER['/'] PREV_DIRECTORY = DIRECTORY ## ページIDを記載 num = 0 if COUNTER['/'] == 1: ROW_ID = 2 else: ROW_ID = COUNTER['/'] for i in range(ROW_C, ROW_C + ROW_ID): if i == ROW_C: ws[str(chr(i)) + str(START_COL)].value = str( chr(ID_LIST[0])) else: ws[str(chr(i)) + str(START_COL)].value = ID_LIST[num] num = num + 1 ## タイトルを記載 for i in range(ROW_M, ROW_V): if i == ROW_M + (COUNTER['/'] - 1): ws[str(chr(i)) + str(START_COL)].value = TITLE ## 自階層の列より左の場合 if i - ROW_M < (COUNTER['/'] - 1): ws[str(chr(i)) + str(START_COL)].border = Border( left=Side(style='thin', color='000000'), ) ## 自階層の列の場合 elif i - ROW_M == (COUNTER['/'] - 1): ws[str(chr(i)) + str(START_COL)].border = Border( top=Side(style='thin', color='000000'), left=Side(style='thin', color='000000'), ) ## 自階層の列より右の場合 else: ws[str(chr(i)) + str(START_COL)].border = Border( top=Side(style='thin', color='000000'), bottom=Side(style='thin', color='000000'), ) ## 各ページ情報を記載 # ws['R'+str(START_COL)].value = DIRECTORY #ディレクトリ # ws['S'+str(START_COL)].value = FILENAME #ファイル名 ws['W' + str(START_COL)].value = PATH #パス ws['X' + str(START_COL)].value = KEYWORDS #keywords ws['Y' + str(START_COL)].value = DESCRIPTION #discription START_COL = START_COL + 1 ## 最終行の枠線調整 for i in range(ROW_M, ROW_V): if i == ROW_M: ws[str(chr(i)) + str(START_COL)].border = Border( top=Side(style='thin', color='000000'), bottom=Side(style='thin', color='000000'), left=Side(style='thin', color='000000'), ) elif i == ROW_V: ws[str(chr(i)) + str(START_COL)].border = Border( top=Side(style='thin', color='000000'), bottom=Side(style='thin', color='000000'), right=Side(style='thin', color='000000'), ) else: ws[str(chr(i)) + str(START_COL)].border = Border( top=Side(style='thin', color='000000'), bottom=Side(style='thin', color='000000'), ) ## Excel保存 wb.save('./filelist.xlsx')
def run(self): confidence = 0 size = os.stat(self.file_name).st_size if BINARY.search(self.file_name): encoding = 'BINARY' confidence = 1 elif size > 1048576 and maybe_binary(self.file_name): encoding = 'BINARY' confidence = 0.7 elif size > 1048576: # skip files > 1Mb encoding = 'Unknown' confidence = 1 else: started_at = time.time() timeout = False detector = UniversalDetector() fp = open(self.file_name, 'rb') line = fp.readline(500) while line != '': detector.feed(line) if time.time() - started_at > 8: timeout = True break line = fp.readline(8000) fp.close() detector.close() if timeout == False or (timeout == True and detector.done): encoding = str(detector.result['encoding']).upper() confidence = detector.result['confidence'] else: encoding = 'Unknown' confidence = 1 if encoding == 'ASCII': encoding = 'UTF-8' elif encoding == None or encoding == 'NONE' or encoding == '' or encoding == 'Unknown' or confidence < 0.7: if encoding == 'ISO-8859-2' and confidence > 0.69: workaround = self.test_fallback_encodings( ['UTF-8', 'ISO-8859-1']) if workaround != False: encoding = workaround else: encoding = 'Unknown' elif encoding != 'ISO-8859-2' and confidence > 0.49: if encoding == 'WINDOWS-1252': encoding = 'ISO-8859-1' else: fallback = self.test_fallback_encodings() if fallback == False: encoding = 'Unknown' else: encoding = fallback # workarounds here if encoding == 'ISO-8859-2' or encoding == 'MACCYRILLIC': workaround = self.test_fallback_encodings( ['UTF-8', 'ISO-8859-1']) if workaround != False: encoding = workaround del detector sublime.set_timeout(lambda: self.callback(encoding, confidence), 0)
def getFileEncoding(filePath): detector = UniversalDetector() f = open(filePath, 'rb') detector.feed(f.read()) detector.close() return detector.result['encoding']
def __init__(self, filename): self.detector = UniversalDetector() self.encodingFormat(filename) self.decoderFile(filename)
def read_into_dataframe(file: IO, filename: str = "", nrows: int = 100, max_characters: int = 50) -> pd.DataFrame: """Reads a file into a DataFrame. Infers the file encoding and whether a header column exists Args: file (IO): file buffer. filename (str): filename. Used to infer compression. nrows (int, optional): number of rows to peek. Default: 100. max_characters (int, optional): max characters a column name can have to be distinguished from a real text value Returns: A pandas.DataFrame. """ detector = UniversalDetector() for line, text in enumerate(file): detector.feed(text) if detector.done or line > nrows: break detector.close() encoding = detector.result.get("encoding") compression = infer_compression(filename, "infer") file.seek(0, SEEK_SET) contents = file.read() with BytesIO(contents) as file: df0 = pd.read_csv( file, encoding=encoding, compression=compression, sep=None, engine="python", header="infer", nrows=nrows, ) df0_cols = list(df0.columns) #Check if all columns are strins and short strings(text values tend to be long) column_names_checker = all([type(item) == str for item in df0_cols]) if column_names_checker: column_names_checker = all( [len(item) < max_characters for item in df0_cols]) #Check if any column can be turned to float conversion_checker = True for item in df0_cols: try: item = float(item) conversion_checker = False break except: pass #Prefix and header final_checker = True if (column_names_checker and conversion_checker) else False header = "infer" if final_checker else None prefix = None if header else "col" with BytesIO(contents) as file: df = pd.read_csv( file, encoding=encoding, compression=compression, sep=None, engine="python", header=header, prefix=prefix, ) return df
def parse_dat_file(dat_path, spec_csv_path, out_folder): """Parse a .DAT file (CSPro fixed-width text datafile) into a series of CSV files containing the tabular data for each table contained in the .DAT and described in the associated .DCD file. Developed for use in particular with DAT files provided in the "hierarchical data" from DHS, but may be more generally applicable to CSPro format files. The .DCF file must be parsed first, using DCF_Parser, and the table specification file it generates is used by this function to parse the data file. Produces one CSV data file for every table (recordtype) defined in the .DCF and occurring in the .DAT. """ filecode = os.path.extsep.join(os.path.basename(dat_path).split(os.path.extsep)[:-1]) # See if we've already done this one test_fn = os.path.join(out_folder, f"{filecode}.REC01.csv") if os.path.exists(test_fn): print("Already parsed " + filecode) return print("Parsing "+dat_path) # read the parsed file specification in CSV form which was created by parsing the .dcf file # The first row specifies where, on all subsequent rows, the "record type" is found i.e. the identifier # that specifies which table the row defines a variable for. This is constant throughout the file. # Each remaining item in the parsed DCF spec defines one field from one table, specifying what position that # field's data is found in the fixed-width text format row when the row's record_type_info # (destination table name) is for this table with open(spec_csv_path, 'r') as dict_file: dict_file_reader = csv.DictReader(dict_file) # the record type position info must be in the first line recordtype_info = next(dict_file_reader) rt_start = int(recordtype_info['Start']) - 1 rt_end = int(recordtype_info['Len']) + rt_start all_vars_this_file = [row for row in dict_file_reader] for field_info in all_vars_this_file: field_info['Start'] = int(field_info['Start']) field_info['Len'] = int(field_info['Len']) # sort them by record type (i.e. destination table) then position in the row (order of fields) sorted_fields = sorted(all_vars_this_file, key=(itemgetter('RecordTypeValue', 'Start'))) # build a dictionary of record type (i.e. tablename) : list of its fields (i.e. field infos) rt_field_info = {} for field_info in sorted_fields: record_tag = field_info['RecordTypeValue'] if record_tag not in rt_field_info: rt_field_info[record_tag] = [] rt_field_info[record_tag].append(field_info) # now parse the data file result = {} n_cols_per_table = {} detector = UniversalDetector() with open(dat_path, 'rb') as f: for line in f: detector.feed(line) if detector.done: break detector.close() enc = detector.result['encoding'] with open(dat_path, 'r', encoding=enc) as data: for i, line in enumerate(data): #if i == 0 and line.startswith(codecs.BOM_UTF8): # print(f"File {dat_path} appears to contain BOM; ignoring it") # line = line[len(codecs.BOM_UTF8):] record_type = line[rt_start:rt_end] if record_type not in rt_field_info: print("Specification for recordtype '{0!s}' not found in file for {1!s} at line {2!s}".format( record_type, filecode, i)) continue record_spec = rt_field_info[record_type] if record_type not in result: result[record_type] = [] # split the column-aligned text according to the row specification # The .DAT format allows a fixed width for each column of each recordtype. # Should we strip the whitespace on shorter values? This is difficult. # In general, yes we should, because values are stored as fixed-width and where # shorter than the field, are padded with spaces, which would take up unnecessary space # and would prevent joining/comparison between surveys. # HOWEVER in the case of the CASEID / HHID variables we must NOT strip the whitespace. # The HHID is usually the CASEID with the last 3 chars trimmed off, but if we # trim "some" whitespace from HHID here then we can break that association and # damage referential integrity. # On the other hand some joins are based on e.g. BIDX (recorded as len 2) # to MIDX (recorded as len 1, despite containing the same data), and we need # to join on a single digit found in both so BIDX would need to be stripped. # Define a lambda to strip or not strip accordingly, and use it in a list comp to # split the row into its field values strip_or_not = lambda data, name: data if name in ('CASEID', 'HHID') else data.strip() rowParts = [strip_or_not( (line[i['Start'] - 1: i['Start'] + i['Len'] - 1]), i['Name']) for i in record_spec] if record_type not in n_cols_per_table: n_cols_per_table[record_type] = len(rowParts) else: assert len(rowParts) == n_cols_per_table[record_type] # add as a list to the list of rows for this record type result[record_type].append(rowParts) # (",".join(rowParts)) for record_type, field_infos in rt_field_info.items(): if not record_type in result: print(f"No rows were found for record type {record_type} in file {filecode} despite DCF specification") continue field_header = [i['Name'] for i in field_infos] field_records = set([i['RecordName'] for i in field_infos]) assert len(field_records) == 1 rec_name = field_records.pop() if not os.path.exists(out_folder): os.makedirs(out_folder) out_fn = os.path.join(out_folder, f"{filecode}.{rec_name}.csv") with open(out_fn, 'w', newline='', encoding='utf-8') as out_csv: csv_writer = csv.writer(out_csv) csv_writer.writerow(field_header) csv_writer.writerows(result[record_type])
def __init__(self): self.detector = UniversalDetector()
def detect_file_encoding(training_file, file_encoding, max_passwords=10000, default='utf-8'): print() print("Attempting to autodetect file encoding of the file: " + str(training_file), file=sys.stderr) print("-----------------------------------------------------------------") ##--Try to import chardet. ##--If that package is not installed print out a warning and use the default try: from chardet.universaldetector import UniversalDetector detector = UniversalDetector() except ImportError as error: print("FAILED: chardet not installed", file=sys.stderr) print("IT IS HIGHLY RECOMMENDED THAT YOU INSTALL THE chardet PACKAGE", file=sys.stderr) print( "or manually specify the file encoding of the training set via the command line", file=sys.stderr) print( "You can download chardet from https://pypi.python.org/pypi/chardet", file=sys.stderr) print("Defaulting as " + default, file=sys.stderr) file_encoding.append(default) return True ##--Read through up to the number specified in 'max_passwords' to identify the character encoding try: cur_count = 0 with open(training_file, 'rb') as file: for line in file.readlines(): detector.feed(line) if detector.done: break cur_count = cur_count + 1 if cur_count >= max_passwords: break detector.close() except IOError as error: print("Error opening file " + training_file) print("Error is " + str(error)) return False try: file_encoding.append(detector.result['encoding']) print("File Encoding Detected: " + str(detector.result['encoding']), file=sys.stderr) print("Confidence for file encoding: " + str(detector.result['confidence']), file=sys.stderr) print( "If you think another file encoding might have been used please manually specify the file encoding and run the training program again", file=sys.stderr) print() except KeyError as error: print("Error encountered with file encoding autodetection", file=sys.stderr) print("Error : " + str(error)) return False return True
def File_import(Folder_path): df_list = [] detector = UniversalDetector() file_list = glob(os.path.join(Folder_path, '*.csv')) for f in file_list: detector.reset() for line in open(f, 'rb'): detector.feed(line) if detector.done: break detector.close() print(detector.result['encoding']) try: try: _df = pd.DataFrame() _df = pd.read_csv( f, header=1, skipfooter=1, encoding=detector.result['encoding'], engine='python') except: _df = pd.DataFrame() _df = pd.read_csv( f, header=1, skipfooter=1, encoding='cp932', engine='python') except: print(f) sys.exit('error: cannot open.') if _df.shape[1] == 1: try: _df = pd.DataFrame() _df = pd.read_csv( f, header=0, encoding=detector.result['encoding']) except: _df = pd.DataFrame() _df = pd.read_csv( f, header=0, encoding=detector.result['encoding']) df_list.append(_df) df_165 = pd.DataFrame() df_167 = pd.DataFrame() df_append = pd.DataFrame() for df in df_list: if df.shape[1] == 104: df_165 = pd.concat([df_165, df], axis=0) elif df.shape[1] == 101: df_167 = pd.concat([df_167, df], axis=0) elif df.shape[1] == 1: df_append = pd.concat([df_append, df], axis=0) if set(df_167.columns) != set(df.columns): print(df_167.columns) print(df.columns) df_165.reset_index(inplace=True) df_167.reset_index(inplace=True) print('165:{0}, 167:{1}, append:{2}'.format(df_165.shape, df_167.shape, df_append.shape)) if df_165.shape[0] == 0 and df_167.shape[0] == 0: sys.exit('error: 0 csv file import.') return df_165, df_167, df_append
def iter_files(old_kw, new_kw, root_dir, **kwargs): """ 遍历根目录 :param old_kw: 旧词 :param new_kw: 新词 :param root_dir: 目录的绝对路径 :param kwargs: 自定义参数 """ if not old_kw: raise Exception("原有关键词为空,请输入!") isdir = os.path.isdir(root_dir) if not isdir: raise Exception("找不到该目录!请检查路径是否正确!") keywords_dict = {} for k, v in kwargs.items(): keywords_dict[k] = v detector = UniversalDetector() for root, dirs, files in os.walk(root_dir, topdown=False): # 替换文件内容 if keywords_dict['content'] == 1: # 先遍历最内层,逐步向上 for file_name in files: old_file_path = os.path.join(root, file_name) file_data = "" # 读该文件编码格式,重置对象 detector.reset() with open(old_file_path, 'rb') as file: for line in file.readlines(): detector.feed(line) if detector.done: break curr_encode = detector.result['encoding'] # 如果被替换的字符串在文件内容中,先按行读出来,在替换 with open(old_file_path, 'r', encoding=curr_encode, errors='ignore') as f: for line in f.readlines(): new_line = line.replace(old_kw, new_kw) file_data += new_line with open(old_file_path, 'w', encoding=curr_encode, errors='ignore') as f: f.write(file_data) # 替换文件名称 if keywords_dict['filename'] == 1: for file_name in files: old_file_path = os.path.join(root, file_name) # 如果被替换的字符串在文件的名中,则替换成新的 if old_kw in file_name: new_file_name = file_name.replace(old_kw, new_kw) new_file_path = os.path.join(root, new_file_name) os.rename(old_file_path, new_file_path) # 替换文件夹名称 if keywords_dict['dirname'] == 1: for dir_name in dirs: old_dir_path = os.path.join(root, dir_name) # 如果被替换的字符串在文件夹的名中,则替换成新的 if old_kw in dir_name: new_dir_name = dir_name.replace(old_kw, new_kw) new_dir_path = os.path.join(root, new_dir_name) os.rename(old_dir_path, new_dir_path)
def opener(file, prefix='texts/'): detector = UniversalDetector() detector.feed(open(prefix + file, 'rb').read()) detector.close() return open(prefix + file, encoding=detector.result['encoding']).read()
def parse_csv(self, file_id, skip_header=False): """ Attempt to parse a previously uploaded file as a table or spreadsheet. Generate rows as they're requested. @type file_id: unicode @param file_id: id of the file to parse @type skip_header: bool @param skip_header: if a line of header labels is detected, don't include it in the generated rows (defaults to False) @rtype: generator @return: rows of data from the parsed file. each row is a list of elements @raise Parse_error: there was an error in parsing the given file """ APPROX_SNIFF_SAMPLE_SIZE_BYTES = 1024 * 50 try: import csv table_file = Upload_file.open_file(file_id) table_file.seek( 0 ) # necessary in case the file is opened by another call to parse_csv() sniffer = csv.Sniffer() # attempt to determine the presence of a header lines = table_file.readlines(APPROX_SNIFF_SAMPLE_SIZE_BYTES) sniff_sample = "".join(lines) has_header = sniffer.has_header(sniff_sample) # attempt to determine the file's character encoding detector = UniversalDetector() for line in lines: detector.feed(line) if detector.done: break detector.close() encoding = detector.result.get("encoding") table_file.seek(0) reader = csv.reader(table_file) # skip the header if requested to do so if has_header and skip_header: reader.next() expected_row_length = None for row in reader: # all rows must have the same number of elements current_row_length = len(row) if current_row_length == 0: continue if expected_row_length and current_row_length != expected_row_length: raise Parse_error() else: expected_row_length = current_row_length yield [element.decode(encoding) for element in row] except (csv.Error, IOError, TypeError): raise Parse_error()
def parse_csv(myfile, newsletter, ignore_errors=False): from newsletter.addressimport.csv_util import UnicodeReader import codecs import csv # Detect encoding from chardet.universaldetector import UniversalDetector detector = UniversalDetector() for line in myfile.readlines(): detector.feed(line) if detector.done: break detector.close() charset = detector.result['encoding'] # Reset the file index myfile.seek(0) # Attempt to detect the dialect encodedfile = codecs.EncodedFile(myfile, charset) dialect = csv.Sniffer().sniff(encodedfile.read(1024)) # Reset the file index myfile.seek(0) logger.info('Detected encoding %s and dialect %s for CSV file', charset, dialect) myreader = UnicodeReader(myfile, dialect=dialect, encoding=charset) firstrow = myreader.next() # Find name column colnum = 0 namecol = None for column in firstrow: if "name" in column.lower() or ugettext("name") in column.lower(): namecol = colnum if "display" in column.lower() or \ ugettext("display") in column.lower(): break colnum += 1 if namecol is None: raise forms.ValidationError( _("Name column not found. The name of this column should be " "either 'name' or '%s'.") % ugettext("name")) logger.debug("Name column found: '%s'", firstrow[namecol]) # Find email column colnum = 0 mailcol = None for column in firstrow: if 'email' in column.lower() or \ 'e-mail' in column.lower() or \ ugettext("e-mail") in column.lower(): mailcol = colnum break colnum += 1 if mailcol is None: raise forms.ValidationError( _("E-mail column not found. The name of this column should be " "either 'email', 'e-mail' or '%(email)s'.") % {'email': ugettext("e-mail")}) logger.debug("E-mail column found: '%s'", firstrow[mailcol]) #assert namecol != mailcol, \ # 'Name and e-mail column should not be the same.' if namecol == mailcol: raise forms.ValidationError( _("Could not properly determine the proper columns in the " "CSV-file. There should be a field called 'name' or " "'%(name)s' and one called 'e-mail' or '%(e-mail)s'.") % { "name": _("name"), "e-mail": _("e-mail") }) logger.debug('Extracting data.') addresses = {} for row in myreader: if not max(namecol, mailcol) < len(row): logger.warn("Column count does not match for row number %d", myreader.line_num, extra=dict(data={'row': row})) if ignore_errors: # Skip this record continue else: raise forms.ValidationError( _("Row with content '%(row)s' does not contain a name and " "email field.") % {'row': row}) name = check_name(row[namecol], ignore_errors) email = check_email(row[mailcol], ignore_errors) logger.debug("Going to add %s <%s>", name, email) try: validate_email(email) addr = make_subscription(newsletter, email, name) except ValidationError: if ignore_errors: logger.warn( "Entry '%s' at line %d does not contain a valid " "e-mail address.", name, myreader.line_num, extra=dict(data={'row': row})) else: raise forms.ValidationError( _("Entry '%s' does not contain a valid " "e-mail address.") % name) if addr: if email in addresses: logger.warn( "Entry '%s' at line %d contains a " "duplicate entry for '%s'", name, myreader.line_num, email, extra=dict(data={'row': row})) if not ignore_errors: raise forms.ValidationError( _("The address file contains duplicate entries " "for '%s'.") % email) addresses.update({email: addr}) else: logger.warn( "Entry '%s' at line %d is already subscribed to " "with email '%s'", name, myreader.line_num, email, extra=dict(data={'row': row})) if not ignore_errors: raise forms.ValidationError( _("Some entries are already subscribed to.")) return addresses
def sendData(self): """Load files, create and send segmentation""" # Check that there's something on input... if ((self.displayAdvancedSettings and not self.files) or not (self.file or self.displayAdvancedSettings)): self.infoBox.setText(u'Please select input file.', 'warning') self.send('Text data', None, self) return # Check that autoNumberKey is not empty (if necessary)... if self.displayAdvancedSettings and self.autoNumber: if self.autoNumberKey: autoNumberKey = self.autoNumberKey else: self.infoBox.setText( u'Please enter an annotation key for auto-numbering.', 'warning') self.send('Text data', None, self) return else: autoNumberKey = None # Clear created Inputs... self.clearCreatedInputs() fileContents = list() annotations = list() counter = 1 if self.displayAdvancedSettings: myFiles = self.files else: myFiles = [[self.file, self.encoding, u'', u'']] self.infoBox.setText(u"Processing, please wait...", "warning") self.controlArea.setDisabled(True) progressBar = ProgressBar(self, iterations=len(myFiles)) # Open and process each file successively... for myFile in myFiles: filePath = myFile[0] encoding = myFile[1] encoding = re.sub(r"[ ]\(.+", "", encoding) annotation_key = myFile[2] annotation_value = myFile[3] # Try to open the file... self.error() try: if encoding == "(auto-detect)": detector = UniversalDetector() fh = open(filePath, 'rb') for line in fh: detector.feed(line) if detector.done: break detector.close() fh.close() encoding = detector.result['encoding'] fh = open( filePath, mode='rU', encoding=encoding, ) try: fileContent = "" i = 0 chunks = list() for chunk in iter(lambda: fh.read(CHUNK_LENGTH), ""): chunks.append('\n'.join(chunk.splitlines())) i += CHUNK_LENGTH if i % (CHUNK_NUM * CHUNK_LENGTH) == 0: fileContent += "".join(chunks) chunks = list() if len(chunks): fileContent += "".join(chunks) del chunks except UnicodeError: progressBar.finish() if len(myFiles) > 1: message = u"Please select another encoding " \ + u"for file %s." % filePath else: message = u"Please select another encoding." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return finally: fh.close() except IOError: progressBar.finish() if len(myFiles) > 1: message = u"Couldn't open file '%s'." % filePath else: message = u"Couldn't open file." self.infoBox.setText(message, 'error') self.send('Text data', None, self) self.controlArea.setDisabled(False) return # Remove utf-8 BOM if necessary... if encoding == u'utf-8': fileContent = fileContent.lstrip( codecs.BOM_UTF8.decode('utf-8')) # Normalize text (canonical decomposition then composition)... fileContent = normalize('NFC', fileContent) fileContents.append(fileContent) # Annotations... annotation = dict() if self.displayAdvancedSettings: if annotation_key and annotation_value: annotation[annotation_key] = annotation_value if self.importFilenames and self.importFilenamesKey: filename = os.path.basename(filePath) annotation[self.importFilenamesKey] = filename if self.autoNumber and self.autoNumberKey: annotation[self.autoNumberKey] = counter counter += 1 annotations.append(annotation) progressBar.advance() # Create an LTTL.Input for each file... if len(fileContents) == 1: label = self.captionTitle else: label = None for index in range(len(fileContents)): myInput = Input(fileContents[index], label) segment = myInput[0] segment.annotations.update(annotations[index]) myInput[0] = segment self.createdInputs.append(myInput) # If there's only one file, the widget's output is the created Input. if len(fileContents) == 1: self.segmentation = self.createdInputs[0] # Otherwise the widget's output is a concatenation... else: self.segmentation = Segmenter.concatenate( segmentations=self.createdInputs, label=self.captionTitle, copy_annotations=True, import_labels_as=None, sort=False, auto_number_as=None, merge_duplicates=False, progress_callback=None, ) message = u'%i segment@p sent to output ' % len(self.segmentation) message = pluralize(message, len(self.segmentation)) numChars = 0 for segment in self.segmentation: segmentLength = len(Segmentation.get_data(segment.str_index)) numChars += segmentLength message += u'(%i character@p).' % numChars message = pluralize(message, numChars) self.infoBox.setText(message) progressBar.finish() self.controlArea.setDisabled(False) self.send('Text data', self.segmentation, self) self.sendButton.resetSettingsChangedFlag()
def GetFileBody(self, get): if sys.version_info[0] == 2: get.path = get.path.encode('utf-8') if not os.path.exists(get.path): if get.path.find('rewrite') == -1: return public.returnMsg(False, 'FILE_NOT_EXISTS', (get.path, )) public.writeFile(get.path, '') if os.path.getsize(get.path) > 2097152: return public.returnMsg(False, 'CANT_EDIT_ONLINE_FILE') fp = open(get.path, 'rb') data = {} data['status'] = True try: if fp: from chardet.universaldetector import UniversalDetector detector = UniversalDetector() srcBody = b"" for line in fp.readlines(): detector.feed(line) srcBody += line detector.close() char = detector.result data['encoding'] = char['encoding'] if char['encoding'] == 'GB2312' or not char[ 'encoding'] or char['encoding'] == 'TIS-620' or char[ 'encoding'] == 'ISO-8859-9': data['encoding'] = 'GBK' if char['encoding'] == 'ascii' or char[ 'encoding'] == 'ISO-8859-1': data['encoding'] = 'utf-8' if char['encoding'] == 'Big5': data['encoding'] = 'BIG5' if not char['encoding'] in ['GBK', 'utf-8', 'BIG5']: data['encoding'] = 'utf-8' try: if sys.version_info[0] == 2: data['data'] = srcBody.decode(data['encoding']).encode( 'utf-8', errors='ignore') else: data['data'] = srcBody.decode(data['encoding']) except: data['encoding'] = char['encoding'] if sys.version_info[0] == 2: data['data'] = srcBody.decode(data['encoding']).encode( 'utf-8', errors='ignore') else: data['data'] = srcBody.decode(data['encoding']) else: if sys.version_info[0] == 2: data['data'] = srcBody.decode('utf-8').encode('utf-8') else: data['data'] = srcBody.decode('utf-8') data['encoding'] = u'utf-8' return data except Exception as ex: return public.returnMsg( False, 'INCOMPATIBLE_FILECODE', (str(ex)), )
def getEncodingByContent(content): detector = UniversalDetector() detector.feed(content) detector.close() return detector.result["encoding"]
def main(argv): # Collect command options try: opts, args = getopt.getopt(argv,"ri:o:",["ifile=","odir="]) except getopt.GetoptError: print('Failed with arguments', argv) sys.exit(2) # Read command options input_file = '' output_directory = '' recurse = False for opt, arg in opts: if opt == '-r': recurse = True elif opt in ("-i", "--ifile"): input_file = arg elif opt in ("-o", "--odir"): output_directory = arg # Set default values if input_file == '': input_file = '.' if output_directory == '': output_directory = './yeet/' # Determine absolute paths input_path = os.path.abspath(input_file) output_directory = os.path.abspath(output_directory) # Make sure the input file or directory exists if not os.path.exists(input_path): print('The input path {} does not exist.'.format(input_path)) sys.exit(1) # Determine which files will be used as input inputs = [] if os.path.isdir(input_path): input_directory = input_path for extension in ('h', 'hpp', 'c', 'cpp'): if recurse: # Check input and sub directories path = os.path.join(input_path, '**/*.' + extension) files = glob.glob(path, recursive=True) else: # Check input directory only path = os.path.join(input_path, '*.' + extension) files = glob.glob(path) inputs.extend(files) else: input_directory = os.path.dirname(input_path) inputs.append(input_path) yeet_table = {} yeet_generator = YeetGenerator() detector = UniversalDetector() for input_file in inputs: print('Yeeting {}'.format(input_file)) # Attempt to find the encoding of the file that is being read. try: detector.reset() for line in open(input_file, 'rb'): detector.feed(line) if detector.done: break detector.close() except UnicodeDecodeError: print('Could not decode file {}'.format(input_file)) detector.close() continue try: with open(input_file, "r", encoding=detector.result['encoding']) as fi: file_string = fi.read() except UnicodeDecodeError: print('Could not read file {} \n Expected encode was: {}'.format(input_file, detector.result['encoding'])) continue yeeted_file_string = yeet_file(file_string, yeet_table, yeet_generator) file_name = input_file.replace(input_directory, '')[1:] output_file_path = os.path.join(output_directory, file_name) try: os.makedirs(os.path.dirname(output_file_path), exist_ok=True) yeet_header_path = get_yeet_path(output_directory, output_file_path) with open(output_file_path, "w") as fo: fo.write('#include "{}"\n'.format(yeet_header_path)) fo.write(yeeted_file_string) except OSError as e: print('Could not yeet file {} to {}'.format(input_file, output_file_path)) print(e) # Output header file with the macro definitions for all of the input files try: yeetfile = os.path.join(output_directory, 'yeet.h') with open(yeetfile, "w") as fo: for token in yeet_table.keys(): fo.write('#define {} {}\n'.format(yeet_table[token], token)) except OSError as e: print('Could not yeet file yeet.h to {}'.format(yeetfile)) print(e)
import codecs import textract from chardet.universaldetector import UniversalDetector import os import freeling import sys DETECTOR = UniversalDetector() def convert_to_utf8(filename): global DETECTOR DETECTOR.reset() with open(filename, 'rb') as f: start = f.read(3) f.seek(0) for line in f: DETECTOR.feed(line) if DETECTOR.done: break DETECTOR.close() encoding = DETECTOR.result["encoding"] if encoding != "UTF-8": os.system('iconv -f %s -t UTF-8 "%s" > "%s.utf8"' % (encoding, filename, filename)) os.system('mv "%s.utf8" "%s"' % (filename, filename)) elif start == codecs.BOM_UTF8: os.system('tail --bytes=+4 "%s" > "%s.utf8wobom"' % (filename, filename)) os.system('mv "%s.utf8wobom" "%s"' % (filename, filename))
def determineEncoding(self, chardet=True): # BOMs take precedence over everything # This will also read past the BOM if present charEncoding = self.detectBOM(), "certain" if charEncoding[0] is not None: return charEncoding # If we've been overridden, we've been overridden charEncoding = lookupEncoding(self.override_encoding), "certain" if charEncoding[0] is not None: return charEncoding # Now check the transport layer charEncoding = lookupEncoding(self.transport_encoding), "certain" if charEncoding[0] is not None: return charEncoding # Look for meta elements with encoding information charEncoding = self.detectEncodingMeta(), "tentative" if charEncoding[0] is not None: return charEncoding # Parent document encoding charEncoding = lookupEncoding( self.same_origin_parent_encoding), "tentative" if charEncoding[0] is not None and not charEncoding[0].name.startswith( "utf-16"): return charEncoding # "likely" encoding charEncoding = lookupEncoding(self.likely_encoding), "tentative" if charEncoding[0] is not None: return charEncoding # Guess with chardet, if available if chardet: try: from chardet.universaldetector import UniversalDetector except ImportError: pass else: buffers = [] detector = UniversalDetector() while not detector.done: buffer = self.rawStream.read(self.numBytesChardet) assert isinstance(buffer, bytes) if not buffer: break buffers.append(buffer) detector.feed(buffer) detector.close() encoding = lookupEncoding(detector.result['encoding']) self.rawStream.seek(0) if encoding is not None: return encoding, "tentative" # Try the default encoding charEncoding = lookupEncoding(self.default_encoding), "tentative" if charEncoding[0] is not None: return charEncoding # Fallback to html5lib's default if even that hasn't worked return lookupEncoding("windows-1252"), "tentative"
import os import urllib.request from bs4 import BeautifulSoup import json from chardet.universaldetector import UniversalDetector # https://chardet.readthedocs.io/en/latest/usage.html#example-using-the-detect-function import html5lib # for BeautifulSoup parser encode_detector = UniversalDetector() if not os.path.isfile('./config/FSF-licenses-full.json'): try: with urllib.request.urlopen( 'https://wking.github.io/fsf-api/licenses-full.json') as res: body = res.read() encode_detector.reset() encode_detector.feed(body) if encode_detector.done: encode_detector.close() raw_doc = body.decode( encode_detector.result['encoding'], errors='ignore') # .encode('utf-8', 'ignore') else: encode_detector.close() raw_doc = body.decode('utf-8', errors='ignore') f = open("./config/FSF-licenses-full.json", "w", encoding='utf-8') f.write(raw_doc) f.close() license_metaData = json.loads(raw_doc) except urllib.error.HTTPError as err: print('licenses.json get failed', err) exit(1) except urllib.error.URLError as err:
import urllib from chardet.universaldetector import UniversalDetector usock = urllib.urlopen('file:///C:/tmp/enem/sql-CRIACAO-TABELAS.SQL') detector = UniversalDetector() for line in usock.readlines(): detector.feed(line) if detector.done: break detector.close() usock.close() print detector.result
def show_encode(text): detector = UniversalDetector() detector.feed(text) detector.close() return detector.result['encoding']