def reader(csv, skip_guess_encoding=anycsvconfig.SKIP_GUESS_ENCODING, delimiter=None, sniff_lines=anycsvconfig.NO_SNIFF_LINES, max_file_size=anycsvconfig.MAX_FILE_SIZE, encoding=anycsvconfig.DEFAULT_ENCODING): if not csv: raise exceptions.AnyCSVException('No CSV input specified') ios = BufferedAutoEncodingStream(csv, max_buffer=sniff_lines, max_file_size=max_file_size) if not skip_guess_encoding: encoding_result = E.detect_encoding(ios, min_lines=10, max_lines=sniff_lines) ios.reset() final_encoding = E.prob_encoding(ios, encoding_result, max_lines=sniff_lines) ios.reset() else: final_encoding = encoding dialect = D.guessDialect(ios, final_encoding) ios.reset() return Table(csv, ios, dialect, encoding=final_encoding)
def test_single_file(): csv = "/Users/jumbrich/data/mimesis_csvs/encoding/latin.csv" ios = BufferedAutoEncodingStream(csv, max_buffer=50) dialect = D.guessDialect(ios, 'cp737') import csv assert dialect.delimiter == csv.unix_dialect.delimiter
def test_file_gzipped(tmpdir): p = tmpdir.mkdir("tmp.csvs").mkdir("data") csv = _create_table(p, rows=200, gzipped=True) ios = BufferedAutoEncodingStream(csv, max_buffer=50) dialect = D.guessDialect(ios, 'utf-8') import csv assert dialect.delimiter == csv.unix_dialect.delimiter
def extract_csv_meta(header, content=None, id='', skip_guess_encoding=False): logger = logging.getLogger(__name__) results = {'used_enc': None, 'dialect': {}} # check if guess encoding is possible if not skip_guess_encoding: try: import anycsv.encoding except Exception as e: print( 'Could not import "magic" library. To support encoding detection please install python-magic.' ) skip_guess_encoding = True # get encoding if skip_guess_encoding: results['used_enc'] = DEFAULT_ENCODING content_encoded = content #.decode(encoding=results['used_enc']) status = "META encoding" else: results['enc'] = encoding.guessEncoding(content, header) content_encoded = None status = "META " c_enc = None for k in ENC_PRIORITY: #we try to use the different encodings try: if k in results['enc'] and results['enc'][k][ 'encoding'] is not None: content_encoded = content.decode( encoding=results['enc'][k]['encoding']) c_enc = results['enc'][k]['encoding'] status += " encoding" break except Exception as e: logger.debug('(%s) ERROR Tried %s encoding: %s', results['enc'][k]['encoding'], id, e) if content_encoded: results['used_enc'] = c_enc # get dialect try: results['dialect'] = dialect.guessDialect(content_encoded) status += " dialect" except Exception as e: logger.warning('(%s) %s', id, e.message) results['dialect'] = {} #if fName: # results['charset'] = encoding.get_charset(fName) logger.debug("(%s) %s", id, status) return results
def extract_csv_meta(header, content=None, id='', skip_guess_encoding=False): logger = logging.getLogger(__name__) results = {'dialect': {}} # get dialect try: results['dialect'] = dialect.guessDialect(content.decode("utf-8")) except Exception as e: logger.warning('(%s) %s', id, e.args) results['dialect'] = {} return results