예제 #1
0
def reader(csv,
           skip_guess_encoding=anycsvconfig.SKIP_GUESS_ENCODING,
           delimiter=None,
           sniff_lines=anycsvconfig.NO_SNIFF_LINES,
           max_file_size=anycsvconfig.MAX_FILE_SIZE,
           encoding=anycsvconfig.DEFAULT_ENCODING):
    if not csv:
        raise exceptions.AnyCSVException('No CSV input specified')

    ios = BufferedAutoEncodingStream(csv,
                                     max_buffer=sniff_lines,
                                     max_file_size=max_file_size)

    if not skip_guess_encoding:
        encoding_result = E.detect_encoding(ios,
                                            min_lines=10,
                                            max_lines=sniff_lines)
        ios.reset()

        final_encoding = E.prob_encoding(ios,
                                         encoding_result,
                                         max_lines=sniff_lines)
        ios.reset()
    else:
        final_encoding = encoding

    dialect = D.guessDialect(ios, final_encoding)
    ios.reset()

    return Table(csv, ios, dialect, encoding=final_encoding)
예제 #2
0
def test_single_file():
    csv = "/Users/jumbrich/data/mimesis_csvs/encoding/latin.csv"
    ios = BufferedAutoEncodingStream(csv, max_buffer=50)

    dialect = D.guessDialect(ios, 'cp737')

    import csv
    assert dialect.delimiter == csv.unix_dialect.delimiter
예제 #3
0
def test_file_gzipped(tmpdir):
    p = tmpdir.mkdir("tmp.csvs").mkdir("data")
    csv = _create_table(p, rows=200, gzipped=True)

    ios = BufferedAutoEncodingStream(csv, max_buffer=50)

    dialect = D.guessDialect(ios, 'utf-8')

    import csv
    assert dialect.delimiter == csv.unix_dialect.delimiter
예제 #4
0
def extract_csv_meta(header, content=None, id='', skip_guess_encoding=False):
    logger = logging.getLogger(__name__)
    results = {'used_enc': None, 'dialect': {}}

    # check if guess encoding is possible
    if not skip_guess_encoding:
        try:
            import anycsv.encoding
        except Exception as e:

            print(
                'Could not import "magic" library. To support encoding detection please install python-magic.'
            )
            skip_guess_encoding = True

    # get encoding
    if skip_guess_encoding:
        results['used_enc'] = DEFAULT_ENCODING
        content_encoded = content  #.decode(encoding=results['used_enc'])
        status = "META encoding"
    else:
        results['enc'] = encoding.guessEncoding(content, header)

        content_encoded = None
        status = "META "
        c_enc = None
        for k in ENC_PRIORITY:
            #we try to use the different encodings
            try:
                if k in results['enc'] and results['enc'][k][
                        'encoding'] is not None:
                    content_encoded = content.decode(
                        encoding=results['enc'][k]['encoding'])
                    c_enc = results['enc'][k]['encoding']
                    status += " encoding"
                    break
            except Exception as e:
                logger.debug('(%s) ERROR Tried %s encoding: %s',
                             results['enc'][k]['encoding'], id, e)
        if content_encoded:
            results['used_enc'] = c_enc

    # get dialect
    try:
        results['dialect'] = dialect.guessDialect(content_encoded)
        status += " dialect"
    except Exception as e:
        logger.warning('(%s)  %s', id, e.message)
        results['dialect'] = {}

    #if fName:
    #    results['charset'] = encoding.get_charset(fName)

    logger.debug("(%s) %s", id, status)
    return results
예제 #5
0
def extract_csv_meta(header, content=None, id='', skip_guess_encoding=False):
    logger = logging.getLogger(__name__)
    results = {'dialect': {}}

    # get dialect
    try:
        results['dialect'] = dialect.guessDialect(content.decode("utf-8"))
    except Exception as e:
        logger.warning('(%s)  %s', id, e.args)
        results['dialect'] = {}

    return results