def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 96) assert_equal(guessed_types, [ IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), DecimalType(), DecimalType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), DateUtilType(), DateUtilType(), DateUtilType(), DateUtilType(), StringType(), StringType(), StringType()])
def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 96) assert_equal(guessed_types, [ IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), DecimalType(), DecimalType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), DateUtilType(), DateUtilType(), DateUtilType(), DateUtilType(), StringType(), StringType(), StringType()])
def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) print guessed_types assert_equal(guessed_types, [ IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType() ])
def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) print guessed_types assert_equal(guessed_types, [ IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType()])
def prepare_csv_rows(csv_file): row_set = CSVTableSet(csv_file).tables[0] offset, headers = headers_guess(row_set.sample) headers = [convert_header_to_column_name(header) for header in (h for h in headers if h)] row_set.register_processor(headers_processor_remove_blank(headers)) row_set.register_processor(offset_processor(offset + 1)) DateType.formats = create_date_formats(day_first=False) # We are never wanting boolean types, so remove that from the default list eligible_types = [StringType, DecimalType, IntegerType, DateType] types = type_guess(row_set.sample, types=eligible_types, strict=True) row_set.register_processor(types_processor(types)) return row_set
def csvimport_table(name): from messytables import CSVTableSet, type_guess from messytables import types_processor, headers_guess from messytables import headers_processor, offset_processor from spendb.etl.extract import parse_table row_set = CSVTableSet(data_fixture(name)).tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) rows = [] for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)): rows.append(row) return fields, rows
def csvimport_table(name): from messytables import CSVTableSet, type_guess from messytables import types_processor, headers_guess from messytables import headers_processor, offset_processor from spendb.etl.extract import parse_table row_set = CSVTableSet(data_fixture(name)).tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample, strict=True) row_set.register_processor(types_processor(types)) rows = [] for num_rows, (fields, row, samples) in enumerate(parse_table(row_set)): rows.append(row) return fields, rows
print("file path:", s3path) print("delimiter:", delimiter) print("date_format:", date_format) # getting data generator = (i for i in smart_open.smart_open(s3path)) head_lst = list(next(generator) for _ in range(100)) head_str = map( lambda x: str(x).strip("b").strip("\"").strip('\\n').strip("\'"), head_lst) head_join = "\n".join(head_str) f = io.BytesIO(bytes(head_join, 'utf-8')) # gussing row_set = CSVTableSet(f, delimiter=delimiter).tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample) #types = type_guess(row_set.sample, strict=True) print('guessed types:', types) # constructing ddl cols = [] for indx, typ in enumerate(types): if typ == StringType(): cols.append(" `a%s` string" % (indx)) elif typ == DateType(date_format): cols.append(" `a%s` date" % (indx)) elif typ == DecimalType(): cols.append(" `a%s` double" % (indx)) elif typ == IntegerType():