def test_null_process(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] row_set.register_processor(null_processor(['null'])) data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, False, False]) types = type_guess(row_set.sample, strict=True) expected_types = [ IntegerType(), IntegerType(), IntegerType(), IntegerType() ] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) # after applying the types, '' should become None for int columns data = list(row_set) nones = [[x.value is None for x in row] for row in data] assert_equal(nones[0], [False, True, False, False]) assert_equal(nones[1], [False, False, False, True]) assert_equal(nones[2], [False, True, True, True])
def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 96) assert_equal(guessed_types, [ IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), DecimalType(), DecimalType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), DateUtilType(), DateUtilType(), DateUtilType(), DateUtilType(), StringType(), StringType(), StringType()])
def test_apply_null_values(self): fh = horror_fobj('null.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample, strict=True) expected_types = [ IntegerType(), StringType(), IntegerType(), StringType() ] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) # treat null as non empty text and 0 as non empty integer assert [x.empty for x in data[0]] == [False, False, False, False] assert [x.empty for x in data[1]] == [False, False, False, False] assert [x.empty for x in data[2]] == [False, False, True, True] assert [x.empty for x in data[3]] == [False, False, False, False] assert [x.empty for x in data[4]] == [False, False, False, True] assert [x.empty for x in data[5]] == [False, False, False, True] # we expect None for Integers and "" for empty strings in CSV assert [x.value for x in data[2]] == [3, "null", None, ""], data[2]
def test_file_with_few_strings_among_integers(self): fh = horror_fobj('mixedGLB.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 19) print guessed_types assert_equal(guessed_types, [ IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType() ])
def test_non_strict_guessing_handles_padding(self): csv_file = StringIO.StringIO(''' 1, , 2 2, , 1.1 foo, , 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=False) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, [IntegerType(), StringType(), DecimalType()])
def test_guessing_uses_first_in_case_of_tie(self): csv_file = StringIO.StringIO(''' 2 1.1 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess( rows.sample, types=[DecimalType, IntegerType], strict=False) assert_equal(guessed_types, [DecimalType()]) guessed_types = type_guess( rows.sample, types=[IntegerType, DecimalType], strict=False) assert_equal(guessed_types, [IntegerType()])
def test_read_type_guess_simple(self): fh = horror_fobj('simple.csv') table_set = CSVTableSet(fh) row_set = table_set.tables[0] types = type_guess(row_set.sample) expected_types = [DateType("%Y-%m-%d"), IntegerType(), StringType()] assert_equal(types, expected_types) row_set.register_processor(types_processor(types)) data = list(row_set) header_types = [c.type for c in data[0]] assert_equal(header_types, [StringType()] * 3) row_types = [c.type for c in data[2]] assert_equal(expected_types, row_types)
def test_type_guess(self): csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 02 October 2011, yes, 1 2, 2012/2/12, 2, 02 October 2011, true, 1 2.4, 2012/2/12, 1, 1 May 2011, no, 0 foo, bar, 1000, , false, 0 4.3, , 42, 24 October 2012,, , 2012/2/12, 21, 24 December 2013, true, 1''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ DecimalType(), DateType('%Y/%m/%d'), IntegerType(), DateType('%d %B %Y'), BoolType(), BoolType()])
def test_type_guess_strict(self): import locale locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8') csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 2,02 October 2011,"100.234354" 2, 2012/2/12, 1.1, 0,1 May 2011,"100,000,000.12" foo, bar, 1500, 0,,"NaN" 4, 2012/2/12, 42,"-2,000",24 October 2012,"42" ,,,,,''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(guessed_types, [ StringType(), StringType(), DecimalType(), IntegerType(), DateType('%d %B %Y'), DecimalType()])
row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample) #types = type_guess(row_set.sample, strict=True) print('guessed types:', types) # constructing ddl cols = [] for indx, typ in enumerate(types): if typ == StringType(): cols.append(" `a%s` string" % (indx)) elif typ == DateType(date_format): cols.append(" `a%s` date" % (indx)) elif typ == DecimalType(): cols.append(" `a%s` double" % (indx)) elif typ == IntegerType(): cols.append(" `a%s` int" % (indx)) elif typ == BoolType(): cols.append(" `a%s` boolean" % (indx)) else: raise Exception("A type of column %indx cannot be handled. %s " % (indx, typ)) cols_str = ",\n".join(cols) ddl = '''---------------------------------------------------------- CREATE EXTERNAL TABLE IF NOT EXISTS default.%s ( %s ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' WITH SERDEPROPERTIES ( "separatorChar" = "%s",
def helper(value): return any(i.test(value) for i in IntegerType.instances())