def test_strict_type_guessing_with_large_file(self): fh = horror_fobj('211.csv') rows = CSVTableSet(fh).tables[0] offset, headers = headers_guess(rows.sample) rows.register_processor(offset_processor(offset + 1)) types = [StringType, IntegerType, DecimalType, DateUtilType] guessed_types = type_guess(rows.sample, types, True) assert_equal(len(guessed_types), 96) assert_equal(guessed_types, [ IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), DecimalType(), DecimalType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), IntegerType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), StringType(), DateUtilType(), DateUtilType(), DateUtilType(), DateUtilType(), StringType(), StringType(), StringType()])
def test_type_guess_strict(self): import locale locale.setlocale(locale.LC_ALL, 'en_GB.UTF-8') csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 2,02 October 2011,"100.234354" 2, 2012/2/12, 1.1, 0,1 May 2011,"100,000,000.12" foo, bar, 1500, 0,,"NaN" 4, 2012/2/12, 42,"-2,000",24 October 2012,"42" ,,,,,''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(guessed_types, [ StringType(), StringType(), DecimalType(), IntegerType(), DateType('%d %B %Y'), DecimalType()])
def test_non_strict_guessing_handles_padding(self): csv_file = StringIO.StringIO(''' 1, , 2 2, , 1.1 foo, , 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=False) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, [IntegerType(), StringType(), DecimalType()])
def test_strict_guessing_handles_padding(self): csv_file = io.BytesIO(b''' 1, , 2 2, , 1.1 foo, , 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample, strict=True) assert_equal(len(guessed_types), 3) assert_equal(guessed_types, [StringType(), StringType(), DecimalType()])
def test_guessing_uses_first_in_case_of_tie(self): csv_file = StringIO.StringIO(''' 2 1.1 1500''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess( rows.sample, types=[DecimalType, IntegerType], strict=False) assert_equal(guessed_types, [DecimalType()]) guessed_types = type_guess( rows.sample, types=[IntegerType, DecimalType], strict=False) assert_equal(guessed_types, [IntegerType()])
def test_type_guess(self): csv_file = StringIO.StringIO(''' 1, 2012/2/12, 2, 02 October 2011, yes, 1 2, 2012/2/12, 2, 02 October 2011, true, 1 2.4, 2012/2/12, 1, 1 May 2011, no, 0 foo, bar, 1000, , false, 0 4.3, , 42, 24 October 2012,, , 2012/2/12, 21, 24 December 2013, true, 1''') rows = CSVTableSet(csv_file).tables[0] guessed_types = type_guess(rows.sample) assert_equal(guessed_types, [ DecimalType(), DateType('%Y/%m/%d'), IntegerType(), DateType('%d %B %Y'), BoolType(), BoolType()])
row_set = CSVTableSet(f, delimiter=delimiter).tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) types = type_guess(row_set.sample) #types = type_guess(row_set.sample, strict=True) print('guessed types:', types) # constructing ddl cols = [] for indx, typ in enumerate(types): if typ == StringType(): cols.append(" `a%s` string" % (indx)) elif typ == DateType(date_format): cols.append(" `a%s` date" % (indx)) elif typ == DecimalType(): cols.append(" `a%s` double" % (indx)) elif typ == IntegerType(): cols.append(" `a%s` int" % (indx)) elif typ == BoolType(): cols.append(" `a%s` boolean" % (indx)) else: raise Exception("A type of column %indx cannot be handled. %s " % (indx, typ)) cols_str = ",\n".join(cols) ddl = '''---------------------------------------------------------- CREATE EXTERNAL TABLE IF NOT EXISTS default.%s ( %s ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'