def get_column_types(data: io.BytesIO) \ -> Tuple[List[str], List[types.CellType]]: """derive the column types Using messytables' CSV API, attempt to derive the column types based on a best-guess of a sample of the rows. This is still a WIP due to the parlous state of the DV360/CM CSV data formats in general Arguments: data (io.BytesIO): sample of the CSV file Returns: (List[str], List[str]): tuple of list of header names and list of column types """ table_set = messytables.CSVTableSet(data) row_set = table_set.tables[0] offset, csv_headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(csv_headers)) row_set.register_processor(messytables.offset_processor(offset + 1)) csv_types = messytables.type_guess(row_set.sample, strict=True) return (csv_headers, csv_types)
def determine_messytables_types(file_handle, types=messytables.types.TYPES): """ :param file_handle: file handle opened in binary mode :return: (headers, types, row_set) """ # Load a file object: table_set = messytables.CSVTableSet(file_handle) # If you aren't sure what kind of file it is # table_set = messytables.any_tableset(file_handle) # A table set is a collection of tables: row_set = table_set.tables[0] # A row set is an iterator over the table, but it can only # be run once. To peek, a sample is provided: print(next(row_set.sample)) # guess header names and the offset of the header: offset, headers = messytables.headers_guess(row_set.sample) row_set.register_processor(messytables.headers_processor(headers)) # add one to begin with content, not the header: row_set.register_processor(messytables.offset_processor(offset + 1)) # guess column types: types = messytables.type_guess(row_set.sample, types, strict=True) # and tell the row set to apply these types to # each row when traversing the iterator: row_set.register_processor(messytables.types_processor(types)) # now run some operation on the data: return headers, types, row_set
def headersDataTypes(CSV): '''Get column headers and data types using messytables''' table = open(path[0]+CSV, 'rb') # Creates a set of tables as file object, although it'll just be one tableset = messytables.CSVTableSet(table) rowset = tableset.tables[0] # get first and only table as iterator # guesses header names and offset of header, returns headers as list offset, headers = messytables.headers_guess(rowset.sample) print "Here is the offset", str(offset), "\nHere are the headers:\n"\ , str(headers) # test # establish headers in table rowset.register_processor(messytables.headers_processor(headers)) # begin iterator at content, rather than header rowset.register_processor(messytables.offset_processor(offset + 1)) # guess column types, return as list types = messytables.type_guess(rowset.sample, strict=True) print "Here are the data types", str(types) dtypedict = {} # empty dictionary to append columns and datatype needed # for pandas csv to dataframe conversion colcount = 0 # location to append datatypes to match columns in dict for column in types: dtypedict[headers[colcount]]=column colcount+=1 return headers, dtypedict
def is_psv(buf, log): '''If the buffer is a PSV file then return True.''' buf_rows = StringIO.StringIO(buf) table_set = messytables.CSVTableSet(buf_rows, delimiter='|') return _is_spreadsheet(table_set, 'PSV', log)
def is_csv(buf, log): '''If the buffer is a CSV file then return True.''' buf_rows = StringIO.StringIO(buf) table_set = messytables.CSVTableSet(buf_rows) return _is_spreadsheet(table_set, 'CSV', log)
def is_psv(buf): '''If the buffer is a PSV file then return True.''' buf_rows = six.BytesIO(six.ensure_binary(buf)) table_set = messytables.CSVTableSet(buf_rows, delimiter='|') return _is_spreadsheet(table_set, 'PSV')