Ejemplo n.º 1
0
def get_column_types(data: io.BytesIO) \
        -> Tuple[List[str], List[types.CellType]]:
    """derive the column types

  Using messytables' CSV API, attempt to derive the column types based on a
  best-guess of a sample of the rows.

  This is still a WIP due to the parlous state of the DV360/CM CSV data formats
  in general

  Arguments:
      data (io.BytesIO):  sample of the CSV file

  Returns:
      (List[str], List[str]): tuple of list of header names and list of
                                column types
  """
    table_set = messytables.CSVTableSet(data)
    row_set = table_set.tables[0]
    offset, csv_headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(csv_headers))
    row_set.register_processor(messytables.offset_processor(offset + 1))
    csv_types = messytables.type_guess(row_set.sample, strict=True)

    return (csv_headers, csv_types)
Ejemplo n.º 2
0
def determine_messytables_types(file_handle, types=messytables.types.TYPES):
    """

    :param file_handle: file handle opened in binary mode
    :return: (headers, types, row_set)
    """

    # Load a file object:
    table_set = messytables.CSVTableSet(file_handle)

    # If you aren't sure what kind of file it is
    # table_set = messytables.any_tableset(file_handle)

    # A table set is a collection of tables:
    row_set = table_set.tables[0]

    # A row set is an iterator over the table, but it can only
    # be run once. To peek, a sample is provided:
    print(next(row_set.sample))

    # guess header names and the offset of the header:
    offset, headers = messytables.headers_guess(row_set.sample)
    row_set.register_processor(messytables.headers_processor(headers))

    # add one to begin with content, not the header:
    row_set.register_processor(messytables.offset_processor(offset + 1))

    # guess column types:
    types = messytables.type_guess(row_set.sample, types, strict=True)

    # and tell the row set to apply these types to
    # each row when traversing the iterator:
    row_set.register_processor(messytables.types_processor(types))

    # now run some operation on the data:
    return headers, types, row_set
def headersDataTypes(CSV):  
    '''Get column headers and data types using messytables'''  
    table = open(path[0]+CSV, 'rb')
    # Creates a set of tables as file object, although it'll just be one
    tableset = messytables.CSVTableSet(table) 
    rowset = tableset.tables[0] # get first and only table as iterator
    # guesses header names and offset of header, returns headers as list
    offset, headers = messytables.headers_guess(rowset.sample) 
    print "Here is the offset", str(offset), "\nHere are the headers:\n"\
    , str(headers) # test 
    # establish headers in table
    rowset.register_processor(messytables.headers_processor(headers))
    # begin iterator at content, rather than header
    rowset.register_processor(messytables.offset_processor(offset + 1))
    # guess column types, return as list
    types = messytables.type_guess(rowset.sample, strict=True)
    print "Here are the data types", str(types)  
    dtypedict = {} # empty dictionary to append columns and datatype needed
    # for pandas csv to dataframe conversion
    colcount = 0  # location to append datatypes to match columns in dict
    for column in types:
        dtypedict[headers[colcount]]=column
        colcount+=1
    return headers, dtypedict  
def is_psv(buf, log):
    '''If the buffer is a PSV file then return True.'''
    buf_rows = StringIO.StringIO(buf)
    table_set = messytables.CSVTableSet(buf_rows, delimiter='|')
    return _is_spreadsheet(table_set, 'PSV', log)
def is_csv(buf, log):
    '''If the buffer is a CSV file then return True.'''
    buf_rows = StringIO.StringIO(buf)
    table_set = messytables.CSVTableSet(buf_rows)
    return _is_spreadsheet(table_set, 'CSV', log)
Ejemplo n.º 6
0
def is_psv(buf):
    '''If the buffer is a PSV file then return True.'''
    buf_rows = six.BytesIO(six.ensure_binary(buf))
    table_set = messytables.CSVTableSet(buf_rows, delimiter='|')
    return _is_spreadsheet(table_set, 'PSV')