def keyable_p(column): # `unique' can't cope with NaNs, so reject them early. if any(v is None or (isinstance(v, float) and math.isnan(v)) for v in column): return False try: column_floats = [float(v) for v in column] if not all(v.is_integer() for v in column_floats): return False return len(column) == len(unique(column)) except ValueError: return len(column) == len(unique(column))
def numerical_p(column, count_cutoff, ratio_cutoff): nu = len(unique([v for v in column if not math.isnan(v)])) if nu <= count_cutoff: return False if float(nu) / float(len(column)) <= ratio_cutoff: return False return True
def keyable_p(column): # `unique' can't cope with NaNs, so reject them early. if any(v is None or (isinstance(v, float) and math.isnan(v)) for v in column): return False if all(isinstance(v, float) for v in column) and all(float(v).is_integer() \ for v in column): return False return len(column) == len(unique(column))
def bayesdb_guess_stattypes(column_names, rows, null_values=None, numcat_count=None, numcat_ratio=None, distinct_ratio=None, nullify_ratio=None, overrides=None): """Heuristically guess statistical types for the data in `rows`. Return a list of statistical types corresponding to the columns named in the list `column_names`. :param set null_values: values to nullify. :param int numcat_count: number of distinct values below which columns whose values can all be parsed as numbers will be considered categorical anyway :param real numcat_ratio: ratio of distinct values to total values below which columns whose values can all be parsed as numbers will be considered categorical anyway :param real distinct_ratio: ratio of distinct values to total values above which a column will be ignored as a pseudo-key (only if count > numcat_count). :param real nullify_ratio: ratio of count of the most numerous value to total number of values above which the most numerous value should be nullified (set to 1 to turn off). :param list overrides: list of ``(name, stattype)``, overriding any guessed statistical type for columns by those names In addition to statistical types, the overrides may specify ``key`` or ``ignore``. """ # Fill in default arguments. if null_values is None: null_values = set(("", "N/A", "none", "None")) if numcat_count is None: numcat_count = 20 if numcat_ratio is None: numcat_ratio = 0.02 if distinct_ratio is None: distinct_ratio = 0.9 if nullify_ratio is None: nullify_ratio = 0.9 if overrides is None: overrides = [] # Build a set of the column names. column_name_set = set() duplicates = set() for name in column_names: if casefold(name) in column_name_set: duplicates.add(name) column_name_set.add(casefold(name)) if 0 < len(duplicates): raise ValueError('Duplicate column names: %s' % (repr(list(duplicates),))) # Build a map for the overrides. # # XXX Support more than just stattype: allow arbitrary column # descriptions. override_map = {} unknown = set() duplicates = set() for name, stattype in overrides: if casefold(name) not in column_name_set: unknown.add(name) continue if casefold(name) in override_map: duplicates.add(name) continue override_map[casefold(name)] = casefold(stattype) if 0 < len(unknown): raise ValueError('Unknown columns overridden: %s' % (repr(list(unknown)),)) if 0 < len(duplicates): raise ValueError('Duplicate columns overridden: %s' % (repr(list(duplicates)),)) # Sanity-check the inputs. ncols = len(column_names) assert ncols == len(unique(map(casefold, column_names))) for ri, row in enumerate(rows): if len(row) < ncols: raise ValueError('Row %d: Too few columns: %d < %d' % (ri, len(row), ncols)) if len(row) > ncols: raise ValueError('Row %d: Too many columns: %d > %d' % (ri, len(row), ncols)) # Find a key first, if it has been specified as an override. key = None duplicate_keys = set() for ci, column_name in enumerate(column_names): if casefold(column_name) in override_map: if override_map[casefold(column_name)] == 'key': if key is not None: duplicate_keys.add(column_name) continue column = [row[ci] for row in rows] ints = integerify(column) if ints: column = ints if not keyable_p(column): raise ValueError('Column non-unique but specified as key' ': %s' % (repr(column_name),)) key = column_name if 0 < len(duplicate_keys): raise ValueError('Multiple columns overridden as keys: %s' % (repr(list(duplicate_keys)),)) # Now go through and guess the other column stattypes or use the # override. stattypes = [] for ci, column_name in enumerate(column_names): if casefold(column_name) in override_map: stattype = override_map[casefold(column_name)] else: column = nullify(null_values, rows, ci) stattype = guess_column_stattype(column, distinct_ratio=distinct_ratio, nullify_ratio=nullify_ratio, numcat_count=numcat_count, numcat_ratio=numcat_ratio, have_key=(key is not None)) if stattype == 'key': key = column_name stattypes.append(stattype) return stattypes
def keyable_p(column): # `unique' can't cope with NaNs, so reject them early. if any(v is None or (isinstance(v, float) and math.isnan(v)) for v in column): return False return len(column) == len(unique(column))
def bayesdb_guess_stattypes(column_names, rows, count_cutoff=None, ratio_cutoff=None, overrides=None): """Heuristically guess statistical types for the data in `rows`. Return a list of statistical types corresponding to the columns named in the list `column_names`. :param int count_cutoff: number of distinct values below which columns whose values can all be parsed as numbers will be considered categorical anyway :param real ratio_cutoff: ratio of distinct values to total values below which columns whose values can all be parsed as numbers will be considered categorical anyway :param list overrides: list of ``(name, stattype)``, overriding any guessed statistical type for columns by those names In addition to statistical types, the overrides may specify ``key`` or ``ignore``. """ # Fill in default arguments. if count_cutoff is None: count_cutoff = 20 if ratio_cutoff is None: ratio_cutoff = 0.02 if overrides is None: overrides = [] # Build a set of the column names. column_name_set = set() duplicates = set() for name in column_names: if casefold(name) in column_name_set: duplicates.add(name) column_name_set.add(casefold(name)) if 0 < len(duplicates): raise ValueError('Duplicate column names: %s' % (repr(list(duplicates),))) # Build a map for the overrides. # # XXX Support more than just stattype: allow arbitrary column # descriptions. override_map = {} unknown = set() duplicates = set() for name, stattype in overrides: if casefold(name) not in column_name_set: unknown.add(name) continue if casefold(name) in override_map: duplicates.add(name) continue override_map[casefold(name)] = casefold(stattype) if 0 < len(unknown): raise ValueError('Unknown columns overridden: %s' % (repr(list(unknown)),)) if 0 < len(duplicates): raise ValueError('Duplicate columns overridden: %s' % (repr(list(duplicates)),)) # Sanity-check the inputs. ncols = len(column_names) assert ncols == len(unique(map(casefold, column_names))) for ri, row in enumerate(rows): if len(row) < ncols: raise ValueError('Row %d: Too few columns: %d < %d' % (ri, len(row), ncols)) if len(row) > ncols: raise ValueError('Row %d: Too many columns: %d > %d' % (ri, len(row), ncols)) # Find a key first, if it has been specified as an override. key = None duplicate_keys = set() for ci, column_name in enumerate(column_names): if casefold(column_name) in override_map: if override_map[casefold(column_name)] == 'key': if key is not None: duplicate_keys.add(column_name) continue column = integerify(rows, ci) if not column: column = [row[ci] for row in rows] if not keyable_p(column): raise ValueError('Column non-unique but specified as key' ': %s' % (repr(column_name),)) key = column_name if 0 < len(duplicate_keys): raise ValueError('Multiple columns overridden as keys: %s' % (repr(list(duplicate_keys)),)) # Now go through and guess the other column stattypes or use the # override. stattypes = [] for ci, column_name in enumerate(column_names): if casefold(column_name) in override_map: stattype = override_map[casefold(column_name)] else: numericable = True column = integerify(rows, ci) if not column: column = floatify(rows, ci) if not column: column = [row[ci] for row in rows] numericable = False if key is None and keyable_p(column): stattype = 'key' key = column_name elif numericable and \ numerical_p(column, count_cutoff, ratio_cutoff): stattype = 'numerical' else: stattype = 'categorical' stattypes.append(stattype) return stattypes